void init_matrix( int N, magmaFloatComplex *h_A, magma_int_t lda ) { magma_int_t ione = 1, n2 = N*lda; magma_int_t ISEED[4] = {0,0,0,1}; lapackf77_clarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ for (int i = 0; i < N; ++i) { MAGMA_C_SET2REAL( h_A(i,i), MAGMA_C_REAL(h_A(i,i)) + N ); for (int j = 0; j < i; ++j) h_A(i, j) = MAGMA_C_CNJG( h_A(j, i) ); } }
extern "C" magma_int_t magma_cunmqr(magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *c, magma_int_t ldc, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info, magma_queue_t queue) { /* -- MAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver September 2012 Purpose ======= CUNMQR overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**H * C C * Q**H where Q is a complex orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by CGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) COMPLEX array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQRF in the first k columns of its array argument A. A is modified by the routine but restored on exit. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQRF. C (input/output) COMPLEX array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ magmaFloatComplex c_one = MAGMA_C_ONE; magma_side_t side_ = side; magma_trans_t trans_ = trans; /* Allocate work space on the GPU */ magmaFloatComplex_ptr dwork, dc; magma_malloc( &dc, (m)*(n)*sizeof(magmaFloatComplex) ); magma_malloc( &dwork, (m + n + 64)*64*sizeof(magmaFloatComplex) ); /* Copy matrix C from the CPU to the GPU */ magma_csetmatrix( m, n, c, 0, ldc, dc, 0, m, queue ); //dc -= (1 + m); size_t dc_offset = -(1+m); magma_int_t a_offset, c_offset, i__4, lddwork; magma_int_t i__; magmaFloatComplex t[2*4160] /* was [65][64] */; magma_int_t i1, i2, i3, ib, ic, jc, nb, mi, ni, nq, nw; int left, notran, lquery; magma_int_t iinfo, lwkopt; a_offset = 1 + lda; a -= a_offset; --tau; c_offset = 1 + ldc; c -= c_offset; *info = 0; left = lapackf77_lsame(lapack_const(side_), "L"); notran = lapackf77_lsame(lapack_const(trans_), "N"); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && ! lapackf77_lsame(lapack_const(side_), "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(lapack_const(trans_), "T")) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { /* Determine the block size. NB may be at most NBMAX, where NBMAX is used to define the local array T. */ nb = 64; lwkopt = max(1,nw) * nb; MAGMA_C_SET2REAL( work[0], lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_cunmqr(lapack_const(side_), lapack_const(trans_), &m, &n, &k, &a[a_offset], &lda, &tau[1], &c[c_offset], &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ if ( ( left && (! notran) ) || ( (! left) && notran ) ) { i1 = 1; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb + 1; i2 = 1; i3 = -nb; } if (left) { ni = n; jc = 1; } else { mi = m; ic = 1; } for (i__ = i1; i3 < 0 ? i__ >= i2 : i__ <= i2; i__ += i3) { ib = min(nb, k - i__ + 1); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ i__4 = nq - i__ + 1; lapackf77_clarft("F", "C", &i__4, &ib, &a[i__ + i__ * lda], &lda, &tau[i__], t, &ib); /* 1) Put 0s in the upper triangular part of A; 2) copy the panel from A to the GPU, and 3) restore A */ cpanel_to_q(MagmaUpper, ib, &a[i__ + i__ * lda], lda, t+ib*ib); magma_csetmatrix( i__4, ib, &a[i__ + i__ * lda], 0, lda, dwork, 0, i__4, queue ); cq_to_panel(MagmaUpper, ib, &a[i__ + i__ * lda], lda, t+ib*ib); if (left) { /* H or H' is applied to C(i:m,1:n) */ mi = m - i__ + 1; ic = i__; } else { /* H or H' is applied to C(1:m,i:n) */ ni = n - i__ + 1; jc = i__; } if (left) lddwork = ni; else lddwork = mi; /* Apply H or H'; First copy T to the GPU */ magma_csetmatrix( ib, ib, t, 0, ib, dwork, i__4*ib, ib, queue ); magma_clarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dwork, 0, i__4, dwork, i__4*ib, ib, dc, dc_offset+(ic + jc * m), m, dwork, (i__4*ib + ib*ib), lddwork, queue); } magma_cgetmatrix( m, n, dc, dc_offset+(1+m), m, &c[c_offset], 0, ldc, queue ); } MAGMA_C_SET2REAL( work[0], lwkopt ); //dc += (1 + m); magma_free( dc ); magma_free( dwork ); return *info; } /* magma_cunmqr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cheevd */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); cuFloatComplex *h_A, *h_R, *h_work; float *rwork, *w1, *w2; magma_int_t *iwork; float gpu_time, cpu_time; magma_timestr_t start, end; /* Matrix size */ magma_int_t N=0, n2; magma_int_t size[8] = {1024,2048,3072,4032,5184,6016,7040,8064}; magma_int_t i, info; magma_int_t ione = 1, izero = 0; magma_int_t ISEED[4] = {0,0,0,1}; const char *uplo = MagmaLowerStr; const char *jobz = MagmaVectorsStr; magma_int_t checkres; float result[3], eps = lapackf77_slamch( "E" ); if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); } else if ( strcmp("-JV", argv[i]) == 0 ) { jobz = MagmaVectorsStr; } else if ( strcmp("-JN", argv[i]) == 0 ) { jobz = MagmaNoVectorsStr; } } if (N>0) printf(" testing_cheevd -N %d [-JV] [-JN]\n\n", (int) N); else { printf("\nUsage: \n"); printf(" testing_cheevd -N %d [-JV] [-JN]\n\n", (int) N); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cheevd -N %d [-JV] [-JN]\n\n", 1024); N = size[7]; } checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; if ( checkres && jobz[0] == MagmaNoVectors ) { printf( "Cannot check results when vectors are not computed (jobz='N')\n" ); checkres = false; } /* Query for workspace sizes */ cuFloatComplex aux_work[1]; float aux_rwork[1]; magma_int_t aux_iwork[1]; magma_cheevd( jobz[0], uplo[0], N, h_R, N, w1, aux_work, -1, aux_rwork, -1, aux_iwork, -1, &info ); magma_int_t lwork, lrwork, liwork; lwork = (magma_int_t) MAGMA_C_REAL( aux_work[0] ); lrwork = (magma_int_t) aux_rwork[0]; liwork = aux_iwork[0]; /* Allocate host memory for the matrix */ TESTING_MALLOC( h_A, cuFloatComplex, N*N ); TESTING_MALLOC( w1, float , N ); TESTING_MALLOC( w2, float , N ); TESTING_HOSTALLOC( h_R, cuFloatComplex, N*N ); TESTING_HOSTALLOC( h_work, cuFloatComplex, lwork ); TESTING_MALLOC( rwork, float, lrwork ); TESTING_MALLOC( iwork, magma_int_t, liwork ); printf(" N CPU Time(s) GPU Time(s) \n"); printf("===================================\n"); for(i=0; i<8; i++){ if (argc==1){ N = size[i]; } n2 = N*N; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); for( int i=0; i<N; i++) { MAGMA_C_SET2REAL( h_A[i*N+i], MAGMA_C_REAL(h_A[i*N+i]) ); } lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); /* warm up run */ magma_cheevd(jobz[0], uplo[0], N, h_R, N, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); /* query for optimal workspace sizes */ magma_cheevd(jobz[0], uplo[0], N, h_R, N, w1, h_work, -1, rwork, -1, iwork, -1, &info); int lwork_save = lwork; int lrwork_save = lrwork; int liwork_save = liwork; lwork = min( lwork, (magma_int_t) MAGMA_C_REAL( h_work[0] )); lrwork = min( lrwork, (magma_int_t) rwork[0] ); liwork = min( liwork, iwork[0] ); //printf( "lwork %d, query %d, used %d; liwork %d, query %d, used %d\n", // lwork_save, (magma_int_t) h_work[0], lwork, // liwork_save, iwork[0], liwork ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ start = get_current_time(); magma_cheevd(jobz[0], uplo[0], N, h_R, N, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); end = get_current_time(); gpu_time = GetTimerValue(start,end)/1000.; lwork = lwork_save; lrwork = lrwork_save; liwork = liwork_save; if ( checkres ) { /* ===================================================================== Check the results following the LAPACK's [zcds]drvst routine. A is factored as A = U S U' and the following 3 tests computed: (1) | A - U S U' | / ( |A| N ) (2) | I - U'U | / ( N ) (3) | S(with U) - S(w/o U) | / | S | =================================================================== */ float temp1, temp2; cuFloatComplex *tau; lapackf77_chet21(&ione, uplo, &N, &izero, h_A, &N, w1, w1, h_R, &N, h_R, &N, tau, h_work, rwork, &result[0]); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); magma_cheevd('N', uplo[0], N, h_R, N, w2, h_work, lwork, rwork, lrwork, iwork, liwork, &info); temp1 = temp2 = 0; for(int j=0; j<N; j++){ temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[2] = temp2 / temp1; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_cheevd(jobz, uplo, &N, h_A, &N, w2, h_work, &lwork, rwork, &lrwork, iwork, &liwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of cheevd had an illegal value.\n", (int) -info); cpu_time = GetTimerValue(start,end)/1000.; /* ===================================================================== Print execution time =================================================================== */ printf("%5d %6.2f %6.2f\n", (int) N, cpu_time, gpu_time); if ( checkres ){ printf("Testing the factorization A = U S U' for correctness:\n"); printf("(1) | A - U S U' | / (|A| N) = %e\n", result[0]*eps); printf("(2) | I - U'U | / N = %e\n", result[1]*eps); printf("(3) | S(w/ U)-S(w/o U)|/ |S| = %e\n\n", result[2]); } if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( h_A); TESTING_FREE( w1); TESTING_FREE( w2); TESTING_FREE( rwork); TESTING_FREE( iwork); TESTING_HOSTFREE(h_work); TESTING_HOSTFREE( h_R); /* Shutdown */ TESTING_CUDA_FINALIZE(); }
extern "C" magma_int_t magma_cgehrd(magma_int_t n, magma_int_t ilo, magma_int_t ihi, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *work, magma_int_t lwork, magmaFloatComplex *dT, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CGEHRD reduces a COMPLEX general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . This version stores the triangular matrices used in the factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Arguments ========= N (input) INTEGER The order of the matrix A. N >= 0. ILO (input) INTEGER IHI (input) INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to CGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) COMPLEX array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. WORK (workspace/output) COMPLEX array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. dT (output) COMPLEX array on the GPU, dimension NB*N, where NB is the optimal blocksize. It stores the NB*NB blocks of the triangular T matrices used in the reduction. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value. Further Details =============== The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. This version stores the T matrices in dT, for later use in magma_cunghr. ===================================================================== */ #define A( i, j ) ( A + (i) + (j)*lda) #define dA( i, j ) (dA + (i) + (j-ilo)*ldda) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_zero = MAGMA_C_ZERO; magma_int_t nb = magma_get_cgehrd_nb(n); magma_int_t ldda = n; // assumed in clahru magma_int_t nh, iws; magma_int_t iinfo; magma_int_t ldwork; magma_int_t lquery; *info = 0; iws = n*nb; MAGMA_C_SET2REAL( work[0], (float) iws ); lquery = lwork == -1; if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; // Adjust from 1-based indexing ilo -= 1; // Quick return if possible nh = ihi - ilo; if (nh <= 1) { work[0] = c_one; return *info; } // GPU workspace is: // nb*ldda for dwork for clahru // nb*ldda for dV // n*ldda for dA magmaFloatComplex *dwork; if (MAGMA_SUCCESS != magma_cmalloc( &dwork, 2*nb*ldda + n*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmaFloatComplex *dV = dwork + nb*ldda; magmaFloatComplex *dA = dwork + nb*ldda*2; ldwork = n; magma_int_t i; magmaFloatComplex *T, *dTi; magma_cmalloc_cpu( &T, nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } // zero first block of V, which is lower triangular czero_nbxnb_block(nb, dV, ldda); // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero for(i = 0; i < ilo; ++i) tau[i] = c_zero; for(i = max(0,ihi-1); i < n-1; ++i) tau[i] = c_zero; for(i=0; i < nb*nb; i += 4) T[i] = T[i+1] = T[i+2] = T[i+3] = c_zero; magmablas_claset( 'F', nb, n, dT, nb ); // If not enough workspace, use unblocked code if ( lwork < iws ) { nb = 1; } if (nb == 1 || nb > nh) { // Use unblocked code below i = ilo; } else { // Use blocked code // Copy the matrix to the GPU magma_csetmatrix( n, n-ilo, A(0,ilo), lda, dA, ldda ); for (i = ilo; i < ihi-1 - nb; i += nb) { // Reduce columns i:i+nb-1 to Hessenberg form, returning the // matrices V and T of the block reflector H = I - V*T*V' // which performs the reduction, and also the matrix Y = A*V*T // Get the current panel (no need for the 1st iteration) magma_cgetmatrix( ihi-i, nb, dA(i,i), ldda, A (i,i), lda ); // add 1 to i for 1-based index magma_clahr2( ihi, i+1, nb, dA(0,i), dV, A (0,i), lda, &tau[i], T, nb, work, ldwork); // Copy T from the CPU to dT on the GPU dTi = dT + (i - ilo)*nb; magma_csetmatrix( nb, nb, T, nb, dTi, nb ); magma_clahru( n, ihi, i, nb, A (0,i), lda, dA(0,i), // dA dA(i,i), // dY, stored over current panel dV, dTi, dwork ); } // Copy remainder to host magma_cgetmatrix( n, n-i, dA(0,i), ldda, A (0,i), lda ); } // Use unblocked code to reduce the rest of the matrix // add 1 to i for 1-based index i += 1; lapackf77_cgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo); MAGMA_C_SET2REAL( work[0], (float) iws ); magma_free( dwork ); magma_free_cpu( T ); return *info; } /* magma_cgehrd */
extern "C" magma_int_t magma_chetrd_he2hb_mgpu( char uplo, magma_int_t n, magma_int_t nb, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *work, magma_int_t lwork, magmaFloatComplex *dAmgpu[], magma_int_t ldda, magmaFloatComplex *dTmgpu[], magma_int_t lddt, magma_int_t ngpu, magma_int_t distblk, magma_queue_t streams[][20], magma_int_t nstream, magma_int_t threads, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CHETRD_HE2HB reduces a complex Hermitian matrix A to real symmetric band-diagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. This version stores the triangular matrices T used in the accumulated Householder transformations (I - V T V'). Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = 'U', the Upper band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements above the band diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the the Lower band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements below the band-diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) COMPLEX array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. dT (output) COMPLEX array on the GPU, dimension N*NB, where NB is the optimal blocksize. On exit dT holds the upper triangular matrices T from the accumulated Householder transformations (I - V T V') used in the factorization. The nb x nb matrices T are ordered consecutively in memory one after another. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = 'U': if UPLO = 'L': ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a + ((a_2)-1)*( lda) + (a_1)-1) #define da_ref(a_1,a_2) (da + ((a_2)-1)*(ldda) + (a_1)-1) #define tau_ref(a_1) (tau + (a_1)-1) #define t_ref(a_1) (dT + ((a_1)-1)*(lddt)) #define Atest(a_1,a_2) ( Atest + ((a_2)-1)*( lda) + (a_1)-1) #define dttest(a_0, a_1, a_2) (dTmgpu[a_0] + ((a_2)-1)*(lddt)) #define datest(a_0, a_1, a_2) (dAmgpu[a_0] + ((a_2)-1)*(ldda) + (a_1)-1) char uplo_[2] = {uplo, 0}; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_neg_half = MAGMA_C_NEG_HALF; magmaFloatComplex c_one = MAGMA_C_ONE ; magmaFloatComplex c_zero = MAGMA_C_ZERO; float d_one = MAGMA_D_ONE; magma_int_t pm, pn, indi, indj, pk; magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0, flipV=-1; magma_int_t iblock, idev, di; int i; int lwkopt; int lquery; assert (nstream>=3); assert (nstream>=(ngpu+1)); *info = 0; int upper = lapackf77_lsame(uplo_, "U"); lquery = lwork == -1; if (! upper && ! lapackf77_lsame(uplo_, "L")) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < 1 && ! lquery) { *info = -9; } /* Determine the block size. */ lwkopt = n * nb; if (*info == 0) { MAGMA_C_SET2REAL( work[0], lwkopt ); } if (*info != 0) return *info; else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_int_t mklth = min(threads,16); magma_setlapack_numthreads(mklth); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t nbcmplx=0; magma_buildconnection_mgpu(gnode, &nbcmplx, ngpu); #ifdef ENABLE_DEBUG printf(" Initializing communication pattern.... GPU-ncmplx %d\n\n" , nbcmplx); #endif magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magmaFloatComplex *dspace[MagmaMaxGPUs]; magmaFloatComplex *dwork[MagmaMaxGPUs], *dworkbis[MagmaMaxGPUs]; magmaFloatComplex *dvall[MagmaMaxGPUs], *dv[MagmaMaxGPUs], *dw[MagmaMaxGPUs]; magmaFloatComplex *workngpu[MagmaMaxGPUs+1]; magma_event_t redevents[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10]; magma_int_t nbevents = MagmaMaxGPUs*MagmaMaxGPUs; magma_int_t lddv = ldda; magma_int_t lddw = lddv; magma_int_t dwrk2siz = ldda*nb*(ngpu+1); magma_int_t worksiz = n*nb; magma_int_t devworksiz = 2*nb*lddv + nb*lddw + nb*ldda + dwrk2siz; // 2*dv(dv0+dv1) + dw + dwork +dworkbis // local allocation and stream creation for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_cmalloc( &dspace[dev], devworksiz ); magma_cmalloc_pinned ( &workngpu[dev], worksiz); dvall[dev] = dspace[dev]; dw[dev] = dvall[dev] + 2*nb*lddv; dwork[dev] = dw[dev] + nb*lddw; dworkbis[dev] = dwork[dev] + nb*ldda; magmablasSetKernelStream( streams[ dev ][ 0 ] ); for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&redevents[dev][i],cudaEventDisableTiming); } } magma_cmalloc_pinned ( &workngpu[ngpu], worksiz); magmaFloatComplex *worktest = NULL; //(magmaFloatComplex *) malloc(n*nb*sizeof(magmaFloatComplex)); // not used // ====================== magmaFloatComplex *hT = work + lwork - nb*nb; lwork -= nb*nb; memset( hT, 0, nb*nb*sizeof(magmaFloatComplex)); if (upper) { printf("CHETRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n"); exit(1); }else { /* Reduce the lower triangle of A */ for (i = 1; i <= n-nb; i += nb) { indi = i+nb; indj = i; pm = n - i - nb + 1; //pn = min(i+nb-1, n-nb) -i + 1; pn = nb; /* Get the current panel (no need for the 1st iteration) */ if (i > 1 ){ // cpanel_to_q copy the upper oof diagonal part of // the matrix to work to be restored later. acctually // the zero's and one's putted are not used this is only // because we don't have a function that copy only the // upper part of A to be restored after copying the // lookahead panel that has been computted from GPU to CPU. cpanel_to_q(MagmaUpper, pn-1, a_ref(i, i+1), lda, work); // find the device who own the panel then send it to the CPU. // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((i-1) / distblk) / ngpu; // local block id di = iblock*distblk + (i-1)%distblk; // local index in parent matrix idev = ((i-1) / distblk) % ngpu; // device with this block //printf("Receiving panel ofsize %d %d from idev %d A(%d,%d) \n",(pm+pn), pn,idev,i-1,di); magma_setdevice( idev ); //magma_device_sync(); magma_cgetmatrix_async( (pm+pn), pn, datest(idev, i, di+1), ldda, a_ref ( i, i), lda, streams[ idev ][ nstream-1 ] ); /* magma_device_sync(); cudaMemcpy2DAsync(a_ref(i,i), lda*sizeof(magmaFloatComplex), datest(idev,i,di+1), ldda*sizeof(magmaFloatComplex), (pm+pn)*sizeof(magmaFloatComplex), pn, cudaMemcpyDeviceToHost, streams[ idev ][ nstream-1 ]); */ //magma_setdevice( 0 ); //printf("updating cher2k on A(%d,%d) of size %d %d \n",indi_old+pn_old-1,indi_old+pn_old-1,pm_old-pn_old,pn_old); // compute CHER2K_MGPU magmablas_cher2k_mgpu2( MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one, dv, pm_old, pn_old, dw, pm_old, pn_old, d_one, dAmgpu, ldda, indi_old+pn_old-1, ngpu, distblk, streams, 2 ); //magma_setdevice( 0 ); magma_setdevice( idev ); magma_queue_sync( streams[idev][ nstream-1 ] ); //magma_setdevice( 0 ); cq_to_panel(MagmaUpper, pn-1, a_ref(i, i+1), lda, work); } /* ========================================================== QR factorization on a panel starting nb off of the diagonal. Prepare the V and T matrices. ========================================================== */ lapackf77_cgeqrf(&pm, &pn, a_ref(indi, indj), &lda, tau_ref(i), work, &lwork, info); /* Form the matrix T */ pk=min(pm,pn); lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &pm, &pk, a_ref(indi, indj), &lda, tau_ref(i), hT, &nb); /* Prepare V - put 0s in the upper triangular part of the panel (and 1s on the diagonal), temporaly storing the original in work */ cpanel_to_q(MagmaUpper, pk, a_ref(indi, indj), lda, work); /* Send V and T from the CPU to the GPU */ // To be able to overlap the GET with the CHER2K // it should be done on last stream. // TO Avoid a BUG that is overwriting the old_V // used atthis moment by cher2k with the new_V // send it now, we decide to have a flipflop // vector of Vs. if step%2=0 use V[0] else use V[nb*n] flipV = ((i-1)/nb)%2; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { dv[dev] = dvall[dev] + flipV*nb*lddv; } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); // send V magma_csetmatrix_async( pm, pk, a_ref(indi, indj), lda, dv[dev], pm, streams[dev][nstream-1] ); // Send the triangular factor T to the GPU magma_csetmatrix_async( pk, pk, hT, nb, dttest(dev, 1, i), lddt, streams[dev][nstream-1] ); } /* ========================================================== Compute W: 1. X = A (V T) 2. W = X - 0.5* V * (T' * (V' * X)) ========================================================== */ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { // dwork = V T magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ nstream-1 ] ); magma_queue_sync( streams[dev][nstream-1] ); magma_cgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_one, dv[dev], pm, dttest(dev, 1, i), lddt, c_zero, dwork[dev], pm); } // =============================================== // SYNC TO BE SURE THAT BOTH V AND T WERE // RECEIVED AND VT IS COMPUTED and SYR2K is done // =============================================== for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); for( magma_int_t s = 0; s < nstream; ++s ) magma_queue_sync( streams[dev][s] ); } // compute CHEMM_MGPU // The broadcast of the result done inside this function // should be done in stream [0] because i am assuming this // for the GEMMs below otherwise I have to SYNC over the // Broadcasting stream. if(ngpu==1){ magmablasSetKernelStream( streams[ 0 ][ 0 ] ); magma_chemm(MagmaLeft, uplo, pm, pk, c_one, dAmgpu[0]+(indi-1)*ldda+(indi-1), ldda, dwork[0], pm, c_zero, dw[0], pm); }else{ magmablas_chemm_mgpu_com( MagmaLeft, uplo, pm, pk, c_one, dAmgpu, ldda, indi-1, dwork, pm, c_zero, dw, pm, dworkbis, dwrk2siz, worktest, pm, workngpu, worksiz, ngpu, distblk, streams, nstream-1, redevents, nbevents, gnode, nbcmplx); } /* dwork = V*T already ==> dwork' = T'*V' * compute T'*V'*X ==> dwork'*W ==> * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { // Here we have to wait until the broadcast of CHEMM has been done. // Note that the broadcast should be done on stream[0] so in a way // we can continue here on the same stream and avoid a sync magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); // magma_queue_sync( streams[dev][0] ); magma_cgemm(MagmaConjTrans, MagmaNoTrans, pk, pk, pm, c_one, dwork[dev], pm, dw[dev], pm, c_zero, dworkbis[dev], nb); /* W = X - 0.5 * V * T'*V'*X * = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */ magma_cgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_neg_half, dv[dev], pm, dworkbis[dev], nb, c_one, dw[dev], pm); } /* restore the panel it is put here to overlap with the previous GEMM*/ cq_to_panel(MagmaUpper, pk, a_ref(indi, indj), lda, work); // =============================================== // SYNC TO BE SURE THAT BOTH V AND W ARE DONE // =============================================== // Synchronise to be sure that W has been computed // because next CHER2K use streaming and may happen // that lunch a gemm on stream 2 while stream 0 // which compute those 2 GEMM above has not been // computed and also used for the same reason in // the panel update below and also for the last HER2K for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_queue_sync( streams[dev][0] ); } /* ========================================================== Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' ========================================================== */ if (i + nb <= n-nb){ /* There would be next iteration; do lookahead - update the next panel */ // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((indi-1) / distblk) / ngpu; // local block id di = iblock*distblk + (indi-1)%distblk; // local index in parent matrix idev = ((indi-1) / distblk) % ngpu; // device with this block magma_setdevice( idev ); magmablasSetKernelStream( streams[ idev ][ nstream-1 ] ); //magma_queue_sync( streams[idev][0] ); removed because the sync has been done in the loop above magma_cgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dv[idev], pm, dw[idev] , pm, c_one, datest(idev, indi, di+1), ldda); magma_cgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dw[idev] , pm, dv[idev], pm, c_one, datest(idev, indi, di+1), ldda); //printf("updating next panel distblk %d idev %d on A(%d,%d) of size %d %d %d \n",distblk,idev,indi-1,di,pm,pn,pn); } else { /* no look-ahead as this is last iteration */ // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((indi-1) / distblk) / ngpu; // local block id di = iblock*distblk + (indi-1)%distblk; // local index in parent matrix idev = ((indi-1) / distblk) % ngpu; // device with this block magma_setdevice( idev ); magmablasSetKernelStream( streams[ idev ][ 0 ] ); //printf("LAST CHER2K idev %d on A(%d,%d) of size %d \n",idev, indi-1,di,pk); magma_cher2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one, dv[idev], pm, dw[idev] , pm, d_one, datest(idev, indi, di+1), ldda); /* Send the last block to the CPU */ cpanel_to_q(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work); magma_cgetmatrix( pk, pk, datest(idev, indi, di+1), ldda, a_ref(n-pk+1, n-pk+1), lda ); cq_to_panel(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work); } indi_old = indi; indj_old = indj; pm_old = pm; pn_old = pn; } // end loop for(i) }// end of LOWER //magma_setdevice( 0 ); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_free( dspace[dev]); magma_free_pinned(workngpu[dev]); for( magma_int_t e = 0; e < nbevents; ++e ) { cudaEventDestroy(redevents[dev][e]); } } magma_free_pinned(workngpu[ngpu]); free(worktest); magma_setdevice( cdev ); magmablasSetKernelStream( cstream ); MAGMA_C_SET2REAL( work[0], lwkopt ); magma_setlapack_numthreads(1); return *info; } /* chetrd_he2hb_ */
extern "C" magma_int_t magma_cheevx(char jobz, char range, char uplo, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, float vl, float vu, magma_int_t il, magma_int_t iu, float abstol, magma_int_t *m, float *w, magmaFloatComplex *z, magma_int_t ldz, magmaFloatComplex *work, magma_int_t lwork, float *rwork, magma_int_t *iwork, magma_int_t *ifail, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CHEEVX computes selected eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. Arguments ========= JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, the lower triangle (if UPLO='L') or the upper triangle (if UPLO='U') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). VL (input) DOUBLE PRECISION VU (input) DOUBLE PRECISION If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. ABSTOL (input) DOUBLE PRECISION The absolute error tolerance for the eigenvalues. An approximate eigenvalue is accepted as converged when it is determined to lie in an interval [a,b] of width less than or equal to ABSTOL + EPS * max( |a|,|b| ) , where EPS is the machine precision. If ABSTOL is less than or equal to zero, then EPS*|T| will be used in its place, where |T| is the 1-norm of the tridiagonal matrix obtained by reducing A to tridiagonal form. Eigenvalues will be computed most accurately when ABSTOL is set to twice the underflow threshold 2*SLAMCH('S'), not zero. If this routine returns with INFO>0, indicating that some eigenvectors did not converge, try setting ABSTOL to 2*SLAMCH('S'). See "Computing Small Singular Values of Bidiagonal Matrices with Guaranteed High Relative Accuracy," by Demmel and Kahan, LAPACK Working Note #3. M (output) INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. W (output) DOUBLE PRECISION array, dimension (N) On normal exit, the first M elements contain the selected eigenvalues in ascending order. Z (output) COMPLEX array, dimension (LDZ, max(1,M)) If JOBZ = 'V', then if INFO = 0, the first M columns of Z contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues, with the i-th column of Z holding the eigenvector associated with W(i). If an eigenvector fails to converge, then that column of Z contains the latest approximation to the eigenvector, and the index of the eigenvector is returned in IFAIL. If JOBZ = 'N', then Z is not referenced. Note: the user must ensure that at least max(1,M) columns are supplied in the array Z; if RANGE = 'V', the exact value of M is not known in advance and an upper bound must be used. LDZ (input) INTEGER The leading dimension of the array Z. LDZ >= 1, and if JOBZ = 'V', LDZ >= max(1,N). WORK (workspace/output) COMPLEX array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. LWORK >= max(1,2*N-1). For optimal efficiency, LWORK >= (NB+1)*N, where NB is the max of the blocksize for CHETRD and for CUNMTR as returned by ILAENV. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. RWORK (workspace) DOUBLE PRECISION array, dimension (7*N) IWORK (workspace) INTEGER array, dimension (5*N) IFAIL (output) INTEGER array, dimension (N) If JOBZ = 'V', then if INFO = 0, the first M elements of IFAIL are zero. If INFO > 0, then IFAIL contains the indices of the eigenvectors that failed to converge. If JOBZ = 'N', then IFAIL is not referenced. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, then i eigenvectors failed to converge. Their indices are stored in array IFAIL. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; char range_[2] = {range, 0}; magma_int_t izero = 0; magma_int_t ione = 1; char order[1]; magma_int_t indd, inde; magma_int_t imax; magma_int_t lopt, itmp1, indee; magma_int_t lower, wantz; magma_int_t i, j, jj, i__1; magma_int_t alleig, valeig, indeig; magma_int_t iscale, indibl; magma_int_t indiwk, indisp, indtau; magma_int_t indrwk, indwrk; magma_int_t llwork, nsplit; magma_int_t lquery; magma_int_t iinfo; float safmin; float bignum; float smlnum; float eps, tmp1; float anrm; float sigma, d__1; float rmin, rmax; /* Function Body */ lower = lapackf77_lsame(uplo_, MagmaLowerStr); wantz = lapackf77_lsame(jobz_, MagmaVecStr); alleig = lapackf77_lsame(range_, "A"); valeig = lapackf77_lsame(range_, "V"); indeig = lapackf77_lsame(range_, "I"); lquery = lwork == -1; *info = 0; if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else if (ldz < 1 || (wantz && ldz < n)) { *info = -15; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_chetrd_nb(n); lopt = n * (nb + 1); MAGMA_C_SET2REAL(work[0],(float)lopt); if (lwork < lopt && ! lquery) { *info = -17; } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } *m = 0; /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_cheevx(jobz_, range_, uplo_, &n, a, &lda, &vl, &vu, &il, &iu, &abstol, m, w, z, &ldz, work, &lwork, rwork, iwork, ifail, info); return *info; } --w; --work; --rwork; --iwork; --ifail; /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt(smlnum); rmax = magma_ssqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_clanhe("M", uplo_, &n, a, &lda, &rwork[1]); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { d__1 = 1.; lapackf77_clascl(uplo_, &izero, &izero, &d__1, &sigma, &n, &n, a, &lda, info); if (abstol > 0.) { abstol *= sigma; } if (valeig) { vl *= sigma; vu *= sigma; } } /* Call CHETRD to reduce Hermitian matrix to tridiagonal form. */ indd = 1; inde = indd + n; indrwk = inde + n; indtau = 1; indwrk = indtau + n; llwork = lwork - indwrk + 1; magma_chetrd(uplo, n, a, lda, &rwork[indd], &rwork[inde], &work[indtau], &work[indwrk], llwork, &iinfo); lopt = n + (magma_int_t)MAGMA_C_REAL(work[indwrk]); /* If all eigenvalues are desired and ABSTOL is less than or equal to zero, then call SSTERF or CUNGTR and CSTEQR. If this fails for some eigenvalue, then try SSTEBZ. */ if ((alleig || (indeig && il == 1 && iu == n)) && abstol <= 0.) { blasf77_scopy(&n, &rwork[indd], &ione, &w[1], &ione); indee = indrwk + 2*n; if (! wantz) { i__1 = n - 1; blasf77_scopy(&i__1, &rwork[inde], &ione, &rwork[indee], &ione); lapackf77_ssterf(&n, &w[1], &rwork[indee], info); } else { lapackf77_clacpy("A", &n, &n, a, &lda, z, &ldz); lapackf77_cungtr(uplo_, &n, z, &ldz, &work[indtau], &work[indwrk], &llwork, &iinfo); i__1 = n - 1; blasf77_scopy(&i__1, &rwork[inde], &ione, &rwork[indee], &ione); lapackf77_csteqr(jobz_, &n, &w[1], &rwork[indee], z, &ldz, &rwork[indrwk], info); if (*info == 0) { for (i = 1; i <= n; ++i) { ifail[i] = 0; } } } if (*info == 0) { *m = n; } } /* Otherwise, call SSTEBZ and, if eigenvectors are desired, CSTEIN. */ if (*m == 0) { *info = 0; if (wantz) { *(unsigned char *)order = 'B'; } else { *(unsigned char *)order = 'E'; } indibl = 1; indisp = indibl + n; indiwk = indisp + n; lapackf77_sstebz(range_, order, &n, &vl, &vu, &il, &iu, &abstol, &rwork[indd], &rwork[inde], m, &nsplit, &w[1], &iwork[indibl], &iwork[indisp], &rwork[indrwk], &iwork[indiwk], info); if (wantz) { lapackf77_cstein(&n, &rwork[indd], &rwork[inde], m, &w[1], &iwork[indibl], &iwork[indisp], z, &ldz, &rwork[indrwk], &iwork[indiwk], &ifail[1], info); /* Apply unitary matrix used in reduction to tridiagonal form to eigenvectors returned by CSTEIN. */ magma_cunmtr(MagmaLeft, uplo, MagmaNoTrans, n, *m, a, lda, &work[indtau], z, ldz, &work[indwrk], llwork, &iinfo); } } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = *m; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_sscal(&imax, &d__1, &w[1], &ione); } /* If eigenvalues are not in order, then sort them, along with eigenvectors. */ if (wantz) { for (j = 1; j <= *m-1; ++j) { i = 0; tmp1 = w[j]; for (jj = j + 1; jj <= *m; ++jj) { if (w[jj] < tmp1) { i = jj; tmp1 = w[jj]; } } if (i != 0) { itmp1 = iwork[indibl + i - 1]; w[i] = w[j]; iwork[indibl + i - 1] = iwork[indibl + j - 1]; w[j] = tmp1; iwork[indibl + j - 1] = itmp1; blasf77_cswap(&n, z + (i-1)*ldz, &ione, z + (j-1)*ldz, &ione); if (*info != 0) { itmp1 = ifail[i]; ifail[i] = ifail[j]; ifail[j] = itmp1; } } } } /* Set WORK(1) to optimal complex workspace size. */ work[1] = MAGMA_C_MAKE((float) lopt, 0.); return *info; } /* magma_cheevx */
extern "C" magma_int_t magma_chetrd2_gpu(char uplo, magma_int_t n, magmaFloatComplex *da, magma_int_t ldda, float *d, float *e, magmaFloatComplex *tau, magmaFloatComplex *wa, magma_int_t ldwa, magmaFloatComplex *work, magma_int_t lwork, magmaFloatComplex *dwork, magma_int_t ldwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CHETRD2_GPU reduces a complex Hermitian matrix A to real symmetric tridiagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. This version passes a workspace that is used in an optimized GPU matrix-vector product. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. DA (device input/output) COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = 'U', the diagonal and first superdiagonal of A are overwritten by the corresponding elements of the tridiagonal matrix T, and the elements above the first superdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the diagonal and first subdiagonal of A are over- written by the corresponding elements of the tridiagonal matrix T, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). D (output) COMPLEX array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). E (output) COMPLEX array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. TAU (output) COMPLEX array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). WA (workspace/output) COMPLEX array, dimension (LDA,N) On exit the diagonal, the upper part (UPLO='U') or the lower part (UPLO='L') are copies of DA LDWA (input) INTEGER The leading dimension of the array WA. LDWA >= max(1,N). WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. DWORK (workspace/output) COMPLEX array on the GPU, dim (MAX(1,LDWORK)) LDWORK (input) INTEGER The dimension of the array DWORK. LDWORK >= (n*n+64-1)/64 + 2*n*nb, where nb = magma_get_chetrd_nb(n) INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = 'U': if UPLO = 'L': ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). ===================================================================== */ char uplo_[2] = {uplo, 0}; magma_int_t nb = magma_get_chetrd_nb(n); magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; float d_one = MAGMA_D_ONE; magma_int_t kk, nx; magma_int_t i, j, i_n; magma_int_t iinfo; magma_int_t ldw, lddw, lwkopt; magma_int_t lquery; *info = 0; int upper = lapackf77_lsame(uplo_, "U"); lquery = lwork == -1; if (! upper && ! lapackf77_lsame(uplo_, "L")) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } else if (ldwa < max(1,n)) { *info = -9; } else if (lwork < 1 && ! lquery) { *info = -11; } /* Determine the block size. */ ldw = lddw = n; lwkopt = n * nb; if (*info == 0) { MAGMA_C_SET2REAL( work[0], lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } if (n < 1024) nx = n; else nx = 300; if (ldwork<(ldw*n+64-1)/64 + 2*ldw*nb) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (upper) { /* Reduce the upper triangle of A. Columns 1:kk are handled by the unblocked method. */ kk = n - (n - nx + nb - 1) / nb * nb; for (i = n - nb; i >= kk; i -= nb) { /* Reduce columns i:i+nb-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel */ magma_cgetmatrix( i+nb, nb, dA(0, i), ldda, A(0, i), ldwa ); magma_clatrd2(uplo, i+nb, nb, A(0, 0), ldwa, e, tau, work, ldw, dA(0, 0), ldda, dwork, lddw, dwork + 2*ldw*nb, ldwork - 2*ldw*nb); /* Update the unreduced submatrix A(0:i-2,0:i-2), using an update of the form: A := A - V*W' - W*V' */ magma_csetmatrix( i + nb, nb, work, ldw, dwork, lddw ); magma_cher2k(uplo, MagmaNoTrans, i, nb, c_neg_one, dA(0, i), ldda, dwork, lddw, d_one, dA(0, 0), ldda); /* Copy superdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+nb; ++j) { MAGMA_C_SET2REAL( *A(j-1, j), e[j - 1] ); d[j] = MAGMA_C_REAL( *A(j, j) ); } } magma_cgetmatrix( kk, kk, dA(0, 0), ldda, A(0, 0), ldwa ); /* Use CPU code to reduce the last or only block */ lapackf77_chetrd(uplo_, &kk, A(0, 0), &ldwa, d, e, tau, work, &lwork, &iinfo); magma_csetmatrix( kk, kk, A(0, 0), ldwa, dA(0, 0), ldda ); } else { /* Reduce the lower triangle of A */ for (i = 0; i < n-nx; i += nb) { /* Reduce columns i:i+nb-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel */ magma_cgetmatrix( n-i, nb, dA(i, i), ldda, A(i, i), ldwa ); magma_clatrd2(uplo, n-i, nb, A(i, i), ldwa, &e[i], &tau[i], work, ldw, dA(i, i), ldda, dwork, lddw, dwork + 2*ldw*nb, ldwork - 2*ldw*nb); /* Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' */ magma_csetmatrix( n-i, nb, work, ldw, dwork, lddw ); magma_cher2k(MagmaLower, MagmaNoTrans, n-i-nb, nb, c_neg_one, dA(i+nb, i), ldda, &dwork[nb], lddw, d_one, dA(i+nb, i+nb), ldda); /* Copy subdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+nb; ++j) { MAGMA_C_SET2REAL( *A(j+1, j), e[j] ); d[j] = MAGMA_C_REAL( *A(j, j) ); } } /* Use unblocked code to reduce the last or only block */ magma_cgetmatrix( n-i, n-i, dA(i, i), ldda, A(i, i), ldwa ); i_n = n-i; lapackf77_chetrd(uplo_, &i_n, A(i, i), &ldwa, &d[i], &e[i], &tau[i], work, &lwork, &iinfo); magma_csetmatrix( n-i, n-i, A(i, i), ldwa, dA(i, i), ldda ); } MAGMA_C_SET2REAL( work[0], lwkopt ); return *info; } /* chetrd2_gpu */
extern "C" magma_int_t magma_cstedx(magma_vec_t range, magma_int_t n, float vl, float vu, magma_int_t il, magma_int_t iu, float* d, float* e, magmaFloatComplex* z, magma_int_t ldz, float* rwork, magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magmaFloat_ptr dwork, magma_int_t* info, magma_queue_t queue) { /* -- MAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 .. Scalar Arguments .. CHARACTER RANGE INTEGER IL, IU, INFO, LDZ, LIWORK, LRWORK, N REAL VL, VU .. .. Array Arguments .. INTEGER IWORK( * ) REAL D( * ), E( * ), RWORK( * ), DWORK ( * ) COMPLEX Z( LDZ, * ) .. Purpose ======= CSTEDX computes some eigenvalues and eigenvectors of a symmetric tridiagonal matrix using the divide and conquer method. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. See SLAEX3 for details. Arguments ========= RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. N (input) INTEGER The dimension of the symmetric tridiagonal matrix. N >= 0. VL (input) REAL VU (input) REAL If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. D (input/output) REAL array, dimension (N) On entry, the diagonal elements of the tridiagonal matrix. On exit, if INFO = 0, the eigenvalues in ascending order. E (input/output) REAL array, dimension (N-1) On entry, the subdiagonal elements of the tridiagonal matrix. On exit, E has been destroyed. Z (output) COMPLEX array, dimension (LDZ,N) On exit, if INFO = 0, Z contains the orthonormal eigenvectors of the symmetric tridiagonal matrix. LDZ (input) INTEGER The leading dimension of the array Z. LDZ >= max(1,N). RWORK (workspace/output) REAL array, dimension (LRWORK) On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. LRWORK (input) INTEGER The dimension of the array RWORK. LRWORK must be at least 1 + 4*N + 2*N**2. Note that if N is less than or equal to the minimum divide size, usually 25, then LRWORK need only be max(1,2*(N-1)). If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. LIWORK must be at least 3 + 5*N . Note that if N is less than or equal to the minimum divide size, usually 25, then LIWORK need only be 1. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. DWORK (device workspace) REAL array, dimension (3*N*N/2+3*N) INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. > 0: The algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA ===================================================================== */ magma_vec_t range_ = range; magma_int_t alleig, indeig, valeig, lquery; magma_int_t i, j, smlsiz; magma_int_t liwmin, lrwmin; alleig = lapackf77_lsame(lapack_const(range_), "A"); valeig = lapackf77_lsame(lapack_const(range_), "V"); indeig = lapackf77_lsame(lapack_const(range_), "I"); lquery = lrwork == -1 || liwork == -1; *info = 0; if (! (alleig || valeig || indeig)) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldz < max(1,n)) { *info = -10; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -4; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -5; } else if (iu < min(n,il) || iu > n) { *info = -6; } } } if (*info == 0) { // Compute the workspace requirements smlsiz = get_cstedx_smlsize(); if( n <= 1 ){ lrwmin = 1; liwmin = 1; } else { lrwmin = 1 + 4*n + 2*n*n; liwmin = 3 + 5*n; } rwork[0] = lrwmin; iwork[0] = liwmin; if (lrwork < lrwmin && ! lquery) { *info = -12; } else if (liwork < liwmin && ! lquery) { *info = -14; } } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } // Quick return if possible if(n==0) return *info; if(n==1){ MAGMA_C_SET2REAL(*z,1.); return *info; } // If N is smaller than the minimum divide size (SMLSIZ+1), then // solve the problem with another solver. if (n < smlsiz){ char char_I[]= {'I', 0}; lapackf77_csteqr(char_I, &n, d, e, z, &ldz, rwork, info); } else { // We simply call SSTEDX instead. magma_sstedx(range, n, vl, vu, il, iu, d, e, rwork, n, rwork+n*n, lrwork-n*n, iwork, liwork, dwork, info, queue); for(j=0; j<n; ++j) for(i=0; i<n; ++i){ MAGMA_C_SET2REAL(*(z+i+ldz*j), *(rwork+i+n*j)); } } rwork[0] = lrwmin; iwork[0] = liwmin; return *info; } /* cstedx */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cpotrf */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; float flops, gpu_perf, cpu_perf; cuFloatComplex *h_A, *h_R; cuFloatComplex *d_A; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info; const char *uplo = MagmaUpperStr; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], matnorm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_cpotri_gpu -N %d\n\n", 1024); } /* Allocate host memory for the matrix */ n2 = size[9] * size[9]; ldda = ((size[9]+31)/32) * 32; TESTING_MALLOC( h_A, cuFloatComplex, n2); TESTING_HOSTALLOC( h_R, cuFloatComplex, n2); TESTING_DEVALLOC( d_A, cuFloatComplex, ldda*size[9] ); printf(" N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("========================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; flops = FLOPS_CPOTRI( (float)N ) / 1000000; ldda = ((N+31)/32)*32; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ { magma_int_t i, j; for(i=0; i<N; i++) { MAGMA_C_SET2REAL( h_A[i*lda+i], ( MAGMA_C_REAL(h_A[i*lda+i]) + 1.*N ) ); for(j=0; j<i; j++) h_A[i*lda+j] = cuConjf(h_A[j*lda+i]); } } lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ //cublasSetMatrix( N, N, sizeof(cuFloatComplex), h_A, lda, d_A, ldda); //magma_cpotrf_gpu(uplo[0], N, d_A, ldda, &info); /* factorize matrix */ magma_csetmatrix( N, N, h_A, lda, d_A, ldda ); magma_cpotrf_gpu(uplo[0], N, d_A, ldda, &info); // check for exact singularity //magma_cgetmatrix( N, N, d_A, ldda, h_R, lda ); //h_R[ 10 + 10*lda ] = MAGMA_C_MAKE( 0.0, 0.0 ); //magma_csetmatrix( N, N, h_R, lda, d_A, ldda ); start = get_current_time(); magma_cpotri_gpu(uplo[0], N, d_A, ldda, &info); end = get_current_time(); if (info != 0) printf("magma_cpotri_gpu returned error %d\n", (int) info); gpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_cpotrf(uplo, &N, h_A, &lda, &info); start = get_current_time(); lapackf77_cpotri(uplo, &N, h_A, &lda, &info); end = get_current_time(); if (info != 0) printf("lapackf77_cpotri returned error %d\n", (int) info); cpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix( N, N, d_A, ldda, h_R, lda ); matnorm = lapackf77_clange("f", &N, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %6.2f %6.2f %e\n", (int) size[i], cpu_perf, gpu_perf, lapackf77_clange("f", &N, &N, h_R, &lda, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); TESTING_DEVFREE( d_A ); /* Shutdown */ TESTING_CUDA_FINALIZE(); }
extern "C" magma_int_t magma_cgehrd2(magma_int_t n, magma_int_t ilo, magma_int_t ihi, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CGEHRD2 reduces a COMPLEX general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . Arguments ========= N (input) INTEGER The order of the matrix A. N >= 0. ILO (input) INTEGER IHI (input) INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to CGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) COMPLEX array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. WORK (workspace/output) COMPLEX array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value. Further Details =============== The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. ===================================================================== */ magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_zero = MAGMA_C_ZERO; magma_int_t nb = magma_get_cgehrd_nb(n); magma_int_t N = n, ldda = n; magma_int_t ib; magma_int_t nh, iws; magma_int_t nbmin, iinfo; magma_int_t ldwork; magma_int_t lquery; --tau; *info = 0; MAGMA_C_SET2REAL( work[0], (float) n * nb ); lquery = lwork == -1; if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ nh = ihi - ilo + 1; if (nh <= 1) { work[0] = c_one; return *info; } magmaFloatComplex *da; if (MAGMA_SUCCESS != magma_cmalloc( &da, N*ldda + 2*N*nb + nb*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmaFloatComplex *d_A = da; magmaFloatComplex *d_work = da + (N+nb)*ldda; magma_int_t i__; magmaFloatComplex *t, *d_t; magma_cmalloc_cpu( &t, nb*nb ); if ( t == NULL ) { magma_free( da ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } d_t = d_work + nb * ldda; czero_nbxnb_block(nb, d_A+N*ldda, ldda); /* Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */ for (i__ = 1; i__ < ilo; ++i__) tau[i__] = c_zero; for (i__ = max(1,ihi); i__ < n; ++i__) tau[i__] = c_zero; for(i__=0; i__< nb*nb; i__+=4) t[i__] = t[i__+1] = t[i__+2] = t[i__+3] = c_zero; nbmin = 2; iws = 1; if (nb > 1 && nb < nh) { /* Determine when to cross over from blocked to unblocked code (last block is always handled by unblocked code) */ if (nb < nh) { /* Determine if workspace is large enough for blocked code */ iws = n * nb; if (lwork < iws) { /* Not enough workspace to use optimal NB: determine the minimum value of NB, and reduce NB or force use of unblocked code */ nbmin = nb; if (lwork >= n * nbmin) nb = lwork / n; else nb = 1; } } } ldwork = n; if (nb < nbmin || nb >= nh) { /* Use unblocked code below */ i__ = ilo; } else { /* Use blocked code */ /* Copy the matrix to the GPU */ magma_csetmatrix( N, N-ilo+1, a+(ilo-1)*(lda), lda, d_A, ldda ); for (i__ = ilo; i__ < ihi - nb; i__ += nb) { /* Computing MIN */ ib = min(nb, ihi - i__); /* Reduce columns i:i+ib-1 to Hessenberg form, returning the matrices V and T of the block reflector H = I - V*T*V' which performs the reduction, and also the matrix Y = A*V*T */ /* Get the current panel (no need for the 1st iteration) */ magma_cgetmatrix( ihi-i__+1, ib, d_A + (i__ - ilo)*ldda + i__ - 1, ldda, a + (i__ - 1 )*lda + i__ - 1, lda ); magma_clahr2(ihi, i__, ib, d_A + (i__ - ilo)*ldda, d_A + N*ldda + 1, a + (i__ - 1 )*(lda) , lda, &tau[i__], t, nb, work, ldwork); /* Copy T from the CPU to D_T on the GPU */ magma_csetmatrix( nb, nb, t, nb, d_t, nb ); magma_clahru(n, ihi, i__ - 1, ib, a + (i__ - 1 )*(lda), lda, d_A + (i__ - ilo)*ldda, d_A + (i__ - ilo)*ldda + i__ - 1, d_A + N*ldda, d_t, d_work); } } /* Use unblocked code to reduce the rest of the matrix */ if (!(nb < nbmin || nb >= nh)) { magma_cgetmatrix( n, n-i__+1, d_A+ (i__-ilo)*ldda, ldda, a + (i__-1)*(lda), lda ); } lapackf77_cgehd2(&n, &i__, &ihi, a, &lda, &tau[1], work, &iinfo); MAGMA_C_SET2REAL( work[0], (float) iws ); magma_free( da ); magma_free_cpu(t); return *info; } /* magma_cgehrd2 */
extern "C" magma_int_t magma_cungtr(char uplo, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *work, magma_int_t lwork, magmaFloatComplex *dT, magma_int_t nb, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CUNGTR generates a complex unitary matrix Q which is defined as the product of n-1 elementary reflectors of order N, as returned by CHETRD: if UPLO = 'U', Q = H(n-1) . . . H(2) H(1), if UPLO = 'L', Q = H(1) H(2) . . . H(n-1). Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A contains elementary reflectors from CHETRD; = 'L': Lower triangle of A contains elementary reflectors from CHETRD. N (input) INTEGER The order of the matrix Q. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the vectors which define the elementary reflectors, as returned by CHETRD. On exit, the N-by-N unitary matrix Q. LDA (input) INTEGER The leading dimension of the array A. LDA >= N. TAU (input) COMPLEX array, dimension (N-1) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CHETRD. WORK (workspace/output) COMPLEX array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N-1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. DT (input) COMPLEX array on the GPU device. DT contains the T matrices used in blocking the elementary reflectors H(i) as returned by magma_chetrd. NB (input) INTEGER This is the block size used in CHETRD, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(i,j) ( a + (j)*lda+ (i)) char uplo_[2] = {uplo, 0}; magma_int_t i__1; magma_int_t i, j; magma_int_t iinfo; magma_int_t upper, lwkopt, lquery; *info = 0; lquery = lwork == -1; upper = lapackf77_lsame(uplo_, "U"); if (! upper && ! lapackf77_lsame(uplo_, "L")) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else /* if(complicated condition) */ { /* Computing MAX */ if (lwork < max(1, n-1) && ! lquery) { *info = -7; } } lwkopt = max(1, n) * nb; if (*info == 0) { MAGMA_C_SET2REAL( work[0], lwkopt); } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { work[0] = MAGMA_C_ONE; return *info; } if (upper) { /* Q was determined by a call to CHETRD with UPLO = 'U' Shift the vectors which define the elementary reflectors one column to the left, and set the last row and column of Q to those of the unit matrix */ for (j = 0; j < n-1; ++j) { for (i = 0; i < j-1; ++i) *a_ref(i, j) = *a_ref(i, j + 1); *a_ref(n-1, j) = MAGMA_C_ZERO; } for (i = 0; i < n-1; ++i) { *a_ref(i, n-1) = MAGMA_C_ZERO; } *a_ref(n-1, n-1) = MAGMA_C_ONE; /* Generate Q(1:n-1,1:n-1) */ i__1 = n - 1; lapackf77_cungql(&i__1, &i__1, &i__1, a_ref(0,0), &lda, tau, work, &lwork, &iinfo); } else { /* Q was determined by a call to CHETRD with UPLO = 'L'. Shift the vectors which define the elementary reflectors one column to the right, and set the first row and column of Q to those of the unit matrix */ for (j = n-1; j > 0; --j) { *a_ref(0, j) = MAGMA_C_ZERO; for (i = j; i < n-1; ++i) *a_ref(i, j) = *a_ref(i, j - 1); } *a_ref(0, 0) = MAGMA_C_ONE; for (i = 1; i < n-1; ++i) *a_ref(i, 0) = MAGMA_C_ZERO; if (n > 1) { /* Generate Q(2:n,2:n) */ magma_cungqr(n-1, n-1, n-1, a_ref(1, 1), lda, tau, dT, nb, &iinfo); } } MAGMA_C_SET2REAL( work[0], lwkopt); return *info; } /* magma_cungtr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing chegvdx */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time; magmaFloatComplex *h_A, *h_R, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; magma_int_t lrwork; #endif /* Matrix size */ float *w1, *w2; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1};; magma_int_t info_ortho = 0; magma_int_t info_solution = 0; magma_int_t info_reduction = 0; magma_timestr_t start, end; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t ngpu = opts.ngpu; char jobz = opts.jobz; magma_int_t checkres = opts.check; char range = 'A'; char uplo = opts.uplo; magma_int_t itype = opts.itype; float f = opts.fraction; if (f != 1) range='I'; if ( checkres && jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); jobz = MagmaVec; } printf("using: itype = %d, jobz = %c, range = %c, uplo = %c, checkres = %d, fraction = %6.4f\n", (int) itype, jobz, range, uplo, (int) checkres, f); printf(" N M GPU Time(s) \n"); printf("==========================\n"); magma_int_t threads = magma_get_numthreads(); for( magma_int_t i = 0; i < opts.ntest; ++i ) { for( magma_int_t iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_cbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_cbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; /* Allocate host memory for the matrix */ TESTING_MALLOC( h_A, magmaFloatComplex, n2); TESTING_MALLOC( w1, float , N); TESTING_MALLOC( w2, float , N); TESTING_HOSTALLOC(h_R, magmaFloatComplex, n2); TESTING_HOSTALLOC(h_work, magmaFloatComplex, lwork); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_HOSTALLOC( rwork, float, lrwork); #endif TESTING_MALLOC( iwork, magma_int_t, liwork); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); /* Make diagonal real */ for(int i=0; i<N; i++) { MAGMA_C_SET2REAL( h_A[i*N+i], MAGMA_C_REAL(h_A[i*N+i]) ); } magma_int_t m1 = 0; float vl = 0; float vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == 'I'){ il = 1; iu = (int) (f*N); } if(opts.warmup){ // ================================================================== // Warmup using MAGMA // ================================================================== lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); if(ngpu==1){ printf("calling cheevdx_2stage 1 GPU\n"); magma_cheevdx_2stage(jobz, range, uplo, N, h_R, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); }else{ printf("calling cheevdx_2stage_m %d GPU\n", (int) ngpu); magma_cheevdx_2stage_m(ngpu, jobz, range, uplo, N, h_R, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } } // =================================================================== // Performs operation using MAGMA // =================================================================== lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); start = get_current_time(); if(ngpu==1){ printf("calling cheevdx_2stage 1 GPU\n"); magma_cheevdx_2stage(jobz, range, uplo, N, h_R, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); }else{ printf("calling cheevdx_2stage_m %d GPU\n", (int) ngpu); magma_cheevdx_2stage_m(ngpu, jobz, range, uplo, N, h_R, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } end = get_current_time(); gpu_time = GetTimerValue(start,end)/1000.; if ( checkres ) { float eps = lapackf77_slamch("E"); printf("\n"); printf("------ TESTS FOR MAGMA CHEEVD ROUTINE ------- \n"); printf(" Size of the Matrix %d by %d\n", (int) N, (int) N); printf("\n"); printf(" The matrix A is randomly generated for each test.\n"); printf("============\n"); printf(" The relative machine precision (eps) is %8.2e\n",eps); printf(" Computational tests pass if scaled residuals are less than 60.\n"); /* Check the orthogonality, reduction and the eigen solutions */ if (jobz == MagmaVec) { info_ortho = check_orthogonality(N, N, h_R, N, eps); info_reduction = check_reduction(uplo, N, 1, h_A, w1, N, h_R, eps); } printf("------ CALLING LAPACK CHEEVD TO COMPUTE only eigenvalue and verify elementswise ------- \n"); lapackf77_cheevd("N", "L", &N, h_A, &N, w2, h_work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, &info); info_solution = check_solution(N, w2, w1, eps); if ( (info_solution == 0) & (info_ortho == 0) & (info_reduction == 0) ) { printf("***************************************************\n"); printf(" ---- TESTING CHEEVD ...................... PASSED !\n"); printf("***************************************************\n"); } else { printf("************************************************\n"); printf(" - TESTING CHEEVD ... FAILED !\n"); printf("************************************************\n"); } } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %6.2f\n", (int) N, (int) m1, gpu_time); TESTING_FREE( h_A); TESTING_FREE( w1); TESTING_FREE( w2); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_HOSTFREE( rwork); #endif TESTING_FREE( iwork); TESTING_HOSTFREE(h_work); TESTING_HOSTFREE( h_R); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE(); return 0; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; magmaFloatComplex *hA, *hR; magmaFloatComplex_ptr dA; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1024, 2048, 3072, 4032, 5184, 6048, 7200, 8064, 8928, 10560 }; magma_int_t i, info; magmaFloatComplex mz_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], matnorm, diffnorm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_cpotrf_gpu -N %d\n\n", 1024); } /* Initialize */ magma_queue_t queue; magma_device_t device; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( &device, 1, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the largest matrix */ N = size[9]; n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC( hA, magmaFloatComplex, n2 ); TESTING_MALLOC_HOST( hR, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( dA, magmaFloatComplex, ldda*N ); printf("\n\n"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, hA ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { MAGMA_C_SET2REAL( hA(i,i), MAGMA_C_REAL(hA(i,i)) + N ); for( int j = 0; j < i; ++j ) { hA(i, j) = MAGMA_C_CNJG( hA(j,i) ); } } lapackf77_clacpy( MagmaFullStr, &N, &N, hA, &lda, hR, &lda ); /* Warm up to measure the performance */ magma_csetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); magma_cpotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); gpu_time = get_time(); magma_cpotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); gpu_time = get_time() - gpu_time; if (info != 0) printf( "magma_cpotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = get_time(); lapackf77_cpotrf( MagmaUpperStr, &N, hA, &lda, &info ); cpu_time = get_time() - cpu_time; if (info != 0) printf( "lapackf77_cpotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ magma_cgetmatrix( N, N, dA, 0, ldda, hR, 0, lda, queue ); matnorm = lapackf77_clange("f", &N, &N, hA, &lda, work); blasf77_caxpy(&n2, &mz_one, hA, &ione, hR, &ione); diffnorm = lapackf77_clange("f", &N, &N, hR, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (argc != 1) break; } /* clean up */ TESTING_FREE( hA ); TESTING_FREE_HOST( hR ); TESTING_FREE_DEV( dA ); magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing chegvd */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); cuFloatComplex *h_A, *h_R, *h_B, *h_S, *h_work; float *rwork, *w1, *w2; magma_int_t *iwork; float gpu_time, cpu_time; magma_timestr_t start, end; /* Matrix size */ magma_int_t N=0, n2; magma_int_t size[4] = {1024,2048,4100,6001}; magma_int_t i, itype, info; magma_int_t ione = 1, izero = 0; magma_int_t five = 5; cuFloatComplex c_zero = MAGMA_C_ZERO; cuFloatComplex c_one = MAGMA_C_ONE; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; float d_one = 1.; float d_neg_one = -1.; float d_ten = 10.; magma_int_t ISEED[4] = {0,0,0,1}; const char *uplo = MagmaLowerStr; const char *jobz = MagmaVectorsStr; itype = 1; magma_int_t checkres; float result[4]; int flagN = 0; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0){ N = atoi(argv[++i]); if (N>0){ printf(" testing_chegvd -N %d\n\n", (int) N); flagN=1; } else { printf("\nUsage: \n"); printf(" testing_chegvd -N %d\n\n", (int) N); exit(1); } } if (strcmp("-itype", argv[i])==0){ itype = atoi(argv[++i]); if (itype>0 && itype <= 3){ printf(" testing_chegvd -itype %d\n\n", (int) itype); } else { printf("\nUsage: \n"); printf(" testing_chegvd -itype %d\n\n", (int) itype); exit(1); } } if (strcmp("-L", argv[i])==0){ uplo = MagmaLowerStr; printf(" testing_chegvd -L"); } if (strcmp("-U", argv[i])==0){ uplo = MagmaUpperStr; printf(" testing_chegvd -U"); } } } else { printf("\nUsage: \n"); printf(" testing_chegvd -L/U -N %d -itype %d\n\n", 1024, 1); } if(!flagN) N = size[3]; checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; n2 = N * N; /* Allocate host memory for the matrix */ TESTING_MALLOC( h_A, cuFloatComplex, n2); TESTING_MALLOC( h_B, cuFloatComplex, n2); TESTING_MALLOC( w1, float , N); TESTING_MALLOC( w2, float , N); TESTING_HOSTALLOC(h_R, cuFloatComplex, n2); TESTING_HOSTALLOC(h_S, cuFloatComplex, n2); magma_int_t nb = magma_get_chetrd_nb(N); magma_int_t lwork = 2*N*nb + N*N; magma_int_t lrwork = 1 + 5*N +2*N*N; magma_int_t liwork = 3 + 5*N; TESTING_HOSTALLOC(h_work, cuFloatComplex, lwork); TESTING_MALLOC( rwork, float, lrwork); TESTING_MALLOC( iwork, magma_int_t, liwork); printf(" N CPU Time(s) GPU Time(s) \n"); printf("===================================\n"); for(i=0; i<4; i++){ if (!flagN){ N = size[i]; n2 = N*N; } /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); //lapackf77_clatms( &N, &N, "U", ISEED, "P", w1, &five, &d_ten, // &d_one, &N, &N, uplo, h_B, &N, h_work, &info); //lapackf77_claset( "A", &N, &N, &c_zero, &c_one, h_B, &N); lapackf77_clarnv( &ione, ISEED, &n2, h_B ); /* increase the diagonal */ { magma_int_t i, j; for(i=0; i<N; i++) { MAGMA_C_SET2REAL( h_B[i*N+i], MAGMA_C_REAL(h_B[i*N+i]) + 1.*N ); MAGMA_C_SET2REAL( h_A[i*N+i], MAGMA_C_REAL(h_A[i*N+i]) ); } } lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_chegvd(itype, jobz[0], uplo[0], N, h_R, N, h_S, N, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ start = get_current_time(); magma_chegvd(itype, jobz[0], uplo[0], N, h_R, N, h_S, N, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); end = get_current_time(); gpu_time = GetTimerValue(start,end)/1000.; if ( checkres ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvd routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | I - V V' B | / ( N ) (itype = 1,2) | B - V V' | / ( |B| N ) (itype = 3) (3) | S(with V) - S(w/o V) | / | S | =================================================================== */ float temp1, temp2; cuFloatComplex *tau; if (itype == 1 || itype == 2){ lapackf77_claset( "A", &N, &N, &c_zero, &c_one, h_S, &N); blasf77_cgemm("N", "C", &N, &N, &N, &c_one, h_R, &N, h_R, &N, &c_zero, h_work, &N); blasf77_chemm("R", uplo, &N, &N, &c_neg_one, h_B, &N, h_work, &N, &c_one, h_S, &N); result[1]= lapackf77_clange("1", &N, &N, h_S, &N, rwork) / N; } else if (itype == 3){ lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N); blasf77_cherk(uplo, "N", &N, &N, &d_neg_one, h_R, &N, &d_one, h_S, &N); result[1]= lapackf77_clanhe("1",uplo, &N, h_S, &N, rwork) / N / lapackf77_clanhe("1",uplo, &N, h_B, &N, rwork); } result[0] = 1.; result[0] /= lapackf77_clanhe("1",uplo, &N, h_A, &N, rwork); result[0] /= lapackf77_clange("1",&N , &N, h_R, &N, rwork); if (itype == 1){ blasf77_chemm("L", uplo, &N, &N, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", uplo, &N, &N, &c_neg_one, h_B, &N, h_R, &N, &c_one, h_work, &N); result[0] *= lapackf77_clange("1", &N, &N, h_work, &N, rwork)/N; } else if (itype == 2){ blasf77_chemm("L", uplo, &N, &N, &c_one, h_B, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", uplo, &N, &N, &c_one, h_A, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_clange("1", &N, &N, h_R, &N, rwork)/N; } else if (itype == 3){ blasf77_chemm("L", uplo, &N, &N, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", uplo, &N, &N, &c_one, h_B, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_clange("1", &N, &N, h_R, &N, rwork)/N; } /* lapackf77_chet21(&ione, uplo, &N, &izero, h_A, &N, w1, w1, h_R, &N, h_R, &N, tau, h_work, rwork, &result[0]); */ lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_chegvd(itype, 'N', uplo[0], N, h_R, N, h_S, N, w2, h_work, lwork, rwork, lrwork, iwork, liwork, &info); temp1 = temp2 = 0; for(int j=0; j<N; j++){ temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[2] = temp2 / temp1; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_chegvd(&itype, jobz, uplo, &N, h_A, &N, h_B, &N, w2, h_work, &lwork, rwork, &lrwork, iwork, &liwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of chegvd had an illegal value.\n", (int) -info); cpu_time = GetTimerValue(start,end)/1000.; /* ===================================================================== Print execution time =================================================================== */ printf("%5d %6.2f %6.2f\n", (int) N, cpu_time, gpu_time); if ( checkres ){ printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if(itype==1) printf("(1) | A Z - B Z D | / (|A| |Z| N) = %e\n", result[0]); else if(itype==2) printf("(1) | A B Z - Z D | / (|A| |Z| N) = %e\n", result[0]); else if(itype==3) printf("(1) | B A Z - Z D | / (|A| |Z| N) = %e\n", result[0]); if(itype==1 || itype ==2) printf("(2) | I - Z Z' B | / N = %e\n", result[1]); else printf("(2) | B - Z Z' | / (|B| N) = %e\n", result[1]); printf("(3) | D(w/ Z)-D(w/o Z)|/ |D| = %e\n\n", result[2]); } if (flagN) break; } /* Memory clean up */ TESTING_FREE( h_A); TESTING_FREE( h_B); TESTING_FREE( w1); TESTING_FREE( w2); TESTING_FREE( rwork); TESTING_FREE( iwork); TESTING_HOSTFREE(h_work); TESTING_HOSTFREE( h_R); TESTING_HOSTFREE( h_S); /* Shutdown */ TESTING_CUDA_FINALIZE(); }