/** Purpose ------- ZGEHRD reduces a COMPLEX_16 general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . This version stores the triangular matrices used in the factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] ilo INTEGER @param[in] ihi INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to ZGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau COMPLEX_16 array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. @param[out] work (workspace) COMPLEX_16 array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] T COMPLEX_16 array, dimension NB*N, where NB is the optimal blocksize. It stores the NB*NB blocks of the triangular T matrices used in the reduction. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: @verbatim on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) @endverbatim where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. This version stores the T matrices, for later use in magma_zunghr. @ingroup magma_zgeev_comp ********************************************************************/ extern "C" magma_int_t magma_zgehrd_m( magma_int_t n, magma_int_t ilo, magma_int_t ihi, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *work, magma_int_t lwork, magmaDoubleComplex *T, magma_int_t *info) { #define A( i, j ) (A + (i) + (j)*lda) #define dA( d, i, j ) (data.A[d] + (i) + (j)*ldda) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magma_int_t nb = magma_get_zgehrd_nb(n); magma_int_t nh, iws, ldda, min_lblocks, max_lblocks, last_dev, d; magma_int_t dpanel, di, nlocal, i, i2, ib, ldwork; magma_int_t iinfo; magma_int_t lquery; struct zgehrd_data data; int ngpu = magma_num_gpus(); *info = 0; iws = n*(nb + nb*ngpu); work[0] = MAGMA_Z_MAKE( iws, 0 ); lquery = (lwork == -1); if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; // Adjust from 1-based indexing ilo -= 1; // Quick return if possible nh = ihi - ilo; if (nh <= 1) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero for (i = 0; i < ilo; ++i) tau[i] = c_zero; for (i = max(0,ihi-1); i < n-1; ++i) tau[i] = c_zero; // set T to zero lapackf77_zlaset( "Full", &nb, &n, &c_zero, &c_zero, T, &nb ); // set to null, to simplify cleanup code for( d = 0; d < ngpu; ++d ) { data.A[d] = NULL; data.streams[d] = NULL; } // If not enough workspace, use unblocked code if ( lwork < iws ) { nb = 1; } if (nb == 1 || nb >= nh) { // Use unblocked code below i = ilo; } else { // Use blocked code // allocate memory on GPUs for A and workspaces ldda = ((n+31)/32)*32; min_lblocks = (n / nb) / ngpu; max_lblocks = ((n-1) / nb) / ngpu + 1; last_dev = (n / nb) % ngpu; // V and Vd need to be padded for copying in mzlahr2 data.ngpu = ngpu; data.ldda = ldda; data.ldv = nb*max_lblocks*ngpu; data.ldvd = nb*max_lblocks; for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); nlocal = min_lblocks*nb; if ( d < last_dev ) { nlocal += nb; } else if ( d == last_dev ) { nlocal += (n % nb); } ldwork = nlocal*ldda // A + nb*data.ldv // V + nb*data.ldvd // Vd + nb*ldda // Y + nb*ldda // W + nb*nb; // Ti if ( MAGMA_SUCCESS != magma_zmalloc( &data.A[d], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } data.V [d] = data.A [d] + nlocal*ldda; data.Vd[d] = data.V [d] + nb*data.ldv; data.Y [d] = data.Vd[d] + nb*data.ldvd; data.W [d] = data.Y [d] + nb*ldda; data.Ti[d] = data.W [d] + nb*ldda; magma_queue_create( &data.streams[d] ); } // Copy the matrix to GPUs magma_zsetmatrix_1D_col_bcyclic( n, n, A, lda, data.A, ldda, ngpu, nb ); // round ilo down to block boundary ilo = (ilo/nb)*nb; for (i = ilo; i < ihi - 1 - nb; i += nb) { // Reduce columns i:i+nb-1 to Hessenberg form, returning the // matrices V and T of the block reflector H = I - V*T*V' // which performs the reduction, and also the matrix Y = A*V*T // Get the current panel (no need for the 1st iteration) dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; if ( i > ilo ) { magma_setdevice( dpanel ); magma_zgetmatrix( ihi-i, nb, dA(dpanel, i, di), ldda, A(i,i), lda ); } // add 1 to i for 1-based index magma_zlahr2_m( ihi, i+1, nb, A(0,i), lda, &tau[i], &T[i*nb], nb, work, n, &data ); magma_zlahru_m( n, ihi, i, nb, A, lda, &data ); // copy first i rows above panel to host magma_setdevice( dpanel ); magma_zgetmatrix_async( i, nb, dA(dpanel, 0, di), ldda, A(0,i), lda, data.streams[dpanel] ); } // Copy remainder to host, block-by-block for( i2 = i; i2 < n; i2 += nb ) { ib = min( nb, n-i2 ); d = (i2 / nb) % ngpu; di = (i2 / nb) / ngpu * nb; magma_setdevice( d ); magma_zgetmatrix( n, ib, dA(d, 0, di), ldda, A(0,i2), lda ); } } // Use unblocked code to reduce the rest of the matrix // add 1 to i for 1-based index i += 1; lapackf77_zgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo); work[0] = MAGMA_Z_MAKE( iws, 0 ); CLEANUP: for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_free( data.A[d] ); magma_queue_destroy( data.streams[d] ); } magma_setdevice( orig_dev ); return *info; } /* magma_zgehrd */
extern "C" magma_int_t magma_zbicgstab_merge2( magma_z_matrix A, magma_z_matrix b, magma_z_matrix *x, magma_z_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_BICGSTABMERGE2; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables magmaDoubleComplex alpha, beta, omega, rho_old, rho_new, *skp_h={0}; double nom, nom0, betanom, nomb; //double den; // some useful variables magmaDoubleComplex c_zero = MAGMA_Z_ZERO, c_one = MAGMA_Z_ONE; magma_int_t dofs = A.num_rows; // workspace magma_z_matrix q={Magma_CSR}, r={Magma_CSR}, rr={Magma_CSR}, p={Magma_CSR}, v={Magma_CSR}, s={Magma_CSR}, t={Magma_CSR}; magmaDoubleComplex *d1=NULL, *d2=NULL, *skp=NULL; d1 = NULL; d2 = NULL; skp = NULL; CHECK( magma_zmalloc( &d1, dofs*(2) )); CHECK( magma_zmalloc( &d2, dofs*(2) )); // array for the parameters CHECK( magma_zmalloc( &skp, 8 )); // skp = [alpha|beta|omega|rho_old|rho|nom|tmp1|tmp2] CHECK( magma_zvinit( &q, Magma_DEV, dofs*6, 1, c_zero, queue )); // q = rr|r|p|v|s|t rr.memory_location = Magma_DEV; rr.dval = NULL; rr.num_rows = rr.nnz = dofs; r.memory_location = Magma_DEV; r.dval = NULL; r.num_rows = r.nnz = dofs; p.memory_location = Magma_DEV; p.dval = NULL; p.num_rows = p.nnz = dofs; v.memory_location = Magma_DEV; v.dval = NULL; v.num_rows = v.nnz = dofs; s.memory_location = Magma_DEV; s.dval = NULL; s.num_rows = s.nnz = dofs; t.memory_location = Magma_DEV; t.dval = NULL; t.num_rows = t.nnz = dofs; rr.dval = q(0); r.dval = q(1); p.dval = q(2); v.dval = q(3); s.dval = q(4); t.dval = q(5); // solver setup magma_zscal( dofs, c_zero, x->dval, 1, queue ); // x = 0 CHECK( magma_zresidualvec( A, b, *x, &r, &nom0, queue)); magma_zcopy( dofs, r.dval, 1, q(0), 1, queue ); // rr = r magma_zcopy( dofs, r.dval, 1, q(1), 1, queue ); // q = r betanom = nom0; nom = nom0*nom0; rho_new = magma_zdotc( dofs, r.dval, 1, r.dval, 1, queue ); // rho=<rr,r> rho_old = omega = alpha = MAGMA_Z_MAKE( 1.0, 0. ); beta = rho_new; solver_par->init_res = nom0; // array on host for the parameters CHECK( magma_zmalloc_cpu( &skp_h, 8 )); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=omega; skp_h[3]=rho_old; skp_h[4]=rho_new; skp_h[5]=MAGMA_Z_MAKE(nom, 0.0); magma_zsetvector( 8, skp_h, 1, skp, 1, queue ); solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } nomb = magma_dznrm2( dofs, b.dval, 1, queue ); if( nom0 < solver_par->atol || nom0/nomb < solver_par->rtol ){ info = MAGMA_SUCCESS; goto cleanup; } CHECK( magma_z_spmv( c_one, A, r, c_zero, v, queue )); // z = A r //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // computes p=r+beta*(p-omega*v) CHECK( magma_zbicgmerge1( dofs, skp, v.dval, r.dval, p.dval, queue )); CHECK( magma_zbicgmerge_spmv1( A, d1, d2, q(2), q(0), q(3), skp, queue )); solver_par->spmv_count++; CHECK( magma_zbicgmerge2( dofs, skp, r.dval, v.dval, s.dval, queue )); // s=r-alpha*v CHECK( magma_zbicgmerge_spmv2( A, d1, d2, q(4), q(5), skp, queue )); solver_par->spmv_count++; CHECK( magma_zbicgmerge_xrbeta( dofs, d1, d2, q(0), q(1), q(2), q(4), q(5), x->dval, skp, queue )); // check stopping criterion (asynchronous copy) magma_zgetvector( 1 , skp+5, 1, skp_h+5, 1, queue ); betanom = sqrt(MAGMA_Z_REAL(skp_h[5])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < solver_par->atol || betanom/nomb < solver_par->rtol ) { break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; CHECK( magma_zresidual( A, b, *x, &residual, NULL )); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->atol || solver_par->iter_res/solver_par->init_res < solver_par->rtol ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_zmfree(&q, queue ); // frees all vectors magma_free(d1); magma_free(d2); magma_free( skp ); magma_free_cpu( skp_h ); solver_par->info = info; return info; } /* zbicgstab_merge2 */
/** Purpose ------- ZGEGQR orthogonalizes the N vectors given by a complex M-by-N matrix A: A = Q * R. On exit, if successful, the orthogonal vectors Q overwrite A and R is given in work (on the CPU memory). The routine is designed for tall-and-skinny matrices: M >> N, N <= 128. This version uses normal equations and SVD in an iterative process that makes the computation numerically accurate. Arguments --------- @param[in] ikind INTEGER Several versions are implemented indiceted by the ikind value: 1: This version uses normal equations and SVD in an iterative process that makes the computation numerically accurate. 2: This version uses a standard LAPACK-based orthogonalization through MAGMA's QR panel factorization (magma_zgeqr2x3_gpu) and magma_zungqr 3: Modified Gram-Schmidt (MGS) 4. Cholesky QR [ Note: this method uses the normal equations which squares the condition number of A, therefore ||I - Q'Q|| < O(eps cond(A)^2) ] @param[in] m INTEGER The number of rows of the matrix A. m >= n >= 0. @param[in] n INTEGER The number of columns of the matrix A. 128 >= n >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (ldda,n) On entry, the m-by-n matrix A. On exit, the m-by-n matrix Q with orthogonal columns. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,m). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param dwork (GPU workspace) COMPLEX_16 array, dimension: n^2 for ikind = 1 3 n^2 + min(m, n) + 2 for ikind = 2 0 (not used) for ikind = 3 n^2 for ikind = 4 @param[out] work (CPU workspace) COMPLEX_16 array, dimension 3 n^2. On exit, work(1:n^2) holds the rectangular matrix R. Preferably, for higher performance, work should be in pinned memory. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: for ikind = 4, the normal equations were not positive definite, so the factorization could not be completed, and the solution has not been computed. @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zgegqr_gpu( magma_int_t ikind, magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex_ptr dwork, magmaDoubleComplex *work, magma_int_t *info ) { #define work(i_,j_) (work + (i_) + (j_)*n) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) magma_int_t i = 0, j, k, n2 = n*n; magma_int_t ione = 1; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; double cn = 200., mins, maxs; /* check arguments */ *info = 0; if (ikind < 1 || ikind > 4) { *info = -1; } else if (m < 0 || m < n) { *info = -2; } else if (n < 0 || n > 128) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); if (ikind == 1) { // === Iterative, based on SVD ============================================================ magmaDoubleComplex *U, *VT, *vt, *R, *G, *hwork, *tau; double *S; R = work; // Size n * n G = R + n*n; // Size n * n VT = G + n*n; // Size n * n magma_zmalloc_cpu( &hwork, 32 + 2*n*n + 2*n ); if ( hwork == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_int_t lwork=n*n+32; // First part f hwork; used as workspace in svd U = hwork + n*n + 32; // Size n*n S = (double*)(U + n*n); // Size n tau = U + n*n + n; // Size n #ifdef COMPLEX double *rwork; magma_dmalloc_cpu( &rwork, 5*n ); if ( rwork == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif do { i++; magma_zgemm( MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, dA, ldda, dA, ldda, c_zero, dwork, n, queue ); magma_zgetmatrix( n, n, dwork, n, G, n, queue ); lapackf77_zgesvd( "n", "a", &n, &n, G, &n, S, U, &n, VT, &n, hwork, &lwork, #ifdef COMPLEX rwork, #endif info ); mins = 100.f, maxs = 0.f; for (k=0; k < n; k++) { S[k] = magma_dsqrt( S[k] ); if (S[k] < mins) mins = S[k]; if (S[k] > maxs) maxs = S[k]; } for (k=0; k < n; k++) { vt = VT + k*n; for (j=0; j < n; j++) vt[j] *= S[j]; } lapackf77_zgeqrf( &n, &n, VT, &n, tau, hwork, &lwork, info ); if (i == 1) blasf77_zcopy( &n2, VT, &ione, R, &ione ); else blasf77_ztrmm( "l", "u", "n", "n", &n, &n, &c_one, VT, &n, R, &n ); magma_zsetmatrix( n, n, VT, n, dwork, n, queue ); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, n, c_one, dwork, n, dA, ldda, queue ); if (mins > 0.00001f) cn = maxs/mins; //fprintf( stderr, "Iteration %d, cond num = %f \n", i, cn ); } while (cn > 10.f); magma_free_cpu( hwork ); #ifdef COMPLEX magma_free_cpu( rwork ); #endif // ================== end of ikind == 1 =================================================== } else if (ikind == 2) { // ================== LAPACK based =================================================== magma_int_t min_mn = min(m, n); magma_int_t nb = n; magmaDoubleComplex_ptr dtau = dwork + 2*n*n; magmaDoubleComplex_ptr d_T = dwork; magmaDoubleComplex_ptr ddA = dwork + n*n; magmaDoubleComplex *tau = work+n*n; magmablas_zlaset( MagmaFull, n, n, c_zero, c_zero, d_T, n, queue ); magma_zgeqr2x3_gpu( m, n, dA, ldda, dtau, d_T, ddA, (double*)(dwork+min_mn+2*n*n), info ); magma_zgetmatrix( min_mn, 1, dtau, min_mn, tau, min_mn, queue ); magma_zgetmatrix( n, n, ddA, n, work, n, queue ); magma_zungqr_gpu( m, n, n, dA, ldda, tau, d_T, nb, info ); // ================== end of ikind == 2 =================================================== } else if (ikind == 3) { // ================== MGS =================================================== for (j = 0; j < n; j++) { for (i = 0; i < j; i++) { *work(i, j) = magma_zdotc( m, dA(0,i), 1, dA(0,j), 1, queue ); magma_zaxpy( m, -(*work(i,j)), dA(0,i), 1, dA(0,j), 1, queue ); } for (i = j; i < n; i++) { *work(i, j) = MAGMA_Z_ZERO; } //*work(j,j) = MAGMA_Z_MAKE( magma_dznrm2( m, dA(0,j), 1), 0., queue ); *work(j,j) = magma_zdotc( m, dA(0,j), 1, dA(0,j), 1, queue ); *work(j,j) = MAGMA_Z_MAKE( sqrt(MAGMA_Z_REAL( *work(j,j) )), 0. ); magma_zscal( m, 1./ *work(j,j), dA(0,j), 1, queue ); } // ================== end of ikind == 3 =================================================== } else if (ikind == 4) { // ================== Cholesky QR =================================================== magma_zgemm( MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, dA, ldda, dA, ldda, c_zero, dwork, n, queue ); magma_zgetmatrix( n, n, dwork, n, work, n, queue ); lapackf77_zpotrf( "u", &n, work, &n, info ); magma_zsetmatrix( n, n, work, n, dwork, n, queue ); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, n, c_one, dwork, n, dA, ldda, queue ); // ================== end of ikind == 4 =================================================== } magma_queue_destroy( queue ); return *info; } /* magma_zgegqr_gpu */
extern "C" void magma_zherk_mgpu2( magma_int_t num_gpus, magma_uplo_t uplo, magma_trans_t trans, magma_int_t nb, magma_int_t n, magma_int_t k, double alpha, magmaDoubleComplex **db, magma_int_t lddb, magma_int_t offset_b, double beta, magmaDoubleComplex **dc, magma_int_t lddc, magma_int_t offset, magma_int_t num_streams, magma_queue_t stream[][10]) { #define dB(id, i, j) (db[(id)]+(j)*lddb + (i)+offset_b) #define dC(id, i, j) (dc[(id)]+(j)*lddc + (i)) const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t i, id, ib, ii, kk, n1; magmaDoubleComplex z_alpha = MAGMA_Z_MAKE(alpha,0.0); magmaDoubleComplex z_beta = MAGMA_Z_MAKE(beta, 0.0); magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* diagonal update */ for( i=0; i < n; i += nb ) { id = ((i+offset)/nb)%num_gpus; kk = STREAM_ID( i+offset ); ib = min(nb, n-i); ii = nb*((i+offset)/(nb*num_gpus)); } if (uplo == MagmaUpper) { for( i=0; i < n; i += nb ) { id = ((i+offset)/nb)%num_gpus; kk = STREAM_ID( i+offset ); ib = min(nb, n-i); ii = nb*((i+offset)/(nb*num_gpus)); n1 = i+ib; magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); /* zgemm on diag and off-diagonal blocks */ magma_zgemm(MagmaNoTrans, MagmaConjTrans, n1, ib, k, z_alpha, dB(id, 0, 0 ), lddb, dB(id, i, 0 ), lddb, z_beta, dC(id, 0, ii), lddc); } } else { for( i=0; i < n; i += nb ) { id = ((i+offset)/nb)%num_gpus; kk = STREAM_ID( i+offset ); ib = min(nb, n-i); ii = nb*((i+offset)/(nb*num_gpus)); n1 = n-i; magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); trace_gpu_start( id, kk, "gemm_up", "gemm_up" ); /* zgemm on diag and off-diagonal blocks */ magma_zgemm(MagmaNoTrans, MagmaConjTrans, n1, ib, k, z_alpha, dB(id, i, 0), lddb, dB(id, i, 0), lddb, z_beta, dC(id, i+offset, ii), lddc); trace_gpu_end( id, kk ); } } // TODO: why not sync? //for( id=0; id < num_gpus; id++ ) { // magma_setdevice(id); // //for( kk=0; kk < num_streams; kk++ ) // // magma_queue_sync(stream[id][kk]); //} magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); }
/** Purpose ------- ZHEGVDX computes selected eigenvalues and, optionally, eigenvectors of a complex generalized Hermitian-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be Hermitian and B is also positive definite. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] itype INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangles of A and B are stored; - = MagmaLower: Lower triangles of A and B are stored. @param[in] n INTEGER The order of the matrices A and B. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. \n On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the matrix Z of eigenvectors. The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**H*B*Z = I; if ITYPE = 3, Z**H*inv(B)*Z = I. If JOBZ = MagmaNoVec, then on exit the upper triangle (if UPLO=MagmaUpper) or the lower triangle (if UPLO=MagmaLower) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B COMPLEX_16 array, dimension (LDB, N) On entry, the Hermitian matrix B. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = MagmaLower, the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. \n On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**H*U or B = L*L**H. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[in] vl DOUBLE PRECISION @param[in] vu DOUBLE PRECISION If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] m INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1. @param[out] w DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( N + N*NB, 2*N + N**2 ). NB can be obtained through magma_get_zhetrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] rwork (workspace) DOUBLE PRECISION array, dimension (MAX(1,LRWORK)) On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK. @param[in] lrwork INTEGER The dimension of the array RWORK. If N <= 1, LRWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LRWORK >= N. If JOBZ = MagmaVec and N > 1, LRWORK >= 1 + 5*N + 2*N**2. \n If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: ZPOTRF or ZHEEVD returned an error code: <= N: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1); > N: if INFO = N + i, for 1 <= i <= N, then the leading minor of order i of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed. Further Details --------------- Based on contributions by Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA Modified so that no backsubstitution is performed if ZHEEVD fails to converge (NEIG in old code could be greater than N causing out of bounds reference to A - reported by Ralf Meyer). Also corrected the description of INFO and the test on ITYPE. Sven, 16 Feb 05. @ingroup magma_zhegv_driver ********************************************************************/ extern "C" magma_int_t magma_zhegvdx(magma_int_t itype, magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *B, magma_int_t ldb, double vl, double vu, magma_int_t il, magma_int_t iu, magma_int_t *m, double *w, magmaDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex *dA=NULL, *dB=NULL; magma_int_t ldda = n; magma_int_t lddb = n; magma_int_t lower; magma_trans_t trans; magma_int_t wantz; magma_int_t lquery; magma_int_t alleig, valeig, indeig; magma_int_t lwmin; magma_int_t liwmin; magma_int_t lrwmin; magma_queue_t stream; magma_queue_create( &stream ); wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1 || lrwork == -1 || liwork == -1); *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (wantz || (jobz == MagmaNoVec))) { *info = -3; } else if (! (lower || (uplo == MagmaUpper))) { *info = -4; } else if (n < 0) { *info = -5; } else if (lda < max(1,n)) { *info = -7; } else if (ldb < max(1,n)) { *info = -9; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -11; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -12; } else if (iu < min(n,il) || iu > n) { *info = -13; } } } magma_int_t nb = magma_get_zhetrd_nb( n ); if ( n <= 1 ) { lwmin = 1; lrwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( n + n*nb, 2*n + n*n ); lrwmin = 1 + 5*n + 2*n*n; liwmin = 3 + 5*n; } else { lwmin = n + n*nb; lrwmin = n; liwmin = 1; } // multiply by 1+eps (in Double!) to ensure length gets rounded up, // if it cannot be exactly represented in floating point. real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon"); work[0] = MAGMA_Z_MAKE( lwmin * one_eps, 0.); // round up rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -17; } else if (lrwork < lrwmin && ! lquery) { *info = -19; } else if (liwork < liwmin && ! lquery) { *info = -21; } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_zhegvd(&itype, jobz_, uplo_, &n, A, &lda, B, &ldb, w, work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); *m = n; return *info; } if (MAGMA_SUCCESS != magma_zmalloc( &dA, n*ldda ) || MAGMA_SUCCESS != magma_zmalloc( &dB, n*lddb )) { magma_free( dA ); magma_free( dB ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Form a Cholesky factorization of B. */ magma_zsetmatrix( n, n, B, ldb, dB, lddb ); magma_zsetmatrix_async( n, n, A, lda, dA, ldda, stream ); magma_timer_t time=0; timer_start( time ); magma_zpotrf_gpu(uplo, n, dB, lddb, info); if (*info != 0) { *info = n + *info; return *info; } timer_stop( time ); timer_printf( "time zpotrf_gpu = %6.2f\n", time ); magma_queue_sync( stream ); magma_zgetmatrix_async( n, n, dB, lddb, B, ldb, stream ); timer_start( time ); /* Transform problem to standard eigenvalue problem and solve. */ magma_zhegst_gpu(itype, uplo, n, dA, ldda, dB, lddb, info); timer_stop( time ); timer_printf( "time zhegst_gpu = %6.2f\n", time ); /* simple fix to be able to run bigger size. * need to have a dwork here that will be used * a dB and then passed to dsyevd. * */ if (n > 5000) { magma_queue_sync( stream ); magma_free( dB ); } timer_start( time ); magma_zheevdx_gpu(jobz, range, uplo, n, dA, ldda, vl, vu, il, iu, m, w, A, lda, work, lwork, rwork, lrwork, iwork, liwork, info); timer_stop( time ); timer_printf( "time zheevdx_gpu = %6.2f\n", time ); if (wantz && *info == 0) { timer_start( time ); /* allocate and copy dB back */ if (n > 5000) { if (MAGMA_SUCCESS != magma_zmalloc( &dB, n*lddb ) ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zsetmatrix( n, n, B, ldb, dB, lddb ); } /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { trans = MagmaConjTrans; } else { trans = MagmaNoTrans; } magma_ztrsm(MagmaLeft, uplo, trans, MagmaNonUnit, n, *m, c_one, dB, lddb, dA, ldda); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { trans = MagmaNoTrans; } else { trans = MagmaConjTrans; } magma_ztrmm(MagmaLeft, uplo, trans, MagmaNonUnit, n, *m, c_one, dB, lddb, dA, ldda); } magma_zgetmatrix( n, *m, dA, ldda, A, lda ); /* free dB */ if (n > 5000) { magma_free( dB ); } timer_stop( time ); timer_printf( "time ztrsm/mm + getmatrix = %6.2f\n", time ); } magma_queue_sync( stream ); magma_queue_destroy( stream ); work[0] = MAGMA_Z_MAKE( lwmin * one_eps, 0.); // round up rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; magma_free( dA ); if (n <= 5000) { magma_free( dB ); } return *info; } /* magma_zhegvdx */
extern "C" magma_int_t magma_zpbicgstab( magma_z_matrix A, magma_z_matrix b, magma_z_matrix *x, magma_z_solver_par *solver_par, magma_z_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PBICGSTAB; solver_par->numiter = 0; solver_par->spmv_count = 0; // some useful variables magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t dofs = A.num_rows*b.num_cols; // workspace magma_z_matrix r={Magma_CSR}, rr={Magma_CSR}, p={Magma_CSR}, v={Magma_CSR}, s={Magma_CSR}, t={Magma_CSR}, ms={Magma_CSR}, mt={Magma_CSR}, y={Magma_CSR}, z={Magma_CSR}; CHECK( magma_zvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &rr,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &s, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &ms,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &mt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &y, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver variables magmaDoubleComplex alpha, beta, omega, rho_old, rho_new; double nom, betanom, nom0, r0, res, nomb; res=0; //double den; // solver setup CHECK( magma_zresidualvec( A, b, *x, &r, &nom0, queue)); magma_zcopy( dofs, r.dval, 1, rr.dval, 1, queue ); // rr = r betanom = nom0; nom = nom0*nom0; rho_new = omega = alpha = MAGMA_Z_MAKE( 1.0, 0. ); solver_par->init_res = nom0; nomb = magma_dznrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } if ( nom < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2, tempop1, tempop2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; rho_old = rho_new; // rho_old=rho rho_new = magma_zdotc( dofs, rr.dval, 1, r.dval, 1, queue ); // rho=<rr,r> beta = rho_new/rho_old * alpha/omega; // beta=rho/rho_old *alpha/omega if( magma_z_isnan_inf( beta ) ){ info = MAGMA_DIVERGENCE; break; } magma_zscal( dofs, beta, p.dval, 1, queue ); // p = beta*p magma_zaxpy( dofs, c_neg_one * omega * beta, v.dval, 1 , p.dval, 1, queue ); // p = p-omega*beta*v magma_zaxpy( dofs, c_one, r.dval, 1, p.dval, 1, queue ); // p = p+r // preconditioner tempop1 = magma_sync_wtime( queue ); CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, p, &mt, precond_par, queue )); CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, mt, &y, precond_par, queue )); tempop2 = magma_sync_wtime( queue ); precond_par->runtime += tempop2-tempop1; CHECK( magma_z_spmv( c_one, A, y, c_zero, v, queue )); // v = Ap solver_par->spmv_count++; alpha = rho_new / magma_zdotc( dofs, rr.dval, 1, v.dval, 1, queue ); if( magma_z_isnan_inf( alpha ) ){ info = MAGMA_DIVERGENCE; break; } magma_zcopy( dofs, r.dval, 1 , s.dval, 1, queue ); // s=r magma_zaxpy( dofs, c_neg_one * alpha, v.dval, 1 , s.dval, 1, queue ); // s=s-alpha*v // preconditioner tempop1 = magma_sync_wtime( queue ); CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, s, &ms, precond_par, queue )); CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, ms, &z, precond_par, queue )); tempop2 = magma_sync_wtime( queue ); precond_par->runtime += tempop2-tempop1; CHECK( magma_z_spmv( c_one, A, z, c_zero, t, queue )); // t=As solver_par->spmv_count++; // omega = <s,t>/<t,t> omega = magma_zdotc( dofs, t.dval, 1, s.dval, 1, queue ) / magma_zdotc( dofs, t.dval, 1, t.dval, 1, queue ); magma_zaxpy( dofs, alpha, y.dval, 1 , x->dval, 1, queue ); // x=x+alpha*p magma_zaxpy( dofs, omega, z.dval, 1 , x->dval, 1, queue ); // x=x+omega*s magma_zcopy( dofs, s.dval, 1 , r.dval, 1, queue ); // r=s magma_zaxpy( dofs, c_neg_one * omega, t.dval, 1 , r.dval, 1, queue ); // r=r-omega*t res = betanom = magma_dznrm2( dofs, r.dval, 1, queue ); nom = betanom*betanom; if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; CHECK( magma_zresidualvec( A, b, *x, &r, &residual, queue)); solver_par->final_res = residual; solver_par->iter_res = res; if ( solver_par->numiter < solver_par->maxiter && info == MAGMA_SUCCESS ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_zmfree(&r, queue ); magma_zmfree(&rr, queue ); magma_zmfree(&p, queue ); magma_zmfree(&v, queue ); magma_zmfree(&s, queue ); magma_zmfree(&t, queue ); magma_zmfree(&ms, queue ); magma_zmfree(&mt, queue ); magma_zmfree(&y, queue ); magma_zmfree(&z, queue ); solver_par->info = info; return info; } /* magma_zbicgstab */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeev */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; double *h_A, *h_R, *VL, *VR, *h_work, *w1, *w2; double *w1i, *w2i; magmaDoubleComplex *w1copy, *w2copy; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double tnrm, result[9]; magma_int_t N, n2, lda, nb, lwork, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double ulp, ulpinv, error; magma_int_t status = 0; ulp = lapackf77_dlamch( "P" ); ulpinv = 1./ulp; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); // enable at least some minimal checks, if requested if ( opts.check && !opts.lapack && opts.jobvl == MagmaNoVec && opts.jobvr == MagmaNoVec ) { fprintf( stderr, "NOTE: Some checks require vectors to be computed;\n" " set jobvl=V (option -LV), or jobvr=V (option -RV), or both.\n" " Some checks require running lapack (-l); setting lapack.\n\n"); opts.lapack = true; } printf(" N CPU Time (sec) GPU Time (sec) |W_magma - W_lapack| / |W_lapack|\n"); printf("===========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; nb = magma_get_dgehrd_nb(N); lwork = N*(2 + nb); // generous workspace - required by dget22 lwork = max( lwork, N*(5 + 2*N) ); TESTING_MALLOC_CPU( w1copy, magmaDoubleComplex, N ); TESTING_MALLOC_CPU( w2copy, magmaDoubleComplex, N ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( w1i, double, N ); TESTING_MALLOC_CPU( w2i, double, N ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_PIN( VL, double, n2 ); TESTING_MALLOC_PIN( VR, double, n2 ); TESTING_MALLOC_PIN( h_work, double, lwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dgeev( opts.jobvl, opts.jobvr, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, opts.queue, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_dgeev returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { /* =================================================================== * Check the result following LAPACK's [zcds]drvev routine. * The following tests are performed: * (1) | A * VR - VR * W | / ( n |A| ) * * Here VR is the matrix of unit right eigenvectors. * W is a diagonal matrix with diagonal entries W(j). * * (2) | |VR(i)| - 1 | and whether largest component real * * VR(i) denotes the i-th column of VR. * * (3) | A**T * VL - VL * W**T | / ( n |A| ) * * Here VL is the matrix of unit left eigenvectors, A**T is the * transpose of A, and W is as above. * * (4) | |VL(i)| - 1 | and whether largest component real * * VL(i) denotes the i-th column of VL. * * (5) W(full) = W(partial, W only) -- currently skipped * (6) W(full) = W(partial, W and VR) * (7) W(full) = W(partial, W and VL) * * W(full) denotes the eigenvalues computed when both VR and VL * are also computed, and W(partial) denotes the eigenvalues * computed when only W, only W and VR, or only W and VL are * computed. * * (8) VR(full) = VR(partial, W and VR) * * VR(full) denotes the right eigenvectors computed when both VR * and VL are computed, and VR(partial) denotes the result * when only VR is computed. * * (9) VL(full) = VL(partial, W and VL) * * VL(full) denotes the left eigenvectors computed when both VR * and VL are also computed, and VL(partial) denotes the result * when only VL is computed. * * (1, 2) only if jobvr = V * (3, 4) only if jobvl = V * (5-9) only if check = 2 (option -c2) ================================================================= */ double vmx, vrmx, vtst; // Initialize result. -1 indicates test was not run. for( int j = 0; j < 9; ++j ) result[j] = -1.; if ( opts.jobvr == MagmaVec ) { // Do test 1: | A * VR - VR * W | / ( n |A| ) // Note this writes result[1] also lapackf77_dget22( MagmaNoTransStr, MagmaNoTransStr, MagmaNoTransStr, &N, h_A, &lda, VR, &lda, w1, w1i, h_work, &result[0] ); result[0] *= ulp; // Do test 2: | |VR(i)| - 1 | and whether largest component real result[1] = -1.; for( int j = 0; j < N; ++j ) { tnrm = 1.; if (w1i[j] == 0.) tnrm = magma_cblas_dnrm2( N, &VR[j*lda], ione ); else if (w1i[j] > 0.) tnrm = magma_dlapy2( magma_cblas_dnrm2( N, &VR[j*lda], ione ), magma_cblas_dnrm2( N, &VR[(j+1)*lda], ione )); result[1] = max( result[1], min( ulpinv, MAGMA_D_ABS(tnrm-1.)/ulp )); if (w1i[j] > 0.) { vmx = vrmx = 0.; for( int jj = 0; jj < N; ++jj ) { vtst = magma_dlapy2( VR[jj+j*lda], VR[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VR[jj + (j+1)*lda])==0. && MAGMA_D_ABS( VR[jj+j*lda] ) > vrmx) { vrmx = MAGMA_D_ABS( VR[jj+j*lda] ); } } if (vrmx / vmx < 1. - ulp*2.) result[1] = ulpinv; } } result[1] *= ulp; } if ( opts.jobvl == MagmaVec ) { // Do test 3: | A**T * VL - VL * W**T | / ( n |A| ) // Note this writes result[3] also lapackf77_dget22( MagmaTransStr, MagmaNoTransStr, MagmaTransStr, &N, h_A, &lda, VL, &lda, w1, w1i, h_work, &result[2] ); result[2] *= ulp; // Do test 4: | |VL(i)| - 1 | and whether largest component real result[3] = -1.; for( int j = 0; j < N; ++j ) { tnrm = 1.; if (w1i[j] == 0.) tnrm = magma_cblas_dnrm2( N, &VL[j*lda], ione ); else if (w1i[j] > 0.) tnrm = magma_dlapy2( magma_cblas_dnrm2( N, &VL[j*lda], ione ), magma_cblas_dnrm2( N, &VL[(j+1)*lda], ione )); result[3] = max( result[3], min( ulpinv, MAGMA_D_ABS(tnrm-1.)/ulp )); if (w1i[j] > 0.) { vmx = vrmx = 0.; for( int jj = 0; jj < N; ++jj ) { vtst = magma_dlapy2( VL[jj+j*lda], VL[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VL[jj + (j+1)*lda])==0. && MAGMA_D_ABS( VL[jj+j*lda]) > vrmx) { vrmx = MAGMA_D_ABS( VL[jj+j*lda] ); } } if (vrmx / vmx < 1. - ulp*2.) result[3] = ulpinv; } } result[3] *= ulp; } } if ( opts.check == 2 ) { // more extensive tests // this is really slow because it calls magma_zgeev multiple times double *LRE, DUM; TESTING_MALLOC_PIN( LRE, double, n2 ); lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); // ---------- // Compute eigenvalues, left and right eigenvectors magma_dgeev( MagmaVec, MagmaVec, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, opts.queue, &info ); if (info != 0) printf("magma_zgeev (case V, V) returned error %d: %s.\n", (int) info, magma_strerror( info )); // ---------- // Compute eigenvalues only // These are not exactly equal, and not in the same order, so skip for now. //lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); //magma_dgeev( MagmaNoVec, MagmaNoVec, // N, h_R, lda, w2, w2i, // &DUM, 1, &DUM, 1, // h_work, lwork, opts.queue, &info ); //if (info != 0) // printf("magma_dgeev (case N, N) returned error %d: %s.\n", // (int) info, magma_strerror( info )); // //// Do test 5: W(full) = W(partial, W only) //result[4] = 1; //for( int j = 0; j < N; ++j ) // if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) // result[4] = 0; // ---------- // Compute eigenvalues and right eigenvectors lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev( MagmaNoVec, MagmaVec, N, h_R, lda, w2, w2i, &DUM, 1, LRE, lda, h_work, lwork, opts.queue, &info ); if (info != 0) printf("magma_dgeev (case N, V) returned error %d: %s.\n", (int) info, magma_strerror( info )); // Do test 6: W(full) = W(partial, W and VR) result[5] = 1; for( int j = 0; j < N; ++j ) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[5] = 0; // Do test 8: VR(full) = VR(partial, W and VR) result[7] = 1; for( int j = 0; j < N; ++j ) for( int jj = 0; jj < N; ++jj ) if ( ! MAGMA_D_EQUAL( VR[j+jj*lda], LRE[j+jj*lda] )) result[7] = 0; // ---------- // Compute eigenvalues and left eigenvectors lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev( MagmaVec, MagmaNoVec, N, h_R, lda, w2, w2i, LRE, lda, &DUM, 1, h_work, lwork, opts.queue, &info ); if (info != 0) printf("magma_dgeev (case V, N) returned error %d: %s.\n", (int) info, magma_strerror( info )); // Do test 7: W(full) = W(partial, W and VL) result[6] = 1; for( int j = 0; j < N; ++j ) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[6] = 0; // Do test 9: VL(full) = VL(partial, W and VL) result[8] = 1; for( int j = 0; j < N; ++j ) for( int jj = 0; jj < N; ++jj ) if ( ! MAGMA_D_EQUAL( VL[j+jj*lda], LRE[j+jj*lda] )) result[8] = 0; TESTING_FREE_PIN( LRE ); } /* ===================================================================== Performs operation using LAPACK Do this after checks, because it overwrites VL and VR. =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dgeev( lapack_vec_const(opts.jobvl), lapack_vec_const(opts.jobvr), &N, h_A, &lda, w2, w2i, VL, &lda, VR, &lda, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_dgeev returned error %d: %s.\n", (int) info, magma_strerror( info )); // check | W_magma - W_lapack | / | W | // need to sort eigenvalues first // copy them into complex vectors for ease for( int j=0; j < N; ++j ) { w1copy[j] = MAGMA_Z_MAKE( w1[j], w1i[j] ); w2copy[j] = MAGMA_Z_MAKE( w2[j], w2i[j] ); } std::sort( w1copy, &w1copy[N], lessthan ); std::sort( w2copy, &w2copy[N], lessthan ); // adjust sorting to deal with numerical inaccuracy // search down w2 for eigenvalue that matches w1's eigenvalue for( int j=0; j < N; ++j ) { for( int j2=j; j2 < N; ++j2 ) { magmaDoubleComplex diff = MAGMA_Z_SUB( w1copy[j], w2copy[j2] ); double diff2 = magma_dzlapy2( diff ) / max( magma_dzlapy2( w1copy[j] ), tol ); if ( diff2 < 100*tol ) { if ( j != j2 ) { std::swap( w2copy[j], w2copy[j2] ); } break; } } } blasf77_zaxpy( &N, &c_neg_one, w2copy, &ione, w1copy, &ione ); error = magma_cblas_dznrm2( N, w1copy, 1 ); error /= magma_cblas_dznrm2( N, w2copy, 1 ); printf("%5d %7.2f %7.2f %8.2e %s\n", (int) N, cpu_time, gpu_time, error, (error < tolulp ? "ok" : "failed")); status += ! (error < tolulp); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } if ( opts.check ) { // -1 indicates test was not run if ( result[0] != -1 ) { printf(" | A * VR - VR * W | / ( n |A| ) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } if ( result[1] != -1 ) { printf(" | |VR(i)| - 1 | = %8.2e %s\n", result[1], (result[1] < tol ? "ok" : "failed")); } if ( result[2] != -1 ) { printf(" | A'* VL - VL * W'| / ( n |A| ) = %8.2e %s\n", result[2], (result[2] < tol ? "ok" : "failed")); } if ( result[3] != -1 ) { printf(" | |VL(i)| - 1 | = %8.2e %s\n", result[3], (result[3] < tol ? "ok" : "failed")); } if ( result[4] != -1 ) { printf(" W (full) == W (partial, W only) %s\n", (result[4] == 1. ? "ok" : "failed")); } if ( result[5] != -1 ) { printf(" W (full) == W (partial, W and VR) %s\n", (result[5] == 1. ? "ok" : "failed")); } if ( result[6] != -1 ) { printf(" W (full) == W (partial, W and VL) %s\n", (result[6] == 1. ? "ok" : "failed")); } if ( result[7] != -1 ) { printf(" VR (full) == VR (partial, W and VR) %s\n", (result[7] == 1. ? "ok" : "failed")); } if ( result[8] != -1 ) { printf(" VL (full) == VL (partial, W and VL) %s\n", (result[8] == 1. ? "ok" : "failed")); } int newline = 0; if ( result[0] != -1 ) { status += ! (result[0] < tol); newline = 1; } if ( result[1] != -1 ) { status += ! (result[1] < tol); newline = 1; } if ( result[2] != -1 ) { status += ! (result[2] < tol); newline = 1; } if ( result[3] != -1 ) { status += ! (result[3] < tol); newline = 1; } if ( result[4] != -1 ) { status += ! (result[4] == 1.); newline = 1; } if ( result[5] != -1 ) { status += ! (result[5] == 1.); newline = 1; } if ( result[6] != -1 ) { status += ! (result[6] == 1.); newline = 1; } if ( result[7] != -1 ) { status += ! (result[7] == 1.); newline = 1; } if ( result[8] != -1 ) { status += ! (result[8] == 1.); newline = 1; } if ( newline ) { printf( "\n" ); } } TESTING_FREE_CPU( w1copy ); TESTING_FREE_CPU( w2copy ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( w1i ); TESTING_FREE_CPU( w2i ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( VL ); TESTING_FREE_PIN( VR ); TESTING_FREE_PIN( h_work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma; magmaDoubleComplex *dA, *dX, *dY, *dwork; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); if ( opts.uplo == MagmaUpper ) { printf("** for uplo=MagmaUpper, magmablas_zhemv simply calls cublas_zhemv.\n"); } printf(" N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("=============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_ZHEMV( N ) / 1e9; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); blocks = (N + nb - 1) / nb; ldwork = ldda*blocks; TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork, ldwork ); magmablas_zlaset( MagmaFull, ldda, N, MAGMA_Z_NAN, MAGMA_Z_NAN, dA, ldda ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); magma_zmake_hermitian( N, A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( N, N, A, lda, dA, ldda ); magma_zsetvector( N, X, incx, dX, incx ); magma_zsetvector( N, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasZhemv( handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_zsetvector( N, Y, incy, dY, incy ); //magma_zprint_gpu( ldda, blocks, dwork, ldda ); magma_time = magma_sync_wtime( 0 ); magmablas_zhemv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork ); // TODO provide option to test non-work interface //magmablas_zhemv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( N, dY, incy, Ymagma, incy ); //magma_zprint_gpu( ldda, blocks, dwork, ldda ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zhemv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "M", &N, &ione, Ymagma, &N, work ) / N; blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_zlange( "M", &N, &ione, Ycublas, &N, work ) / N; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zhetrd_gpu(char uplo, magma_int_t n, magmaDoubleComplex *da, magma_int_t ldda, double *d, double *e, magmaDoubleComplex *tau, magmaDoubleComplex *wa, magma_int_t ldwa, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= ZHETRD_GPU reduces a complex Hermitian matrix A to real symmetric tridiagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. DA (device input/output) COMPLEX_16 array on the GPU, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = 'U', the diagonal and first superdiagonal of A are overwritten by the corresponding elements of the tridiagonal matrix T, and the elements above the first superdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the diagonal and first subdiagonal of A are over- written by the corresponding elements of the tridiagonal matrix T, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). D (output) COMPLEX_16 array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). E (output) COMPLEX_16 array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. TAU (output) COMPLEX_16 array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). WA (workspace/output) COMPLEX_16 array, dimension (LDA,N) On exit the diagonal, the upper part (UPLO='U') or the lower part (UPLO='L') are copies of DA LDWA (input) INTEGER The leading dimension of the array WA. LDWA >= max(1,N). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB is the optimal blocksize given by magma_get_zhetrd_nb(). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = 'U': if UPLO = 'L': ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). ===================================================================== */ char uplo_[2] = {uplo, 0}; magma_int_t nb = magma_get_zhetrd_nb(n); magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; double d_one = MAGMA_D_ONE; magma_int_t kk, nx; magma_int_t i, j, i_n; magma_int_t iinfo; magma_int_t ldw, lddw, lwkopt; magma_int_t lquery; *info = 0; int upper = lapackf77_lsame(uplo_, "U"); lquery = lwork == -1; if (! upper && ! lapackf77_lsame(uplo_, "L")) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } else if (ldwa < max(1,n)) { *info = -9; } else if (lwork < nb*n && ! lquery) { *info = -11; } /* Determine the block size. */ ldw = lddw = n; lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magmaDoubleComplex *dwork; if (n < 2048) nx = n; else nx = 512; if (MAGMA_SUCCESS != magma_zmalloc( &dwork, (ldw*nb) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (upper) { /* Reduce the upper triangle of A. Columns 1:kk are handled by the unblocked method. */ kk = n - (n - nx + nb - 1) / nb * nb; for (i = n - nb; i >= kk; i -= nb) { /* Reduce columns i:i+nb-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel */ magma_zgetmatrix( i+nb, nb, dA(0, i), ldda, A(0, i), ldwa ); magma_zlatrd(uplo, i+nb, nb, A(0, 0), ldwa, e, tau, work, ldw, dA(0, 0), ldda, dwork, lddw); /* Update the unreduced submatrix A(0:i-2,0:i-2), using an update of the form: A := A - V*W' - W*V' */ magma_zsetmatrix( i + nb, nb, work, ldw, dwork, lddw ); magma_zher2k(uplo, MagmaNoTrans, i, nb, c_neg_one, dA(0, i), ldda, dwork, lddw, d_one, dA(0, 0), ldda); /* Copy superdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+nb; ++j) { *A(j-1,j) = MAGMA_Z_MAKE( e[j - 1], 0 ); d[j] = MAGMA_Z_REAL( *A(j, j) ); } } magma_zgetmatrix( kk, kk, dA(0, 0), ldda, A(0, 0), ldwa ); /* Use CPU code to reduce the last or only block */ lapackf77_zhetrd(uplo_, &kk, A(0, 0), &ldwa, d, e, tau, work, &lwork, &iinfo); magma_zsetmatrix( kk, kk, A(0, 0), ldwa, dA(0, 0), ldda ); } else { /* Reduce the lower triangle of A */ for (i = 0; i < n-nx; i += nb) { /* Reduce columns i:i+nb-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel */ magma_zgetmatrix( n-i, nb, dA(i, i), ldda, A(i, i), ldwa ); magma_zlatrd(uplo, n-i, nb, A(i, i), ldwa, &e[i], &tau[i], work, ldw, dA(i, i), ldda, dwork, lddw); /* Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' */ magma_zsetmatrix( n-i, nb, work, ldw, dwork, lddw ); magma_zher2k(MagmaLower, MagmaNoTrans, n-i-nb, nb, c_neg_one, dA(i+nb, i), ldda, &dwork[nb], lddw, d_one, dA(i+nb, i+nb), ldda); /* Copy subdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+nb; ++j) { *A(j+1,j) = MAGMA_Z_MAKE( e[j], 0 ); d[j] = MAGMA_Z_REAL( *A(j, j) ); } } /* Use unblocked code to reduce the last or only block */ magma_zgetmatrix( n-i, n-i, dA(i, i), ldda, A(i, i), ldwa ); i_n = n-i; lapackf77_zhetrd(uplo_, &i_n, A(i, i), &ldwa, &d[i], &e[i], &tau[i], work, &lwork, &iinfo); magma_zsetmatrix( n-i, n-i, A(i, i), ldwa, dA(i, i), ldda ); } magma_free( dwork ); work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); return *info; } /* zhetrd_gpu */
extern "C" magma_int_t magma_zgeqrf3(magma_context *cntxt, magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info ) { /* -- MAGMA (version 1.6.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2015 Purpose ======= ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A: A = Q * R. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N*NB. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ==================================================================== */ cuDoubleComplex c_one = MAGMA_Z_ONE; int k, ib; magma_qr_params *qr_params = (magma_qr_params *)cntxt->params; *info = 0; int lwkopt = n * qr_params->nb; work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 ); long int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,n) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } else if (lquery) return MAGMA_SUCCESS; k = min(m,n); if (k == 0) { work[0] = c_one; return MAGMA_SUCCESS; } int M=qr_params->nthreads*qr_params->ob; int N=qr_params->nthreads*qr_params->ob; if (qr_params->m > qr_params->n) M = qr_params->m - (qr_params->n-qr_params->nthreads*qr_params->ob); /* Use MAGMA code to factor left portion of matrix, waking up threads along the way to perform updates on the right portion of matrix */ magma_zgeqrf2(cntxt, m, n - qr_params->nthreads * qr_params->ob, a, lda, tau, work, lwork, info); /* Wait for all update threads to finish */ for (k = 0; k < qr_params->nthreads; k++) { while (qr_params->sync1[k] == 0) { sched_yield(); } } /* Unzero upper part of each panel */ for (k = 0; k < qr_params->np_gpu-1; k++){ ib = min(qr_params->nb,(n-qr_params->nthreads*qr_params->ob)-qr_params->nb*k); zq_to_panel(MagmaUpper, ib, a + k*qr_params->nb*lda + k*qr_params->nb, lda, qr_params->w+qr_params->nb*qr_params->nb*k); } /* Use final blocking size */ qr_params->nb = qr_params->fb; /* Flag MAGMA code to internally unzero upper part of each panel */ qr_params->flag = 1; /* Use MAGMA code to perform final factorization if necessary */ if (qr_params->m > (qr_params->n - (qr_params->nthreads*qr_params->ob))) if (M > (qr_params->m-(qr_params->n-(qr_params->ob*qr_params->nthreads)))) M = qr_params->m-(qr_params->n-(qr_params->ob*qr_params->nthreads)); magma_zgeqrf2(cntxt, M, N, a + (n-qr_params->nthreads*qr_params->ob)*m+ (n-qr_params->nthreads*qr_params->ob), lda, &tau[n-qr_params->nthreads*qr_params->ob], work, lwork, info); /* Prepare for next run */ for (k = 0; k < qr_params->np_gpu; k++) { qr_params->p[k] = NULL; } for (k = 0; k < qr_params->nthreads; k++) { qr_params->sync1[k] = 0; } /* Infrastructure for next run is not in place yet */ qr_params->sync0 = 0; /* Signal update threads to get in position for next run */ qr_params->sync2 = 1; }
/** Purpose ------- ZUNMQR overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] nrgpu INTEGER Number of GPUs to use. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. @param[in,out] C COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zunmqr_m(magma_int_t nrgpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *C, magma_int_t ldc, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); magma_int_t nb = 128; magmaDoubleComplex *T; magma_zmalloc_pinned(&T, nb*nb); //printf("calling zunmqr_m with nb=%d\n", (int) nb); magmaDoubleComplex* dw[MagmaMaxGPUs]; magma_queue_t stream [MagmaMaxGPUs][2]; magma_event_t event [MagmaMaxGPUs][2]; magma_int_t ind_c; magma_int_t igpu = 0; int gpu_b; magma_getdevice(&gpu_b); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_zunmqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = (m+63)/64*64; magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = (n-1)/nb_l+1; // number of blocks magma_int_t maxnlocal = (nbl+nrgpu-1)/nrgpu*nb_l; nrgpu = min(nrgpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); if (MAGMA_SUCCESS != magma_zmalloc( &dw[igpu], ldw )) { magma_xerbla( __func__, -(*info) ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); magma_event_create( &event[igpu][0] ); magma_event_create( &event[igpu][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%nrgpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zsetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(igpu, 0, i/nrgpu*nb_l), lddc, stream[igpu][0] ); nlocal[igpu] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied magma_zsetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); // set upper triangular part of dA to identity magmablas_zlaset_band_stream( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_zlarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_zsetmatrix_async(kb, kb, T, kb, dT(igpu, ind_c), kb, stream[igpu][0] ); } for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); // check if the data was copied magmablasSetKernelStream(stream[igpu][1]); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[igpu], kb, dA_c(igpu, ind_c, i, 0), lddac, dT(igpu, ind_c), kb, dC(igpu, i, 0), lddc, dwork(igpu, ind_c), lddwork); magma_event_record(event[igpu][ind_c], stream[igpu][1] ); } ind_c = (ind_c+1)%2; } for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%nrgpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zgetmatrix( m, kb, dC(igpu, 0, i/nrgpu*nb_l), lddc, C(0, i*nb_l), ldc ); // magma_zgetmatrix_async( m, kb, // dC(igpu, 0, i/nrgpu*nb_l), lddc, // C(0, i*nb_l), ldc, stream[igpu][0] ); } } else { fprintf(stderr, "The case (side == right) is not implemented\n"); *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); return *info; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda ); magmablas_zlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_zsetmatrix( ib, ib, T, ib, dT, ib ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork); } */ } work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magmablasSetKernelStream(NULL); magma_event_destroy( event[igpu][0] ); magma_event_destroy( event[igpu][1] ); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_free( dw[igpu] ); } magma_setdevice(gpu_b); return *info; } /* magma_zunmqr */
extern "C" magma_int_t magma_zcg( magma_z_sparse_matrix A, magma_z_vector b, magma_z_vector *x, magma_z_solver_par *solver_par, magma_queue_t queue ) { // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); // prepare solver feedback solver_par->solver = Magma_CG; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // local variables magmaDoubleComplex c_zero = MAGMA_Z_ZERO, c_one = MAGMA_Z_ONE; magma_int_t dofs = A.num_rows; // GPU workspace magma_z_vector r, p, q; magma_z_vinit( &r, Magma_DEV, dofs, c_zero, queue ); magma_z_vinit( &p, Magma_DEV, dofs, c_zero, queue ); magma_z_vinit( &q, Magma_DEV, dofs, c_zero, queue ); // solver variables magmaDoubleComplex alpha, beta; double nom, nom0, r0, betanom, betanomsq, den; // solver setup magma_zscal( dofs, c_zero, x->dval, 1) ; // x = 0 magma_zcopy( dofs, b.dval, 1, r.dval, 1 ); // r = b magma_zcopy( dofs, b.dval, 1, p.dval, 1 ); // p = b nom0 = betanom = magma_dznrm2( dofs, r.dval, 1 ); nom = nom0 * nom0; // nom = r' * r magma_z_spmv( c_one, A, p, c_zero, q, queue ); // q = A p den = MAGMA_Z_REAL( magma_zdotc(dofs, p.dval, 1, q.dval, 1) );// den = p dot q solver_par->init_res = nom0; if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); magmablasSetKernelStream( orig_queue ); return MAGMA_NONSPD; solver_par->info = MAGMA_NONSPD; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { alpha = MAGMA_Z_MAKE(nom/den, 0.); magma_zaxpy(dofs, alpha, p.dval, 1, x->dval, 1); // x = x + alpha p magma_zaxpy(dofs, -alpha, q.dval, 1, r.dval, 1); // r = r - alpha q betanom = magma_dznrm2(dofs, r.dval, 1); // betanom = || r || betanomsq = betanom * betanom; // betanoms = r' * r if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } beta = MAGMA_Z_MAKE(betanomsq/nom, 0.); // beta = betanoms/nom magma_zscal(dofs, beta, p.dval, 1); // p = beta*p magma_zaxpy(dofs, c_one, r.dval, 1, p.dval, 1); // p = p + r magma_z_spmv( c_one, A, p, c_zero, q, queue ); // q = A p den = MAGMA_Z_REAL(magma_zdotc(dofs, p.dval, 1, q.dval, 1)); // den = p dot q nom = betanomsq; } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; magma_zresidual( A, b, *x, &residual, queue ); solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } magma_z_vfree(&r, queue ); magma_z_vfree(&p, queue ); magma_z_vfree(&q, queue ); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_zcg */
extern "C" magma_int_t magma_zgelqf( magma_int_t m, magma_int_t n, magmaDoubleComplex *a, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= ZGELQF computes an LQ factorization of a COMPLEX_16 M-by-N matrix A: A = L * Q. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and below the diagonal of the array contain the m-by-min(m,n) lower trapezoidal matrix L (L is lower triangular if m <= n); the elements above the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max(1,M). For optimum performance LWORK >= M*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -10 internal GPU memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(k) . . . H(2) H(1), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) magmaDoubleComplex *dA, *dAT; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t maxm, maxn, maxdim, nb; magma_int_t iinfo, ldda; int lquery; /* Function Body */ *info = 0; nb = magma_get_zgelqf_nb(m); work[0] = MAGMA_Z_MAKE( (double)(m*nb), 0 ); lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,m) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (min(m, n) == 0) { work[0] = c_one; return *info; } maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; maxdim = max(maxm, maxn); if (maxdim*maxdim < 2*maxm*maxn) { ldda = maxdim; if (MAGMA_SUCCESS != magma_zmalloc( &dA, maxdim*maxdim )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zsetmatrix( m, n, a, lda, dA, ldda ); dAT = dA; magmablas_ztranspose_inplace( ldda, dAT, ldda ); } else { ldda = maxn; if (MAGMA_SUCCESS != magma_zmalloc( &dA, 2*maxn*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zsetmatrix( m, n, a, lda, dA, maxm ); dAT = dA + maxn * maxm; magmablas_ztranspose2( dAT, ldda, dA, maxm, m, n ); } magma_zgeqrf2_gpu(n, m, dAT, ldda, tau, &iinfo); if (maxdim*maxdim < 2*maxm*maxn) { magmablas_ztranspose_inplace( ldda, dAT, ldda ); magma_zgetmatrix( m, n, dA, ldda, a, lda ); } else { magmablas_ztranspose2( dA, maxm, dAT, ldda, n, m ); magma_zgetmatrix( m, n, dA, maxm, a, lda ); } magma_free( dA ); return *info; } /* magma_zgelqf */
/** Purpose ------- ZHEEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. It uses a two-stage algorithm for the tridiagonalization. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] nrgpu INTEGER Number of GPUs to use. @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = MagmaVec, then if INFO = 0, the first m columns of A contains the required orthonormal eigenvectors of the matrix A. If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] vl DOUBLE PRECISION @param[in] vu DOUBLE PRECISION If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] m INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1. @param[out] w DOUBLE PRECISION array, dimension (N) If INFO = 0, the required m eigenvalues in ascending order. @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= LQ2 + N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= LQ2 + 2*N + N**2. where LQ2 is the size needed to store the Q2 matrix and is returned by magma_bulge_get_lq2. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] rwork (workspace) DOUBLE PRECISION array, dimension (LRWORK) On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK. @param[in] lrwork INTEGER The dimension of the array RWORK. If N <= 1, LRWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LRWORK >= N. If JOBZ = MagmaVec and N > 1, LRWORK >= 1 + 5*N + 2*N**2. \n If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. @ingroup magma_zheev_driver ********************************************************************/ extern "C" magma_int_t magma_zheevdx_2stage_m(magma_int_t nrgpu, magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, double vl, double vu, magma_int_t il, magma_int_t iu, magma_int_t *m, double *w, magmaDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { #define A( i_,j_) (A + (i_) + (j_)*lda) #define A2(i_,j_) (A2 + (i_) + (j_)*lda2) const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); magmaDoubleComplex c_one = MAGMA_Z_ONE; double d_one = 1.; magma_int_t ione = 1; magma_int_t izero = 0; double d__1; double eps; double anrm; magma_int_t imax; double rmin, rmax; double sigma; //magma_int_t iinfo; magma_int_t lwmin, lrwmin, liwmin; magma_int_t lower; magma_int_t wantz; magma_int_t iscale; double safmin; double bignum; double smlnum; magma_int_t lquery; magma_int_t alleig, valeig, indeig; magma_int_t len; /* determine the number of threads */ magma_int_t parallel_threads = magma_get_parallel_numthreads(); wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1 || lrwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || (uplo == MagmaUpper))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_zbulge_nb(n, parallel_threads); magma_int_t Vblksiz = magma_zbulge_get_Vblksiz(n, nb, parallel_threads); magma_int_t ldt = Vblksiz; magma_int_t ldv = nb + Vblksiz; magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz); magma_int_t lq2 = magma_zbulge_get_lq2(n, parallel_threads); if (wantz) { lwmin = lq2 + 2*n + n*n; lrwmin = 1 + 5*n + 2*n*n; liwmin = 5*n + 3; } else { lwmin = lq2 + n + n*nb; lrwmin = n; liwmin = 1; } // multiply by 1+eps (in Double!) to ensure length gets rounded up, // if it cannot be exactly represented in floating point. real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon"); work[0] = MAGMA_Z_MAKE( lwmin * one_eps, 0.); // round up rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -14; } else if ((lrwork < lrwmin) && ! lquery) { *info = -16; } else if ((liwork < liwmin) && ! lquery) { *info = -18; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = MAGMA_Z_REAL(A[0]); if (wantz) { A[0] = MAGMA_Z_ONE; } return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); timer_printf("using %d parallel_threads\n", (int) parallel_threads); /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ magma_int_t ntiles = n/nb; if ( ( ntiles < 2 ) || ( n <= 128 ) ) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_zheevd(jobz_, uplo_, &n, A, &lda, w, work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); *m = n; return *info; } /* Get machine constants. */ safmin = lapackf77_dlamch("Safe minimum"); eps = lapackf77_dlamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_dsqrt(smlnum); rmax = magma_dsqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_zlanhe("M", uplo_, &n, A, &lda, rwork); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_zlascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, A, &lda, info); } magma_int_t indT2 = 0; magma_int_t indTAU2 = indT2 + blkcnt*ldt*Vblksiz; magma_int_t indV2 = indTAU2+ blkcnt*Vblksiz; magma_int_t indtau1 = indV2 + blkcnt*ldv*Vblksiz; magma_int_t indwrk = indtau1+ n; magma_int_t indwk2 = indwrk + n*n; magma_int_t llwork = lwork - indwrk; magma_int_t llwrk2 = lwork - indwk2; magma_int_t inde = 0; magma_int_t indrwk = inde + n; magma_int_t llrwk = lrwork - indrwk; magma_timer_t time=0, time_total=0, time_alloc=0, time_dist=0, time_band=0; timer_start( time_total ); #ifdef HE2HB_SINGLEGPU magmaDoubleComplex *dT1; if (MAGMA_SUCCESS != magma_zmalloc( &dT1, n*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } timer_start( time_band ); magma_zhetrd_he2hb(uplo, n, nb, A, lda, &work[indtau1], &work[indwrk], llwork, dT1, info); timer_stop( time_band ); timer_printf( " 1 GPU seq code time zhetrd_he2hb only = %7.4f\n", time_band ); magma_free(dT1); #else magma_int_t nstream = max(3,nrgpu+2); magma_queue_t streams[MagmaMaxGPUs][20]; magmaDoubleComplex *da[MagmaMaxGPUs], *dT1[MagmaMaxGPUs]; magma_int_t ldda = ((n+31)/32)*32; magma_int_t ver = 0; magma_int_t distblk = max(256, 4*nb); #ifdef ENABLE_DEBUG printf("voici ngpu %d distblk %d NB %d nstream %d version %d \n ", nrgpu, distblk, nb, nstream, ver); #endif timer_start( time_alloc ); for( magma_int_t dev = 0; dev < nrgpu; ++dev ) { magma_int_t mlocal = ((n / distblk) / nrgpu + 1) * distblk; magma_setdevice( dev ); // TODO check malloc magma_zmalloc(&da[dev], ldda*mlocal ); magma_zmalloc(&dT1[dev], (n*nb) ); for( int i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } } timer_stop( time_alloc ); timer_start( time_dist ); magma_zsetmatrix_1D_col_bcyclic( n, n, A, lda, da, ldda, nrgpu, distblk ); magma_setdevice(0); timer_stop( time_dist ); timer_start( time_band ); if (ver == 30) { magma_zhetrd_he2hb_mgpu_spec(uplo, n, nb, A, lda, &work[indtau1], &work[indwrk], llwork, da, ldda, dT1, nb, nrgpu, distblk, streams, nstream, info); } else { magma_zhetrd_he2hb_mgpu(uplo, n, nb, A, lda, &work[indtau1], &work[indwrk], llwork, da, ldda, dT1, nb, nrgpu, distblk, streams, nstream, info); } timer_stop( time_band ); timer_printf(" time alloc %7.4f, ditribution %7.4f, zhetrd_he2hb only = %7.4f\n", time_alloc, time_dist, time_band ); for( magma_int_t dev = 0; dev < nrgpu; ++dev ) { magma_setdevice( dev ); magma_free( da[dev] ); magma_free( dT1[dev] ); for( int i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } #endif // not HE2HB_SINGLEGPU timer_stop( time_total ); timer_printf( " time zhetrd_he2hb_mgpu = %6.2f\n", time_total ); timer_start( time_total ); timer_start( time ); /* copy the input matrix into WORK(INDWRK) with band storage */ /* PAY ATTENTION THAT work[indwrk] should be able to be of size lda2*n which it should be checked in any future modification of lwork.*/ magma_int_t lda2 = 2*nb; //nb+1+(nb-1); magmaDoubleComplex* A2 = &work[indwrk]; memset(A2, 0, n*lda2*sizeof(magmaDoubleComplex)); for (magma_int_t j = 0; j < n-nb; j++) { len = nb+1; blasf77_zcopy( &len, A(j,j), &ione, A2(0,j), &ione ); memset(A(j,j), 0, (nb+1)*sizeof(magmaDoubleComplex)); *A(nb+j,j) = c_one; } for (magma_int_t j = 0; j < nb; j++) { len = nb-j; blasf77_zcopy( &len, A(j+n-nb,j+n-nb), &ione, A2(0,j+n-nb), &ione ); memset(A(j+n-nb,j+n-nb), 0, (nb-j)*sizeof(magmaDoubleComplex)); } timer_stop( time ); timer_printf( " time zhetrd_convert = %6.2f\n", time ); timer_start( time ); magma_zhetrd_hb2st(uplo, n, nb, Vblksiz, A2, lda2, w, &rwork[inde], &work[indV2], ldv, &work[indTAU2], wantz, &work[indT2], ldt); timer_stop( time ); timer_stop( time_total ); timer_printf( " time zhetrd_hb2st = %6.2f\n", time ); timer_printf( " time zhetrd = %6.2f\n", time_total ); /* For eigenvalues only, call DSTERF. For eigenvectors, first call ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call ZUNMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { timer_start( time ); lapackf77_dsterf(&n, w, &rwork[inde], info); magma_dmove_eig(range, n, w, &il, &iu, vl, vu, m); timer_stop( time ); timer_printf( " time dstedc = %6.2f\n", time ); } else { timer_start( time_total ); timer_start( time ); magma_zstedx_m(nrgpu, range, n, vl, vu, il, iu, w, &rwork[inde], &work[indwrk], n, &rwork[indrwk], llrwk, iwork, liwork, info); timer_stop( time ); timer_printf( " time zstedx_m = %6.2f\n", time ); timer_start( time ); magma_dmove_eig(range, n, w, &il, &iu, vl, vu, m); /* magmaDoubleComplex *dZ; magma_int_t lddz = n; if (MAGMA_SUCCESS != magma_zmalloc( &dZ, *m*lddz)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zbulge_back(uplo, n, nb, *m, Vblksiz, &work[indwrk + n * (il-1)], n, dZ, lddz, &work[indV2], ldv, &work[indTAU2], &work[indT2], ldt, info); magma_zgetmatrix( n, *m, dZ, lddz, &work[indwrk], n); magma_free(dZ); */ magma_zbulge_back_m(nrgpu, uplo, n, nb, *m, Vblksiz, &work[indwrk + n * (il-1)], n, &work[indV2], ldv, &work[indTAU2], &work[indT2], ldt, info); timer_stop( time ); timer_printf( " time zbulge_back_m = %6.2f\n", time ); timer_start( time ); magma_zunmqr_m(nrgpu, MagmaLeft, MagmaNoTrans, n-nb, *m, n-nb, A+nb, lda, &work[indtau1], &work[indwrk + n * (il-1) + nb], n, &work[indwk2], llwrk2, info); lapackf77_zlacpy("A", &n, m, &work[indwrk + n * (il-1)], &n, A, &lda); timer_stop( time ); timer_stop( time_total ); timer_printf( " time zunmqr_m + copy = %6.2f\n", time ); timer_printf( " time eigenvectors backtransf. = %6.2f\n", time_total ); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = n; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_dscal(&imax, &d__1, w, &ione); } work[0] = MAGMA_Z_MAKE( lwmin * one_eps, 0.); // round up rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; magma_setdevice( orig_dev ); return *info; } /* magma_zheevdx_2stage_m */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zlaset Code is very similar to testing_zlacpy.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_R; magmaDoubleComplex *d_A; magmaDoubleComplex offdiag = MAGMA_Z_MAKE( 1.2000, 6.7000 ); magmaDoubleComplex diag = MAGMA_Z_MAKE( 3.1415, 2.7183 ); magma_int_t M, N, size, lda, ldb, ldda; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("uplo M N CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("==================================================================\n"); for( int iuplo = 0; iuplo < 3; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; //M += 2; // space for insets //N += 2; lda = M; ldb = lda; ldda = ((M+31)/32)*32; size = lda*N; if ( uplo[iuplo] == MagmaLower || uplo[iuplo] == MagmaUpper ) { // save triangle (with diagonal) // TODO wrong for trapezoid gbytes = sizeof(magmaDoubleComplex) * 0.5*N*(N+1) / 1e9; } else { // save entire matrix gbytes = sizeof(magmaDoubleComplex) * 1.*M*N / 1e9; } TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, size ); TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, size ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_Z_MAKE( i + j/10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); gpu_time = magma_sync_wtime( 0 ); //magmablas_zlaset( uplo[iuplo], M-2, N-2, offdiag, diag, d_A+1+ldda, ldda ); // inset by 1 row & col magmablas_zlaset( uplo[iuplo], M, N, offdiag, diag, d_A, ldda ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); //magma_int_t M2 = M-2; // inset by 1 row & col //magma_int_t N2 = N-2; //lapackf77_zlaset( lapack_uplo_const( uplo[iuplo] ), &M2, &N2, &offdiag, &diag, h_A+1+lda, &lda ); lapackf77_zlaset( lapack_uplo_const( uplo[iuplo] ), &M, &N, &offdiag, &diag, h_A, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ===================================================================== Check the result =================================================================== */ magma_zgetmatrix( M, N, d_A, ldda, h_R, lda ); blasf77_zaxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_zlange("f", &M, &N, h_R, &lda, work); printf("%4c %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", lapacke_uplo_const( uplo[iuplo] ), (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } TESTING_FINALIZE(); return status; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; magmaDoubleComplex *hA, *hR; magmaDoubleComplex_ptr dA; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8160, 8192 }; magma_int_t i, info; magmaDoubleComplex mz_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], matnorm, diffnorm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_zpotrf2_gpu -N %d\n\n", 1024); } /* Initialize */ magma_queue_t queue1, queue2; magma_device_t device; magma_int_t num = 0; magma_int_t err; magma_init(); err = magma_getdevices( &device, 2, &num ); if ( err != 0 or num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( device, &queue1 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( device, &queue2 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } magma_queue_t queues[2] = {queue1, queue2}; /* Allocate memory for the largest matrix */ N = size[9]; n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC_CPU( hA, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( hR, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); printf("\n\n"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (double)N ) * 1e-9; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, hA ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { hA(i,i) = MAGMA_Z_MAKE( MAGMA_Z_REAL(hA(i,i)) + N, 0 ); for( int j = 0; j < i; ++j ) { hA(i, j) = MAGMA_Z_CNJG( hA(j,i) ); } } lapackf77_zlacpy( MagmaFullStr, &N, &N, hA, &lda, hR, &lda ); /* Warm up to measure the performance */ magma_zsetmatrix( N, N, hA, lda, dA, 0, ldda, queue1); clFinish(queue1); magma_zpotrf2_gpu( MagmaLower, N, dA, 0, ldda, queues, &info ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( N, N, hA, lda, dA, 0, ldda, queue1 ); clFinish(queue1); gpu_time = magma_wtime(); magma_zpotrf2_gpu( MagmaLower, N, dA, 0, ldda, queues, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_zpotrf2 had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zpotrf( MagmaLowerStr, &N, hA, &lda, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf( "lapackf77_zpotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ magma_zgetmatrix( N, N, dA, 0, ldda, hR, lda, queue1 ); matnorm = lapackf77_zlange("f", &N, &N, hA, &lda, work); blasf77_zaxpy(&n2, &mz_one, hA, &ione, hR, &ione); diffnorm = lapackf77_zlange("f", &N, &N, hR, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( hA ); TESTING_FREE_PIN( hR ); TESTING_FREE_DEV( dA ); magma_queue_destroy( queue1 ); magma_queue_destroy( queue2 ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- testing sparse matrix vector product */ int main( int argc, char** argv ) { TESTING_INIT(); magma_queue_t queue; magma_queue_create( /*devices[ opts->device ],*/ &queue ); magma_z_sparse_matrix hA, hA_SELLP, hA_ELL, dA, dA_SELLP, dA_ELL; hA_SELLP.blocksize = 8; hA_SELLP.alignment = 8; real_Double_t start, end, res; magma_int_t *pntre; magmaDoubleComplex c_one = MAGMA_Z_MAKE(1.0, 0.0); magmaDoubleComplex c_zero = MAGMA_Z_MAKE(0.0, 0.0); magma_int_t i, j; for( i = 1; i < argc; ++i ) { if ( strcmp("--blocksize", argv[i]) == 0 ) { hA_SELLP.blocksize = atoi( argv[++i] ); } else if ( strcmp("--alignment", argv[i]) == 0 ) { hA_SELLP.alignment = atoi( argv[++i] ); } else break; } printf( "\n# usage: ./run_zspmv" " [ --blocksize %d --alignment %d (for SELLP) ]" " matrices \n\n", (int) hA_SELLP.blocksize, (int) hA_SELLP.alignment ); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); magma_zm_5stencil( laplace_size, &hA, queue ); } else { // file-matrix test magma_z_csr_mtx( &hA, argv[i], queue ); } printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n", (int) hA.num_rows,(int) hA.num_cols,(int) hA.nnz ); real_Double_t FLOPS = 2.0*hA.nnz/1e9; magma_z_vector hx, hy, dx, dy, hrefvec, hcheck; // init CPU vectors magma_z_vinit( &hx, Magma_CPU, hA.num_rows, c_zero, queue ); magma_z_vinit( &hy, Magma_CPU, hA.num_rows, c_zero, queue ); // init DEV vectors magma_z_vinit( &dx, Magma_DEV, hA.num_rows, c_one, queue ); magma_z_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); #ifdef MAGMA_WITH_MKL // calling MKL with CSR pntre = (magma_int_t*)malloc( (hA.num_rows+1)*sizeof(magma_int_t) ); pntre[0] = 0; for (j=0; j<hA.num_rows; j++ ) { pntre[j] = hA.row[j+1]; } MKL_INT num_rows = hA.num_rows; MKL_INT num_cols = hA.num_cols; MKL_INT nnz = hA.nnz; MKL_INT *col; TESTING_MALLOC_CPU( col, MKL_INT, nnz ); for( magma_int_t t=0; t < hA.nnz; ++t ) { col[ t ] = hA.col[ t ]; } MKL_INT *row; TESTING_MALLOC_CPU( row, MKL_INT, num_rows ); for( magma_int_t t=0; t < hA.num_rows; ++t ) { row[ t ] = hA.col[ t ]; } start = magma_wtime(); for (j=0; j<10; j++ ) { mkl_zcsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); } end = magma_wtime(); printf( "\n > MKL : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); TESTING_FREE_CPU( row ); TESTING_FREE_CPU( col ); free(pntre); #endif // MAGMA_WITH_MKL // copy matrix to GPU magma_z_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue ); // SpMV on GPU (CSR) -- this is the reference! start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_z_spmv( c_one, dA, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_z_mfree(&dA, queue ); magma_z_vtransfer( dy, &hrefvec , Magma_DEV, Magma_CPU, queue ); // convert to ELL and copy to GPU magma_z_mconvert( hA, &hA_ELL, Magma_CSR, Magma_ELL, queue ); magma_z_mtransfer( hA_ELL, &dA_ELL, Magma_CPU, Magma_DEV, queue ); magma_z_mfree(&hA_ELL, queue ); magma_z_vfree( &dy, queue ); magma_z_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // SpMV on GPU (ELL) start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_z_spmv( c_one, dA_ELL, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard ELL).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_z_mfree(&dA_ELL, queue ); magma_z_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_Z_REAL(hcheck.val[k]) - MAGMA_Z_REAL(hrefvec.val[k]); if ( res < .000001 ) printf("# tester spmv ELL: ok\n"); else printf("# tester spmv ELL: failed\n"); magma_z_vfree( &hcheck, queue ); // convert to SELLP and copy to GPU magma_z_mconvert( hA, &hA_SELLP, Magma_CSR, Magma_SELLP, queue ); magma_z_mtransfer( hA_SELLP, &dA_SELLP, Magma_CPU, Magma_DEV, queue ); magma_z_mfree(&hA_SELLP, queue ); magma_z_vfree( &dy, queue ); magma_z_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // SpMV on GPU (SELLP) start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_z_spmv( c_one, dA_SELLP, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (SELLP).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_z_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_Z_REAL(hcheck.val[k]) - MAGMA_Z_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv SELL-P: ok\n"); else printf("# tester spmv SELL-P: failed\n"); magma_z_vfree( &hcheck, queue ); magma_z_mfree(&dA_SELLP, queue ); // SpMV on GPU (CUSPARSE - CSR) // CUSPARSE context // cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); cusparseSetStream( cusparseHandle, queue ); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); magmaDoubleComplex alpha = c_one; magmaDoubleComplex beta = c_zero; magma_z_vfree( &dy, queue ); magma_z_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // copy matrix to GPU magma_z_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue ); start = magma_sync_wtime( queue ); for (j=0; j<10; j++) cusparseStatus = cusparseZcsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, hA.num_rows, hA.num_cols, hA.nnz, &alpha, descr, dA.dval, dA.drow, dA.dcol, dx.dval, &beta, dy.dval); end = magma_sync_wtime( queue ); if (cusparseStatus != 0) printf("error in cuSPARSE CSR\n"); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); cusparseMatDescr_t descrA; cusparseStatus = cusparseCreateMatDescr(&descrA); if (cusparseStatus != 0) printf("error\n"); cusparseHybMat_t hybA; cusparseStatus = cusparseCreateHybMat( &hybA ); if (cusparseStatus != 0) printf("error\n"); magma_z_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_Z_REAL(hcheck.val[k]) - MAGMA_Z_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv cuSPARSE CSR: ok\n"); else printf("# tester spmv cuSPARSE CSR: failed\n"); magma_z_vfree( &hcheck, queue ); magma_z_vfree( &dy, queue ); magma_z_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); cusparseZcsr2hyb(cusparseHandle, hA.num_rows, hA.num_cols, descrA, dA.dval, dA.drow, dA.dcol, hybA, 0, CUSPARSE_HYB_PARTITION_AUTO); start = magma_sync_wtime( queue ); for (j=0; j<10; j++) cusparseStatus = cusparseZhybmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA, hybA, dx.dval, &beta, dy.dval); end = magma_sync_wtime( queue ); if (cusparseStatus != 0) printf("error in cuSPARSE HYB\n"); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (HYB).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_z_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_Z_REAL(hcheck.val[k]) - MAGMA_Z_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv cuSPARSE HYB: ok\n"); else printf("# tester spmv cuSPARSE HYB: failed\n"); magma_z_vfree( &hcheck, queue ); cusparseDestroyMatDescr( descrA ); cusparseDestroyHybMat( hybA ); cusparseDestroy( cusparseHandle ); magma_z_mfree(&dA, queue ); printf("\n\n"); // free CPU memory magma_z_mfree(&hA, queue ); magma_z_vfree(&hx, queue ); magma_z_vfree(&hy, queue ); magma_z_vfree(&hrefvec, queue ); // free GPU memory magma_z_vfree(&dx, queue ); magma_z_vfree(&dy, queue ); i++; } magma_queue_destroy( queue ); TESTING_FINALIZE(); return 0; }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; const char trans[] = { 'N', 'C', 'T' }; const char uplo[] = { 'L', 'U' }; const char diag[] = { 'U', 'N' }; const char side[] = { 'L', 'R' }; magmaDoubleComplex *A, *B, *C, *C2, *LU; magmaDoubleComplex *dA, *dB, *dC1, *dC2; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.5, 0.1 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_err_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int i = 0; i < opts.ntest; ++i ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = maxn; size = maxn*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_zmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_zmalloc( &dA, size ); assert( err == 0 ); err = magma_zmalloc( &dB, size ); assert( err == 0 ); err = magma_zmalloc( &dC1, size ); assert( err == 0 ); err = magma_zmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_zlarnv( &ione, ISEED, &size, A ); lapackf77_zlarnv( &ione, ISEED, &size, B ); lapackf77_zlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test ZSWAP // swap columns 2 and 3 of dA, then copy to C2 and compare with A if ( n >= 3 ) { magma_zsetmatrix( m, n, A, ld, dA, ld ); magma_zsetmatrix( m, n, A, ld, dB, ld ); magma_zswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_zswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasZaxpy( ld*n, c_neg_one, dA, 1, dB, 1 ); magma_zgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "zswap diff %.2g\n", error ); } else { printf( "zswap skipped for n < 3\n" ); } // ----- test IZAMAX // get argmax of column of A magma_zsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_izamax( m, dA(0,j), 1 ); magma_int_t i2 = cublasIzamax( m, dA(0,j), 1 ); assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "izamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test ZGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_zsetmatrix( m, n, A, ld, dA, ld ); magma_zsetvector( maxn, B, 1, dB, 1 ); magma_zsetvector( maxn, C, 1, dC1, 1 ); magma_zsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_zgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == 'N' ? m : n); cublasZaxpy( size, c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZGEMV( m, n ) / 1e9; printf( "zgemv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test ZHEMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_zsetmatrix( m, m, A, ld, dA, ld ); magma_zsetvector( m, B, 1, dB, 1 ); magma_zsetvector( m, C, 1, dC1, 1 ); magma_zsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_zhemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZhemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHEMV( m ) / 1e9; printf( "zhemv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test ZTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_zlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_zgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_zsetmatrix( m, m, LU, ld, dA, ld ); magma_zsetvector( m, C, 1, dC1, 1 ); magma_zsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_ztrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "ztrsv( %c, %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test ZGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == 'N'); bool ntb = (trans[ib] == 'N'); magma_zsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_zsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZGEMM( m, n, k ) / 1e9; printf( "zgemm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], trans[ib], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHEMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_zsetmatrix( m, m, A, ld, dA, ld ); magma_zsetmatrix( m, n, B, ld, dB, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zhemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZhemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHEMM( side[is], m, n ) / 1e9; printf( "zhemm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", side[is], uplo[iu], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHERK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_zsetmatrix( n, k, A, ld, dA, ld ); magma_zsetmatrix( n, n, C, ld, dC1, ld ); magma_zsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHERK( k, n ) / 1e9; printf( "zherk( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHER2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == 'N'); magma_zsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_zsetmatrix( n, n, C, ld, dC1, ld ); magma_zsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHER2K( k, n ) / 1e9; printf( "zher2k( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_zsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ztrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRMM( side[is], m, n ) / 1e9; printf( "ztrmm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test ZTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_zsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ztrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRSM( side[is], m, n ) / 1e9; printf( "ztrsm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgemm_batched */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cpu_perf, cpu_time; double magma_error, magma_err, Ynorm, work[1]; magma_int_t M, N, Xm, Ym, lda, ldda; magma_int_t sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_int_t batchCount; magmaDoubleComplex *h_A, *h_X, *h_Y, *h_Ymagma; magmaDoubleComplex *d_A, *d_X, *d_Y; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.29, -0.86 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.48, 0.38 ); magmaDoubleComplex **A_array = NULL; magmaDoubleComplex **X_array = NULL; magmaDoubleComplex **Y_array = NULL; magma_opts opts; parse_opts( argc, argv, &opts ); batchCount = opts.batchcount; opts.lapack |= opts.check; //double tol = opts.tolerance * lapackf77_dlamch("E"); printf("trans = %s\n", lapack_trans_const(opts.transA) ); printf("BatchCount M N MAGMA Gflop/s (ms) CPU Gflop/s (ms) MAGMA error\n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = ((M+31)/32)*32; gflops = FLOPS_ZGEMV( M, N ) / 1e9 * batchCount; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N*batchCount; sizeX = incx*Xm*batchCount; sizeY = incy*Ym*batchCount; ldda = ((lda+31)/32)*32; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( h_Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( h_Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N*batchCount ); TESTING_MALLOC_DEV( d_X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( d_Y, magmaDoubleComplex, sizeY ); magma_malloc((void**)&A_array, batchCount * sizeof(*A_array)); magma_malloc((void**)&X_array, batchCount * sizeof(*X_array)); magma_malloc((void**)&Y_array, batchCount * sizeof(*Y_array)); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeX, h_X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, h_Y ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_zsetmatrix( M, N*batchCount, h_A, lda, d_A, ldda ); magma_zsetvector( Xm*batchCount, h_X, incx, d_X, incx ); magma_zsetvector( Ym*batchCount, h_Y, incy, d_Y, incy ); zset_pointer(A_array, d_A, ldda, 0, 0, ldda*N, batchCount, magma_stream); zset_pointer(X_array, d_X, 1, 0, 0, incx*Xm, batchCount, magma_stream); zset_pointer(Y_array, d_Y, 1, 0, 0, incy*Ym, batchCount, magma_stream); magma_time = magma_sync_wtime( NULL ); magmablas_zgemv_batched(opts.transA, M, N, alpha, A_array, ldda, X_array, incx, beta, Y_array, incy, batchCount, magma_stream); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( Ym*batchCount, d_Y, incy, h_Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); for(int i=0; i<batchCount; i++) { blasf77_zgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, h_A + i*lda*N, &lda, h_X + i*Xm, &incx, &beta, h_Y + i*Ym, &incy ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma relative to lapack, // |C_magma - C_lapack| / |C_lapack| magma_error = 0.0; for(int s=0; s<batchCount; s++) { Ynorm = lapackf77_zlange( "M", &M, &ione, h_Y + s*Ym, &incy, work ); blasf77_zaxpy( &Ym, &c_neg_one, h_Y + s*Ym, &ione, h_Ymagma + s*Ym, &ione ); magma_err = lapackf77_zlange( "M", &M, &ione, h_Ymagma + s*Ym, &incy, work ) / Ynorm; if ( isnan(magma_err) || isinf(magma_err) ) { magma_error = magma_err; break; } magma_error = max(fabs(magma_err), magma_error); } printf("%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e \n", (int) batchCount, (int) M, (int) N, magma_perf, 1000.*magma_time, cpu_perf, 1000.*cpu_time, magma_error); } else { printf("%10d %5d %5d %7.2f (%7.2f) --- ( --- ) ---\n", (int) batchCount, (int) M, (int) N, magma_perf, 1000.*magma_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_Y ); TESTING_FREE_CPU( h_Ymagma ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_X ); TESTING_FREE_DEV( d_Y ); TESTING_FREE_DEV( A_array ); TESTING_FREE_DEV( X_array ); TESTING_FREE_DEV( Y_array ); fflush( stdout); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- ZHEEVX computes selected eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. Arguments --------- @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array, dimension (LDDA, N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,N). @param[in] vl DOUBLE PRECISION @param[in] vu DOUBLE PRECISION If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[in] abstol DOUBLE PRECISION The absolute error tolerance for the eigenvalues. An approximate eigenvalue is accepted as converged when it is determined to lie in an interval [a,b] of width less than or equal to ABSTOL + EPS * max( |a|,|b| ), \n where EPS is the machine precision. If ABSTOL is less than or equal to zero, then EPS*|T| will be used in its place, where |T| is the 1-norm of the tridiagonal matrix obtained by reducing A to tridiagonal form. \n Eigenvalues will be computed most accurately when ABSTOL is set to twice the underflow threshold 2*DLAMCH('S'), not zero. If this routine returns with INFO > 0, indicating that some eigenvectors did not converge, try setting ABSTOL to 2*DLAMCH('S'). \n See "Computing Small Singular Values of Bidiagonal Matrices with Guaranteed High Relative Accuracy," by Demmel and Kahan, LAPACK Working Note #3. @param[out] m INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1. @param[out] w DOUBLE PRECISION array, dimension (N) On normal exit, the first M elements contain the selected eigenvalues in ascending order. @param[out] dZ COMPLEX_16 array, dimension (LDDZ, max(1,M)) If JOBZ = MagmaVec, then if INFO = 0, the first M columns of Z contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues, with the i-th column of Z holding the eigenvector associated with W(i). If an eigenvector fails to converge, then that column of Z contains the latest approximation to the eigenvector, and the index of the eigenvector is returned in IFAIL. If JOBZ = MagmaNoVec, then Z is not referenced. Note: the user must ensure that at least max(1,M) columns are supplied in the array Z; if RANGE = MagmaRangeV, the exact value of M is not known in advance and an upper bound must be used. ********* (workspace) If FAST_HEMV is defined DZ should be (LDDZ, max(1,N)) in both cases. @param[in] lddz INTEGER The leading dimension of the array DZ. LDDZ >= 1, and if JOBZ = MagmaVec, LDDZ >= max(1,N). @param wA (workspace) COMPLEX_16 array, dimension (LDWA, N) @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,N). @param wZ (workspace) COMPLEX_16 array, dimension (LDWZ, max(1,M)) @param[in] ldwz INTEGER The leading dimension of the array wZ. LDWZ >= 1, and if JOBZ = MagmaVec, LDWZ >= max(1,N). @param[out] work (workspace) COMPLEX_16 array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= (NB+1)*N, where NB is the max of the blocksize for ZHETRD. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param rwork (workspace) DOUBLE PRECISION array, dimension (7*N) @param iwork (workspace) INTEGER array, dimension (5*N) @param[out] ifail INTEGER array, dimension (N) If JOBZ = MagmaVec, then if INFO = 0, the first M elements of IFAIL are zero. If INFO > 0, then IFAIL contains the indices of the eigenvectors that failed to converge. If JOBZ = MagmaNoVec, then IFAIL is not referenced. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, then i eigenvectors failed to converge. Their indices are stored in array IFAIL. @ingroup magma_zheev_driver ********************************************************************/ extern "C" magma_int_t magma_zheevx_gpu(magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, double vl, double vu, magma_int_t il, magma_int_t iu, double abstol, magma_int_t *m, double *w, magmaDoubleComplex *dZ, magma_int_t lddz, magmaDoubleComplex *wA, magma_int_t ldwa, magmaDoubleComplex *wZ, magma_int_t ldwz, magmaDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t *iwork, magma_int_t *ifail, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); const char* range_ = lapack_range_const( range ); magma_int_t ione = 1; const char* order_; magma_int_t indd, inde; magma_int_t imax; magma_int_t lopt, itmp1, indee; magma_int_t lower, wantz; magma_int_t i, j, jj, i__1; magma_int_t alleig, valeig, indeig; magma_int_t iscale, indibl; magma_int_t indiwk, indisp, indtau; magma_int_t indrwk, indwrk; magma_int_t llwork, nsplit; magma_int_t lquery; magma_int_t iinfo; double safmin; double bignum; double smlnum; double eps, tmp1; double anrm; double sigma, d__1; double rmin, rmax; double *dwork; /* Function Body */ lower = (uplo == MagmaLower); wantz = (jobz == MagmaVec); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || (uplo == MagmaUpper))) { *info = -3; } else if (n < 0) { *info = -4; } else if (ldda < max(1,n)) { *info = -6; } else if (lddz < 1 || (wantz && lddz < n)) { *info = -15; } else if (ldwa < max(1,n)) { *info = -17; } else if (ldwz < 1 || (wantz && ldwz < n)) { *info = -19; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_zhetrd_nb(n); lopt = n * (nb + 1); work[0] = MAGMA_Z_MAKE( lopt, 0 ); if (lwork < lopt && ! lquery) { *info = -21; } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } *m = 0; /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif magmaDoubleComplex *a; magma_zmalloc_cpu( &a, n*n ); magma_zgetmatrix(n, n, dA, ldda, a, n); lapackf77_zheevx(jobz_, range_, uplo_, &n, a, &n, &vl, &vu, &il, &iu, &abstol, m, w, wZ, &ldwz, work, &lwork, rwork, iwork, ifail, info); magma_zsetmatrix( n, n, a, n, dA, ldda); magma_zsetmatrix( n, *m, wZ, ldwz, dZ, lddz); magma_free_cpu(a); return *info; } if (MAGMA_SUCCESS != magma_dmalloc( &dwork, n )) { fprintf (stderr, "!!!! device memory allocation error (magma_zheevx_gpu)\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } --w; --work; --rwork; --iwork; --ifail; /* Get machine constants. */ safmin = lapackf77_dlamch("Safe minimum"); eps = lapackf77_dlamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_dsqrt(smlnum); rmax = magma_dsqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = magmablas_zlanhe(MagmaMaxNorm, uplo, n, dA, ldda, dwork); iscale = 0; sigma = 1; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { d__1 = 1.; magmablas_zlascl(uplo, 0, 0, 1., sigma, n, n, dA, ldda, info); if (abstol > 0.) { abstol *= sigma; } if (valeig) { vl *= sigma; vu *= sigma; } } /* Call ZHETRD to reduce Hermitian matrix to tridiagonal form. */ indd = 1; inde = indd + n; indrwk = inde + n; indtau = 1; indwrk = indtau + n; llwork = lwork - indwrk + 1; #ifdef FAST_HEMV magma_zhetrd2_gpu(uplo, n, dA, ldda, &rwork[indd], &rwork[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, dZ, lddz*n, &iinfo); #else magma_zhetrd_gpu (uplo, n, dA, ldda, &rwork[indd], &rwork[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, &iinfo); #endif lopt = n + (magma_int_t)MAGMA_Z_REAL(work[indwrk]); /* If all eigenvalues are desired and ABSTOL is less than or equal to zero, then call DSTERF or ZUNGTR and ZSTEQR. If this fails for some eigenvalue, then try DSTEBZ. */ if ((alleig || (indeig && il == 1 && iu == n)) && abstol <= 0.) { blasf77_dcopy(&n, &rwork[indd], &ione, &w[1], &ione); indee = indrwk + 2*n; if (! wantz) { i__1 = n - 1; blasf77_dcopy(&i__1, &rwork[inde], &ione, &rwork[indee], &ione); lapackf77_dsterf(&n, &w[1], &rwork[indee], info); } else { lapackf77_zlacpy("A", &n, &n, wA, &ldwa, wZ, &ldwz); lapackf77_zungtr(uplo_, &n, wZ, &ldwz, &work[indtau], &work[indwrk], &llwork, &iinfo); i__1 = n - 1; blasf77_dcopy(&i__1, &rwork[inde], &ione, &rwork[indee], &ione); lapackf77_zsteqr(jobz_, &n, &w[1], &rwork[indee], wZ, &ldwz, &rwork[indrwk], info); if (*info == 0) { for (i = 1; i <= n; ++i) { ifail[i] = 0; } magma_zsetmatrix( n, n, wZ, ldwz, dZ, lddz ); } } if (*info == 0) { *m = n; } } /* Otherwise, call DSTEBZ and, if eigenvectors are desired, ZSTEIN. */ if (*m == 0) { *info = 0; if (wantz) { order_ = "B"; } else { order_ = "E"; } indibl = 1; indisp = indibl + n; indiwk = indisp + n; lapackf77_dstebz(range_, order_, &n, &vl, &vu, &il, &iu, &abstol, &rwork[indd], &rwork[inde], m, &nsplit, &w[1], &iwork[indibl], &iwork[indisp], &rwork[indrwk], &iwork[indiwk], info); if (wantz) { lapackf77_zstein(&n, &rwork[indd], &rwork[inde], m, &w[1], &iwork[indibl], &iwork[indisp], wZ, &ldwz, &rwork[indrwk], &iwork[indiwk], &ifail[1], info); magma_zsetmatrix( n, *m, wZ, ldwz, dZ, lddz ); /* Apply unitary matrix used in reduction to tridiagonal form to eigenvectors returned by ZSTEIN. */ magma_zunmtr_gpu(MagmaLeft, uplo, MagmaNoTrans, n, *m, dA, ldda, &work[indtau], dZ, lddz, wA, ldwa, &iinfo); } } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = *m; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_dscal(&imax, &d__1, &w[1], &ione); } /* If eigenvalues are not in order, then sort them, along with eigenvectors. */ if (wantz) { for (j = 1; j <= *m-1; ++j) { i = 0; tmp1 = w[j]; for (jj = j + 1; jj <= *m; ++jj) { if (w[jj] < tmp1) { i = jj; tmp1 = w[jj]; } } if (i != 0) { itmp1 = iwork[indibl + i - 1]; w[i] = w[j]; iwork[indibl + i - 1] = iwork[indibl + j - 1]; w[j] = tmp1; iwork[indibl + j - 1] = itmp1; magma_zswap(n, dZ + (i-1)*lddz, ione, dZ + (j-1)*lddz, ione); if (*info != 0) { itmp1 = ifail[i]; ifail[i] = ifail[j]; ifail[j] = itmp1; } } } } /* Set WORK[0] to optimal complex workspace size. */ work[1] = MAGMA_Z_MAKE( lopt, 0 ); return *info; } /* magma_zheevx_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing ztrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time=0, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; double magma_error, cublas_error, work[1]; magma_int_t M, N, info; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; magmaDoubleComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2; magmaDoubleComplex *d_A, *d_B; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.29, -0.86 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("side = %s, uplo = %s, transA = %s, diag = %s \n", lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("==================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; gflops = FLOPS_ZTRSM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*N ); TESTING_MALLOC_CPU( h_B1, magmaDoubleComplex, ldb*N ); TESTING_MALLOC_CPU( h_X1, magmaDoubleComplex, ldb*N ); TESTING_MALLOC_CPU( h_X2, magmaDoubleComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bcublas, magmaDoubleComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bmagma, magmaDoubleComplex, ldb*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, Ak ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zgetrf( &Ak, &Ak, h_A, &lda, ipiv, &info ); for( int j = 0; j < Ak; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); memcpy(h_B1, h_B, sizeB*sizeof(magmaDoubleComplex)); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_zsetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, N, h_B, ldb, d_B, lddb ); magma_time = magma_sync_wtime( NULL ); magmablas_ztrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_zgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( M, N, h_B, ldb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ); cublasZtrsm( handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), M, N, &alpha, d_A, ldda, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ztrsm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) memcpy(h_X1, h_Bmagma, sizeB*sizeof(magmaDoubleComplex)); magmaDoubleComplex alpha2 = MAGMA_Z_DIV( c_one, alpha ); blasf77_ztrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha2, h_A, &lda, h_X1, &ldb ); blasf77_zaxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione ); double norm1 = lapackf77_zlange( "M", &M, &N, h_X1, &ldb, work ); double normx = lapackf77_zlange( "M", &M, &N, h_Bmagma, &ldb, work ); double normA = lapackf77_zlange( "M", &Ak, &Ak, h_A, &lda, work ); magma_error = norm1/(normx*normA); memcpy(h_X2, h_Bcublas, sizeB*sizeof(magmaDoubleComplex)); blasf77_ztrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha2, h_A, &lda, h_X2, &ldb ); blasf77_zaxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione ); norm1 = lapackf77_zlange( "M", &M, &N, h_X2, &ldb, work ); normx = lapackf77_zlange( "M", &M, &N, h_Bcublas, &ldb, work ); normA = lapackf77_zlange( "M", &Ak, &Ak, h_A, &lda, work ); cublas_error = norm1/(normx*normA); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_B1 ); TESTING_FREE_CPU( h_X1 ); TESTING_FREE_CPU( h_X2 ); TESTING_FREE_CPU( h_Bcublas ); TESTING_FREE_CPU( h_Bmagma ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- ZHEEVD computes all eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the orthonormal eigenvectors of the matrix A. If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] w DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( N + N*NB, 2*N + N**2 ). NB can be obtained through magma_get_zhetrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] rwork (workspace) DOUBLE PRECISION array, dimension (LRWORK) On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK. @param[in] lrwork INTEGER The dimension of the array RWORK. If N <= 1, LRWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LRWORK >= N. If JOBZ = MagmaVec and N > 1, LRWORK >= 1 + 5*N + 2*N**2. \n If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. @ingroup magma_zheev_driver ********************************************************************/ extern "C" magma_int_t magma_zheevd( magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, double *w, magmaDoubleComplex *work, magma_int_t lwork, #ifdef COMPLEX double *rwork, magma_int_t lrwork, #endif magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); magma_int_t ione = 1; magma_int_t izero = 0; double d_one = 1.; double d__1; double eps; magma_int_t inde; double anrm; magma_int_t imax; double rmin, rmax; double sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t llrwk; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; double safmin; double bignum; magma_int_t indtau; magma_int_t indrwk, indwrk, liwmin; magma_int_t lrwmin, llwork; double smlnum; magma_int_t lquery; double* dwork; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); lquery = (lwork == -1 || lrwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (lower || (uplo == MagmaUpper))) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } magma_int_t nb = magma_get_zhetrd_nb( n ); if ( n <= 1 ) { lwmin = 1; lrwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( n + n*nb, 2*n + n*n ); lrwmin = 1 + 5*n + 2*n*n; liwmin = 3 + 5*n; } else { lwmin = n + n*nb; lrwmin = n; liwmin = 1; } // multiply by 1+eps (in Double!) to ensure length gets rounded up, // if it cannot be exactly represented in floating point. real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon"); work[0] = MAGMA_Z_MAKE( lwmin * one_eps, 0 ); rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -8; } else if ((lrwork < lrwmin) && ! lquery) { *info = -10; } else if ((liwork < liwmin) && ! lquery) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = MAGMA_Z_REAL( A[0] ); if (wantz) { A[0] = MAGMA_Z_ONE; } return *info; } /* If matrix is very small, then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { lapackf77_zheevd( jobz_, uplo_, &n, A, &lda, w, work, &lwork, #ifdef COMPLEX rwork, &lrwork, #endif iwork, &liwork, info ); return *info; } /* Get machine constants. */ safmin = lapackf77_dlamch("Safe minimum"); eps = lapackf77_dlamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_dsqrt( smlnum ); rmax = magma_dsqrt( bignum ); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_zlanhe( "M", uplo_, &n, A, &lda, rwork ); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_zlascl( uplo_, &izero, &izero, &d_one, &sigma, &n, &n, A, &lda, info ); } /* Call ZHETRD to reduce Hermitian matrix to tridiagonal form. */ // zhetrd rwork: e (n) // zstedx rwork: e (n) + llrwk (1 + 4*N + 2*N**2) ==> 1 + 5n + 2n^2 inde = 0; indrwk = inde + n; llrwk = lrwork - indrwk; // zhetrd work: tau (n) + llwork (n*nb) ==> n + n*nb // zstedx work: tau (n) + z (n^2) // zunmtr work: tau (n) + z (n^2) + llwrk2 (n or n*nb) ==> 2n + n^2, or n + n*nb + n^2 indtau = 0; indwrk = indtau + n; indwk2 = indwrk + n*n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; magma_timer_t time=0; timer_start( time ); magma_zhetrd( uplo, n, A, lda, w, &rwork[inde], &work[indtau], &work[indwrk], llwork, &iinfo ); timer_stop( time ); timer_printf( "time zhetrd = %6.2f\n", time ); /* For eigenvalues only, call DSTERF. For eigenvectors, first call * ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the * tridiagonal matrix, then call ZUNMTR to multiply it to the Householder * transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_dsterf( &n, w, &rwork[inde], info ); } else { timer_start( time ); if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 3*n*(n/2 + 1) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zstedx( MagmaRangeAll, n, 0., 0., 0, 0, w, &rwork[inde], &work[indwrk], n, &rwork[indrwk], llrwk, iwork, liwork, dwork, info ); magma_free( dwork ); timer_stop( time ); timer_printf( "time zstedx = %6.2f\n", time ); timer_start( time ); magma_zunmtr( MagmaLeft, uplo, MagmaNoTrans, n, n, A, lda, &work[indtau], &work[indwrk], n, &work[indwk2], llwrk2, &iinfo ); lapackf77_zlacpy( "A", &n, &n, &work[indwrk], &n, A, &lda ); timer_stop( time ); timer_printf( "time zunmtr + copy = %6.2f\n", time ); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = n; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_dscal( &imax, &d__1, w, &ione ); } work[0] = MAGMA_Z_MAKE( lwmin * one_eps, 0 ); // round up rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; return *info; } /* magma_zheevd */
/** Purpose ------- ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA COMPLEX_16 array of pointers on the GPU, dimension (num_gpus) On entry, the Hermitian matrix dA distributed over GPUs (dl_A[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_mgpu_right(magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex **d_lA, magma_int_t ldda, magma_int_t *info ) { #define dlA(id, i, j) (d_lA[(id)] + (j) * ldda + (i)) #define dlP(id, i, j) (d_lP[(id)] + (j) * ldda + (i)) #define panel(j) (panel + (j)) #define tmppanel(j) (tmppanel + (j)) #define tmpprevpanel(j) (tmpprevpanel + (j)) #define STREAM_ID(i) (num_streams > 1 ? 1+((i)/nb)%(num_streams-1) : 0) magmaDoubleComplex z_one = MAGMA_Z_MAKE( 1.0, 0.0 ); magmaDoubleComplex mz_one = MAGMA_Z_MAKE( -1.0, 0.0 ); double one = 1.0; double m_one = -1.0; const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t j, nb, d, id, j_local, blkid, crosspoint, prevj, prevtrsmrows, num_streams = 5; magmaDoubleComplex *panel, *tmppanel0, *tmppanel1, *tmppanel, *tmpprevpanel; magmaDoubleComplex *d_lP[MagmaMaxGPUs], *dlpanel, *dlpanels[MagmaMaxGPUs]; magma_int_t rows, trsmrows, ngpu, n_local[MagmaMaxGPUs], ldpanel; magma_queue_t stream[MagmaMaxGPUs][10]; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); nb = magma_get_zpotrf_nb(n); ldpanel = ldda; magma_setdevice(0); if (MAGMA_SUCCESS != magma_zmalloc_pinned( &panel, 2 * nb * ldpanel )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } tmppanel0 = panel; tmppanel1 = tmppanel0 + nb * ldpanel; if ((nb <= 1) || (nb >= n)) { // Use unblocked code. magma_zgetmatrix( n, n, dlA(0, 0, 0), ldda, panel, ldpanel); lapackf77_zpotrf( uplo_, &n, panel, &ldpanel, info); magma_zsetmatrix( n, n, panel, ldpanel, dlA(0, 0, 0), ldda ); } else { for( d = 0; d < num_gpus; d++ ) { // local-n and local-ld n_local[d] = ((n / nb) / num_gpus) * nb; if (d < (n / nb) % num_gpus) n_local[d] += nb; else if (d == (n / nb) % num_gpus) n_local[d] += n % nb; magma_setdevice(d); magma_device_sync(); if (MAGMA_SUCCESS != magma_zmalloc( &d_lP[d], nb * ldda )) { for( j = 0; j < d; j++ ) { magma_setdevice(j); magma_free( d_lP[d] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < num_streams; j++ ) { magma_queue_create( &stream[d][j] ); } } //#define ENABLE_TIMER #if defined (ENABLE_TIMER) real_Double_t therk[4], tmtc, tcchol, tctrsm, tctm, tmnp, tcnp; real_Double_t ttot_herk[4] = {0,0,0,0}, ttot_mtc = 0, ttot_cchol = 0, ttot_ctrsm = 0, ttot_ctm = 0, ttot_mnp = 0, ttot_cnp = 0; printf("\n\n %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n", "j", "nb", "row", "mtc", "CPU_np", "panel", "ctrsm", "CH+TRSM", "CPU", "dsyrk[0]", "dsyrk[1]", "dsyrk[2]", "dsyrk[3]", "ctm P", "gpu_np"); printf(" ====================================================================================================\n"); #endif // Use blocked code. if (uplo == MagmaUpper) { printf( " === not supported, yet ===\n" ); } else { blkid = -1; if (num_gpus == 4) crosspoint = n; else if (num_gpus == 3) crosspoint = n; else if (num_gpus == 2) crosspoint = 20160; else crosspoint = 0; crosspoint = 0; //n; //n -- > gpu always does next panel, 0 --> cpu always does next panel crosspoint = n; #if defined (ENABLE_TIMER) real_Double_t tget = get_time(), tset = 0.0, ttot = 0.0; #endif if ( n > nb ) { // send first panel to cpu magma_setdevice(0); tmppanel = tmppanel0; magma_zgetmatrix_async(n, nb, dlA(0, 0, 0), ldda, tmppanel(0), ldpanel, stream[0][0] ); } #if defined (ENABLE_TIMER) for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_device_sync(); } tget = get_time()-tget; #endif // Compute the Cholesky factorization A = L*L' for (j = 0; (j + nb) < n; j += nb) { #if defined (ENABLE_TIMER) therk[0] = therk[1] = therk[2] = therk[3] = tmtc = tcchol = tctrsm = tctm = tmnp = tcnp = 0.0; #endif blkid += 1; tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; // Set the gpu number that holds the current panel id = (j / nb) % num_gpus; magma_setdevice(id); // Set the local index where the current panel is j_local = j / (nb * num_gpus) * nb; rows = n - j; // Wait for the panel on cpu magma_queue_sync( stream[id][0] ); if (j > 0 && prevtrsmrows > crosspoint) { #if defined (ENABLE_TIMER) tcnp = get_time(); #endif tmpprevpanel = ((blkid - 1) % 2) == 0 ? tmppanel0 : tmppanel1; blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr, &rows, &nb, &nb, &mz_one, tmpprevpanel(j), &ldpanel, tmpprevpanel(j), &ldpanel, &z_one, tmppanel(j), &ldpanel ); #if defined (ENABLE_TIMER) tcnp = get_time() - tcnp; ttot_cnp += tcnp; #endif } #if defined (ENABLE_TIMER) tcchol = get_time(); #endif lapackf77_zpotrf(MagmaLowerStr, &nb, tmppanel(j), &ldpanel, info); if (*info != 0) { *info = *info + j; break; } #if defined (ENABLE_TIMER) tcchol = get_time() - tcchol; ttot_cchol += tcchol; tctrsm = get_time(); #endif trsmrows = rows - nb; if (trsmrows > 0) { blasf77_ztrsm(MagmaRightStr, MagmaLowerStr, MagmaConjTransStr, MagmaNonUnitStr, &trsmrows, &nb, &z_one, tmppanel(j), &ldpanel, tmppanel(j + nb), &ldpanel); } #if defined (ENABLE_TIMER) tctrsm = get_time() - tctrsm; ttot_ctrsm += tctrsm; tctm = get_time(); #endif d = (id + 1) % num_gpus; // send current panel to gpus for (ngpu = 0; ngpu < num_gpus; ngpu++, d = (d + 1) % num_gpus ) { magma_int_t myrows = 0; magma_int_t row_offset = 0; if ( d == id ) { dlpanel = dlA(d, j, j_local); myrows = rows; row_offset = 0; } else { dlpanel = dlP(d, 0, 0); myrows = trsmrows; row_offset = nb; } if (myrows > 0) { magma_setdevice(d); magma_zsetmatrix_async(myrows, nb, tmppanel(j + row_offset), ldpanel, dlpanel, ldda, stream[d][0] ); } } /* make sure panel is on GPUs */ d = (id + 1) % num_gpus; for (ngpu = 0; ngpu < num_gpus; ngpu++, d = (d + 1) % num_gpus ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } #if defined (ENABLE_TIMER) tctm = get_time() - tctm; ttot_ctm += tctm; #endif if ( (j + nb) < n) { magma_int_t offset = 0; magma_int_t row_offset = 0; if (j + nb + nb < n) { d = (id + 1) % num_gpus; magma_setdevice(d); magma_int_t j_local2 = (j + nb) / (nb * num_gpus) * nb; if (trsmrows <= crosspoint) { #if defined (ENABLE_TIMER) tmnp = get_time(); #endif // do gemm on look ahead panel if ( d == id ) { dlpanel = dlA(d, j + nb, j_local); } else { dlpanel = dlP(d, 0, 0); } magmablasSetKernelStream(stream[d][STREAM_ID(j_local2)]); #define ZHERK_ON_DIAG #ifdef ZHERK_ON_DIAG magma_zherk( MagmaLower, MagmaNoTrans, nb, nb, m_one, dlpanel, ldda, one, dlA(d, j + nb, j_local2), ldda); magma_zgemm( MagmaNoTrans, MagmaConjTrans, trsmrows-nb, nb, nb, mz_one, dlpanel+nb, ldda, dlpanel, ldda, z_one, dlA(d, j + nb +nb, j_local2), ldda); #else magma_zgemm( MagmaNoTrans, MagmaConjTrans, trsmrows, nb, nb, mz_one, dlpanel, ldda, dlpanel, ldda, z_one, dlA(d, j + nb, j_local2), ldda); #endif #if defined (ENABLE_TIMER) magma_device_sync(); tmnp = get_time() - tmnp; ttot_mnp += tmnp; #endif } // send next panel to cpu magma_queue_sync( stream[d][STREAM_ID(j_local2)] ); // make sure lookahead is done tmppanel = ((blkid+1) % 2 == 0) ? tmppanel0 : tmppanel1; magma_zgetmatrix_async(rows-nb, nb, dlA(d, j+nb, j_local2), ldda, tmppanel(j+nb), ldpanel, stream[d][0] ); tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; offset = j + nb + nb; row_offset = nb; } else { offset = j + nb; row_offset = 0; } if (n - offset > 0) { // syrk on multiple gpu for (d = 0; d < num_gpus; d++ ) { if ( d == id ) { dlpanels[d] = dlA(d, j + nb + row_offset, j_local); } else { dlpanels[d] = dlP(d, row_offset, 0); } } #if defined (ENABLE_TIMER) for( d=0; d < num_gpus; d++ ) therk[d] = get_time(); #endif //magmablasSetKernelStream(stream[d]); //magma_zherk(MagmaLower, MagmaNoTrans, n - offset, nb, // m_one, dlpanel, ldda, // one, &d_lA[d][offset + offset*ldda], ldda ); #ifdef ZHERK_ON_DIAG magma_zherk_mgpu #else magma_zherk_mgpu2 #endif (num_gpus, MagmaLower, MagmaNoTrans, nb, n - offset, nb, m_one, dlpanels, ldda, 0, one, d_lA, ldda, offset, num_streams, stream ); #if defined (ENABLE_TIMER) for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_device_sync(); therk[d] = get_time() - therk[d]; ttot_herk[d] += therk[d]; } #endif } prevtrsmrows = trsmrows; prevj = j; #if defined (ENABLE_TIMER) ttot += (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(%d) %10.3lf\n", j, nb, rows, tmtc, tcnp, // gemm tcchol, // potrf tctrsm, // trsm (tcchol + tctrsm), (tmtc+tcnp+tcchol+tctrsm), therk[0], therk[1], therk[2], therk[3], // syrk tctm, // copy panel to GPU tmnp, // lookahead on GPU (id + 1) % num_gpus, (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp)); fflush(0); #endif } } for( d = 0; d < num_gpus; d++ ) { magma_setdevice(d); for( id=0; id < num_streams; id++ ) { magma_queue_sync( stream[d][id] ); } } #if defined (ENABLE_TIMER) printf("\n%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf\n", n, n, 0, ttot_mtc, ttot_cnp, // gemm ttot_cchol, // potrf ttot_ctrsm, // trsm (ttot_cchol + ttot_ctrsm), (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm), ttot_herk[0], ttot_herk[1], ttot_herk[2], ttot_herk[3], // syrk ttot_ctm, // copy panel to GPU ttot_mnp, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf (ratio)\n", n, n, 0, ttot_mtc/ttot, ttot_cnp/ttot, // gemm ttot_cchol/ttot, // potrf ttot_ctrsm/ttot, // trsm (ttot_cchol + ttot_ctrsm)/ttot, (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm)/ttot, ttot_herk[0]/ttot, ttot_herk[1]/ttot, ttot_herk[2]/ttot, ttot_herk[3]/ttot, // syrk ttot_ctm/ttot, // copy panel to GPU ttot_mnp/ttot, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)/ttot); #endif // cholesky for the last block if (j < n && *info == 0) { rows = n - j; id = (j / nb) % num_gpus; // Set the local index where the current panel is j_local = j / (nb * num_gpus) * nb; magma_setdevice(id); #if defined (ENABLE_TIMER) tset = get_time(); #endif magma_zgetmatrix(rows, rows, dlA(id, j, j_local), ldda, panel(j), ldpanel); lapackf77_zpotrf(MagmaLowerStr, &rows, panel(j), &ldpanel, info); magma_zsetmatrix(rows, rows, panel(j), ldpanel, dlA(id, j, j_local), ldda); #if defined (ENABLE_TIMER) tset = get_time() - tset; #endif } #if defined (ENABLE_TIMER) printf( " matrix_get,set: %10.3lf %10.3lf -> %10.3lf\n",tget,tset,ttot+tget+tset ); #endif } // end of else not upper // clean up for( d = 0; d < num_gpus; d++ ) { magma_setdevice(d); for( j=0; j < num_streams; j++ ) { magma_queue_destroy( stream[d][j] ); } magma_free( d_lP[d] ); } } // end of not lapack // free workspace magma_free_pinned( panel ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_zpotrf_mgpu_right */
/** Purpose ------- Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by ZGEQRF_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by ZGEQRF_GPU in the first n columns of its array argument A. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in] tau COMPLEX_16 array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_ZGEQRF_GPU. @param[in,out] dB COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] dT COMPLEX_16 array that is the output (the 6th argument) of magma_zgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block inverses for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) COMPLEX_16 array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_zgeqrf_nb( M ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgels_comp ********************************************************************/ extern "C" magma_int_t magma_zgeqrs_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex_ptr dT, magmaDoubleComplex_ptr dB, magma_int_t lddb, magmaDoubleComplex *hwork, magma_int_t lwork, magma_int_t *info) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (lddwork+(a_1))*nb) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex_ptr dwork; magma_int_t i, k, lddwork, rows, ib; magma_int_t ione = 1; magma_int_t nb = magma_get_zgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -9; else if (lwork < lwkopt && ! lquery) *info = -11; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } /* B := Q' * B */ magma_zunmqr_gpu( MagmaLeft, Magma_ConjTrans, m, nrhs, n, dA(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) */ lddwork= k; if (nb < k) dwork = dT+2*lddwork*nb; else dwork = dT; // To do: Why did we have this line originally; seems to be a bug (Stan)? // dwork = dT; i = (k-1)/nb * nb; ib = n-i; rows = m-i; // TODO: this assumes that, on exit from magma_zunmqr_gpu, hwork contains // the last block of A and B (i.e., C in zunmqr). This should be fixed. // Seems this data should already be on the GPU, so could switch to // magma_ztrsm and drop the zsetmatrix. if ( nrhs == 1 ) { blasf77_ztrsv( MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, hwork, &rows, hwork+rows*ib, &ione); } else { blasf77_ztrsm( MagmaLeftStr, MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, &nrhs, &c_one, hwork, &rows, hwork+rows*ib, &rows); } // update the solution vector magma_zsetmatrix( ib, nrhs, hwork+rows*ib, rows, dwork+i, lddwork ); // update c if (nrhs == 1) magma_zgemv( MagmaNoTrans, i, ib, c_neg_one, dA(0, i), ldda, dwork + i, 1, c_one, dB, 1); else magma_zgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, dA(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb); int start = i-nb; if (nb < k) { for (i = start; i >= 0; i -= nb) { ib = min(k-i, nb); rows = m -i; if (i + ib < n) { if (nrhs == 1) { magma_zgemv( MagmaNoTrans, ib, ib, c_one, dT(i), ib, dB+i, 1, c_zero, dwork+i, 1); magma_zgemv( MagmaNoTrans, i, ib, c_neg_one, dA(0, i), ldda, dwork + i, 1, c_one, dB, 1); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, ib, nrhs, ib, c_one, dT(i), ib, dB+i, lddb, c_zero, dwork+i, lddwork); magma_zgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, dA(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb); } } } } magma_zcopymatrix( (n), nrhs, dwork, lddwork, dB, lddb ); return *info; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma; magmaDoubleComplex *dA, *dX, *dY; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("trans = %s\n", lapack_trans_const(opts.transA) ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = ((M+31)/32)*32; gflops = FLOPS_ZGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, sizeA ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( M, N, A, lda, dA, lda ); magma_zsetvector( Xm, X, incx, dX, incx ); magma_zsetvector( Ym, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasZgemv( handle, cublas_trans_const(opts.transA), M, N, &alpha, dA, lda, dX, incx, &beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( Ym, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_zsetvector( Ym, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); magmablas_zgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( Ym, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "M", &Ym, &ione, Ymagma, &Ym, work ) / Ym; blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_zlange( "M", &Ym, &ione, Ycublas, &Ym, work ) / Ym; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- ZUNMLQ overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complexunitary matrix defined as the product of k elementary reflectors Q = H(k)**H . . . H(2)**H H(1)**H as returned by ZGELQF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A COMPLEX_16 array, dimension (LDA,M) if SIDE = MagmaLeft, (LDA,N) if SIDE = MagmaRight. The i-th row must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGELQF in the first k rows of its array argument A. A is modified by the routine but restored on exit. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,K). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGELQF. @param[in,out] C COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance if SIDE = MagmaLeft, LWORK >= N*NB; if SIDE = MagmaRight, LWORK >= M*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgelqf_comp ********************************************************************/ extern "C" magma_int_t magma_zunmlq( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *C, magma_int_t ldc, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define dV(i_,j_) (dV + (i_) + (j_)*ib) #define dT(i_,j_) (dT + (i_) + (j_)*ib) #define dwork(i_) (dwork + (i_)) magmaDoubleComplex *T, *T2; magma_int_t i, i1, i2, ib, ic, jc, nb, mi, ni, nq, nq_i, nw, step; magma_int_t iinfo, ldwork, lwkopt; magma_int_t left, notran, lquery; magma_trans_t transt; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } /* Test the input arguments */ if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,k)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { nb = magma_get_zgelqf_nb( min( m, n )); lwkopt = max(1,nw)*nb; work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = MAGMA_Z_ONE; return *info; } ldwork = nw; if (nb >= k) { /* Use CPU code */ lapackf77_zunmlq( lapack_side_const(side), lapack_trans_const(trans), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ /* Allocate work space on the GPU. * nw*nb for dwork (m or n) by nb * nq*nb for dV (n or m) by nb * nb*nb for dT * lddc*n for dC. */ magma_int_t lddc = ((m+31)/32)*32; magmaDoubleComplex_ptr dwork, dV, dT, dC; magma_zmalloc( &dwork, (nw + nq + nb)*nb + lddc*n ); if ( dwork == NULL ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dwork + nw*nb; dT = dV + nq*nb; dC = dT + nb*nb; /* work space on CPU. * nb*nb for T * nb*nb for T2, used to save and restore diagonal block of panel */ magma_zmalloc_cpu( &T, 2*nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } T2 = T + nb*nb; /* Copy matrix C from the CPU to the GPU */ magma_zsetmatrix( m, n, C, ldc, dC(0,0), lddc ); if ( (left && notran) || (! left && ! notran) ) { i1 = 0; i2 = k; step = nb; } else { i1 = ((k - 1) / nb)*nb; i2 = 0; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } if (notran) { transt = Magma_ConjTrans; } else { transt = MagmaNoTrans; } for (i = i1; (step < 0 ? i >= i2 : i < i2); i += step) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i) H(i + 1) . . . H(i + ib-1) */ nq_i = nq - i; lapackf77_zlarft("Forward", "Rowwise", &nq_i, &ib, A(i,i), &lda, &tau[i], T, &ib); /* 1) set upper triangle of panel in A to identity, 2) copy the panel from A to the GPU, and 3) restore A */ zpanel_to_q( MagmaLower, ib, A(i,i), lda, T2 ); magma_zsetmatrix( ib, nq_i, A(i,i), lda, dV(0,0), ib ); zq_to_panel( MagmaLower, ib, A(i,i), lda, T2 ); if (left) { /* H or H**H is applied to C(i:m,1:n) */ mi = m - i; ic = i; } else { /* H or H**H is applied to C(1:m,i:n) */ ni = n - i; jc = i; } /* Apply H or H**H; First copy T to the GPU */ magma_zsetmatrix( ib, ib, T, ib, dT(0,0), ib ); magma_zlarfb_gpu( side, transt, MagmaForward, MagmaRowwise, mi, ni, ib, dV(0,0), ib, dT(0,0), ib, dC(ic,jc), lddc, dwork(0), ldwork ); } magma_zgetmatrix( m, n, dC(0,0), lddc, C, ldc ); magma_free( dwork ); magma_free_cpu( T ); } work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); return *info; } /* magma_zunmlq */
/** Purpose ------- Solves the overdetermined, least squares problem min || A*X - C || using the QR factorization A. The underdetermined problem (m < n) is not currently handled. Arguments --------- @param[in] trans magma_trans_t - = MagmaNoTrans: the linear system involves A. Only TRANS=MagmaNoTrans is currently handled. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, A is overwritten by details of its QR factorization as returned by ZGEQRF. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in,out] dB COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) COMPLEX_16 array, dimension MAX(1,LWORK). On exit, if INFO = 0, HWORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array HWORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_zgeqrf_nb( M ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgels_driver ********************************************************************/ extern "C" magma_int_t magma_zgels_gpu( magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex_ptr dB, magma_int_t lddb, magmaDoubleComplex *hwork, magma_int_t lwork, magma_int_t *info) { magmaDoubleComplex_ptr dT; magmaDoubleComplex *tau; magma_int_t k; magma_int_t nb = magma_get_zgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. ); *info = 0; /* For now, N is the only case working */ if ( trans != MagmaNoTrans ) *info = -1; else if (m < 0) *info = -2; else if (n < 0 || m < n) /* LQ is not handle for now*/ *info = -3; else if (nrhs < 0) *info = -4; else if (ldda < max(1,m)) *info = -6; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = MAGMA_Z_ONE; return *info; } /* * Allocate temporary buffers */ int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb; if (nb < nrhs) ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs; if (MAGMA_SUCCESS != magma_zmalloc( &dT, ldtwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zmalloc_cpu( &tau, k ); if ( tau == NULL ) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, info ); if ( *info == 0 ) { magma_zgeqrs_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hwork, lwork, info ); } magma_free( dT ); magma_free_cpu(tau); return *info; }
/** Purpose ------- ZLATRD2 reduces NB rows and columns of a complex Hermitian matrix A to Hermitian tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, ZLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, ZLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by ZHETRD2_GPU. It uses an accelerated HEMV that needs extra memory. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e COMPLEX_16 array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau COMPLEX_16 array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W COMPLEX_16 array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a Hermitian rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_zheev_aux ********************************************************************/ extern "C" magma_int_t magma_zlatrd2(magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, double *e, magmaDoubleComplex *tau, magmaDoubleComplex *W, magma_int_t ldw, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dW, magma_int_t lddw, magmaDoubleComplex *dwork, magma_int_t ldwork) { #define A(i, j) (A + (j)*lda + (i)) #define W(i, j) (W + (j)*ldw + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) #define dW(i, j) (dW + (j)*lddw + (i)) magma_int_t i; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex value = MAGMA_Z_ZERO; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; magmaDoubleComplex alpha; magmaDoubleComplex *f; if (n <= 0) { return 0; } magma_queue_t stream; magma_queue_create( &stream ); magma_zmalloc_cpu( &f, n ); assert( f != NULL ); // TODO return error, or allocate outside zlatrd if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_n, W(i, iw+1), &ldw); #endif blasf77_zgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_n, W(i, iw+1), &ldw); lapackf77_zlacgv(&i_n, A(i, i+1), &ldw); #endif blasf77_zgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_n, A(i, i+1), &ldw); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_zlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_Z_REAL( alpha ); *A(i-1,i) = MAGMA_Z_MAKE( 1, 0 ); /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_zsetvector( i, A(0, i), 1, dA(0, i), 1 ); //#if (GPUSHMEM < 200) //magma_zhemv(MagmaUpper, i, c_one, dA(0, 0), ldda, // dA(0, i), ione, c_zero, dW(0, iw), ione); //#else magmablas_zhemv_work(MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione, dwork, ldwork); //#endif // 2. Start putting the result back (asynchronously) magma_zgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw) /*test*/, ldw, stream ); if (i < n-1) { blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } // 3. Here is where we need it // TODO find the right place magma_queue_sync( stream ); if (i < n-1) { blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_zscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_zdotc_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_zdotc( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -0.5f * value; blasf77_zaxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, W(i, 0), &ldw); #endif blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, W(i, 0), &ldw); lapackf77_zlacgv(&i, A(i, 0), &lda); #endif blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, A(i, 0), &lda); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_zlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); e[i] = MAGMA_Z_REAL( alpha ); *A(i+1,i) = MAGMA_Z_MAKE( 1, 0 ); /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_zsetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 ); //#if (GPUSHMEM < 200) //magma_zhemv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, // dW(i+1, i), ione); //#else magmablas_zhemv_work(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione, dwork, ldwork); //#endif // 2. Start putting the result back (asynchronously) magma_zgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, stream ); blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); // 3. Here is where we need it magma_queue_sync( stream ); if (i != 0) blasf77_zaxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_zscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_zdotc_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_zdotc( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i] * -0.5f * value; blasf77_zaxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); } } } magma_free_cpu(f); magma_queue_destroy( stream ); return 0; } /* magma_zlatrd */
int main(int argc, char **argv) { TESTING_INIT(); const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const magma_int_t ione = 1; real_Double_t atomics_perf=0, atomics_time=0; real_Double_t gflops, magma_perf=0, magma_time=0, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error=0, atomics_error=0, cublas_error, work[1]; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Yatomics, *Ycublas, *Ymagma; magmaDoubleComplex_ptr dA, dX, dY, dwork; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("%% N MAGMA Gflop/s (ms) Atomics Gflop/s CUBLAS Gflop/s CPU Gflop/s MAGMA error Atomics CUBLAS\n"); printf("%%=====================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_ZHEMV( N ) / 1e9; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Yatomics, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); blocks = magma_ceildiv( N, nb ); ldwork = ldda*blocks; TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork, ldwork ); magmablas_zlaset( MagmaFull, ldda, N, MAGMA_Z_NAN, MAGMA_Z_NAN, dA, ldda ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); magma_zmake_hermitian( N, A, lda ); // should not use data from the opposite triangle -- fill with NAN to check magma_int_t N1 = N-1; if ( opts.uplo == MagmaUpper ) { lapackf77_zlaset( "Lower", &N1, &N1, &MAGMA_Z_NAN, &MAGMA_Z_NAN, &A[1], &lda ); } else { lapackf77_zlaset( "Upper", &N1, &N1, &MAGMA_Z_NAN, &MAGMA_Z_NAN, &A[lda], &lda ); } lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( N, N, A, lda, dA, ldda ); magma_zsetvector( N, X, incx, dX, incx ); magma_zsetvector( N, Y, incy, dY, incy ); magmablasSetKernelStream( opts.queue ); // opts.handle also uses opts.queue cublas_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); #else magma_zhemv( opts.uplo, N, alpha, dA, 0, ldda, dX, 0, incx, beta, dY, 0, incy, opts.queue ); #endif cublas_time = magma_sync_wtime( opts.queue ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using CUBLAS - using atomics =================================================================== */ #ifdef HAVE_CUBLAS cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_ALLOWED ); magma_zsetvector( N, Y, incy, dY, incy ); // sync on queue doesn't work -- need device sync or use NULL stream -- bug in CUBLAS? atomics_time = magma_sync_wtime( NULL /*opts.queue*/ ); cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); atomics_time = magma_sync_wtime( NULL /*opts.queue*/ ) - atomics_time; atomics_perf = gflops / atomics_time; magma_zgetvector( N, dY, incy, Yatomics, incy ); cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_NOT_ALLOWED ); #endif /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ #ifdef HAVE_CUBLAS magma_zsetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( opts.queue ); if ( opts.version == 1 ) { magmablas_zhemv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork, opts.queue ); } else { // non-work interface (has added overhead) magmablas_zhemv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); } magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( N, dY, incy, Ymagma, incy ); #endif /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zhemv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_zlange( "M", &N, &ione, Ycublas, &N, work ) / N; #ifdef HAVE_CUBLAS blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Yatomics, &incy ); atomics_error = lapackf77_zlange( "M", &N, &ione, Yatomics, &N, work ) / N; blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "M", &N, &ione, Ymagma, &N, work ) / N; #endif bool okay = (magma_error < tol && cublas_error < tol && atomics_error < tol); status += ! okay; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, atomics_perf, 1000.*atomics_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, atomics_error, (okay ? "ok" : "failed")); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Yatomics ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- running magma_zjacobi */ int main( int argc, char** argv) { TESTING_INIT(); magma_z_solver_par solver_par; magma_z_preconditioner precond_par; solver_par.maxiter = 1000; solver_par.verbose = 0; solver_par.num_eigenvalues = 0; int format = 0; magma_z_sparse_matrix A, B, B_d; magma_z_vector x, b; B.blocksize = 8; B.alignment = 8; int scale = 0; magma_scale_t scaling = Magma_NOSCALE; magmaDoubleComplex one = MAGMA_Z_MAKE(1.0, 0.0); magmaDoubleComplex zero = MAGMA_Z_MAKE(0.0, 0.0); B.storage_type = Magma_CSR; int i; for( i = 1; i < argc; ++i ) { if ( strcmp("--format", argv[i]) == 0 ) { format = atoi( argv[++i] ); switch( format ) { case 0: B.storage_type = Magma_CSR; break; case 1: B.storage_type = Magma_ELL; break; case 2: B.storage_type = Magma_ELLRT; break; case 3: B.storage_type = Magma_SELLP; break; } }else if ( strcmp("--mscale", argv[i]) == 0 ) { scale = atoi( argv[++i] ); switch( scale ) { case 0: scaling = Magma_NOSCALE; break; case 1: scaling = Magma_UNITDIAG; break; case 2: scaling = Magma_UNITROW; break; } }else if ( strcmp("--blocksize", argv[i]) == 0 ) { B.blocksize = atoi( argv[++i] ); }else if ( strcmp("--alignment", argv[i]) == 0 ) { B.alignment = atoi( argv[++i] ); } else if ( strcmp("--maxiter", argv[i]) == 0 ){ // && i+1< argc ) { solver_par.maxiter = atoi( argv[++i] ); } else break; } printf( "\n# usage: ./run_zjacobi" " [ --format %d (0=CSR, 1=ELL 2=ELLRT, 3=SELLP)" " [ --blocksize %d --alignment %d ]" " --mscale %d (0=no, 1=unitdiag, 2=unitrownrm)" " --maxiter %d ]" " matrices \n\n", format, (int) B.blocksize, (int) B.alignment, (int) scale, (int) solver_par.maxiter); magma_zsolverinfo_init( &solver_par, &precond_par ); while( i < argc ){ magma_z_csr_mtx( &A, argv[i] ); printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n", (int) A.num_rows,(int) A.num_cols,(int) A.nnz ); // scale matrix magma_zmscale( &A, scaling ); magma_z_mconvert( A, &B, Magma_CSR, B.storage_type ); magma_z_mtransfer( B, &B_d, Magma_CPU, Magma_DEV ); // vectors and initial guess magma_z_vinit( &b, Magma_DEV, A.num_cols, one ); magma_z_vinit( &x, Magma_DEV, A.num_cols, one ); magma_z_spmv( one, B_d, x, zero, b ); // b = A x magma_z_vfree(&x); magma_z_vinit( &x, Magma_DEV, A.num_cols, zero ); magma_zjacobi( B_d, b, &x, &solver_par ); magma_zsolverinfo( &solver_par, &precond_par ); magma_z_mfree(&B_d); magma_z_mfree(&B); magma_z_mfree(&A); magma_z_vfree(&x); magma_z_vfree(&b); i++; } magma_zsolverinfo_free( &solver_par, &precond_par ); TESTING_FINALIZE(); return 0; }