inline static void magma_clarfxsym_v2( magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *V, magmaFloatComplex *TAU, magmaFloatComplex *work) { /* WORK (workspace) float complex array, dimension N */ magma_int_t ione = 1; magmaFloatComplex dtmp; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_neg_one= MAGMA_C_NEG_ONE; magmaFloatComplex c_half = MAGMA_C_HALF; /* X = AVtau */ blasf77_chemv("L",&n, TAU, A, &lda, V, &ione, &c_zero, work, &ione); /* compute dtmp= X'*V */ dtmp = magma_cblas_cdotc(n, work, ione, V, ione); /* compute 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * c_half * (*TAU); /* compute W=X-1/2VX'Vt = X - dtmp*V */ blasf77_caxpy(&n, &dtmp, V, &ione, work, &ione); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_cher2("L", &n, &c_neg_one, work, &ione, V, &ione, A, &lda); }
extern "C" void magma_clarfxsym( magma_int_t N, magmaFloatComplex *A, magma_int_t LDA, magmaFloatComplex *V, magmaFloatComplex *TAU) { magma_int_t IONE=1; magmaFloatComplex dtmp; magmaFloatComplex Z_ZERO = MAGMA_C_ZERO; //magmaFloatComplex Z_ONE = MAGMA_C_ONE; magmaFloatComplex Z_MONE = MAGMA_C_NEG_ONE; magmaFloatComplex Z_HALF = MAGMA_C_HALF; //magmaFloatComplex WORK[N]; magmaFloatComplex *WORK; magma_cmalloc_cpu( &WORK, N ); /* apply left and right on A(st:ed,st:ed)*/ //magma_clarfxsym(len,A(st,st),LDX,V(st),TAU(st)); /* X = AVtau */ blasf77_chemv("L",&N, TAU, A, &LDA, V, &IONE, &Z_ZERO, WORK, &IONE); /* je calcul dtmp= X'*V */ dtmp = magma_cblas_cdotc(N, WORK, IONE, V, IONE); /* je calcul 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * Z_HALF * (*TAU); /* je calcul W=X-1/2VX'Vt = X - dtmp*V */ /* for (j = 0; j < N; j++) WORK[j] = WORK[j] + (dtmp*V[j]); */ blasf77_caxpy(&N, &dtmp, V, &IONE, WORK, &IONE); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_cher2("L",&N,&Z_MONE,WORK,&IONE,V,&IONE,A,&LDA); magma_free_cpu(WORK); }
/** Purpose ------- CLATRD reduces NB rows and columns of a complex Hermitian matrix A to Hermitian tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, CLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, CLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by CHETRD. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e COMPLEX array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau COMPLEX array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W COMPLEX array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). @param dA TODO: dimension (ldda, n)? @param ldda TODO: ldda >= n? @param dW TODO: dimension (lddw, ??) @param lddw TODO: lddw >= n ?? @param[in] queue magma_queue_t Queue to execute in. Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a Hermitian rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_cheev_aux ********************************************************************/ extern "C" magma_int_t magma_clatrd( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magmaFloatComplex *A, magma_int_t lda, float *e, magmaFloatComplex *tau, magmaFloatComplex *W, magma_int_t ldw, magmaFloatComplex *work, magma_int_t lwork, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dW, magma_int_t lddw, magma_queue_t queue ) { #define A(i_, j_) (A + (i_) + (j_)*lda) #define W(i_, j_) (W + (i_) + (j_)*ldw) #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #define dW(i_, j_) (dW + (i_) + (j_)*lddw) /* Constants */ const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magma_int_t ione = 1; /* Local variables */ magmaFloatComplex alpha, value; magma_int_t i, i_n, i_1, iw; /* Check arguments */ magma_int_t info = 0; if ( uplo != MagmaLower && uplo != MagmaUpper ) { info = -1; } else if ( n < 0 ) { info = -2; } else if ( nb < 1 ) { info = -3; } else if ( lda < max(1,n) ) { info = -5; } else if ( ldw < max(1,n) ) { info = -9; } else if ( ldda < max(1,n) ) { info = -11; } else if ( lddw < max(1,n) ) { info = -13; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return if possible */ if (n == 0) { return info; } if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #ifdef COMPLEX lapackf77_clacgv( &i_n, W(i, iw+1), &ldw ); #endif blasf77_cgemv( "No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione ); #ifdef COMPLEX lapackf77_clacgv( &i_n, W(i, iw+1), &ldw ); lapackf77_clacgv( &i_n, A(i, i+1), &lda ); #endif blasf77_cgemv( "No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione ); #ifdef COMPLEX lapackf77_clacgv( &i_n, A(i, i+1), &lda ); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_clarfg( &i, &alpha, A(0, i), &ione, &tau[i - 1] ); e[i-1] = MAGMA_C_REAL( alpha ); *A(i-1,i) = MAGMA_C_ONE; /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_csetvector( i, A(0, i), 1, dA(0, i), 1, queue ); magma_chemv( MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione, queue ); // 2. Start putting the result back (asynchronously) magma_cgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw), ldw, queue ); if (i < n-1) { blasf77_cgemv( MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); } // 3. Here is where we need it // TODO find the right place magma_queue_sync( queue ); if (i < n-1) { blasf77_cgemv( "No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); blasf77_cgemv( MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); blasf77_cgemv( "No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); } blasf77_cscal( &i, &tau[i - 1], W(0, iw), &ione ); value = magma_cblas_cdotc( i, W(0,iw), ione, A(0,i), ione ); alpha = tau[i - 1] * -0.5f * value; blasf77_caxpy( &i, &alpha, A(0, i), &ione, W(0, iw), &ione ); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #ifdef COMPLEX lapackf77_clacgv( &i, W(i, 0), &ldw ); #endif blasf77_cgemv( "No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione ); #ifdef COMPLEX lapackf77_clacgv( &i, W(i, 0), &ldw ); lapackf77_clacgv( &i, A(i, 0), &lda ); #endif blasf77_cgemv( "No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione ); #ifdef COMPLEX lapackf77_clacgv( &i, A(i, 0), &lda ); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_clarfg( &i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i] ); e[i] = MAGMA_C_REAL( alpha ); *A(i+1,i) = MAGMA_C_ONE; /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_csetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1, queue ); magma_chemv( MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione, queue ); // 2. Start putting the result back (asynchronously) magma_cgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, queue ); blasf77_cgemv( MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); blasf77_cgemv( "No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, work, &ione ); blasf77_cgemv( MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); // 3. Here is where we need it magma_queue_sync( queue ); if (i != 0) blasf77_caxpy( &i_n, &c_one, work, &ione, W(i+1, i), &ione ); blasf77_cgemv( "No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione ); blasf77_cscal( &i_n, &tau[i], W(i+1,i), &ione ); value = magma_cblas_cdotc( i_n, W(i+1,i), ione, A(i+1,i), ione ); alpha = tau[i] * -0.5f * value; blasf77_caxpy( &i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione ); } } } return info; } /* magma_clatrd */
extern "C" magma_int_t magma_cgmres( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { magma_int_t stat = 0; // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); magma_int_t stat_cpu = 0, stat_dev = 0; // prepare solver feedback solver_par->solver = Magma_GMRES; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE, c_mone = MAGMA_C_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; float nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace //magma_setdevice(0); magmaFloatComplex *H, *HH, *y, *h1; stat_cpu += magma_cmalloc_pinned( &H, (ldh+1)*ldh ); stat_cpu += magma_cmalloc_pinned( &y, ldh ); stat_cpu += magma_cmalloc_pinned( &HH, ldh*ldh ); stat_cpu += magma_cmalloc_pinned( &h1, ldh ); if( stat_cpu != 0){ magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_HOST_ALLOC; } // GPU workspace magma_c_vector r, q, q_t; magma_c_vinit( &r, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero, queue ); q_t.memory_location = Magma_DEV; q_t.dval = NULL; q_t.num_rows = q_t.nnz = dofs; q_t.num_cols = 1; magmaFloatComplex *dy = NULL, *dH = NULL; stat_dev += magma_cmalloc( &dy, ldh ); stat_dev += magma_cmalloc( &dH, (ldh+1)*ldh ); if( stat_dev != 0){ magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); magma_free( dH ); magma_free( dy ); magma_free( dH ); magma_free( dy ); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_DEVICE_ALLOC; } // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); //magmablasSetKernelStream(stream[0]); magma_cscal( dofs, c_zero, x->dval, 1 ); // x = 0 magma_ccopy( dofs, b.dval, 1, r.dval, 1 ); // r = b nom0 = betanom = magma_scnrm2( dofs, r.dval, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_C_MAKE( nom0, 0. ); magma_csetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom0 * solver_par->epsilon ) < ATOLERANCE ){ r0 = solver_par->epsilon; } if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { for(k=1; k<=restart; k++) { magma_ccopy(dofs, r.dval, 1, q(k-1), 1); // q[0] = 1.0/||r|| magma_cscal(dofs, 1./H(k,k-1), q(k-1), 1); // (to be fused) q_t.dval = q(k-1); //magmablasSetKernelStream(stream[0]); magma_c_spmv( c_one, A, q_t, c_zero, r, queue ); // r = A q[k] // if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt for (i=1; i<=k; i++) { H(i,k) =magma_cdotc(dofs, q(i-1), 1, r.dval, 1); // H(i,k) = q[i] . r magma_caxpy(dofs,-H(i,k), q(i-1), 1, r.dval, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.dval, 1), 0. ); // H(k+1,k) = ||r|| /*} else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing cgemv with scnrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.dval, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_cgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.dval, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.dval, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_ccopyscale( dofs, k, r.dval, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_cgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.dval, 1, c_zero, &dH(1,k), 1, queue ); // dH(1:k,k) = q[0:k-1] . r #ifndef SCNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.dval, 1); #ifdef SCNRM2SCALE magma_ccopy(dofs, r.dval, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_scnrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.dval, 1), 0. ); // H(k+1,k) = sqrt(r . r) if ( k<solver_par->restart ) { magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.dval, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_cscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif }*/ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { HH(k,i) = magma_cblas_cdotc( i+1, &H(1,k), 1, &H(1,i), 1 ); } h1[k] = H(1,k)*H(1,0); if (k != 1) { for (i=1; i<k; i++) { HH(k,i) = HH(k,i)/HH(i,i);// for (m=i+1; m<=k; m++) { HH(k,m) -= HH(k,i) * HH(m,i) * HH(i,i); } h1[k] -= h1[i] * HH(k,i); } } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_C_REAL(H(k+1,k))); }/* Minimization done */ // compute solution approximation magma_csetmatrix(m, 1, y+1, m, dy, m ); magma_cgemv(MagmaNoTrans, dofs, m, c_one, q(0), dofs, dy, 1, c_one, x->dval, 1); // compute residual magma_c_spmv( c_mone, A, *x, c_zero, r, queue ); // r = - A * x magma_caxpy(dofs, c_one, b.dval, 1, r.dval, 1); // r = r + b H(1,0) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.dval, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_C_REAL( H(1,0) ); betanom = fabs(RNorm); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_cresidual( A, b, *x, &residual, queue ); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_c_vfree(&r, queue ); magma_c_vfree(&q, queue ); // free GPU streams and events magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); //magmablasSetKernelStream(NULL); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_cgmres */
magma_int_t magma_clatrsd( magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_bool_t normin, magma_int_t n, const magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex lambda, magmaFloatComplex *x, float *scale, float *cnorm, magma_int_t *info) { #define A(i,j) (A + (i) + (j)*lda) /* constants */ const magma_int_t ione = 1; const float d_half = 0.5; const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; /* System generated locals */ magma_int_t len; magmaFloatComplex ztmp; /* Local variables */ magma_int_t i, j; float xj, rec, tjj; magma_int_t jinc; float xbnd; magma_int_t imax; float tmax; magmaFloatComplex tjjs; float xmax, grow; float tscal; magmaFloatComplex uscal; magma_int_t jlast; magmaFloatComplex csumj; float bignum; magma_int_t jfirst; float smlnum; /* Function Body */ *info = 0; magma_int_t upper = (uplo == MagmaUpper); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t nounit = (diag == MagmaNonUnit); /* Test the input parameters. */ if ( ! upper && uplo != MagmaLower ) { *info = -1; } else if (! notran && trans != MagmaTrans && trans != MagmaConjTrans) { *info = -2; } else if ( ! nounit && diag != MagmaUnit ) { *info = -3; } else if ( ! (normin == MagmaTrue) && ! (normin == MagmaFalse) ) { *info = -4; } else if ( n < 0 ) { *info = -5; } else if ( lda < max(1,n) ) { *info = -7; } if ( *info != 0 ) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( n == 0 ) { return *info; } /* Determine machine dependent parameters to control overflow. */ smlnum = lapackf77_slamch( "Safe minimum" ); bignum = 1. / smlnum; lapackf77_slabad( &smlnum, &bignum ); smlnum /= lapackf77_slamch( "Precision" ); bignum = 1. / smlnum; *scale = 1.; if ( normin == MagmaFalse ) { /* Compute the 1-norm of each column, not including the diagonal. */ if ( upper ) { /* A is upper triangular. */ cnorm[0] = 0.; for( j = 1; j < n; ++j ) { cnorm[j] = magma_cblas_scasum( j, A(0,j), ione ); } } else { /* A is lower triangular. */ for( j = 0; j < n-1; ++j ) { cnorm[j] = magma_cblas_scasum( n-(j+1), A(j+1,j), ione ); } cnorm[n-1] = 0.; } } /* Scale the column norms by TSCAL if the maximum element in CNORM is */ /* greater than BIGNUM/2. */ imax = blasf77_isamax( &n, &cnorm[0], &ione ) - 1; tmax = cnorm[imax]; if ( tmax <= bignum * 0.5 ) { tscal = 1.; } else { tscal = 0.5 / (smlnum * tmax); blasf77_sscal( &n, &tscal, &cnorm[0], &ione ); } /* ================================================================= */ /* Compute a bound on the computed solution vector to see if the */ /* Level 2 BLAS routine CTRSV can be used. */ xmax = 0.; for( j = 0; j < n; ++j ) { xmax = max( xmax, 0.5*MAGMA_C_ABS1( x[j] )); } xbnd = xmax; if ( notran ) { /* ---------------------------------------- */ /* Compute the growth in A * x = b. */ if ( upper ) { jfirst = n-1; jlast = 0; jinc = -1; } else { jfirst = 0; jlast = n; jinc = 1; } if ( tscal != 1. ) { grow = 0.; goto L60; } /* A is non-unit triangular. */ /* Compute GROW = 1/G(j) and XBND = 1/M(j). */ /* Initially, G(0) = max{x(i), i=1,...,n}. */ grow = 0.5 / max( xbnd, smlnum ); xbnd = grow; for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) { /* Exit the loop if the growth factor is too small. */ if ( grow <= smlnum ) { goto L60; } if ( nounit ) { tjjs = *A(j,j) - lambda; } else { tjjs = c_one - lambda; } tjj = MAGMA_C_ABS1( tjjs ); if ( tjj >= smlnum ) { /* M(j) = G(j-1) / abs(A(j,j)) */ xbnd = min( xbnd, min(1.,tjj)*grow ); } else { /* M(j) could overflow, set XBND to 0. */ xbnd = 0.; } if ( tjj + cnorm[j] >= smlnum ) { /* G(j) = G(j-1)*( 1 + CNORM(j) / abs(A(j,j)) ) */ grow *= (tjj / (tjj + cnorm[j])); } else { /* G(j) could overflow, set GROW to 0. */ grow = 0.; } } grow = xbnd; L60: ; } else { /* ---------------------------------------- */ /* Compute the growth in A**T * x = b or A**H * x = b. */ if ( upper ) { jfirst = 0; jlast = n; jinc = 1; } else { jfirst = n-1; jlast = 0; jinc = -1; } if ( tscal != 1. ) { grow = 0.; goto L90; } /* A is non-unit triangular. */ /* Compute GROW = 1/G(j) and XBND = 1/M(j). */ /* Initially, M(0) = max{x(i), i=1,...,n}. */ grow = 0.5 / max( xbnd, smlnum ); xbnd = grow; for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) { /* Exit the loop if the growth factor is too small. */ if ( grow <= smlnum ) { goto L90; } /* G(j) = max( G(j-1), M(j-1)*( 1 + CNORM(j) ) ) */ xj = 1. + cnorm[j]; grow = min( grow, xbnd / xj ); if ( nounit ) { tjjs = *A(j,j) - lambda; } else { tjjs = c_one - lambda; } tjj = MAGMA_C_ABS1( tjjs ); if ( tjj >= smlnum ) { /* M(j) = M(j-1)*( 1 + CNORM(j) ) / abs(A(j,j)) */ if ( xj > tjj ) { xbnd *= (tjj / xj); } } else { /* M(j) could overflow, set XBND to 0. */ xbnd = 0.; } } grow = min( grow, xbnd ); L90: ; } /* ================================================================= */ /* Due to modified diagonal, we can't use regular BLAS ctrsv. */ /* Use a Level 1 BLAS solve, scaling intermediate results. */ if ( xmax > bignum * 0.5 ) { /* Scale X so that its components are less than or equal to */ /* BIGNUM in absolute value. */ *scale = (bignum * 0.5) / xmax; blasf77_csscal( &n, scale, &x[0], &ione ); xmax = bignum; } else { xmax *= 2.; } if ( notran ) { /* ---------------------------------------- */ /* Solve A * x = b */ for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) { /* Compute x(j) = b(j) / A(j,j), scaling x if necessary. */ xj = MAGMA_C_ABS1( x[j] ); if ( nounit ) { tjjs = (*A(j,j) - lambda ) * tscal; } else { tjjs = (c_one - lambda) * tscal; if ( tscal == 1. ) { goto L110; } } tjj = MAGMA_C_ABS1( tjjs ); if ( tjj > smlnum ) { /* abs(A(j,j)) > SMLNUM: */ if ( tjj < 1. ) { if ( xj > tjj * bignum ) { /* Scale x by 1/b(j). */ rec = 1. / xj; blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } } x[j] = x[j] / tjjs; xj = MAGMA_C_ABS1( x[j] ); } else if ( tjj > 0. ) { /* 0 < abs(A(j,j)) <= SMLNUM: */ if ( xj > tjj * bignum ) { /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM */ /* to avoid overflow when dividing by A(j,j). */ rec = (tjj * bignum) / xj; if ( cnorm[j] > 1. ) { /* Scale by 1/CNORM(j) to avoid overflow when */ /* multiplying x(j) times column j. */ rec /= cnorm[j]; } blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } x[j] = x[j] / tjjs; xj = MAGMA_C_ABS1( x[j] ); } else { /* A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and */ /* scale = 0, and compute a solution to A*x = 0. */ for( i = 0; i < n; ++i ) { x[i] = c_zero; } x[j] = c_one; xj = 1.; *scale = 0.; xmax = 0.; } L110: /* Scale x if necessary to avoid overflow when adding a */ /* multiple of column j of A. */ if ( xj > 1. ) { rec = 1. / xj; if ( cnorm[j] > (bignum - xmax) * rec ) { /* Scale x by 1/(2*abs(x(j))). */ rec *= 0.5; blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; } } else if ( xj * cnorm[j] > bignum - xmax ) { /* Scale x by 1/2. */ blasf77_csscal( &n, &d_half, &x[0], &ione ); *scale *= 0.5; } if ( upper ) { if ( j > 0 ) { /* Compute the update */ /* x(1:j-1) := x(1:j-1) - x(j) * A(1:j-1,j) */ len = j; ztmp = -tscal * x[j]; blasf77_caxpy( &len, &ztmp, A(0,j), &ione, &x[0], &ione ); i = blasf77_icamax( &len, &x[0], &ione ) - 1; xmax = MAGMA_C_ABS1( x[i] ); } } else { if ( j < n-1 ) { /* Compute the update */ /* x(j+1:n) := x(j+1:n) - x(j) * A(j+1:n,j) */ len = n - (j+1); ztmp = -tscal * x[j]; blasf77_caxpy( &len, &ztmp, A(j+1,j), &ione, &x[j + 1], &ione ); i = j + blasf77_icamax( &len, &x[j + 1], &ione ); xmax = MAGMA_C_ABS1( x[i] ); } } } } else if ( trans == MagmaTrans ) { /* ---------------------------------------- */ /* Solve A**T * x = b */ for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) { /* Compute x(j) = b(j) - sum A(k,j)*x(k). */ /* k<>j */ xj = MAGMA_C_ABS1( x[j] ); uscal = MAGMA_C_MAKE( tscal, 0. ); rec = 1. / max( xmax, 1. ); if ( cnorm[j] > (bignum - xj) * rec ) { /* If x(j) could overflow, scale x by 1/(2*XMAX). */ rec *= 0.5; if ( nounit ) { tjjs = (*A(j,j) - lambda) * tscal; } else { tjjs = (c_one - lambda) * tscal; } tjj = MAGMA_C_ABS1( tjjs ); if ( tjj > 1. ) { /* Divide by A(j,j) when scaling x if A(j,j) > 1. */ rec = min( 1., rec * tjj ); uscal = uscal / tjjs; } if ( rec < 1. ) { blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } } csumj = c_zero; if ( uscal == c_one ) { /* If the scaling needed for A in the dot product is 1, */ /* call CDOTU to perform the dot product. */ if ( upper ) { csumj = magma_cblas_cdotu( j, A(0,j), ione, &x[0], ione ); } else if ( j < n-1 ) { csumj = magma_cblas_cdotu( n-(j+1), A(j+1,j), ione, &x[j+1], ione ); } } else { /* Otherwise, use in-line code for the dot product. */ if ( upper ) { for( i = 0; i < j; ++i ) { csumj += (*A(i,j) * uscal) * x[i]; } } else if ( j < n-1 ) { for( i = j+1; i < n; ++i ) { csumj += (*A(i,j) * uscal) * x[i]; } } } if ( uscal == MAGMA_C_MAKE( tscal, 0. )) { /* Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) */ /* was not used to scale the dotproduct. */ x[j] -= csumj; xj = MAGMA_C_ABS1( x[j] ); if ( nounit ) { tjjs = (*A(j,j) - lambda) * tscal; } else { tjjs = (c_one - lambda) * tscal; if ( tscal == 1. ) { goto L160; } } /* Compute x(j) = x(j) / A(j,j), scaling if necessary. */ tjj = MAGMA_C_ABS1( tjjs ); if ( tjj > smlnum ) { /* abs(A(j,j)) > SMLNUM: */ if ( tjj < 1. ) { if ( xj > tjj * bignum ) { /* Scale X by 1/abs(x(j)). */ rec = 1. / xj; blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } } x[j] = x[j] / tjjs; } else if ( tjj > 0. ) { /* 0 < abs(A(j,j)) <= SMLNUM: */ if ( xj > tjj * bignum ) { /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */ rec = (tjj * bignum) / xj; blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } x[j] = x[j] / tjjs; } else { /* A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and */ /* scale = 0 and compute a solution to A**T *x = 0. */ for( i = 0; i < n; ++i ) { x[i] = c_zero; } x[j] = c_one; *scale = 0.; xmax = 0.; } L160: ; } else { /* Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot */ /* product has already been divided by 1/A(j,j). */ x[j] = (x[j] / tjjs) - csumj; } xmax = max( xmax, MAGMA_C_ABS1( x[j] )); } } else { /* ---------------------------------------- */ /* Solve A**H * x = b */ for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) { /* Compute x(j) = b(j) - sum A(k,j)*x(k). */ /* k<>j */ xj = MAGMA_C_ABS1( x[j] ); uscal = MAGMA_C_MAKE( tscal, 0. ); rec = 1. / max(xmax, 1.); if ( cnorm[j] > (bignum - xj) * rec ) { /* If x(j) could overflow, scale x by 1/(2*XMAX). */ rec *= 0.5; if ( nounit ) { tjjs = MAGMA_C_CONJ( *A(j,j) - lambda ) * tscal; } else { tjjs = (c_one - lambda) * tscal; } tjj = MAGMA_C_ABS1( tjjs ); if ( tjj > 1. ) { /* Divide by A(j,j) when scaling x if A(j,j) > 1. */ rec = min( 1., rec * tjj ); uscal = uscal / tjjs; } if ( rec < 1. ) { blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } } csumj = c_zero; if ( uscal == c_one ) { /* If the scaling needed for A in the dot product is 1, */ /* call CDOTC to perform the dot product. */ if ( upper ) { csumj = magma_cblas_cdotc( j, A(0,j), ione, &x[0], ione ); } else if ( j < n-1 ) { csumj = magma_cblas_cdotc( n-(j+1), A(j+1,j), ione, &x[j+1], ione ); } } else { /* Otherwise, use in-line code for the dot product. */ if ( upper ) { for( i = 0; i < j; ++i ) { csumj += (MAGMA_C_CONJ( *A(i,j) ) * uscal) * x[i]; } } else if ( j < n-1 ) { for( i = j + 1; i < n; ++i ) { csumj += (MAGMA_C_CONJ( *A(i,j) ) * uscal) * x[i]; } } } if ( uscal == tscal ) { /* Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) */ /* was not used to scale the dotproduct. */ x[j] -= csumj; xj = MAGMA_C_ABS1( x[j] ); if ( nounit ) { tjjs = MAGMA_C_CONJ( *A(j,j) - lambda ) * tscal; } else { tjjs = (c_one - lambda) * tscal; if ( tscal == 1. ) { goto L210; } } /* Compute x(j) = x(j) / A(j,j), scaling if necessary. */ tjj = MAGMA_C_ABS1( tjjs ); if ( tjj > smlnum ) { /* abs(A(j,j)) > SMLNUM: */ if ( tjj < 1. ) { if ( xj > tjj * bignum ) { /* Scale X by 1/abs(x(j)). */ rec = 1. / xj; blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } } x[j] = x[j] / tjjs; } else if ( tjj > 0. ) { /* 0 < abs(A(j,j)) <= SMLNUM: */ if ( xj > tjj * bignum ) { /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */ rec = (tjj * bignum) / xj; blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } x[j] = x[j] / tjjs; } else { /* A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and */ /* scale = 0 and compute a solution to A**H *x = 0. */ for( i = 0; i < n; ++i ) { x[i] = c_zero; } x[j] = c_one; *scale = 0.; xmax = 0.; } L210: ; } else { /* Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot */ /* product has already been divided by 1/A(j,j). */ x[j] = (x[j] / tjjs) - csumj; } xmax = max( xmax, MAGMA_C_ABS1( x[j] )); } } *scale /= tscal; /* Scale the column norms by 1/TSCAL for return. */ if ( tscal != 1. ) { float d = 1. / tscal; blasf77_sscal( &n, &d, &cnorm[0], &ione ); } return *info; } /* end clatrsd */
// ---------------------------------------- int main( int argc, char** argv ) { TESTING_INIT(); //real_Double_t t_m, t_c, t_f; magma_int_t ione = 1; magmaFloatComplex *A, *B; float diff, error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld; magmaFloatComplex x2_m, x2_c; // complex x for magma, cblas/fortran blas respectively float x_m, x_c; // x for magma, cblas/fortran blas respectively magma_opts opts; parse_opts( argc, argv, &opts ); opts.tolerance = max( 100., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); gTol = tol; printf( "!! Calling these CBLAS and Fortran BLAS sometimes crashes (segfault), which !!\n" "!! is why we use wrappers. It does not necesarily indicate a bug in MAGMA. !!\n" "\n" "Diff compares MAGMA wrapper to CBLAS and BLAS function; should be exactly 0.\n" "Error compares MAGMA implementation to CBLAS and BLAS function; should be ~ machine epsilon.\n" "\n" ); float total_diff = 0.; float total_error = 0.; int inc[] = { 1 }; //{ -2, -1, 1, 2 }; //{ 1 }; //{ -1, 1 }; int ninc = sizeof(inc)/sizeof(*inc); for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d, incx = %d, incy = %d\n", (int) m, (int) n, (int) k, (int) incx, (int) incy ); printf( "Function MAGMA CBLAS BLAS Diff Error\n" " msec msec msec\n" ); // allocate matrices // over-allocate so they can be any combination of // {m,n,k} * {abs(incx), abs(incy)} by // {m,n,k} * {abs(incx), abs(incy)} maxn = max( max( m, n ), k ) * max( abs(incx), abs(incy) ); ld = max( 1, maxn ); size = ld*maxn; magma_cmalloc_pinned( &A, size ); assert( A != NULL ); magma_cmalloc_pinned( &B, size ); assert( B != NULL ); // initialize matrices lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); printf( "Level 1 BLAS ----------------------------------------------------------\n" ); // ----- test SCASUM // get one-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_scasum( m, A(0,j), incx ); x_c = cblas_scasum( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_scasum( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "scasum", diff, error ); total_diff += diff; total_error += error; } // ----- test SCNRM2 // get two-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_scnrm2( m, A(0,j), incx ); x_c = cblas_scnrm2( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_scnrm2( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "scnrm2", diff, error ); total_diff += diff; total_error += error; } // ----- test CDOTC // dot columns, Aj^H Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_cdotc( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_cdotc_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_cdotc( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_cdotc( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "cdotc", diff, error ); total_diff += diff; total_error += error; total_error += error; // ----- test CDOTU // dot columns, Aj^T * Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_cdotu( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_cdotu_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_cdotu( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_cdotu( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "cdotu", diff, error ); total_diff += diff; total_error += error; // tell user about disabled functions #if defined( MAGMA_WITH_MKL ) printf( "cblas_cdotc and cblas_cdotu disabled with MKL (segfaults)\n" ); #endif #if defined( __APPLE__ ) printf( "blasf77_cdotc and blasf77_cdotu disabled on MacOS (segfaults)\n" ); #endif // cleanup magma_free_pinned( A ); magma_free_pinned( B ); fflush( stdout ); }}} // itest, incx, incy // TODO use average error? printf( "sum diffs = %8.2g, MAGMA wrapper compared to CBLAS and Fortran BLAS; should be exactly 0.\n" "sum errors = %8.2e, MAGMA implementation compared to CBLAS and Fortran BLAS; should be ~ machine epsilon.\n\n", total_diff, total_error ); if ( total_diff != 0. ) { printf( "some tests failed diff == 0.; see above.\n" ); } else { printf( "all tests passed diff == 0.\n" ); } TESTING_FINALIZE(); int status = (total_diff != 0.); return status; }
/** Purpose ------- CLATRD reduces NB rows and columns of a complex Hermitian matrix A to Hermitian tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, CLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, CLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by CHETRD. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e COMPLEX array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau COMPLEX array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W COMPLEX array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a Hermitian rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_cheev_aux ********************************************************************/ extern "C" magma_int_t magma_clatrd(magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magmaFloatComplex *A, magma_int_t lda, float *e, magmaFloatComplex *tau, magmaFloatComplex *W, magma_int_t ldw, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *dW, magma_int_t lddw) { #define A(i, j) (A + (j)*lda + (i)) #define W(i, j) (W + (j)*ldw + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) #define dW(i, j) (dW + (j)*lddw + (i)) magma_int_t i; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex value = MAGMA_C_ZERO; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; magmaFloatComplex alpha; magmaFloatComplex *f; // TODO check arguments magma_int_t info = 0; if (n <= 0) { return info; } magma_queue_t stream; magma_queue_create( &stream ); magma_cmalloc_cpu( &f, n ); if ( f == NULL ) { info = MAGMA_ERR_HOST_ALLOC; return info; } if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i_n, W(i, iw+1), &ldw); #endif blasf77_cgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i_n, W(i, iw+1), &ldw); lapackf77_clacgv(&i_n, A(i, i+1), &lda); #endif blasf77_cgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i_n, A(i, i+1), &lda); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_clarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_C_REAL( alpha ); *A(i-1,i) = MAGMA_C_ONE; /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_csetvector( i, A(0, i), 1, dA(0, i), 1 ); magma_chemv(MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione); // 2. Start putting the result back (asynchronously) magma_cgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw) /*test*/, ldw, stream ); if (i < n-1) { blasf77_cgemv(MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } // 3. Here is where we need it // TODO find the right place magma_queue_sync( stream ); if (i < n-1) { blasf77_cgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_cgemv(MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_cgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_cscal(&i, &tau[i - 1], W(0, iw), &ione); value = magma_cblas_cdotc( i, W(0,iw), ione, A(0,i), ione ); alpha = tau[i - 1] * -0.5f * value; blasf77_caxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i, W(i, 0), &ldw); #endif blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i, W(i, 0), &ldw); lapackf77_clacgv(&i, A(i, 0), &lda); #endif blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i, A(i, 0), &lda); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_clarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); e[i] = MAGMA_C_REAL( alpha ); *A(i+1,i) = MAGMA_C_ONE; /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_csetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 ); magma_chemv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione); // 2. Start putting the result back (asynchronously) magma_cgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, stream ); blasf77_cgemv(MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_cgemv(MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); // 3. Here is where we need it magma_queue_sync( stream ); if (i != 0) blasf77_caxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_cscal(&i_n, &tau[i], W(i+1,i), &ione); value = magma_cblas_cdotc( i_n, W(i+1,i), ione, A(i+1,i), ione ); alpha = tau[i] * -0.5f * value; blasf77_caxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); } } } magma_free_cpu( f ); magma_queue_destroy( stream ); return info; } /* magma_clatrd */