inline static void magma_clarfxsym_v2(magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *V, magmaFloatComplex *TAU, magmaFloatComplex *work) { /* WORK (workspace) float complex array, dimension N */ magma_int_t ione = 1; magmaFloatComplex dtmp; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_neg_one= MAGMA_C_NEG_ONE; magmaFloatComplex c_half = MAGMA_C_HALF; /* X = AVtau */ blasf77_chemv("L",&n, TAU, A, &lda, V, &ione, &c_zero, work, &ione); /* compute dtmp= X'*V */ #if defined(PRECISION_z) || defined(PRECISION_c) dtmp = c_zero; for (magma_int_t j = 0; j < n; j++) dtmp = dtmp + MAGMA_C_CNJG(work[j]) * V[j]; //cblas_cdotc_sub(n, work, ione, V, ione, &dtmp); #else dtmp = cblas_cdotc(n, work, ione, V, ione); #endif /* compute 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * c_half * (*TAU); /* compute W=X-1/2VX'Vt = X - dtmp*V */ blasf77_caxpy(&n, &dtmp, V, &ione, work, &ione); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_cher2("L", &n, &c_neg_one, work, &ione, V, &ione, A, &lda); }
/** Purpose ------- CLATRD2 reduces NB rows and columns of a complex Hermitian matrix A to Hermitian tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, CLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, CLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by CHETRD2_GPU. It uses an accelerated HEMV that needs extra memory. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e COMPLEX array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau COMPLEX array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W COMPLEX array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a Hermitian rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_cheev_aux ********************************************************************/ extern "C" magma_int_t magma_clatrd2(magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magmaFloatComplex *A, magma_int_t lda, float *e, magmaFloatComplex *tau, magmaFloatComplex *W, magma_int_t ldw, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *dW, magma_int_t lddw, magmaFloatComplex *dwork, magma_int_t ldwork) { #define A(i, j) (A + (j)*lda + (i)) #define W(i, j) (W + (j)*ldw + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) #define dW(i, j) (dW + (j)*lddw + (i)) magma_int_t i; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex value = MAGMA_C_ZERO; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; magmaFloatComplex alpha; magmaFloatComplex *f; if (n <= 0) { return 0; } magma_queue_t stream; magma_queue_create( &stream ); magma_cmalloc_cpu( &f, n ); assert( f != NULL ); // TODO return error, or allocate outside clatrd if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i_n, W(i, iw+1), &ldw); #endif blasf77_cgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i_n, W(i, iw+1), &ldw); lapackf77_clacgv(&i_n, A(i, i+1), &ldw); #endif blasf77_cgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i_n, A(i, i+1), &ldw); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_clarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_C_REAL( alpha ); *A(i-1,i) = MAGMA_C_MAKE( 1, 0 ); /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_csetvector( i, A(0, i), 1, dA(0, i), 1 ); //#if (GPUSHMEM < 200) //magma_chemv(MagmaUpper, i, c_one, dA(0, 0), ldda, // dA(0, i), ione, c_zero, dW(0, iw), ione); //#else magmablas_chemv_work(MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione, dwork, ldwork); //#endif // 2. Start putting the result back (asynchronously) magma_cgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw) /*test*/, ldw, stream ); if (i < n-1) { blasf77_cgemv(MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } // 3. Here is where we need it // TODO find the right place magma_queue_sync( stream ); if (i < n-1) { blasf77_cgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_cgemv(MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_cgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_cscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_cdotc_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_cdotc( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -0.5f * value; blasf77_caxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i, W(i, 0), &ldw); #endif blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i, W(i, 0), &ldw); lapackf77_clacgv(&i, A(i, 0), &lda); #endif blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i, A(i, 0), &lda); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_clarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); e[i] = MAGMA_C_REAL( alpha ); *A(i+1,i) = MAGMA_C_MAKE( 1, 0 ); /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_csetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 ); //#if (GPUSHMEM < 200) //magma_chemv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, // dW(i+1, i), ione); //#else magmablas_chemv_work(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione, dwork, ldwork); //#endif // 2. Start putting the result back (asynchronously) magma_cgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, stream ); blasf77_cgemv(MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_cgemv(MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); // 3. Here is where we need it magma_queue_sync( stream ); if (i != 0) blasf77_caxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_cscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_cdotc_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_cdotc( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i] * -0.5f * value; blasf77_caxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); } } } magma_free_cpu(f); magma_queue_destroy( stream ); return 0; } /* magma_clatrd */
magma_int_t magma_cgmres( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par ){ // prepare solver feedback solver_par->solver = Magma_GMRES; solver_par->numiter = 0; solver_par->info = 0; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE, c_mone = MAGMA_C_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; float nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace magma_setdevice(0); magmaFloatComplex *H, *HH, *y, *h1; magma_cmalloc_pinned( &H, (ldh+1)*ldh ); magma_cmalloc_pinned( &y, ldh ); magma_cmalloc_pinned( &HH, ldh*ldh ); magma_cmalloc_pinned( &h1, ldh ); // GPU workspace magma_c_vector r, q, q_t; magma_c_vinit( &r, Magma_DEV, dofs, c_zero ); magma_c_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero ); q_t.memory_location = Magma_DEV; q_t.val = NULL; q_t.num_rows = q_t.nnz = dofs; magmaFloatComplex *dy, *dH = NULL; if (MAGMA_SUCCESS != magma_cmalloc( &dy, ldh )) return MAGMA_ERR_DEVICE_ALLOC; if (MAGMA_SUCCESS != magma_cmalloc( &dH, (ldh+1)*ldh )) return MAGMA_ERR_DEVICE_ALLOC; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); magmablasSetKernelStream(stream[0]); magma_cscal( dofs, c_zero, x->val, 1 ); // x = 0 magma_ccopy( dofs, b.val, 1, r.val, 1 ); // r = b nom0 = betanom = magma_scnrm2( dofs, r.val, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_C_MAKE( nom0, 0. ); magma_csetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ magma_ccopy(dofs, r.val, 1, q(0), 1); // q[0] = 1.0/||r|| magma_cscal(dofs, 1./H(1,0), q(0), 1); // (to be fused) for(k=1; k<=restart; k++) { q_t.val = q(k-1); magmablasSetKernelStream(stream[0]); magma_c_spmv( c_one, A, q_t, c_zero, r ); // r = A q[k] if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt magmablasSetKernelStream(stream[0]); for (i=1; i<=k; i++) { H(i,k) =magma_cdotc(dofs, q(i-1), 1, r.val, 1); // H(i,k) = q[i] . r magma_caxpy(dofs,-H(i,k), q(i-1), 1, r.val, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if (k < restart) { magma_ccopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_cscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } } else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing cgemv with scnrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.val, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_cgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_ccopyscale( dofs, k, r.val, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_cgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // dH(1:k,k) = q[0:k-1] . r #ifndef SCNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); #ifdef SCNRM2SCALE magma_ccopy(dofs, r.val, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_scnrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if( k<solver_par->restart ){ magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_cscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif } } magma_queue_sync( stream[1] ); for( k=1; k<=restart; k++ ){ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { #if defined(PRECISION_z) || defined(PRECISION_c) cblas_cdotc_sub( i+1, &H(1,k), 1, &H(1,i), 1, &HH(k,i) ); #else HH(k,i) = cblas_cdotc(i+1, &H(1,k), 1, &H(1,i), 1); #endif } h1[k] = H(1,k)*H(1,0); if (k != 1) for (i=1; i<k; i++) { for (m=i+1; m<k; m++){ HH(k,m) -= HH(k,i) * HH(m,i); } HH(k,k) -= HH(k,i) * HH(k,i) / HH(i,i); HH(k,i) = HH(k,i)/HH(i,i); h1[k] -= h1[i] * HH(k,i); } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_C_REAL(H(k+1,k))); } magma_csetmatrix_async(m, 1, y+1, m, dy, m, stream[0]); magmablasSetKernelStream(stream[0]); magma_cgemv(MagmaNoTrans, dofs, m, c_one, q(0), dofs, dy, 1, c_one, x->val, 1); magma_c_spmv( c_mone, A, *x, c_zero, r ); // r = - A * x magma_caxpy(dofs, c_one, b.val, 1, r.val, 1); // r = r + b H(1,0) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.val, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_C_REAL( H(1,0) ); betanom = fabs(RNorm); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_cresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_c_vfree(&r); magma_c_vfree(&q); // free GPU streams and events //magma_queue_destroy( stream[0] ); //magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); magmablasSetKernelStream(NULL); return MAGMA_SUCCESS; } /* magma_cgmres */
// ---------------------------------------- int main( int argc, char** argv ) { TESTING_INIT(); //real_Double_t t_m, t_c, t_f; magma_int_t ione = 1; magmaFloatComplex *A, *B; float diff, error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld; magmaFloatComplex x2_m, x2_c; // complex x for magma, cblas/fortran blas respectively float x_m, x_c; // x for magma, cblas/fortran blas respectively magma_opts opts; parse_opts( argc, argv, &opts ); opts.tolerance = max( 100., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); gTol = tol; printf( "!! Calling these CBLAS and Fortran BLAS sometimes crashes (segfault), which !!\n" "!! is why we use wrappers. It does not necesarily indicate a bug in MAGMA. !!\n" "\n" "Diff compares MAGMA wrapper to CBLAS and BLAS function; should be exactly 0.\n" "Error compares MAGMA implementation to CBLAS and BLAS function; should be ~ machine epsilon.\n" "\n" ); float total_diff = 0.; float total_error = 0.; int inc[] = { 1 }; //{ -2, -1, 1, 2 }; //{ 1 }; //{ -1, 1 }; int ninc = sizeof(inc)/sizeof(*inc); for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d, incx = %d, incy = %d\n", (int) m, (int) n, (int) k, (int) incx, (int) incy ); printf( "Function MAGMA CBLAS BLAS Diff Error\n" " msec msec msec\n" ); // allocate matrices // over-allocate so they can be any combination of // {m,n,k} * {abs(incx), abs(incy)} by // {m,n,k} * {abs(incx), abs(incy)} maxn = max( max( m, n ), k ) * max( abs(incx), abs(incy) ); ld = max( 1, maxn ); size = ld*maxn; magma_cmalloc_pinned( &A, size ); assert( A != NULL ); magma_cmalloc_pinned( &B, size ); assert( B != NULL ); // initialize matrices lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); printf( "Level 1 BLAS ----------------------------------------------------------\n" ); // ----- test SCASUM // get one-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_scasum( m, A(0,j), incx ); x_c = cblas_scasum( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_scasum( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "scasum", diff, error ); total_diff += diff; total_error += error; } // ----- test SCNRM2 // get two-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_scnrm2( m, A(0,j), incx ); x_c = cblas_scnrm2( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_scnrm2( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "scnrm2", diff, error ); total_diff += diff; total_error += error; } // ----- test CDOTC // dot columns, Aj^H Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_cdotc( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_cdotc_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_cdotc( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_cdotc( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "cdotc", diff, error ); total_diff += diff; total_error += error; total_error += error; // ----- test CDOTU // dot columns, Aj^T * Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_cdotu( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_cdotu_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_cdotu( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_cdotu( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "cdotu", diff, error ); total_diff += diff; total_error += error; // tell user about disabled functions #if defined( MAGMA_WITH_MKL ) printf( "cblas_cdotc and cblas_cdotu disabled with MKL (segfaults)\n" ); #endif #if defined( __APPLE__ ) printf( "blasf77_cdotc and blasf77_cdotu disabled on MacOS (segfaults)\n" ); #endif // cleanup magma_free_pinned( A ); magma_free_pinned( B ); fflush( stdout ); }}} // itest, incx, incy // TODO use average error? printf( "sum diffs = %8.2g, MAGMA wrapper compared to CBLAS and Fortran BLAS; should be exactly 0.\n" "sum errors = %8.2e, MAGMA implementation compared to CBLAS and Fortran BLAS; should be ~ machine epsilon.\n\n", total_diff, total_error ); if ( total_diff != 0. ) { printf( "some tests failed diff == 0.; see above.\n" ); } else { printf( "all tests passed diff == 0.\n" ); } TESTING_FINALIZE(); int status = (total_diff != 0.); return status; }