/***************************************************************************//** Purpose ------- CGEEV computes for an N-by-N complex nonsymmetric matrix A, the eigenvalues and, optionally, the left and/or right eigenvectors. The right eigenvector v(j) of A satisfies A * v(j) = lambda(j) * v(j) where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies u(j)**H * A = lambda(j) * u(j)**H where u(j)**H denotes the conjugate transpose of u(j). The computed eigenvectors are normalized to have Euclidean norm equal to 1 and largest component real. Arguments --------- @param[in] jobvl magma_vec_t - = MagmaNoVec: left eigenvectors of A are not computed; - = MagmaVec: left eigenvectors of are computed. @param[in] jobvr magma_vec_t - = MagmaNoVec: right eigenvectors of A are not computed; - = MagmaVec: right eigenvectors of A are computed. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the N-by-N matrix A. On exit, A has been overwritten. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] w COMPLEX array, dimension (N) W contains the computed eigenvalues. @param[out] VL COMPLEX array, dimension (LDVL,N) If JOBVL = MagmaVec, the left eigenvectors u(j) are stored one after another in the columns of VL, in the same order as their eigenvalues. If JOBVL = MagmaNoVec, VL is not referenced. u(j) = VL(:,j), the j-th column of VL. @param[in] ldvl INTEGER The leading dimension of the array VL. LDVL >= 1; if JOBVL = MagmaVec, LDVL >= N. @param[out] VR COMPLEX array, dimension (LDVR,N) If JOBVR = MagmaVec, the right eigenvectors v(j) are stored one after another in the columns of VR, in the same order as their eigenvalues. If JOBVR = MagmaNoVec, VR is not referenced. v(j) = VR(:,j), the j-th column of VR. @param[in] ldvr INTEGER The leading dimension of the array VR. LDVR >= 1; if JOBVR = MagmaVec, LDVR >= N. @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= (1 + nb + nb*ngpu)*N. For optimal performance, LWORK >= (1 + 2*nb + nb*ngpu)*N. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param rwork (workspace) REAL array, dimension (2*N) @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: if INFO = i, the QR algorithm failed to compute all the eigenvalues, and no eigenvectors have been computed; elements and i+1:N of W contain eigenvalues which have converged. @ingroup magma_geev *******************************************************************************/ extern "C" magma_int_t magma_cgeev_m( magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, #ifdef COMPLEX magmaFloatComplex *w, #else float *wr, float *wi, #endif magmaFloatComplex *VL, magma_int_t ldvl, magmaFloatComplex *VR, magma_int_t ldvr, magmaFloatComplex *work, magma_int_t lwork, #ifdef COMPLEX float *rwork, #endif magma_int_t *info ) { #define VL(i,j) (VL + (i) + (j)*ldvl) #define VR(i,j) (VR + (i) + (j)*ldvr) const magma_int_t ione = 1; const magma_int_t izero = 0; float d__1, d__2; magmaFloatComplex tmp; float scl; float dum[1], eps; float anrm, cscale, bignum, smlnum; magma_int_t i, k, ilo, ihi; magma_int_t ibal, ierr, itau, iwrk, nout, liwrk, nb; magma_int_t scalea, minwrk, optwrk, irwork, lquery, wantvl, wantvr, select[1]; magma_side_t side = MagmaRight; magma_int_t ngpu = magma_num_gpus(); irwork = 0; *info = 0; lquery = (lwork == -1); wantvl = (jobvl == MagmaVec); wantvr = (jobvr == MagmaVec); if (! wantvl && jobvl != MagmaNoVec) { *info = -1; } else if (! wantvr && jobvr != MagmaNoVec) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) { *info = -8; } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) { *info = -10; } /* Compute workspace */ nb = magma_get_cgehrd_nb( n ); if (*info == 0) { minwrk = (1 + nb + nb*ngpu)*n; optwrk = (1 + 2*nb + nb*ngpu)*n; work[0] = magma_cmake_lwork( optwrk ); if (lwork < minwrk && ! lquery) { *info = -12; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } #if defined(Version3) magmaFloatComplex *dT; if (MAGMA_SUCCESS != magma_cmalloc( &dT, nb*n )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #endif #if defined(Version5) magmaFloatComplex *T; if (MAGMA_SUCCESS != magma_cmalloc_cpu( &T, nb*n )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif /* Get machine constants */ eps = lapackf77_slamch( "P" ); smlnum = lapackf77_slamch( "S" ); bignum = 1. / smlnum; lapackf77_slabad( &smlnum, &bignum ); smlnum = magma_ssqrt( smlnum ) / eps; bignum = 1. / smlnum; /* Scale A if max element outside range [SMLNUM,BIGNUM] */ anrm = lapackf77_clange( "M", &n, &n, A, &lda, dum ); scalea = 0; if (anrm > 0. && anrm < smlnum) { scalea = 1; cscale = smlnum; } else if (anrm > bignum) { scalea = 1; cscale = bignum; } if (scalea) { lapackf77_clascl( "G", &izero, &izero, &anrm, &cscale, &n, &n, A, &lda, &ierr ); } /* Balance the matrix * (CWorkspace: none) * (RWorkspace: need N) * - this space is reserved until after gebak */ ibal = 0; lapackf77_cgebal( "B", &n, A, &lda, &ilo, &ihi, &rwork[ibal], &ierr ); /* Reduce to upper Hessenberg form * (CWorkspace: need 2*N, prefer N + N*NB + NB*NGPU) * (RWorkspace: N) * - added NB*NGPU needed for multi-GPU magma_cgehrd_m * - including N reserved for gebal/gebak, unused by cgehrd */ itau = 0; iwrk = itau + n; liwrk = lwork - iwrk; #if defined(Version1) // Version 1 - LAPACK lapackf77_cgehrd( &n, &ilo, &ihi, A, &lda, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version2) // Version 2 - LAPACK consistent HRD magma_cgehrd2( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, &ierr ); #elif defined(Version3) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored, magma_cgehrd( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, dT, &ierr ); #elif defined(Version5) // Version 4 - Multi-GPU, T on host magma_cgehrd_m( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, T, &ierr ); #endif if (wantvl) { /* Want left eigenvectors * Copy Householder vectors to VL */ side = MagmaLeft; lapackf77_clacpy( MagmaLowerStr, &n, &n, A, &lda, VL, &ldvl ); /* Generate unitary matrix in VL * (CWorkspace: need 2*N-1, prefer N + (N-1)*NB) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by cunghr */ #if defined(Version1) || defined(Version2) // Version 1 & 2 - LAPACK lapackf77_cunghr( &n, &ilo, &ihi, VL, &ldvl, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version3) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored magma_cunghr( n, ilo, ihi, VL, ldvl, &work[itau], dT, nb, &ierr ); #elif defined(Version5) // Version 5 - Multi-GPU, T on host magma_cunghr_m( n, ilo, ihi, VL, ldvl, &work[itau], T, nb, &ierr ); #endif /* Perform QR iteration, accumulating Schur vectors in VL * (CWorkspace: need 1, prefer HSWORK (see comments) ) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by chseqr */ iwrk = itau; liwrk = lwork - iwrk; lapackf77_chseqr( "S", "V", &n, &ilo, &ihi, A, &lda, w, VL, &ldvl, &work[iwrk], &liwrk, info ); if (wantvr) { /* Want left and right eigenvectors * Copy Schur vectors to VR */ side = MagmaBothSides; lapackf77_clacpy( "F", &n, &n, VL, &ldvl, VR, &ldvr ); } } else if (wantvr) { /* Want right eigenvectors * Copy Householder vectors to VR */ side = MagmaRight; lapackf77_clacpy( "L", &n, &n, A, &lda, VR, &ldvr ); /* Generate unitary matrix in VR * (CWorkspace: need 2*N-1, prefer N + (N-1)*NB) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by cunghr */ #if defined(Version1) || defined(Version2) // Version 1 & 2 - LAPACK lapackf77_cunghr( &n, &ilo, &ihi, VR, &ldvr, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version3) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored magma_cunghr( n, ilo, ihi, VR, ldvr, &work[itau], dT, nb, &ierr ); #elif defined(Version5) // Version 5 - Multi-GPU, T on host magma_cunghr_m( n, ilo, ihi, VR, ldvr, &work[itau], T, nb, &ierr ); #endif /* Perform QR iteration, accumulating Schur vectors in VR * (CWorkspace: need 1, prefer HSWORK (see comments) ) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by chseqr */ iwrk = itau; liwrk = lwork - iwrk; lapackf77_chseqr( "S", "V", &n, &ilo, &ihi, A, &lda, w, VR, &ldvr, &work[iwrk], &liwrk, info ); } else { /* Compute eigenvalues only * (CWorkspace: need 1, prefer HSWORK (see comments) ) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by chseqr */ iwrk = itau; liwrk = lwork - iwrk; lapackf77_chseqr( "E", "N", &n, &ilo, &ihi, A, &lda, w, VR, &ldvr, &work[iwrk], &liwrk, info ); } /* If INFO > 0 from CHSEQR, then quit */ if (*info > 0) { goto CLEANUP; } if (wantvl || wantvr) { /* Compute left and/or right eigenvectors * (CWorkspace: need 2*N) * (RWorkspace: need 2*N) * - including N reserved for gebal/gebak, unused by ctrevc */ irwork = ibal + n; #if TREVC_VERSION == 1 lapackf77_ctrevc( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl, VR, &ldvr, &n, &nout, &work[iwrk], &rwork[irwork], &ierr ); #elif TREVC_VERSION == 2 liwrk = lwork - iwrk; lapackf77_ctrevc3( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl, VR, &ldvr, &n, &nout, &work[iwrk], &liwrk, &rwork[irwork], &ierr ); #elif TREVC_VERSION == 3 magma_ctrevc3( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &rwork[irwork], &ierr ); #elif TREVC_VERSION == 4 magma_ctrevc3_mt( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &rwork[irwork], &ierr ); #elif TREVC_VERSION == 5 magma_ctrevc3_mt_gpu( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &rwork[irwork], &ierr ); #else #error Unknown TREVC_VERSION #endif } if (wantvl) { /* Undo balancing of left eigenvectors * (CWorkspace: none) * (RWorkspace: need N) */ lapackf77_cgebak( "B", "L", &n, &ilo, &ihi, &rwork[ibal], &n, VL, &ldvl, &ierr ); /* Normalize left eigenvectors and make largest component real */ for (i = 0; i < n; ++i) { scl = 1. / magma_cblas_scnrm2( n, VL(0,i), 1 ); blasf77_csscal( &n, &scl, VL(0,i), &ione ); for (k = 0; k < n; ++k) { /* Computing 2nd power */ d__1 = MAGMA_C_REAL( *VL(k,i) ); d__2 = MAGMA_C_IMAG( *VL(k,i) ); rwork[irwork + k] = d__1*d__1 + d__2*d__2; } k = blasf77_isamax( &n, &rwork[irwork], &ione ) - 1; // subtract 1; k is 0-based tmp = MAGMA_C_CONJ( *VL(k,i) ) / magma_ssqrt( rwork[irwork + k] ); blasf77_cscal( &n, &tmp, VL(0,i), &ione ); *VL(k,i) = MAGMA_C_MAKE( MAGMA_C_REAL( *VL(k,i) ), 0 ); } } if (wantvr) { /* Undo balancing of right eigenvectors * (CWorkspace: none) * (RWorkspace: need N) */ lapackf77_cgebak( "B", "R", &n, &ilo, &ihi, &rwork[ibal], &n, VR, &ldvr, &ierr ); /* Normalize right eigenvectors and make largest component real */ for (i = 0; i < n; ++i) { scl = 1. / magma_cblas_scnrm2( n, VR(0,i), 1 ); blasf77_csscal( &n, &scl, VR(0,i), &ione ); for (k = 0; k < n; ++k) { /* Computing 2nd power */ d__1 = MAGMA_C_REAL( *VR(k,i) ); d__2 = MAGMA_C_IMAG( *VR(k,i) ); rwork[irwork + k] = d__1*d__1 + d__2*d__2; } k = blasf77_isamax( &n, &rwork[irwork], &ione ) - 1; // subtract 1; k is 0-based tmp = MAGMA_C_CONJ( *VR(k,i) ) / magma_ssqrt( rwork[irwork + k] ); blasf77_cscal( &n, &tmp, VR(0,i), &ione ); *VR(k,i) = MAGMA_C_MAKE( MAGMA_C_REAL( *VR(k,i) ), 0 ); } } CLEANUP: /* Undo scaling if necessary */ if (scalea) { // converged eigenvalues, stored in WR[i+1:n] and WI[i+1:n] for i = INFO magma_int_t nval = n - (*info); magma_int_t ld = max( nval, 1 ); lapackf77_clascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, w + (*info), &ld, &ierr ); if (*info > 0) { // first ilo columns were already upper triangular, // so the corresponding eigenvalues are also valid. nval = ilo - 1; lapackf77_clascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, w, &n, &ierr ); } } #if defined(Version3) magma_free( dT ); #endif #if defined(Version5) magma_free_cpu( T ); #endif work[0] = magma_cmake_lwork( minwrk ); // TODO use optwrk as in dgeev return *info; } /* magma_cgeev */
/***************************************************************************//** Purpose ------- CLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] dA COMPLEX array, dimension (LDA,N) Copy of A on the GPU. Portions of A are updated on the CPU; portions of dA are updated on the GPU. See code for details. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 REAL array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 REAL array, dimension (N) The vector with the exact column norms. @param[in,out] auxv COMPLEX array, dimension (NB) Auxiliar vector. @param[in,out] F COMPLEX array, dimension (LDF,NB) Matrix F' = L*Y'*A. @param[in] ldf INTEGER The leading dimension of the array F. LDF >= max(1,N). @param[in,out] dF COMPLEX array, dimension (LDDF,NB) Copy of F on the GPU. See code for details. @param[in] lddf INTEGER The leading dimension of the array dF. LDDF >= max(1,N). @ingroup magma_laqps *******************************************************************************/ extern "C" magma_int_t magma_claqps( magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaFloatComplex *tau, float *vn1, float *vn2, magmaFloatComplex *auxv, magmaFloatComplex *F, magma_int_t ldf, magmaFloatComplex_ptr dF, magma_int_t lddf) { #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dA + (i) + (j)*(ldda)) #define F(i, j) (F + (i) + (j)*(ldf )) #define dF(i, j) (dF + (i) + (j)*(lddf)) magmaFloatComplex c_zero = MAGMA_C_MAKE( 0.,0.); magmaFloatComplex c_one = MAGMA_C_MAKE( 1.,0.); magmaFloatComplex c_neg_one = MAGMA_C_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; float d__1; magmaFloatComplex z__1; magma_int_t j, k, rk; magmaFloatComplex Akk; magma_int_t pvt; float temp, temp2, tol3z; magma_int_t itemp; magma_int_t lsticc; magma_int_t lastrk; lastrk = min( m, n + offset ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); lsticc = 0; k = 0; while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran isamax; pvt, k are 0-based. i__1 = n-k; pvt = k + blasf77_isamax( &i__1, &vn1[k], &ione ) - 1; if (pvt != k) { if (pvt >= nb) { /* 1. Start copy from GPU */ magma_cgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, queue ); } /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; blasf77_cswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; vn1[pvt] = vn1[k]; vn2[pvt] = vn2[k]; if (pvt < nb) { /* no need of transfer if pivot is within the panel */ blasf77_cswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { /* 1. Finish copy from GPU */ magma_queue_sync( queue ); /* 2. Swap as usual on CPU */ blasf77_cswap(&m, A(0, pvt), &ione, A(0, k), &ione); /* 3. Restore the GPU */ magma_csetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, queue ); } } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { #ifdef COMPLEX for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CONJ( *F(k,j) ); } #endif i__1 = m - rk; i__2 = k; blasf77_cgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione ); #ifdef COMPLEX for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CONJ( *F(k,j) ); } #endif } /* Generate elementary reflector H(k). */ if (rk < m-1) { i__1 = m - rk; lapackf77_clarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] ); } else { lapackf77_clarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] ); } Akk = *A(rk, k); *A(rk, k) = c_one; /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ magma_csetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda, queue ); /* Multiply on GPU */ // was CALL CGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) magma_int_t i__3 = nb-k-1; magma_int_t i__4 = i__2 - i__3; magma_int_t i__5 = nb-k; magma_cgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, tau[k], dA(rk +i__5, k+1+i__3), ldda, dA(rk +i__5, k ), ione, c_zero, dF(k+1+i__3, k ), ione, queue ); magma_cgetmatrix_async( i__2-i__3, 1, dF(k + 1 +i__3, k), i__2, F (k + 1 +i__3, k), i__2, queue ); blasf77_cgemv( MagmaConjTransStr, &i__1, &i__3, &tau[k], A(rk, k+1), &lda, A(rk, k ), &ione, &c_zero, F(k+1, k ), &ione ); magma_queue_sync( queue ); blasf77_cgemv( MagmaConjTransStr, &i__5, &i__4, &tau[k], A(rk, k+1+i__3), &lda, A(rk, k ), &ione, &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. */ for (j = 0; j < k; ++j) { *F(j, k) = c_zero; } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */ if (k > 0) { i__1 = m - rk; i__2 = k; z__1 = MAGMA_C_NEGATE( tau[k] ); blasf77_cgemv( MagmaConjTransStr, &i__1, &i__2, &z__1, A(rk, 0), &lda, A(rk, k), &ione, &c_zero, auxv, &ione ); i__1 = k; blasf77_cgemv( MagmaNoTransStr, &n, &i__1, &c_one, F(0,0), &ldf, auxv, &ione, &c_one, F(0,k), &ione ); } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; blasf77_cgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, &c_neg_one, A(rk, 0 ), &lda, F(k+1,0 ), &ldf, &c_one, A(rk, k+1), &lda ); } /* Update partial column norms. */ if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { /* NOTE: The following 4 lines follow from the analysis in Lapack Working Note 176. */ temp = MAGMA_C_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (float) lsticc; lsticc = j; } else { vn1[j] *= magma_ssqrt(temp); } } } } *A(rk, k) = Akk; ++k; } // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU */ magma_csetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2, queue ); magma_cgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), i__2, c_one, dA(rk+1, *kb), ldda, queue ); } /* Recomputation of difficult columns. */ while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) { vn1[lsticc] = magma_cblas_scnrm2( i__1, A(rk+1,lsticc), ione ); } else { /* Where is the data, CPU or GPU ? */ float r1, r2; r1 = magma_cblas_scnrm2( nb-k, A(rk+1,lsticc), ione ); r2 = magma_scnrm2( m-offset-nb, dA(offset + nb + 1, lsticc), ione, queue ); //vn1[lsticc] = magma_scnrm2( i__1, dA(rk + 1, lsticc), ione, queue ); vn1[lsticc] = magma_ssqrt(r1*r1 + r2*r2); } /* NOTE: The computation of VN1( LSTICC ) relies on the fact that SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) */ vn2[lsticc] = vn1[lsticc]; lsticc = itemp; } magma_queue_destroy( queue ); return MAGMA_SUCCESS; } /* magma_claqps */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeev */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; float *h_A, *h_R, *VL, *VR, *h_work, *w1, *w2; float *w1i, *w2i; magmaFloatComplex *w1copy, *w2copy; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; float tnrm, result[9]; magma_int_t N, n2, lda, nb, lwork, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float ulp, ulpinv, error; magma_int_t status = 0; ulp = lapackf77_slamch( "P" ); ulpinv = 1./ulp; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); // enable at least some minimal checks, if requested if ( opts.check && !opts.lapack && opts.jobvl == MagmaNoVec && opts.jobvr == MagmaNoVec ) { fprintf( stderr, "NOTE: Some checks require vectors to be computed;\n" " set jobvl=V (option -LV), or jobvr=V (option -RV), or both.\n" " Some checks require running lapack (-l); setting lapack.\n\n"); opts.lapack = true; } printf(" N CPU Time (sec) GPU Time (sec) |W_magma - W_lapack| / |W_lapack|\n"); printf("===========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; nb = magma_get_sgehrd_nb(N); lwork = N*(2 + nb); // generous workspace - required by sget22 lwork = max( lwork, N*(5 + 2*N) ); TESTING_MALLOC_CPU( w1copy, magmaFloatComplex, N ); TESTING_MALLOC_CPU( w2copy, magmaFloatComplex, N ); TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( w1i, float, N ); TESTING_MALLOC_CPU( w2i, float, N ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_PIN( VL, float, n2 ); TESTING_MALLOC_PIN( VR, float, n2 ); TESTING_MALLOC_PIN( h_work, float, lwork ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgeev( opts.jobvl, opts.jobvr, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_sgeev returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { /* =================================================================== * Check the result following LAPACK's [zcds]drvev routine. * The following tests are performed: * (1) | A * VR - VR * W | / ( n |A| ) * * Here VR is the matrix of unit right eigenvectors. * W is a diagonal matrix with diagonal entries W(j). * * (2) | |VR(i)| - 1 | and whether largest component real * * VR(i) denotes the i-th column of VR. * * (3) | A**T * VL - VL * W**T | / ( n |A| ) * * Here VL is the matrix of unit left eigenvectors, A**T is the * transpose of A, and W is as above. * * (4) | |VL(i)| - 1 | and whether largest component real * * VL(i) denotes the i-th column of VL. * * (5) W(full) = W(partial, W only) -- currently skipped * (6) W(full) = W(partial, W and VR) * (7) W(full) = W(partial, W and VL) * * W(full) denotes the eigenvalues computed when both VR and VL * are also computed, and W(partial) denotes the eigenvalues * computed when only W, only W and VR, or only W and VL are * computed. * * (8) VR(full) = VR(partial, W and VR) * * VR(full) denotes the right eigenvectors computed when both VR * and VL are computed, and VR(partial) denotes the result * when only VR is computed. * * (9) VL(full) = VL(partial, W and VL) * * VL(full) denotes the left eigenvectors computed when both VR * and VL are also computed, and VL(partial) denotes the result * when only VL is computed. * * (1, 2) only if jobvr = V * (3, 4) only if jobvl = V * (5-9) only if check = 2 (option -c2) ================================================================= */ float vmx, vrmx, vtst; // Initialize result. -1 indicates test was not run. for( int j = 0; j < 9; ++j ) result[j] = -1.; if ( opts.jobvr == MagmaVec ) { // Do test 1: | A * VR - VR * W | / ( n |A| ) // Note this writes result[1] also lapackf77_sget22( MagmaNoTransStr, MagmaNoTransStr, MagmaNoTransStr, &N, h_A, &lda, VR, &lda, w1, w1i, h_work, &result[0] ); result[0] *= ulp; // Do test 2: | |VR(i)| - 1 | and whether largest component real result[1] = -1.; for( int j = 0; j < N; ++j ) { tnrm = 1.; if (w1i[j] == 0.) tnrm = magma_cblas_snrm2( N, &VR[j*lda], ione ); else if (w1i[j] > 0.) tnrm = magma_slapy2( magma_cblas_snrm2( N, &VR[j*lda], ione ), magma_cblas_snrm2( N, &VR[(j+1)*lda], ione )); result[1] = max( result[1], min( ulpinv, MAGMA_S_ABS(tnrm-1.)/ulp )); if (w1i[j] > 0.) { vmx = vrmx = 0.; for( int jj = 0; jj < N; ++jj ) { vtst = magma_slapy2( VR[jj+j*lda], VR[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VR[jj + (j+1)*lda])==0. && MAGMA_S_ABS( VR[jj+j*lda] ) > vrmx) { vrmx = MAGMA_S_ABS( VR[jj+j*lda] ); } } if (vrmx / vmx < 1. - ulp*2.) result[1] = ulpinv; } } result[1] *= ulp; } if ( opts.jobvl == MagmaVec ) { // Do test 3: | A**T * VL - VL * W**T | / ( n |A| ) // Note this writes result[3] also lapackf77_sget22( MagmaTransStr, MagmaNoTransStr, MagmaTransStr, &N, h_A, &lda, VL, &lda, w1, w1i, h_work, &result[2] ); result[2] *= ulp; // Do test 4: | |VL(i)| - 1 | and whether largest component real result[3] = -1.; for( int j = 0; j < N; ++j ) { tnrm = 1.; if (w1i[j] == 0.) tnrm = magma_cblas_snrm2( N, &VL[j*lda], ione ); else if (w1i[j] > 0.) tnrm = magma_slapy2( magma_cblas_snrm2( N, &VL[j*lda], ione ), magma_cblas_snrm2( N, &VL[(j+1)*lda], ione )); result[3] = max( result[3], min( ulpinv, MAGMA_S_ABS(tnrm-1.)/ulp )); if (w1i[j] > 0.) { vmx = vrmx = 0.; for( int jj = 0; jj < N; ++jj ) { vtst = magma_slapy2( VL[jj+j*lda], VL[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VL[jj + (j+1)*lda])==0. && MAGMA_S_ABS( VL[jj+j*lda]) > vrmx) { vrmx = MAGMA_S_ABS( VL[jj+j*lda] ); } } if (vrmx / vmx < 1. - ulp*2.) result[3] = ulpinv; } } result[3] *= ulp; } } if ( opts.check == 2 ) { // more extensive tests // this is really slow because it calls magma_zgeev multiple times float *LRE, DUM; TESTING_MALLOC_PIN( LRE, float, n2 ); lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); // ---------- // Compute eigenvalues, left and right eigenvectors magma_sgeev( MagmaVec, MagmaVec, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info ); if (info != 0) printf("magma_zgeev (case V, V) returned error %d: %s.\n", (int) info, magma_strerror( info )); // ---------- // Compute eigenvalues only // These are not exactly equal, and not in the same order, so skip for now. //lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); //magma_sgeev( MagmaNoVec, MagmaNoVec, // N, h_R, lda, w2, w2i, // &DUM, 1, &DUM, 1, // h_work, lwork, &info ); //if (info != 0) // printf("magma_sgeev (case N, N) returned error %d: %s.\n", // (int) info, magma_strerror( info )); // //// Do test 5: W(full) = W(partial, W only) //result[4] = 1; //for( int j = 0; j < N; ++j ) // if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) // result[4] = 0; // ---------- // Compute eigenvalues and right eigenvectors lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_sgeev( MagmaNoVec, MagmaVec, N, h_R, lda, w2, w2i, &DUM, 1, LRE, lda, h_work, lwork, &info ); if (info != 0) printf("magma_sgeev (case N, V) returned error %d: %s.\n", (int) info, magma_strerror( info )); // Do test 6: W(full) = W(partial, W and VR) result[5] = 1; for( int j = 0; j < N; ++j ) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[5] = 0; // Do test 8: VR(full) = VR(partial, W and VR) result[7] = 1; for( int j = 0; j < N; ++j ) for( int jj = 0; jj < N; ++jj ) if ( ! MAGMA_S_EQUAL( VR[j+jj*lda], LRE[j+jj*lda] )) result[7] = 0; // ---------- // Compute eigenvalues and left eigenvectors lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_sgeev( MagmaVec, MagmaNoVec, N, h_R, lda, w2, w2i, LRE, lda, &DUM, 1, h_work, lwork, &info ); if (info != 0) printf("magma_sgeev (case V, N) returned error %d: %s.\n", (int) info, magma_strerror( info )); // Do test 7: W(full) = W(partial, W and VL) result[6] = 1; for( int j = 0; j < N; ++j ) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[6] = 0; // Do test 9: VL(full) = VL(partial, W and VL) result[8] = 1; for( int j = 0; j < N; ++j ) for( int jj = 0; jj < N; ++jj ) if ( ! MAGMA_S_EQUAL( VL[j+jj*lda], LRE[j+jj*lda] )) result[8] = 0; TESTING_FREE_PIN( LRE ); } /* ===================================================================== Performs operation using LAPACK Do this after checks, because it overwrites VL and VR. =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgeev( lapack_vec_const(opts.jobvl), lapack_vec_const(opts.jobvr), &N, h_A, &lda, w2, w2i, VL, &lda, VR, &lda, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_sgeev returned error %d: %s.\n", (int) info, magma_strerror( info )); // check | W_magma - W_lapack | / | W | // need to sort eigenvalues first // copy them into complex vectors for ease for( int j=0; j < N; ++j ) { w1copy[j] = MAGMA_C_MAKE( w1[j], w1i[j] ); w2copy[j] = MAGMA_C_MAKE( w2[j], w2i[j] ); } std::sort( w1copy, &w1copy[N], lessthan ); std::sort( w2copy, &w2copy[N], lessthan ); // adjust sorting to deal with numerical inaccuracy // search down w2 for eigenvalue that matches w1's eigenvalue for( int j=0; j < N; ++j ) { for( int j2=j; j2 < N; ++j2 ) { magmaFloatComplex diff = MAGMA_C_SUB( w1copy[j], w2copy[j2] ); float diff2 = magma_szlapy2( diff ) / max( magma_szlapy2( w1copy[j] ), tol ); if ( diff2 < 100*tol ) { if ( j != j2 ) { std::swap( w2copy[j], w2copy[j2] ); } break; } } } blasf77_caxpy( &N, &c_neg_one, w2copy, &ione, w1copy, &ione ); error = magma_cblas_scnrm2( N, w1copy, 1 ); error /= magma_cblas_scnrm2( N, w2copy, 1 ); printf("%5d %7.2f %7.2f %8.2e %s\n", (int) N, cpu_time, gpu_time, error, (error < tolulp ? "ok" : "failed")); status += ! (error < tolulp); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } if ( opts.check ) { // -1 indicates test was not run if ( result[0] != -1 ) { printf(" | A * VR - VR * W | / ( n |A| ) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } if ( result[1] != -1 ) { printf(" | |VR(i)| - 1 | = %8.2e %s\n", result[1], (result[1] < tol ? "ok" : "failed")); } if ( result[2] != -1 ) { printf(" | A'* VL - VL * W'| / ( n |A| ) = %8.2e %s\n", result[2], (result[2] < tol ? "ok" : "failed")); } if ( result[3] != -1 ) { printf(" | |VL(i)| - 1 | = %8.2e %s\n", result[3], (result[3] < tol ? "ok" : "failed")); } if ( result[4] != -1 ) { printf(" W (full) == W (partial, W only) %s\n", (result[4] == 1. ? "ok" : "failed")); } if ( result[5] != -1 ) { printf(" W (full) == W (partial, W and VR) %s\n", (result[5] == 1. ? "ok" : "failed")); } if ( result[6] != -1 ) { printf(" W (full) == W (partial, W and VL) %s\n", (result[6] == 1. ? "ok" : "failed")); } if ( result[7] != -1 ) { printf(" VR (full) == VR (partial, W and VR) %s\n", (result[7] == 1. ? "ok" : "failed")); } if ( result[8] != -1 ) { printf(" VL (full) == VL (partial, W and VL) %s\n", (result[8] == 1. ? "ok" : "failed")); } int newline = 0; if ( result[0] != -1 ) { status += ! (result[0] < tol); newline = 1; } if ( result[1] != -1 ) { status += ! (result[1] < tol); newline = 1; } if ( result[2] != -1 ) { status += ! (result[2] < tol); newline = 1; } if ( result[3] != -1 ) { status += ! (result[3] < tol); newline = 1; } if ( result[4] != -1 ) { status += ! (result[4] == 1.); newline = 1; } if ( result[5] != -1 ) { status += ! (result[5] == 1.); newline = 1; } if ( result[6] != -1 ) { status += ! (result[6] == 1.); newline = 1; } if ( result[7] != -1 ) { status += ! (result[7] == 1.); newline = 1; } if ( result[8] != -1 ) { status += ! (result[8] == 1.); newline = 1; } if ( newline ) { printf( "\n" ); } } TESTING_FREE_CPU( w1copy ); TESTING_FREE_CPU( w2copy ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( w1i ); TESTING_FREE_CPU( w2i ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( VL ); TESTING_FREE_PIN( VR ); TESTING_FREE_PIN( h_work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
// ---------------------------------------- int main( int argc, char** argv ) { TESTING_INIT(); //real_Double_t t_m, t_c, t_f; magma_int_t ione = 1; magmaFloatComplex *A, *B; float diff, error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld; magmaFloatComplex x2_m, x2_c; // complex x for magma, cblas/fortran blas respectively float x_m, x_c; // x for magma, cblas/fortran blas respectively magma_opts opts; parse_opts( argc, argv, &opts ); opts.tolerance = max( 100., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); gTol = tol; printf( "!! Calling these CBLAS and Fortran BLAS sometimes crashes (segfault), which !!\n" "!! is why we use wrappers. It does not necesarily indicate a bug in MAGMA. !!\n" "\n" "Diff compares MAGMA wrapper to CBLAS and BLAS function; should be exactly 0.\n" "Error compares MAGMA implementation to CBLAS and BLAS function; should be ~ machine epsilon.\n" "\n" ); float total_diff = 0.; float total_error = 0.; int inc[] = { 1 }; //{ -2, -1, 1, 2 }; //{ 1 }; //{ -1, 1 }; int ninc = sizeof(inc)/sizeof(*inc); for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d, incx = %d, incy = %d\n", (int) m, (int) n, (int) k, (int) incx, (int) incy ); printf( "Function MAGMA CBLAS BLAS Diff Error\n" " msec msec msec\n" ); // allocate matrices // over-allocate so they can be any combination of // {m,n,k} * {abs(incx), abs(incy)} by // {m,n,k} * {abs(incx), abs(incy)} maxn = max( max( m, n ), k ) * max( abs(incx), abs(incy) ); ld = max( 1, maxn ); size = ld*maxn; magma_cmalloc_pinned( &A, size ); assert( A != NULL ); magma_cmalloc_pinned( &B, size ); assert( B != NULL ); // initialize matrices lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); printf( "Level 1 BLAS ----------------------------------------------------------\n" ); // ----- test SCASUM // get one-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_scasum( m, A(0,j), incx ); x_c = cblas_scasum( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_scasum( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "scasum", diff, error ); total_diff += diff; total_error += error; } // ----- test SCNRM2 // get two-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_scnrm2( m, A(0,j), incx ); x_c = cblas_scnrm2( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_scnrm2( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "scnrm2", diff, error ); total_diff += diff; total_error += error; } // ----- test CDOTC // dot columns, Aj^H Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_cdotc( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_cdotc_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_cdotc( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_cdotc( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "cdotc", diff, error ); total_diff += diff; total_error += error; total_error += error; // ----- test CDOTU // dot columns, Aj^T * Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_cdotu( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_cdotu_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_cdotu( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_cdotu( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "cdotu", diff, error ); total_diff += diff; total_error += error; // tell user about disabled functions #if defined( MAGMA_WITH_MKL ) printf( "cblas_cdotc and cblas_cdotu disabled with MKL (segfaults)\n" ); #endif #if defined( __APPLE__ ) printf( "blasf77_cdotc and blasf77_cdotu disabled on MacOS (segfaults)\n" ); #endif // cleanup magma_free_pinned( A ); magma_free_pinned( B ); fflush( stdout ); }}} // itest, incx, incy // TODO use average error? printf( "sum diffs = %8.2g, MAGMA wrapper compared to CBLAS and Fortran BLAS; should be exactly 0.\n" "sum errors = %8.2e, MAGMA implementation compared to CBLAS and Fortran BLAS; should be ~ machine epsilon.\n\n", total_diff, total_error ); if ( total_diff != 0. ) { printf( "some tests failed diff == 0.; see above.\n" ); } else { printf( "all tests passed diff == 0.\n" ); } TESTING_FINALIZE(); int status = (total_diff != 0.); return status; }