void SVDMatrix_magma(Tensor_core<complex<double>,2>& U, Tensor_core<double,1>& D, Tensor_core<complex<double>,2>& V) { if( U.rank(0)!=U.rank(1) || U.rank(1)!=D.rank(0) || D.rank(0)!=V.rank(0) || V.rank(0)!=V.rank(1) ) { cout<<"Size is not consistent in SVDMatrix_magma! Only support square matrix."<<endl; exit(1); } magma_int_t m=U.rank(0); magma_int_t n=V.rank(0); magma_vec_t jobz(MagmaOverwriteVec); magma_int_t lda=m; magmaDoubleComplex* u=nullptr; magma_int_t ldu=1; magma_int_t ldv=n; magmaDoubleComplex work_test[1]; magma_int_t lwork=-1; double* rwork; magma_int_t* iwork; magma_dmalloc_cpu( &rwork, 5*m*m+7*m ); magma_imalloc_cpu(&iwork, 8*m); magma_int_t info; magma_zgesdd(jobz, m, n, (magmaDoubleComplex *) U.data(), lda, D.data(), u, ldu, (magmaDoubleComplex *) V.data(), ldv, work_test, lwork, rwork, iwork, &info); lwork=lround( MAGMA_Z_REAL(work_test[0]) ); magmaDoubleComplex* work; magma_zmalloc_cpu(&work, lwork); magma_zgesdd(jobz, m, n, (magmaDoubleComplex *) U.data(), lda, D.data(), u, ldu, (magmaDoubleComplex *) V.data(), ldv, work, lwork, rwork, iwork, &info); magma_free_cpu(work); magma_free_cpu(rwork); magma_free_cpu(iwork); if(info!=0) { cout<<"SVDMatrix_magma is not suceesful, info= "<<info<<endl; exit(1); } }
// ------------------------------------------------------------ // Solve A * X = B, where A and X are stored in CPU host memory. // Internally, MAGMA transfers data to the GPU device // and uses a hybrid CPU + GPU algorithm. void cpu_interface( magma_int_t n, magma_int_t nrhs ) { magmaDoubleComplex *A=NULL, *X=NULL; magma_int_t *ipiv=NULL; magma_int_t lda = n; magma_int_t ldx = lda; magma_int_t info = 0; // magma_*malloc_cpu routines for CPU memory are type-safe and align to memory boundaries, // but you can use malloc or new if you prefer. magma_zmalloc_cpu( &A, lda*n ); magma_zmalloc_cpu( &X, ldx*nrhs ); magma_imalloc_cpu( &ipiv, n ); if ( A == NULL || X == NULL || ipiv == NULL ) { fprintf( stderr, "malloc failed\n" ); goto cleanup; } // Replace these with your code to initialize A and X zfill_matrix( n, n, A, lda ); zfill_rhs( n, nrhs, X, ldx ); magma_zgesv( n, 1, A, lda, ipiv, X, lda, &info ); if ( info != 0 ) { fprintf( stderr, "magma_zgesv failed with info=%d\n", info ); } // TODO: use result in X cleanup: magma_free_cpu( A ); magma_free_cpu( X ); magma_free_cpu( ipiv ); }
/** Purpose ------- CPOSV computes the solution to a complex system of linear equations A * X = B, where A is an N-by-N Hermitian positive definite matrix and X and B are N-by-NRHS matrices. The Cholesky decomposition is used to factor A as A = U**H * U, if UPLO = MagmaUpper, or A = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is a lower triangular matrix. The factored form of A is then used to solve the system of equations A * X = B. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA_array Array of pointers, dimension (batchCount). Each is a COMPLEX array on the GPU, dimension (LDDA,N) On entry, each pointer is a Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if corresponding entry in dinfo_array = 0, each pointer is the factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H. @param[in] ldda INTEGER The leading dimension of each array A. LDA >= max(1,N). @param[in,out] dB_array Array of pointers, dimension (batchCount). Each is a COMPLEX array on the GPU, dimension (LDB,NRHS) On entry, each pointer is a right hand side matrix B. On exit, each pointer is the corresponding solution matrix X. @param[in] lddb INTEGER The leading dimension of each array B. LDB >= max(1,N). @param[out] dinfo_array Array of INTEGERs, dimension (batchCount), for corresponding matrices. - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @param[in] batchCount INTEGER The number of matrices to operate on. @param[in] queue magma_queue_t Queue to execute in. @ingroup magma_cposv_driver ********************************************************************/ extern "C" magma_int_t magma_cposv_batched( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaFloatComplex **dA_array, magma_int_t ldda, magmaFloatComplex **dB_array, magma_int_t lddb, magma_int_t *dinfo_array, magma_int_t batchCount, magma_queue_t queue) { /* Local variables */ magma_int_t info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) info = -1; if ( n < 0 ) info = -2; if ( nrhs < 0 ) info = -3; if ( ldda < max(1, n) ) info = -5; if ( lddb < max(1, n) ) info = -7; if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return info; } info = magma_cpotrf_batched( uplo, n, dA_array, ldda, dinfo_array, batchCount, queue); if ( info != MAGMA_SUCCESS ) { return info; } #ifdef CHECK_INFO // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_int_t *cpu_info = NULL; magma_imalloc_cpu( &cpu_info, batchCount ); magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1); for (magma_int_t i=0; i < batchCount; i++) { if (cpu_info[i] != 0 ) { printf("magma_cpotrf_batched matrix %d returned error %d\n",i, (int)cpu_info[i] ); info = cpu_info[i]; magma_free_cpu (cpu_info); return info; } } magma_free_cpu (cpu_info); #endif info = magma_cpotrs_batched( uplo, n, nrhs, dA_array, ldda, dB_array, lddb, batchCount, queue ); return info; }
/***************************************************************************//** Purpose ------- DGESV solves a system of linear equations A * X = B where A is a general N-by-N matrix and X and B are N-by-NRHS matrices. The LU decomposition with partial pivoting and row interchanges is used to factor A as A = P * L * U, where P is a permutation matrix, L is unit lower triangular, and U is upper triangular. The factored form of A is then used to solve the system of equations A * X = B. This is a batched version that solves batchCount N-by-N matrices in parallel. dA, dB, ipiv, and info become arrays with one entry per matrix. Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array on the GPU, dimension (LDDA,N). On entry, each pointer is an M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of each array A. LDDA >= max(1,M). @param[out] dipiv_array Array of pointers, dimension (batchCount), for corresponding matrices. Each is an INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[in,out] dB_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array on the GPU, dimension (LDDB,N). On entry, each pointer is an right hand side matrix B. On exit, each pointer is the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] dinfo_array Array of INTEGERs, dimension (batchCount), for corresponding matrices. - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @param[in] batchCount INTEGER The number of matrices to operate on. @param[in] queue magma_queue_t Queue to execute in. @ingroup magma_gesv_batched *******************************************************************************/ extern "C" magma_int_t magma_dgesv_batched( magma_int_t n, magma_int_t nrhs, double **dA_array, magma_int_t ldda, magma_int_t **dipiv_array, double **dB_array, magma_int_t lddb, magma_int_t *dinfo_array, magma_int_t batchCount, magma_queue_t queue) { /* Local variables */ magma_int_t info; info = 0; if (n < 0) { info = -1; } else if (nrhs < 0) { info = -2; } else if (ldda < max(1,n)) { info = -4; } else if (lddb < max(1,n)) { info = -6; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return info; } info = magma_dgetrf_batched( n, n, dA_array, ldda, dipiv_array, dinfo_array, batchCount, queue); if ( info != MAGMA_SUCCESS ) { return info; } #ifdef CHECK_INFO // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_int_t *cpu_info = NULL; magma_imalloc_cpu( &cpu_info, batchCount ); magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1); for (magma_int_t i=0; i < batchCount; i++) { if (cpu_info[i] != 0 ) { printf("magma_dgetrf_batched matrix %lld returned error %lld\n", (long long) i, (long long) cpu_info[i] ); info = cpu_info[i]; magma_free_cpu (cpu_info); return info; } } magma_free_cpu (cpu_info); #endif info = magma_dgetrs_batched( MagmaNoTrans, n, nrhs, dA_array, ldda, dipiv_array, dB_array, lddb, batchCount, queue ); return info; }
////////////////////////////////////////////////////////////// // CSTEDC Divide and Conquer for tridiag ////////////////////////////////////////////////////////////// extern "C" void magma_cstedx_withZ(magma_int_t N, magma_int_t NE, float *D, float * E, magmaFloatComplex *Z, magma_int_t LDZ) { float *RWORK; float *dwork; magma_int_t *IWORK; magma_int_t LIWORK, LRWORK; magma_int_t INFO; //LWORK = N; LRWORK = 2*N*N + 4*N + 1 + 256*N; LIWORK = 256*N; magma_smalloc_cpu( &RWORK, LRWORK ); magma_imalloc_cpu( &IWORK, LIWORK ); if (MAGMA_SUCCESS != magma_smalloc( &dwork, 3*N*(N/2 + 1) )) { printf("=================================================\n"); printf("CSTEDC ERROR OCCURED IN CUDAMALLOC\n"); printf("=================================================\n"); return; } printf("using magma_cstedx\n"); magma_timer_t time=0; timer_start( time ); magma_range_t job = MagmaRangeI; if (NE == N) job = MagmaRangeAll; magma_cstedx(job, N, 0., 0., 1, NE, D, E, Z, LDZ, RWORK, LRWORK, IWORK, LIWORK, dwork, &INFO); if (INFO != 0) { printf("=================================================\n"); printf("CSTEDC ERROR OCCURED. HERE IS INFO %d \n ", (int) INFO); printf("=================================================\n"); //assert(INFO == 0); } timer_stop( time ); timer_printf( "time zstevx = %6.2f\n", time ); magma_free( dwork ); magma_free_cpu( IWORK ); magma_free_cpu( RWORK ); }
////////////////////////////////////////////////////////////// // SSTEDX Divide and Conquer for tridiag ////////////////////////////////////////////////////////////// extern "C" void magma_sstedx_withZ(magma_int_t N, magma_int_t NE, float *D, float * E, float *Z, magma_int_t LDZ) { float *WORK; float *dwork; magma_int_t *IWORK; magma_int_t LWORK, LIWORK; magma_int_t INFO; LWORK = N*N+4*N+1; LIWORK = 3 + 5*N; magma_smalloc_cpu( &WORK, LWORK ); magma_imalloc_cpu( &IWORK, LIWORK ); if (MAGMA_SUCCESS != magma_smalloc( &dwork, 3*N*(N/2 + 1) )) { printf("=================================================\n"); printf("SSTEDC ERROR OCCURED IN CUDAMALLOC\n"); printf("=================================================\n"); return; } printf("using magma_sstedx\n"); magma_timer_t time=0; timer_start( time ); //magma_range_t job = MagmaRangeI; //if (NE == N) // job = MagmaRangeAll; magma_sstedx(MagmaRangeI, N, 0., 0., 1, NE, D, E, Z, LDZ, WORK, LWORK, IWORK, LIWORK, dwork, &INFO); if (INFO != 0) { printf("=================================================\n"); printf("SSTEDC ERROR OCCURED. HERE IS INFO %d \n ", (int) INFO); printf("=================================================\n"); //assert(INFO == 0); } timer_stop( time ); timer_printf( "time sstedx = %6.2f\n", time ); magma_free( dwork ); magma_free_cpu( IWORK ); magma_free_cpu( WORK ); }
////////////////////////////////////////////////////////////// // CSTEDC Divide and Conquer for tridiag ////////////////////////////////////////////////////////////// extern "C" void magma_cstedc_withZ(magma_vec_t JOBZ, magma_int_t N, float *D, float * E, magmaFloatComplex *Z, magma_int_t LDZ) { magmaFloatComplex *WORK; float *RWORK; magma_int_t *IWORK; magma_int_t LWORK, LIWORK, LRWORK; magma_int_t INFO; // use log() as log2() is not defined everywhere (e.g., Windows) const float log_2 = 0.6931471805599453; if (JOBZ == MagmaVec) { LWORK = N*N; LRWORK = 1 + 3*N + 3*N*((magma_int_t)(log( (float)N )/log_2) + 1) + 4*N*N + 256*N; LIWORK = 6 + 6*N + 6*N*((magma_int_t)(log( (float)N )/log_2) + 1) + 256*N; } else if (JOBZ == MagmaIVec) { LWORK = N; LRWORK = 2*N*N + 4*N + 1 + 256*N; LIWORK = 256*N; } else if (JOBZ == MagmaNoVec) { LWORK = N; LRWORK = 256*N + 1; LIWORK = 256*N; } else { printf("ERROR JOBZ %c\n", JOBZ); exit(-1); } magma_smalloc_cpu( &RWORK, LRWORK ); magma_cmalloc_cpu( &WORK, LWORK ); magma_imalloc_cpu( &IWORK, LIWORK ); lapackf77_cstedc( lapack_vec_const(JOBZ), &N, D, E, Z, &LDZ, WORK, &LWORK, RWORK, &LRWORK, IWORK, &LIWORK, &INFO); if (INFO != 0) { printf("=================================================\n"); printf("CSTEDC ERROR OCCURED. HERE IS INFO %d \n ", (int) INFO); printf("=================================================\n"); //assert(INFO == 0); } magma_free_cpu( IWORK ); magma_free_cpu( WORK ); magma_free_cpu( RWORK ); }
void eigen_magma(Tensor_core<double,2>& A, Tensor_core<double,1>& W, char JOBZ, char UPLO) { if( A.rank(0) != A.rank(1) ) {cout<<"Input for eigen is not square matrix!"<<endl; exit(1);} if( A.rank(0) != W.rank(0) ) {cout<<"Input size of W is not consistent with A!"<<endl; exit(1);} magma_vec_t jobz = magma_vec_const(JOBZ); magma_uplo_t uplo = magma_uplo_const(UPLO); magma_int_t N=A.rank(0); magma_int_t info; double work_test[1]; magma_int_t iwork_test[1]; magma_int_t lwork=-1; magma_int_t liwork=-1; magma_dsyevd( jobz, uplo, N, A.data(), N, W.data(), work_test, lwork, iwork_test, liwork, &info ); lwork=lround(work_test[0]); liwork=iwork_test[0]; double* work; magma_int_t* iwork; magma_dmalloc_cpu(&work, lwork); magma_imalloc_cpu(&iwork, liwork); magma_dsyevd( jobz, uplo, N, A.data(), N, W.data(), work, lwork, iwork, liwork, &info ); magma_free_cpu(work); magma_free_cpu(iwork); if(info!=0) {cout<<"Dsyevd failed: info= "<< info<<endl; exit(1);} }
// ------------------------------------------------------------ // Solve dA * dX = dB, where dA and dX are stored in GPU device memory. // Internally, MAGMA uses a hybrid CPU + GPU algorithm. void gpu_interface( magma_int_t n, magma_int_t nrhs ) { magmaDoubleComplex *dA=NULL, *dX=NULL; magma_int_t *ipiv=NULL; magma_int_t ldda = magma_roundup( n, 32 ); // round up to multiple of 32 for best GPU performance magma_int_t lddx = ldda; magma_int_t info = 0; magma_queue_t queue=NULL; // magma_*malloc routines for GPU memory are type-safe, // but you can use cudaMalloc if you prefer. magma_zmalloc( &dA, ldda*n ); magma_zmalloc( &dX, lddx*nrhs ); magma_imalloc_cpu( &ipiv, n ); // ipiv always on CPU if ( dA == NULL || dX == NULL || ipiv == NULL ) { fprintf( stderr, "malloc failed\n" ); goto cleanup; } magma_int_t dev = 0; magma_queue_create( dev, &queue ); // Replace these with your code to initialize A and X zfill_matrix_gpu( n, n, dA, ldda, queue ); zfill_rhs_gpu( n, nrhs, dX, lddx, queue ); magma_zgesv_gpu( n, 1, dA, ldda, ipiv, dX, ldda, &info ); if ( info != 0 ) { fprintf( stderr, "magma_zgesv_gpu failed with info=%d\n", info ); } // TODO: use result in dX cleanup: magma_queue_destroy( queue ); magma_free( dA ); magma_free( dX ); magma_free_cpu( ipiv ); }
extern "C" magma_int_t magma_dsgesv_gpu(char trans, magma_int_t n, magma_int_t nrhs, double *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *dipiv, double *dB, magma_int_t lddb, double *dX, magma_int_t lddx, double *dworkd, float *dworks, magma_int_t *iter, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DSGESV computes the solution to a real system of linear equations A * X = B or A' * X = B where A is an N-by-N matrix and X and B are N-by-NRHS matrices. DSGESV first attempts to factorize the matrix in real SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with real DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a real DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio real SINGLE PRECISION performance over real DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments ========= TRANS (input) CHARACTER*1 Specifies the form of the system of equations: = 'N': A * X = B (No transpose) = 'T': A'* X = B (Transpose) = 'C': A'* X = B (Conjugate transpose = Transpose) N (input) INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input or input/output) DOUBLE PRECISION array on the GPU, dimension (ldda,N) On entry, the N-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. ldda (input) INTEGER The leading dimension of the array dA. ldda >= max(1,N). IPIV (output) INTEGER array, dimension (N) The pivot indices that define the permutation matrix P; row i of the matrix was interchanged with row IPIV(i). Corresponds either to the single precision factorization (if info.EQ.0 and ITER.GE.0) or the double precision factorization (if info.EQ.0 and ITER.LT.0). dIPIV (output) INTEGER array on the GPU, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was moved to row IPIV(i). dB (input) DOUBLE PRECISION array on the GPU, dimension (lddb,NRHS) The N-by-NRHS right hand side matrix B. lddb (input) INTEGER The leading dimension of the array dB. lddb >= max(1,N). dX (output) DOUBLE PRECISION array on the GPU, dimension (lddx,NRHS) If info = 0, the N-by-NRHS solution matrix X. lddx (input) INTEGER The leading dimension of the array dX. lddx >= max(1,N). dworkd (workspace) DOUBLE PRECISION array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. dworks (workspace) SINGLE PRECISION array on the GPU, dimension (N*(N+NRHS)) This array is used to store the real single precision matrix and the right-hand sides or solutions in single precision. iter (output) INTEGER < 0: iterative refinement has failed, double precision factorization has been performed -1 : the routine fell back to full precision for implementation- or machine-specific reasons -2 : narrowing the precision induced an overflow, the routine fell back to full precision -3 : failure of SGETRF -31: stop the iterative refinement after the 30th iteration > 0: iterative refinement has been successfully used. Returns the number of iterations info (output) INTEGER = 0: successful exit < 0: if info = -i, the i-th argument had an illegal value > 0: if info = i, U(i,i) computed in DOUBLE PRECISION is exactly zero. The factorization has been completed, but the factor U is exactly singular, so the solution could not be computed. ===================================================================== */ #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; magma_int_t ione = 1; double *dR; float *dSA, *dSX; double Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -8; else if ( lddx < max(1,n)) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_dlange('I', n, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ //magmablas_dlag2s( n, nrhs, dB, lddb, dSX, lddsx, info ); // done inside dsgetrs with pivots if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_dlag2s( n, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_sgetrf_gpu( n, n, dSA, lddsa, ipiv, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Generate parallel pivots { magma_int_t *newipiv; magma_imalloc_cpu( &newipiv, n ); if ( newipiv == NULL ) { *iter = -3; goto FALLBACK; } swp2pswp( trans, n, ipiv, newipiv ); magma_setvector( n, sizeof(magma_int_t), newipiv, 1, dipiv, 1 ); magma_free_cpu( newipiv ); } // solve dSA*dSX = dB in single precision // converts dB to dSX and applies pivots, solves, then converts result back to dX magma_dsgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dB, lddb, dX, lddx, dSX, info ); // residual dR = dB - dA*dX in double precision magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_dgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange? for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( n, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX // solve dSA*dSX = R in single precision // convert result back to double precision dR // it's okay that dR is used for both dB input and dX output. magma_dsgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dR, lddr, dR, lddr, dSX, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dR --and-- // dR = dB // This saves going through dR a second time (if done with one more kernel). // -- not really: first time is read, second time is write. for( j=0; j < nrhs; j++ ) { magmablas_daxpycp( n, dR(0,j), dX(0,j), dB(0,j) ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_dgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER>0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( n, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_dgetrf_gpu( n, n, dA, ldda, ipiv, info ); if (*info == 0) { magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); magma_dgetrs_gpu( trans, n, nrhs, dA, ldda, ipiv, dX, lddx, info ); } return *info; }
/** Purpose ------- ZCGESV computes the solution to a complex system of linear equations A * X = B, A**T * X = B, or A**H * X = B, where A is an N-by-N matrix and X and B are N-by-NRHS matrices. ZCGESV first attempts to factorize the matrix in complex SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with complex DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a complex DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio complex SINGLE PRECISION performance over complex DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaConjTrans: A**H * X = B (Conjugate transpose) @param[in] n INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (ldda,N) On entry, the N-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array dA. ldda >= max(1,N). @param[out] ipiv INTEGER array, dimension (N) The pivot indices that define the permutation matrix P; row i of the matrix was interchanged with row IPIV(i). Corresponds either to the single precision factorization (if info.EQ.0 and ITER.GE.0) or the double precision factorization (if info.EQ.0 and ITER.LT.0). @param[out] dipiv INTEGER array on the GPU, dimension (N) The pivot indices; for 1 <= i <= N, after permuting, row i of the matrix was moved to row dIPIV(i). Note this is different than IPIV, where interchanges are applied one-after-another. @param[in] dB COMPLEX_16 array on the GPU, dimension (lddb,NRHS) The N-by-NRHS right hand side matrix B. @param[in] lddb INTEGER The leading dimension of the array dB. lddb >= max(1,N). @param[out] dX COMPLEX_16 array on the GPU, dimension (lddx,NRHS) If info = 0, the N-by-NRHS solution matrix X. @param[in] lddx INTEGER The leading dimension of the array dX. lddx >= max(1,N). @param dworkd (workspace) COMPLEX_16 array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. @param dworks (workspace) COMPLEX array on the GPU, dimension (N*(N+NRHS)) This array is used to store the complex single precision matrix and the right-hand sides or solutions in single precision. @param[out] iter INTEGER - < 0: iterative refinement has failed, double precision factorization has been performed + -1 : the routine fell back to full precision for implementation- or machine-specific reasons + -2 : narrowing the precision induced an overflow, the routine fell back to full precision + -3 : failure of SGETRF + -31: stop the iterative refinement after the 30th iteration - > 0: iterative refinement has been successfully used. Returns the number of iterations @param[out] info INTEGER - = 0: successful exit - < 0: if info = -i, the i-th argument had an illegal value - > 0: if info = i, U(i,i) computed in DOUBLE PRECISION is exactly zero. The factorization has been completed, but the factor U is exactly singular, so the solution could not be computed. @ingroup magma_zgesv_driver ********************************************************************/ extern "C" magma_int_t magma_zcgesv_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *dipiv, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *dX, magma_int_t lddx, magmaDoubleComplex *dworkd, magmaFloatComplex *dworks, magma_int_t *iter, magma_int_t *info) { #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t ione = 1; magmaDoubleComplex *dR; magmaFloatComplex *dSA, *dSX; magmaDoubleComplex Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -8; else if ( lddx < max(1,n)) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_zlange(MagmaInfNorm, n, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ //magmablas_zlag2c( n, nrhs, dB, lddb, dSX, lddsx, info ); // done inside zcgetrs with pivots if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_zlag2c( n, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_cgetrf_gpu( n, n, dSA, lddsa, ipiv, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Generate parallel pivots { magma_int_t *newipiv; magma_imalloc_cpu( &newipiv, n ); if ( newipiv == NULL ) { *iter = -3; goto FALLBACK; } swp2pswp( trans, n, ipiv, newipiv ); magma_setvector( n, sizeof(magma_int_t), newipiv, 1, dipiv, 1 ); magma_free_cpu( newipiv ); } // solve dSA*dSX = dB in single precision // converts dB to dSX and applies pivots, solves, then converts result back to dX magma_zcgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dB, lddb, dX, lddx, dSX, info ); // residual dR = dB - dA*dX in double precision magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_zgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange? for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( n, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX // solve dSA*dSX = R in single precision // convert result back to double precision dR // it's okay that dR is used for both dB input and dX output. magma_zcgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dR, lddr, dR, lddr, dSX, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dR --and-- // dR = dB // This saves going through dR a second time (if done with one more kernel). // -- not really: first time is read, second time is write. for( j=0; j < nrhs; j++ ) { magmablas_zaxpycp( n, dR(0,j), dX(0,j), dB(0,j) ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_zgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER > 0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( n, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_zgetrf_gpu( n, n, dA, ldda, ipiv, info ); if (*info == 0) { magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); magma_zgetrs_gpu( trans, n, nrhs, dA, ldda, ipiv, dX, lddx, info ); } return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- testing sparse matrix vector product */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); magma_s_matrix hA={Magma_CSR}, hA_SELLP={Magma_CSR}, dA={Magma_CSR}, dA_SELLP={Magma_CSR}; magma_s_matrix hx={Magma_CSR}, hy={Magma_CSR}, dx={Magma_CSR}, dy={Magma_CSR}, hrefvec={Magma_CSR}, hcheck={Magma_CSR}; hA_SELLP.blocksize = 8; hA_SELLP.alignment = 8; real_Double_t start, end, res; #ifdef MAGMA_WITH_MKL magma_int_t *pntre=NULL; #endif cusparseHandle_t cusparseHandle = NULL; cusparseMatDescr_t descr = NULL; float c_one = MAGMA_S_MAKE(1.0, 0.0); float c_zero = MAGMA_S_MAKE(0.0, 0.0); float accuracy = 1e-10; #define PRECISION_s #if defined(PRECISION_c) accuracy = 1e-4; #endif #if defined(PRECISION_s) accuracy = 1e-4; #endif magma_int_t i, j; for( i = 1; i < argc; ++i ) { if ( strcmp("--blocksize", argv[i]) == 0 ) { hA_SELLP.blocksize = atoi( argv[++i] ); } else if ( strcmp("--alignment", argv[i]) == 0 ) { hA_SELLP.alignment = atoi( argv[++i] ); } else break; } printf("\n# usage: ./run_sspmm" " [ --blocksize %lld --alignment %lld (for SELLP) ] matrices\n\n", (long long) hA_SELLP.blocksize, (long long) hA_SELLP.alignment ); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_sm_5stencil( laplace_size, &hA, queue )); } else { // file-matrix test TESTING_CHECK( magma_s_csr_mtx( &hA, argv[i], queue )); } printf("%% matrix info: %lld-by-%lld with %lld nonzeros\n", (long long) hA.num_rows, (long long) hA.num_cols, (long long) hA.nnz ); real_Double_t FLOPS = 2.0*hA.nnz/1e9; // m - number of rows for the sparse matrix // n - number of vectors to be multiplied in the SpMM product magma_int_t m, n; m = hA.num_rows; n = 48; // init CPU vectors TESTING_CHECK( magma_svinit( &hx, Magma_CPU, m, n, c_one, queue )); TESTING_CHECK( magma_svinit( &hy, Magma_CPU, m, n, c_zero, queue )); // init DEV vectors TESTING_CHECK( magma_svinit( &dx, Magma_DEV, m, n, c_one, queue )); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, m, n, c_zero, queue )); // calling MKL with CSR #ifdef MAGMA_WITH_MKL TESTING_CHECK( magma_imalloc_cpu( &pntre, m + 1 ) ); pntre[0] = 0; for (j=0; j < m; j++ ) { pntre[j] = hA.row[j+1]; } MKL_INT num_rows = hA.num_rows; MKL_INT num_cols = hA.num_cols; MKL_INT nnz = hA.nnz; MKL_INT num_vecs = n; MKL_INT *col; TESTING_CHECK( magma_malloc_cpu( (void**) &col, nnz * sizeof(MKL_INT) )); for( magma_int_t t=0; t < hA.nnz; ++t ) { col[ t ] = hA.col[ t ]; } MKL_INT *row; TESTING_CHECK( magma_malloc_cpu( (void**) &row, num_rows * sizeof(MKL_INT) )); for( magma_int_t t=0; t < hA.num_rows; ++t ) { row[ t ] = hA.col[ t ]; } // === Call MKL with consecutive SpMVs, using mkl_scsrmv === // warmp up mkl_scsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); start = magma_wtime(); for (j=0; j < 10; j++ ) { mkl_scsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); } end = magma_wtime(); printf( "\n > MKL SpMVs : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); // === Call MKL with blocked SpMVs, using mkl_scsrmm === char transa = 'n'; MKL_INT ldb = n, ldc=n; char matdescra[6] = {'g', 'l', 'n', 'c', 'x', 'x'}; // warm up mkl_scsrmm( &transa, &num_rows, &num_vecs, &num_cols, MKL_ADDR(&c_one), matdescra, MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), &ldb, MKL_ADDR(&c_zero), MKL_ADDR(hy.val), &ldc ); start = magma_wtime(); for (j=0; j < 10; j++ ) { mkl_scsrmm( &transa, &num_rows, &num_vecs, &num_cols, MKL_ADDR(&c_one), matdescra, MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), &ldb, MKL_ADDR(&c_zero), MKL_ADDR(hy.val), &ldc ); } end = magma_wtime(); printf( "\n > MKL SpMM : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); magma_free_cpu( row ); magma_free_cpu( col ); row = NULL; col = NULL; #endif // MAGMA_WITH_MKL // copy matrix to GPU TESTING_CHECK( magma_smtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue )); // SpMV on GPU (CSR) start = magma_sync_wtime( queue ); for (j=0; j < 10; j++) { TESTING_CHECK( magma_s_spmv( c_one, dA, dx, c_zero, dy, queue )); } end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard CSR).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hrefvec , Magma_DEV, Magma_CPU, queue )); magma_smfree(&dA, queue ); // convert to SELLP and copy to GPU TESTING_CHECK( magma_smconvert( hA, &hA_SELLP, Magma_CSR, Magma_SELLP, queue )); TESTING_CHECK( magma_smtransfer( hA_SELLP, &dA_SELLP, Magma_CPU, Magma_DEV, queue )); magma_smfree(&hA_SELLP, queue ); magma_smfree( &dy, queue ); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, dx.num_rows, dx.num_cols, c_zero, queue )); // SpMV on GPU (SELLP) start = magma_sync_wtime( queue ); for (j=0; j < 10; j++) { TESTING_CHECK( magma_s_spmv( c_one, dA_SELLP, dx, c_zero, dy, queue )); } end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (SELLP).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue )); res = 0.0; for(magma_int_t k=0; k < hA.num_rows; k++ ) { res=res + MAGMA_S_REAL(hcheck.val[k]) - MAGMA_S_REAL(hrefvec.val[k]); } printf("%% |x-y|_F = %8.2e\n", res); if ( res < accuracy ) printf("%% tester spmm SELL-P: ok\n"); else printf("%% tester spmm SELL-P: failed\n"); magma_smfree( &hcheck, queue ); magma_smfree(&dA_SELLP, queue ); // SpMV on GPU (CUSPARSE - CSR) // CUSPARSE context // magma_smfree( &dy, queue ); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, dx.num_rows, dx.num_cols, c_zero, queue )); //#ifdef PRECISION_d start = magma_sync_wtime( queue ); TESTING_CHECK( cusparseCreate( &cusparseHandle )); TESTING_CHECK( cusparseSetStream( cusparseHandle, magma_queue_get_cuda_stream(queue) )); TESTING_CHECK( cusparseCreateMatDescr( &descr )); TESTING_CHECK( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL )); TESTING_CHECK( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO )); float alpha = c_one; float beta = c_zero; // copy matrix to GPU TESTING_CHECK( magma_smtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue) ); for (j=0; j < 10; j++) { cusparseScsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, dA.num_rows, n, dA.num_cols, dA.nnz, &alpha, descr, dA.dval, dA.drow, dA.dcol, dx.dval, dA.num_cols, &beta, dy.dval, dA.num_cols); } end = magma_sync_wtime( queue ); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue )); res = 0.0; for(magma_int_t k=0; k < hA.num_rows; k++ ) { res = res + MAGMA_S_REAL(hcheck.val[k]) - MAGMA_S_REAL(hrefvec.val[k]); } printf("%% |x-y|_F = %8.2e\n", res); if ( res < accuracy ) printf("%% tester spmm cuSPARSE: ok\n"); else printf("%% tester spmm cuSPARSE: failed\n"); magma_smfree( &hcheck, queue ); cusparseDestroyMatDescr( descr ); cusparseDestroy( cusparseHandle ); descr = NULL; cusparseHandle = NULL; //#endif printf("\n\n"); // free CPU memory magma_smfree( &hA, queue ); magma_smfree( &hx, queue ); magma_smfree( &hy, queue ); magma_smfree( &hrefvec, queue ); // free GPU memory magma_smfree( &dx, queue ); magma_smfree( &dy, queue ); magma_smfree( &dA, queue); #ifdef MAGMA_WITH_MKL magma_free_cpu( pntre ); #endif i++; } magma_queue_destroy( queue ); TESTING_CHECK( magma_finalize() ); return info; }
extern "C" magma_int_t magma_dgesv_rbt_batched( magma_int_t n, magma_int_t nrhs, double **dA_array, magma_int_t ldda, double **dB_array, magma_int_t lddb, magma_int_t *dinfo_array, magma_int_t batchCount, magma_queue_t queue) { /* Local variables */ magma_int_t i, info; info = 0; if (n < 0) { info = -1; } else if (nrhs < 0) { info = -2; } else if (ldda < max(1,n)) { info = -4; } else if (lddb < max(1,n)) { info = -6; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return info; } double *hu, *hv; if (MAGMA_SUCCESS != magma_dmalloc_cpu( &hu, 2*n )) { info = MAGMA_ERR_HOST_ALLOC; return info; } if (MAGMA_SUCCESS != magma_dmalloc_cpu( &hv, 2*n )) { info = MAGMA_ERR_HOST_ALLOC; return info; } info = magma_dgerbt_batched(MagmaTrue, n, nrhs, dA_array, n, dB_array, n, hu, hv, &info, batchCount, queue); if (info != MAGMA_SUCCESS) { return info; } info = magma_dgetrf_nopiv_batched( n, n, dA_array, ldda, dinfo_array, batchCount, queue); if ( info != MAGMA_SUCCESS ) { return info; } #ifdef CHECK_INFO // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_int_t *cpu_info = NULL; magma_imalloc_cpu( &cpu_info, batchCount ); magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1); for (i=0; i < batchCount; i++) { if (cpu_info[i] != 0 ) { printf("magma_dgetrf_batched matrix %d returned error %d\n",i, (int)cpu_info[i] ); info = cpu_info[i]; magma_free_cpu (cpu_info); return info; } } magma_free_cpu (cpu_info); #endif info = magma_dgetrs_nopiv_batched( MagmaNoTrans, n, nrhs, dA_array, ldda, dB_array, lddb, dinfo_array, batchCount, queue ); /* The solution of A.x = b is Vy computed on the GPU */ double *dv; if (MAGMA_SUCCESS != magma_dmalloc( &dv, 2*n )) { info = MAGMA_ERR_DEVICE_ALLOC; return info; } magma_dsetvector( 2*n, hv, 1, dv, 1, queue ); for (i = 0; i < nrhs; i++) magmablas_dprbt_mv_batched(n, dv, dB_array+(i), batchCount, queue); // magma_dgetmatrix( n, nrhs, db, nn, B, ldb, queue ); return info; }
int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, PyGpuArrayObject **S, PyGpuArrayObject **U, // may be NULL PyGpuArrayObject **VT, // may be NULL PARAMS_TYPE* params) { bool compute_uv = (U != NULL); magma_int_t *iwork = NULL, iunused[1]; magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info; magma_vec_t jobz; size_t s_dims[1], u_dims[2], vt_dims[2]; float *a_data = NULL, *s_data = NULL, *u_data = NULL, *vt_data = NULL, *work = NULL; float dummy[1]; int res = -1, lwork; if (A->ga.typecode != GA_FLOAT) { PyErr_SetString(PyExc_TypeError, "GpuMagmaMatrixInverse: Unsupported data type"); return -1; } // This is early to match the exit() in the fail label. cuda_enter(params->context->ctx); magma_init(); if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) { PyErr_SetString(PyExc_ValueError, "GpuMagmaMatrixInverse: requires data to be C-contiguous"); return 1; } if (PyGpuArray_NDIM(A) != 2) { PyErr_SetString(PyExc_ValueError, "GpuMagmaMatrixInverse: matrix rank error"); goto fail; } // magma matrix svd // reverse dimensions because MAGMA expects column-major matrices: M = PyGpuArray_DIM(A, 1); N = PyGpuArray_DIM(A, 0); K = std::min(M, N); if (MAGMA_SUCCESS != magma_smalloc_pinned(&a_data, M * N)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } cudaMemcpy(a_data, PyGpuArray_DEV_DATA(A), M * N * sizeof(float), cudaMemcpyDeviceToDevice); if (MAGMA_SUCCESS != magma_smalloc_pinned(&s_data, K)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } if (compute_uv) { if (params->full_matrices) { jobz = MagmaAllVec; } else { jobz = MagmaSomeVec; } M_U = (jobz == MagmaAllVec ? M : K); N_VT = (jobz == MagmaAllVec ? N : K); ldu = M; ldv = N_VT; if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } } else { jobz = MagmaNoVec; ldu = M; ldv = N; } // query for workspace size magma_sgesdd(jobz, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv, dummy, -1, iunused, &info); lwork = (magma_int_t) MAGMA_S_REAL(dummy[0]); if (MAGMA_SUCCESS != magma_smalloc_pinned(&work, lwork)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate working memory"); goto fail; } if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork, 8*K)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate working memory"); goto fail; } // compute svd magma_sgesdd(jobz, M, N, a_data, M, s_data, u_data, ldu, vt_data, ldv, work, lwork, iwork, &info); if (info > 0) { PyErr_Format( PyExc_RuntimeError, "GpuMagmaSVD: the updating process of SBDSDC did not converge (error: %d)", info); goto fail; } else if (info < 0) { PyErr_Format( PyExc_RuntimeError, "GpuMagmaSVD: magma_sgesdd_gpu argument %d has an illegal value", -info); goto fail; } s_dims[0] = K; if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){ PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float), cudaMemcpyDeviceToDevice); if (compute_uv) { u_dims[0] = N; u_dims[1] = N_VT; if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){ PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U // to match numpy.linalg.svd output cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float), cudaMemcpyDeviceToDevice); vt_dims[0] = M_U; vt_dims[1] = M; if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){ PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U // to match numpy.linalg.svd output cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float), cudaMemcpyDeviceToDevice); } res = 0; fail: if (a_data != NULL) magma_free_pinned(a_data); if (s_data != NULL) magma_free_pinned(s_data); if (u_data != NULL) magma_free_pinned(u_data); if (vt_data != NULL) magma_free_pinned(vt_data); if (work != NULL) magma_free_pinned(work); if (iwork != NULL) magma_free_cpu(iwork); magma_finalize(); cuda_exit(params->context->ctx); return res; }
int APPLY_SPECIFIC(magma_eigh)(PyGpuArrayObject *A_, PyGpuArrayObject **D, PyGpuArrayObject **V, // may be NULL PARAMS_TYPE *params) { PyGpuArrayObject *A = NULL; magma_int_t N, liwork, *iwork_data = NULL; size_t d_dims[1], v_dims[2]; magma_uplo_t uplo; magma_vec_t jobz; float *w_data = NULL, *wA_data = NULL, *work_data = NULL, lwork; int res = -1, info; if (A_->ga.typecode != GA_FLOAT) { PyErr_SetString(PyExc_TypeError, "GpuMagmaEigh: Unsupported data type"); return -1; } // This is early to match the exit() in the fail label. cuda_enter(params->context->ctx); if (!GpuArray_IS_C_CONTIGUOUS(&A_->ga)) { PyErr_SetString(PyExc_ValueError, "GpuMagmaEigh: requires data to be C-contiguous"); goto fail; } if (PyGpuArray_NDIM(A_) != 2) { PyErr_SetString(PyExc_ValueError, "GpuMagmaEigh: matrix rank error"); goto fail; } if (PyGpuArray_DIM(A_, 0) != PyGpuArray_DIM(A_, 1)) { PyErr_SetString(PyExc_ValueError, "GpuMagmaEigh: matrix is not square"); goto fail; } A = pygpu_copy(A_, GA_F_ORDER); if (A == NULL) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaEigh: failed to change to column-major order"); return -1; } // magma matrix eigen decomposition of a symmetric matrix N = PyGpuArray_DIM(A, 0); if (params->lower) { uplo = MagmaLower; } else { uplo = MagmaUpper; } if (params->compute_v) { jobz = MagmaVec; } else { jobz = MagmaNoVec; } if (MAGMA_SUCCESS != magma_smalloc_pinned(&w_data, N)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaEigh: failed to allocate working memory"); goto fail; } if (MAGMA_SUCCESS != magma_smalloc_pinned(&wA_data, N * N)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaEigh: failed to allocate working memory"); goto fail; } // query for workspace size magma_ssyevd_gpu(jobz, uplo, N, NULL, N, NULL, NULL, N, &lwork, -1, &liwork, -1, &info); if (MAGMA_SUCCESS != magma_smalloc_pinned(&work_data, (size_t)lwork)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaEigh: failed to allocate working memory"); goto fail; } if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork_data, liwork)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaEigh: failed to allocate working memory"); goto fail; } magma_ssyevd_gpu(jobz, uplo, N, (float *)PyGpuArray_DEV_DATA(A), N, w_data, wA_data, N, work_data, (size_t)lwork, iwork_data, liwork, &info); if (info > 0) { PyErr_Format( PyExc_RuntimeError, "GpuMagmaEigh: %d off-diagonal elements of an didn't converge to zero", info); goto fail; } else if (info < 0) { PyErr_Format( PyExc_RuntimeError, "GpuMagmaEigh: magma_ssyevd_gpu argument %d has an illegal value", -info); goto fail; } d_dims[0] = N; if (theano_prep_output(D, 1, d_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){ PyErr_SetString(PyExc_RuntimeError, "GpuMagmaEigh: failed to allocate memory for the output"); goto fail; } cudaMemcpy(PyGpuArray_DEV_DATA(*D), w_data, N * sizeof(float), cudaMemcpyDeviceToDevice); if (params->compute_v) { *V = theano_try_copy(*V, A); if (*V == NULL) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaEigh: failed to allocate memory for the output"); goto fail; } } res = 0; fail: if (w_data != NULL) magma_free_pinned(w_data); if (wA_data != NULL) magma_free_pinned(wA_data); if (work_data != NULL) magma_free_pinned(work_data); if (iwork_data != NULL) magma_free_cpu(iwork_data); Py_XDECREF(A); cuda_exit(params->context->ctx); return res; }
extern "C" int calc_bounding_box(magmaFloatComplex *M, magma_int_t M_lead_dim, float *wReEig, float *wImEig) { magma_int_t rslt = 0; //magmaFloatComplex *AT = nullptr; magmaFloatComplex *dA = nullptr, *dAT = nullptr, *dreA = nullptr, *dimA = nullptr; float *dreEig = nullptr; float *dimEig = nullptr; //magma_int_t *ipiv = NULL; magma_int_t lda = M_lead_dim; //magma_int_t ldx = lda; magma_int_t info = 0; magma_int_t nb = 0; //magma_vec_t jobvl; //magma_vec_t jobvr; magmaFloatComplex *work = nullptr; magma_int_t lwork = 0; float *rwork = nullptr; magma_int_t lrwork = 0; magma_int_t *iwork = nullptr; magma_int_t liwork = 0; nb = magma_get_cgehrd_nb( M_lead_dim ); lwork = 2 * (M_lead_dim + M_lead_dim*nb); // MagmaNoVec //lwork = 2 * max(M_lead_dim + M_lead_dim*nb, 2*M_lead_dim + M_lead_dim*M_lead_dim); // MagmaVec lrwork = M_lead_dim; // MagmaNoVec //lrwork = 1 + 5 * M_lead_dim + 2*M_lead_dim*M_lead_dim; // MagmaVec liwork = 1; // MagmaNoVec //liwork = 3 + 5*M_lead_dim; // MagmaVec magma_imalloc_cpu(&iwork, liwork); magma_smalloc_cpu(&rwork, lrwork); //magma_cmalloc_cpu(&A, lda*M_lead_dim); //magma_cmalloc_cpu(&AT, lda*M_lead_dim); //magma_smalloc_cpu(&reEig, M_lead_dim); //magma_smalloc_cpu(&imEig, M_lead_dim); magma_cmalloc_pinned(&dA, lda*M_lead_dim); magma_cmalloc_pinned(&dAT, lda*M_lead_dim); magma_cmalloc_pinned(&dreA, lda*M_lead_dim); magma_cmalloc_pinned(&dimA, lda*M_lead_dim); //magma_cmalloc_pinned(&VL, lda*M_lead_dim); //magma_cmalloc_pinned(&VR, lda*M_lead_dim); magma_cmalloc_pinned(&work, lwork); magma_smalloc_pinned(&dreEig, M_lead_dim); magma_smalloc_pinned(&dimEig, M_lead_dim); //matrix_fillzero(AT, M_lead_dim); //vector_fillzero(reEig, M_lead_dim); //vector_fillzero(imEig, M_lead_dim); //prepare_matrix_2(M); magma_csetmatrix(M_lead_dim, M_lead_dim, M, lda, dA, M_lead_dim, queue); //magma_csetmatrix(M_lead_dim, M_lead_dim, AT, lda, dAT, M_lead_dim, queue); //magma_ssetvector(M_lead_dim, wReEig, 1, dreEig, 1, queue); //magma_ssetvector(M_lead_dim, wImEig, 1, dimEig, 1, queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dA, lda); // reA = ( (A + A')/2.0 ) // A' magmablas_ctranspose(M_lead_dim, M_lead_dim, dA, M_lead_dim, dAT, M_lead_dim, queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda); // AT = A + A' magmablas_cgeadd(M_lead_dim, M_lead_dim, MAGMA_C_MAKE(1.0f, 0.0f), dA, M_lead_dim, dAT, M_lead_dim, queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda); // AT=AT*0.5 magma_cscal(lda*M_lead_dim, MAGMA_C_MAKE(0.5f, 0.0f), dAT, 1, queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda); // reA = AT magma_ccopy(lda*M_lead_dim, dAT, 1, dreA, 1, queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dreA, lda); magma_sync_wtime(queue); // imA = ( -1im*(A - A')/2.0 ) // A' magmablas_ctranspose(M_lead_dim, M_lead_dim, dA, M_lead_dim, dAT, M_lead_dim, queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda); // AT = A + A' magmablas_cgeadd(M_lead_dim, M_lead_dim, MAGMA_C_MAKE(-1.0f, 0.0f), dAT, M_lead_dim, dA, M_lead_dim, queue); // A=A*-1j*0.5 magma_cscal(lda*M_lead_dim, MAGMA_C_MAKE(0.0f, -0.5f), dA, 1, queue); // imA = A magma_ccopy(lda*M_lead_dim, dA, 1, dimA, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dreA, lda); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dimA, lda); // reEig::Vector=eigvals(reA) rslt = magma_cheevd(MagmaNoVec, MagmaLower, M_lead_dim, dreA, lda, dreEig, work, lwork, rwork, lrwork, iwork, liwork, &info); // imEig::Vector=eigvals(imA) rslt = magma_cheevd(MagmaNoVec, MagmaLower, M_lead_dim, dimA, lda, dimEig, work, lwork, rwork, lrwork, iwork, liwork, &info); //magma_sprint_gpu(M_lead_dim, 1, dreEig, M_lead_dim); //magma_sprint_gpu(M_lead_dim, 1, dimEig, M_lead_dim); magma_sgetvector(M_lead_dim, dreEig, 1, wReEig, 1, queue); //magma_sync_wtime(queue); magma_sgetvector(M_lead_dim, dimEig, 1, wImEig, 1, queue); //magma_sync_wtime(queue); /* maxReIdx = magma_isamax(M_lead_dim, dreEig, 1, queue) - 1; minReIdx = magma_isamin(M_lead_dim, dreEig, 1, queue) - 1; maxImIdx = magma_isamax(M_lead_dim, dimEig, 1, queue) - 1; minImIdx = magma_isamin(M_lead_dim, dimEig, 1, queue) - 1; printf("max re idx = %d\nmin re idx = %d\n", maxReIdx, minReIdx); printf("%f %f\n", wReEig[maxReIdx], wReEig[minReIdx]); printf("max im idx = %d\nmin im idx = %d\n", maxImIdx, minImIdx); printf("%f %f\n", wImEig[maxImIdx], wImEig[minImIdx]); */ //printf("test wReEig: %f %f\n", wReEig[0], wReEig[1]); //printf("test wImEig: %f %f\n", wImEig[0], wImEig[1]); magma_free_cpu(iwork); magma_free_cpu(rwork); //magma_free_cpu(AT); magma_free_pinned(dA); magma_free_pinned(dAT); magma_free_pinned(dreA); magma_free_pinned(dimA); magma_free_pinned(work); magma_free_pinned(dreEig); magma_free_pinned(dimEig); return rslt; }
extern "C" int calc_numerical_range(magmaFloatComplex *M, magma_int_t M_lead_dim, float _from, float _step, magma_int_t _steps, magmaFloatComplex *pts) { magma_int_t idx = 0, rslt = 0; magmaFloatComplex p, scalar; std::complex<float> vtmp; float j; magmaFloatComplex *dA = nullptr; magmaFloatComplex *dAth = NULL, *dAthT = NULL, *dX = NULL, *dY = NULL; float *dE = NULL; //float *hE = NULL; //magma_int_t *ipiv = NULL; magma_int_t lda = M_lead_dim; //magma_int_t ldx = lda; magma_int_t info = 0; magma_int_t nb = 0; //magma_vec_t jobvl; //magma_vec_t jobvr; magmaFloatComplex *work = nullptr; magma_int_t lwork = 0; float *rwork = nullptr; magma_int_t lrwork = 0; magma_int_t *iwork = nullptr; magma_int_t liwork = 0; nb = magma_get_cgehrd_nb( M_lead_dim ); lwork = 2 * max(M_lead_dim + M_lead_dim*nb, 2 * M_lead_dim + M_lead_dim*M_lead_dim); // MagmaVec lrwork = 1 + 5 * M_lead_dim + 2 * M_lead_dim*M_lead_dim; // MagmaVec liwork = (3 + 5 * M_lead_dim); // MagmaVec magma_imalloc_cpu(&iwork, liwork); magma_smalloc_cpu(&rwork, lrwork); magma_cmalloc_pinned(&work, lwork); magma_cmalloc_pinned(&dA, lda*M_lead_dim); magma_cmalloc_pinned(&dAth, lda*M_lead_dim); magma_cmalloc_pinned(&dAthT, lda*M_lead_dim); magma_smalloc_pinned(&dE, M_lead_dim); //magma_smalloc_cpu(&hE, M_lead_dim); magma_cmalloc_pinned(&dX, M_lead_dim); magma_cmalloc_pinned(&dY, M_lead_dim); magma_csetmatrix(M_lead_dim, M_lead_dim, M, lda, dA, M_lead_dim, queue); // th=[0:resolution:2*pi] j = _from; for (idx = 0; idx < _steps; idx++) { //scalar = exp( 1im * -j); vtmp.real( 0.0f ); vtmp.imag( -j ); //vtmp = _FCbuild(0.0f, -j); //printf("vtmp = %f + i%f\n", vtmp._Val[0], vtmp._Val[1]); vtmp = exp(vtmp); scalar.x = vtmp.real(); scalar.y = vtmp.imag(); //printf("scalar = %f + i%f\n", scalar.x, scalar.y); magma_ccopy(lda * M_lead_dim, dA, 1, dAth, 1, queue); // Ath = exp(1im * -j) * As magma_cscal(lda * M_lead_dim, scalar, dAth, 1, queue); //magma_cprint_gpu(N, N, dA, lda); //magma_cprint_gpu(N, N, dAth, lda); // AthT = (Ath + Ath') magmablas_ctranspose_conj(M_lead_dim, M_lead_dim, dAth, M_lead_dim, dAthT, M_lead_dim, queue); magmablas_cgeadd(M_lead_dim, M_lead_dim, MAGMA_C_MAKE(1.0f, 0.0f), dAth, M_lead_dim, dAthT, M_lead_dim, queue); // AthT = AthT / 2 magma_cscal(lda*M_lead_dim, MAGMA_C_MAKE(0.5f, 0.0f), dAthT, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAthT, lda); // e, r = eig(AthT) rslt = magma_cheevd(MagmaVec, MagmaLower, M_lead_dim, dAthT, lda, dE, work, lwork, rwork, lrwork, iwork, liwork, &info); magma_sync_wtime(queue); //printf("magma_cheevd info=%d\n", info); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAthT, lda); //magma_sprint_gpu(M_lead_dim, 1, dE, M_lead_dim); //magma_sgetvector(M_lead_dim, dE, 1, hE, 1, queue); //printf("%f %f\n", hE[0], hE[1]); // p = r[:,s]' * A * r[:,s] // r = r[:,s] magma_ccopy( M_lead_dim, dAthT + (M_lead_dim*(M_lead_dim-1)), 1, // dAthT + (N), where (N) is a column offset dX, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, 1, dX, M_lead_dim); // pp = A * r[:,s] magma_cgemv(MagmaNoTrans, M_lead_dim, M_lead_dim, MAGMA_C_MAKE(1.0f, 0.0f), dA, lda, dX, 1, MAGMA_C_MAKE(0.0f, 0.0f), dY, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, 1, dY, M_lead_dim); // p = r' * pp p = magma_cdotc(M_lead_dim, dX, 1, dY, 1, queue); magma_sync_wtime(queue); pts[idx] = p; //printf("p = %f %fi\n", p.x, p.y); j += _step; } // end of for (idx = 0; idx < _steps; idx++) magma_free_pinned(dY); magma_free_pinned(dX); //magma_free_cpu(hE); magma_free_pinned(dE); magma_free_pinned(dAthT); magma_free_pinned(dAth); magma_free_pinned(dA); magma_free_pinned(work); magma_free_cpu(rwork); magma_free_cpu(iwork); //magma_free_cpu(w); //magma_free_cpu(A); return rslt; }