/* //////////////////////////////////////////////////////////////////////////// -- Testing dsyr2k */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t N, K; magma_int_t Ak, An, Bk, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double *h_A, *h_B, *h_C, *h_Ccublas; magmaDouble_ptr d_A, d_B, d_C; double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 0.29, -0.86 ); double beta = MAGMA_D_MAKE( -0.48, 0.38 ); magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); #ifdef COMPLEX if (opts.transA == MagmaTrans) { opts.transA = MagmaConjTrans; printf("%% WARNING: transA = MagmaTrans changed to MagmaConjTrans\n"); } #endif printf("%% If running lapack (option --lapack), CUBLAS error is computed\n" "%% relative to CPU BLAS result.\n\n"); printf("%% uplo = %s, transA = %s\n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA) ); printf("%% N K CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("%%=================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.msize[itest]; K = opts.ksize[itest]; gflops = FLOPS_DSYR2K(K, N) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = An = N; Ak = K; ldb = Bn = N; Bk = K; } else { lda = An = K; Ak = N; ldb = Bn = K; Bk = N; } ldc = N; ldda = magma_roundup( lda, opts.align ); // multiple of 32 by default lddb = magma_roundup( ldb, opts.align ); // multiple of 32 by default lddc = magma_roundup( ldc, opts.align ); // multiple of 32 by default sizeA = lda*Ak; sizeB = ldb*Ak; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, double, lda*Ak ); TESTING_MALLOC_CPU( h_B, double, ldb*Bk ); TESTING_MALLOC_CPU( h_C, double, ldc*N ); TESTING_MALLOC_CPU( h_Ccublas, double, ldc*N ); TESTING_MALLOC_DEV( d_A, double, ldda*Ak ); TESTING_MALLOC_DEV( d_B, double, lddb*Bk ); TESTING_MALLOC_DEV( d_C, double, lddc*N ); /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_dlarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_dsetmatrix( An, Ak, h_A, lda, d_A, ldda ); magma_dsetmatrix( Bn, Bk, h_B, ldb, d_B, lddb ); magma_dsetmatrix( N, N, h_C, ldc, d_C, lddc ); magmablasSetKernelStream( opts.queue ); // opts.handle also uses opts.queue cublas_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS cublasDsyr2k( opts.handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), N, K, &alpha, d_A, ldda, d_B, lddb, &beta, d_C, lddc ); #else magma_dsyr2k( opts.uplo, opts.transA, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, opts.queue ); #endif cublas_time = magma_sync_wtime( opts.queue ) - cublas_time; cublas_perf = gflops / cublas_time; magma_dgetmatrix( N, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_dsyr2k( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_dlange( "M", &N, &N, h_C, &ldc, work ); blasf77_daxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_dlange( "M", &N, &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Ccublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R, *tau, *h_work, tmp[1]; double *d_lA[ MagmaMaxGPUs ]; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, lhwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= (opts.check == 2); // check (-c2) implies lapack (-l) magma_int_t status = 0; double tol, eps = lapackf77_dlamch("E"); tol = opts.tolerance * eps; printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("================================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F /(M*||A||_F)\n"); printf("==========================================================================\n"); } for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_dgeqrf_nb( M ); gflops = FLOPS_DGEQRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // query for workspace size lhwork = -1; lapackf77_dgeqrf( &M, &N, h_A, &M, tau, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_D_REAL( tmp[0] ); // Allocate host memory for the matrix TESTING_MALLOC( tau, double, min_mn ); TESTING_MALLOC( h_A, double, n2 ); TESTING_HOSTALLOC( h_R, double, n2 ); TESTING_MALLOC( h_work, double, lhwork ); // Allocate device memory for( int dev = 0; dev < ngpu; dev++ ) { n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; magma_setdevice( dev ); TESTING_DEVALLOC( d_lA[dev], double, ldda*n_local ); } /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { double *tau; TESTING_MALLOC( tau, double, min_mn ); cpu_time = magma_wtime(); lapackf77_dgeqrf( &M, &N, h_A, &M, tau, h_work, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapack_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE( tau ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix_1D_col_bcyclic( M, N, h_R, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_dgeqrf2_mgpu( ngpu, M, N, d_lA, ldda, tau, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf2 returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_R, lda, ngpu, nb ); magma_queue_sync( NULL ); if ( opts.check == 1 ) { /* ===================================================================== Check the result =================================================================== */ magma_int_t lwork = n2+N; double *h_W1, *h_W2, *h_W3; double *h_RW, results[2]; TESTING_MALLOC( h_W1, double, n2 ); // Q TESTING_MALLOC( h_W2, double, n2 ); // R TESTING_MALLOC( h_W3, double, lwork ); // WORK TESTING_MALLOC( h_RW, double, M ); // RWORK lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); lapackf77_dqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } status |= ! (results[0] < tol); TESTING_FREE( h_W1 ); TESTING_FREE( h_W2 ); TESTING_FREE( h_W3 ); TESTING_FREE( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_dlange("f", &M, &N, h_A, &lda, work ); blasf77_daxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work ) / (min_mn*error); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); printf("%s\n", (error < tol ? "" : " failed")); status |= ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_work ); TESTING_HOSTFREE( h_R ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_DEVFREE( d_lA[dev] ); } } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zheevx_gpu(char jobz, char range, char uplo, magma_int_t n, magmaDoubleComplex *da, magma_int_t ldda, double vl, double vu, magma_int_t il, magma_int_t iu, double abstol, magma_int_t *m, double *w, magmaDoubleComplex *dz, magma_int_t lddz, magmaDoubleComplex *wa, magma_int_t ldwa, magmaDoubleComplex *wz, magma_int_t ldwz, magmaDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t *iwork, magma_int_t *ifail, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= ZHEEVX computes selected eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. Arguments ========= JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. DA (device input/output) COMPLEX_16 array, dimension (LDDA, N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, the lower triangle (if UPLO='L') or the upper triangle (if UPLO='U') of A, including the diagonal, is destroyed. LDDA (input) INTEGER The leading dimension of the array DA. LDDA >= max(1,N). VL (input) DOUBLE PRECISION VU (input) DOUBLE PRECISION If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. ABSTOL (input) DOUBLE PRECISION The absolute error tolerance for the eigenvalues. An approximate eigenvalue is accepted as converged when it is determined to lie in an interval [a,b] of width less than or equal to ABSTOL + EPS * max( |a|,|b| ) , where EPS is the machine precision. If ABSTOL is less than or equal to zero, then EPS*|T| will be used in its place, where |T| is the 1-norm of the tridiagonal matrix obtained by reducing A to tridiagonal form. Eigenvalues will be computed most accurately when ABSTOL is set to twice the underflow threshold 2*DLAMCH('S'), not zero. If this routine returns with INFO>0, indicating that some eigenvectors did not converge, try setting ABSTOL to 2*DLAMCH('S'). See "Computing Small Singular Values of Bidiagonal Matrices with Guaranteed High Relative Accuracy," by Demmel and Kahan, LAPACK Working Note #3. M (output) INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. W (output) DOUBLE PRECISION array, dimension (N) On normal exit, the first M elements contain the selected eigenvalues in ascending order. DZ (device output) COMPLEX_16 array, dimension (LDDZ, max(1,M)) If JOBZ = 'V', then if INFO = 0, the first M columns of Z contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues, with the i-th column of Z holding the eigenvector associated with W(i). If an eigenvector fails to converge, then that column of Z contains the latest approximation to the eigenvector, and the index of the eigenvector is returned in IFAIL. If JOBZ = 'N', then Z is not referenced. Note: the user must ensure that at least max(1,M) columns are supplied in the array Z; if RANGE = 'V', the exact value of M is not known in advance and an upper bound must be used. ********* (workspace) If FAST_HEMV is defined DZ should be (LDDZ, max(1,N)) in both cases. LDDZ (input) INTEGER The leading dimension of the array DZ. LDDZ >= 1, and if JOBZ = 'V', LDDZ >= max(1,N). WA (workspace) COMPLEX_16 array, dimension (LDWA, N) LDWA (input) INTEGER The leading dimension of the array WA. LDWA >= max(1,N). WZ (workspace) COMPLEX_16 array, dimension (LDWZ, max(1,M)) LDWZ (input) INTEGER The leading dimension of the array DZ. LDWZ >= 1, and if JOBZ = 'V', LDWZ >= max(1,N). WORK (workspace/output) COMPLEX_16 array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. LWORK >= (NB+1)*N, where NB is the max of the blocksize for ZHETRD. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. RWORK (workspace) DOUBLE PRECISION array, dimension (7*N) IWORK (workspace) INTEGER array, dimension (5*N) IFAIL (output) INTEGER array, dimension (N) If JOBZ = 'V', then if INFO = 0, the first M elements of IFAIL are zero. If INFO > 0, then IFAIL contains the indices of the eigenvectors that failed to converge. If JOBZ = 'N', then IFAIL is not referenced. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, then i eigenvectors failed to converge. Their indices are stored in array IFAIL. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; char range_[2] = {range, 0}; magma_int_t ione = 1; char order[1]; magma_int_t indd, inde; magma_int_t imax; magma_int_t lopt, itmp1, indee; magma_int_t lower, wantz; magma_int_t i, j, jj, i__1; magma_int_t alleig, valeig, indeig; magma_int_t iscale, indibl; magma_int_t indiwk, indisp, indtau; magma_int_t indrwk, indwrk; magma_int_t llwork, nsplit; magma_int_t lquery; magma_int_t iinfo; double safmin; double bignum; double smlnum; double eps, tmp1; double anrm; double sigma, d__1; double rmin, rmax; double *dwork; /* Function Body */ lower = lapackf77_lsame(uplo_, MagmaLowerStr); wantz = lapackf77_lsame(jobz_, MagmaVecStr); alleig = lapackf77_lsame(range_, "A"); valeig = lapackf77_lsame(range_, "V"); indeig = lapackf77_lsame(range_, "I"); lquery = lwork == -1; *info = 0; if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -3; } else if (n < 0) { *info = -4; } else if (ldda < max(1,n)) { *info = -6; } else if (lddz < 1 || (wantz && lddz < n)) { *info = -15; } else if (ldwa < max(1,n)) { *info = -17; } else if (ldwz < 1 || (wantz && ldwz < n)) { *info = -19; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_zhetrd_nb(n); lopt = n * (nb + 1); work[0] = MAGMA_Z_MAKE( lopt, 0 ); if (lwork < lopt && ! lquery) { *info = -21; } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } *m = 0; /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif magmaDoubleComplex *a = (magmaDoubleComplex *) malloc( n * n * sizeof(magmaDoubleComplex) ); magma_zgetmatrix(n, n, da, ldda, a, n); lapackf77_zheevx(jobz_, range_, uplo_, &n, a, &n, &vl, &vu, &il, &iu, &abstol, m, w, wz, &ldwz, work, &lwork, rwork, iwork, ifail, info); magma_zsetmatrix( n, n, a, n, da, ldda); magma_zsetmatrix( n, *m, wz, ldwz, dz, lddz); free(a); return *info; } if (MAGMA_SUCCESS != magma_dmalloc( &dwork, n )) { fprintf (stderr, "!!!! device memory allocation error (magma_zheevx_gpu)\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } --w; --work; --rwork; --iwork; --ifail; /* Get machine constants. */ safmin = lapackf77_dlamch("Safe minimum"); eps = lapackf77_dlamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_dsqrt(smlnum); rmax = magma_dsqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = magmablas_zlanhe('M', uplo, n, da, ldda, dwork); iscale = 0; sigma = 1; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { d__1 = 1.; magmablas_zlascl(uplo, 0, 0, 1., sigma, n, n, da, ldda, info); if (abstol > 0.) { abstol *= sigma; } if (valeig) { vl *= sigma; vu *= sigma; } } /* Call ZHETRD to reduce Hermitian matrix to tridiagonal form. */ indd = 1; inde = indd + n; indrwk = inde + n; indtau = 1; indwrk = indtau + n; llwork = lwork - indwrk + 1; #ifdef FAST_HEMV magma_zhetrd2_gpu(uplo, n, da, ldda, &rwork[indd], &rwork[inde], &work[indtau], wa, ldwa, &work[indwrk], llwork, dz, lddz*n, &iinfo); #else magma_zhetrd_gpu (uplo, n, da, ldda, &rwork[indd], &rwork[inde], &work[indtau], wa, ldwa, &work[indwrk], llwork, &iinfo); #endif lopt = n + (magma_int_t)MAGMA_Z_REAL(work[indwrk]); /* If all eigenvalues are desired and ABSTOL is less than or equal to zero, then call DSTERF or ZUNGTR and ZSTEQR. If this fails for some eigenvalue, then try DSTEBZ. */ if ((alleig || (indeig && il == 1 && iu == n)) && abstol <= 0.) { blasf77_dcopy(&n, &rwork[indd], &ione, &w[1], &ione); indee = indrwk + 2*n; if (! wantz) { i__1 = n - 1; blasf77_dcopy(&i__1, &rwork[inde], &ione, &rwork[indee], &ione); lapackf77_dsterf(&n, &w[1], &rwork[indee], info); } else { lapackf77_zlacpy("A", &n, &n, wa, &ldwa, wz, &ldwz); lapackf77_zungtr(uplo_, &n, wz, &ldwz, &work[indtau], &work[indwrk], &llwork, &iinfo); i__1 = n - 1; blasf77_dcopy(&i__1, &rwork[inde], &ione, &rwork[indee], &ione); lapackf77_zsteqr(jobz_, &n, &w[1], &rwork[indee], wz, &ldwz, &rwork[indrwk], info); if (*info == 0) { for (i = 1; i <= n; ++i) { ifail[i] = 0; } magma_zsetmatrix( n, n, wz, ldwz, dz, lddz ); } } if (*info == 0) { *m = n; } } /* Otherwise, call DSTEBZ and, if eigenvectors are desired, ZSTEIN. */ if (*m == 0) { *info = 0; if (wantz) { *(unsigned char *)order = 'B'; } else { *(unsigned char *)order = 'E'; } indibl = 1; indisp = indibl + n; indiwk = indisp + n; lapackf77_dstebz(range_, order, &n, &vl, &vu, &il, &iu, &abstol, &rwork[indd], &rwork[inde], m, &nsplit, &w[1], &iwork[indibl], &iwork[indisp], &rwork[indrwk], &iwork[indiwk], info); if (wantz) { lapackf77_zstein(&n, &rwork[indd], &rwork[inde], m, &w[1], &iwork[indibl], &iwork[indisp], wz, &ldwz, &rwork[indrwk], &iwork[indiwk], &ifail[1], info); magma_zsetmatrix( n, *m, wz, ldwz, dz, lddz ); /* Apply unitary matrix used in reduction to tridiagonal form to eigenvectors returned by ZSTEIN. */ magma_zunmtr_gpu(MagmaLeft, uplo, MagmaNoTrans, n, *m, da, ldda, &work[indtau], dz, lddz, wa, ldwa, &iinfo); } } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = *m; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_dscal(&imax, &d__1, &w[1], &ione); } /* If eigenvalues are not in order, then sort them, along with eigenvectors. */ if (wantz) { for (j = 1; j <= *m-1; ++j) { i = 0; tmp1 = w[j]; for (jj = j + 1; jj <= *m; ++jj) { if (w[jj] < tmp1) { i = jj; tmp1 = w[jj]; } } if (i != 0) { itmp1 = iwork[indibl + i - 1]; w[i] = w[j]; iwork[indibl + i - 1] = iwork[indibl + j - 1]; w[j] = tmp1; iwork[indibl + j - 1] = itmp1; magma_zswap(n, dz + (i-1)*lddz, ione, dz + (j-1)*lddz, ione); if (*info != 0) { itmp1 = ifail[i]; ifail[i] = ifail[j]; ifail[j] = itmp1; } } } } /* Set WORK(1) to optimal complex workspace size. */ work[1] = MAGMA_Z_MAKE( lopt, 0 ); return *info; } /* magma_zheevx_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dormbr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double Cnorm, error, dwork[1]; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, k, mi, ni, mm, nn, nq, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; double *C, *R, *A, *work, *tau, *tauq, *taup; double *d, *e; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); // test all combinations of input parameters magma_vect_t vect [] = { MagmaQ, MagmaP }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaTrans, MagmaNoTrans }; printf("%% M N K vect side trans CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F / ||QC||_F\n"); printf("%%==============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int ivect = 0; ivect < 2; ++ivect ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_dgebrd_nb( m, n ); ldc = m; // A is nq x k (vect=Q) or k x nq (vect=P) // where nq=m (left) or nq=n (right) nq = (side[iside] == MagmaLeft ? m : n ); mm = (vect[ivect] == MagmaQ ? nq : k ); nn = (vect[ivect] == MagmaQ ? k : nq); lda = mm; // MBR calls either MQR or MLQ in various ways if ( vect[ivect] == MagmaQ ) { if ( nq >= k ) { gflops = FLOPS_DORMQR( m, n, k, side[iside] ) / 1e9; } else { if ( side[iside] == MagmaLeft ) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } gflops = FLOPS_DORMQR( mi, ni, nq-1, side[iside] ) / 1e9; } } else { if ( nq > k ) { gflops = FLOPS_DORMLQ( m, n, k, side[iside] ) / 1e9; } else { if ( side[iside] == MagmaLeft ) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } gflops = FLOPS_DORMLQ( mi, ni, nq-1, side[iside] ) / 1e9; } } // workspace for gebrd is (mm + nn)*nb // workspace for unmbr is m*nb or n*nb, depending on side lwork_max = max( (mm + nn)*nb, max( m*nb, n*nb )); // this rounds it up slightly if needed to agree with lwork query below lwork_max = int( real( magma_dmake_lwork( lwork_max ))); TESTING_MALLOC_CPU( C, double, ldc*n ); TESTING_MALLOC_CPU( R, double, ldc*n ); TESTING_MALLOC_CPU( A, double, lda*nn ); TESTING_MALLOC_CPU( work, double, lwork_max ); TESTING_MALLOC_CPU( d, double, min(mm,nn) ); TESTING_MALLOC_CPU( e, double, min(mm,nn) ); TESTING_MALLOC_CPU( tauq, double, min(mm,nn) ); TESTING_MALLOC_CPU( taup, double, min(mm,nn) ); // C is full, m x n size = ldc*n; lapackf77_dlarnv( &ione, ISEED, &size, C ); lapackf77_dlacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*nn; lapackf77_dlarnv( &ione, ISEED, &size, A ); // compute BRD factorization to get Householder vectors in A, tauq, taup //lapackf77_dgebrd( &mm, &nn, A, &lda, d, e, tauq, taup, work, &lwork_max, &info ); magma_dgebrd( mm, nn, A, lda, d, e, tauq, taup, work, lwork_max, &info ); if (info != 0) { printf("magma_dgebrd returned error %d: %s.\n", (int) info, magma_strerror( info )); } if ( vect[ivect] == MagmaQ ) { tau = tauq; } else { tau = taup; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dormbr( lapack_vect_const( vect[ivect] ), lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dormbr returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_dormbr( vect[ivect], side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, work, lwork, &info ); if (info != 0) { printf("magma_dormbr (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = (magma_int_t) MAGMA_D_REAL( work[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("Warning: optimal lwork %d > allocated lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_dormbr( vect[ivect], side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, work, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dormbr returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ size = ldc*n; blasf77_daxpy( &size, &c_neg_one, C, &ione, R, &ione ); Cnorm = lapackf77_dlange( "Fro", &m, &n, C, &ldc, dwork ); error = lapackf77_dlange( "Fro", &m, &n, R, &ldc, dwork ) / (magma_dsqrt(m*n) * Cnorm); printf( "%5d %5d %5d %c %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_vect_const( vect[ivect] ), lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( d ); TESTING_FREE_CPU( e ); TESTING_FREE_CPU( taup ); TESTING_FREE_CPU( tauq ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }}} // end ivect, iside, itran printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrs */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double gpu_error, cpu_error, matnorm, work[1]; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; magmaDoubleComplex *d_A, *d_B; magma_int_t M, N, n2, nrhs, lda, ldb, ldda, lddb, min_mn, max_mn, nb, info; magma_int_t lworkgpu, lhwork, lhwork2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) CPU GPU \n"); printf("===============================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; if ( M < N ) { printf( "skipping M=%d, N=%d because M < N is not yet supported.\n", (int) M, (int) N ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; n2 = lda*N; ldda = ((M+31)/32)*32; lddb = ((max_mn+31)/32)*32; nb = magma_get_zgeqrf_nb(M); gflops = (FLOPS_ZGEQRF( M, N ) + FLOPS_ZGEQRS( M, N, nrhs )) / 1e9; // query for workspace size lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; lhwork = -1; lapackf77_zgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork2 = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); lhwork = -1; lapackf77_zunmqr( MagmaLeftStr, MagmaConjTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); lhwork = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); lhwork = max( max( lhwork, lhwork2 ), lworkgpu ); TESTING_MALLOC( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC( h_A2, magmaDoubleComplex, lda*N ); TESTING_MALLOC( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC( h_X, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC( h_R, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC( h_work, magmaDoubleComplex, lhwork ); TESTING_DEVALLOC( d_A, magmaDoubleComplex, ldda*N ); TESTING_DEVALLOC( d_B, magmaDoubleComplex, lddb*nrhs ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS n2 = M*nrhs; lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); // make consistent RHS //n2 = N*nrhs; //lapackf77_zlarnv( &ione, ISEED, &n2, h_X ); //blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, // &c_one, h_A, &lda, // h_X, &ldb, // &c_zero, h_B, &ldb ); //lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_zgels3_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgels returned error %d: %s.\n", (int) info, magma_strerror( info )); // Get the solution in h_X magma_zgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); // compute the residual blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_zlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); cpu_error = lapackf77_zlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*matnorm); gpu_error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*matnorm); printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, cpu_error, gpu_error ); printf("%s\n", (gpu_error < tol ? "" : " failed")); status |= ! (gpu_error < tol); TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_A2 ); TESTING_FREE( h_B ); TESTING_FREE( h_X ); TESTING_FREE( h_R ); TESTING_FREE( h_work ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
// ---------------------------------------- int main( int argc, char** argv ) { TESTING_INIT(); //real_Double_t t_m, t_c, t_f; magma_int_t ione = 1; magmaDoubleComplex *A, *B; double diff, error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld; magmaDoubleComplex x2_m, x2_c; // complex x for magma, cblas/fortran blas respectively double x_m, x_c; // x for magma, cblas/fortran blas respectively magma_opts opts; parse_opts( argc, argv, &opts ); opts.tolerance = max( 100., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); gTol = tol; printf( "!! Calling these CBLAS and Fortran BLAS sometimes crashes (segfault), which !!\n" "!! is why we use wrappers. It does not necesarily indicate a bug in MAGMA. !!\n" "\n" "Diff compares MAGMA wrapper to CBLAS and BLAS function; should be exactly 0.\n" "Error compares MAGMA implementation to CBLAS and BLAS function; should be ~ machine epsilon.\n" "\n" ); double total_diff = 0.; double total_error = 0.; int inc[] = { 1 }; //{ -2, -1, 1, 2 }; //{ 1 }; //{ -1, 1 }; int ninc = sizeof(inc)/sizeof(*inc); for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d, incx = %d, incy = %d\n", (int) m, (int) n, (int) k, (int) incx, (int) incy ); printf( "Function MAGMA CBLAS BLAS Diff Error\n" " msec msec msec\n" ); // allocate matrices // over-allocate so they can be any combination of // {m,n,k} * {abs(incx), abs(incy)} by // {m,n,k} * {abs(incx), abs(incy)} maxn = max( max( m, n ), k ) * max( abs(incx), abs(incy) ); ld = max( 1, maxn ); size = ld*maxn; magma_zmalloc_pinned( &A, size ); assert( A != NULL ); magma_zmalloc_pinned( &B, size ); assert( B != NULL ); // initialize matrices lapackf77_zlarnv( &ione, ISEED, &size, A ); lapackf77_zlarnv( &ione, ISEED, &size, B ); printf( "Level 1 BLAS ----------------------------------------------------------\n" ); // ----- test DZASUM // get one-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_dzasum( m, A(0,j), incx ); x_c = cblas_dzasum( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_dzasum( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "dzasum", diff, error ); total_diff += diff; total_error += error; } // ----- test DZNRM2 // get two-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_dznrm2( m, A(0,j), incx ); x_c = cblas_dznrm2( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_dznrm2( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "dznrm2", diff, error ); total_diff += diff; total_error += error; } // ----- test ZDOTC // dot columns, Aj^H Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_zdotc( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_zdotc_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_zdotc( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_zdotc( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "zdotc", diff, error ); total_diff += diff; total_error += error; total_error += error; // ----- test ZDOTU // dot columns, Aj^T * Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_zdotu( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_zdotu_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_zdotu( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_zdotu( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "zdotu", diff, error ); total_diff += diff; total_error += error; // tell user about disabled functions #if defined( MAGMA_WITH_MKL ) printf( "cblas_zdotc and cblas_zdotu disabled with MKL (segfaults)\n" ); #endif #if defined( __APPLE__ ) printf( "blasf77_zdotc and blasf77_zdotu disabled on MacOS (segfaults)\n" ); #endif // cleanup magma_free_pinned( A ); magma_free_pinned( B ); fflush( stdout ); }}} // itest, incx, incy // TODO use average error? printf( "sum diffs = %8.2g, MAGMA wrapper compared to CBLAS and Fortran BLAS; should be exactly 0.\n" "sum errors = %8.2e, MAGMA implementation compared to CBLAS and Fortran BLAS; should be ~ machine epsilon.\n\n", total_diff, total_error ); if ( total_diff != 0. ) { printf( "some tests failed diff == 0.; see above.\n" ); } else { printf( "all tests passed diff == 0.\n" ); } TESTING_FINALIZE(); int status = (total_diff != 0.); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgels */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double gpu_error, cpu_error, error, Anorm, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_A, d_B; magma_int_t M, N, size, nrhs, lda, ldb, ldda, lddb, min_mn, max_mn, nb, info; magma_int_t lworkgpu, lhwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; opts.parse_opts( argc, argv ); magma_int_t status = 0; double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf("%% ||b-Ax|| / (N||A||) ||dx-x||/(N||A||)\n"); printf("%% M N NRHS CPU Gflop/s (sec) GPU Gflop/s (sec) CPU GPU \n"); printf("%%==================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if ( M < N ) { printf( "%5d %5d %5d skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default lddb = magma_roundup( max_mn, opts.align ); // multiple of 32 by default nb = magma_get_dgeqrf_nb( M, N ); gflops = (FLOPS_DGEQRF( M, N ) + FLOPS_DGEQRS( M, N, nrhs )) / 1e9; lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; // query for workspace size lhwork = -1; lapackf77_dgels( MagmaNoTransStr, &M, &N, &nrhs, NULL, &lda, NULL, &ldb, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_D_REAL( tmp[0] ); lhwork = max( lhwork, lworkgpu ); TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, lda*N ); TESTING_MALLOC_CPU( h_A2, double, lda*N ); TESTING_MALLOC_CPU( h_B, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_R, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_work, double, lhwork ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_B, double, lddb*nrhs ); /* Initialize the matrices */ size = lda*N; lapackf77_dlarnv( &ione, ISEED, &size, h_A ); lapackf77_dlacpy( MagmaFullStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS size = ldb*nrhs; lapackf77_dlarnv( &ione, ISEED, &size, h_B ); lapackf77_dlacpy( MagmaFullStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); // make consistent RHS //size = N*nrhs; //lapackf77_dlarnv( &ione, ISEED, &size, h_X ); //blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, // &c_one, h_A, &lda, // h_X, &ldb, // &c_zero, h_B, &ldb ); //lapackf77_dlacpy( MagmaFullStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_dsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_dgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgels_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } // compute the residual magma_dgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb ); Anorm = lapackf77_dlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_dlacpy( MagmaFullStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_dgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dgels returned error %d: %s.\n", (int) info, magma_strerror( info )); } blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb ); cpu_error = lapackf77_dlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm); gpu_error = lapackf77_dlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); // error relative to LAPACK size = M*nrhs; blasf77_daxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_dlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e", (int) M, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, cpu_error, gpu_error, error ); bool okay; if ( M == N ) { okay = (gpu_error < tol && error < tol); } else { okay = (error < tol); } status += ! okay; printf( " %s\n", (okay ? "ok" : "failed")); TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error; double *h_A; magmaDouble_ptr d_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_DGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_dgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); magma_dsetmatrix( M, N, h_A, lda, d_A, ldda ); gpu_time = magma_wtime(); magma_dgetrf_gpu( M, N, d_A, ldda, ipiv, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { magma_dgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { magma_dgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_dsyevd( magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, double *a, magma_int_t lda, double *w, double *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_queue_t queue, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= DSYEVD computes all eigenvalues and, optionally, eigenvectors of a real symmetric matrix A. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA, N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, A contains the orthonormal eigenvectors of the matrix A. If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') or the upper triangle (if UPLO='U') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). W (output) DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. WORK (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = 'N' and N > 1, LWORK >= 2*N + N*NB. If JOBZ = 'V' and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_dsytrd_nb(N). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = 'N' and N > 1, LIWORK >= 1. If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i and JOBZ = 'N', then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = 'V', then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. ===================================================================== */ const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); magma_int_t ione = 1; magma_int_t izero = 0; double d_one = 1.; double d__1; double eps; magma_int_t inde; double anrm; double rmin, rmax; double sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; double safmin; double bignum; magma_int_t indtau; magma_int_t indwrk, liwmin; magma_int_t llwork; double smlnum; magma_int_t lquery; magmaDouble_ptr dwork; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); lquery = (lwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (lower || (uplo == MagmaUpper))) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } magma_int_t nb = magma_get_dsytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } // multiply by 1+eps to ensure length gets rounded up, // if it cannot be exactly represented in floating point. double one_eps = 1. + lapackf77_dlamch("Epsilon"); work[0] = lwmin * one_eps; iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -8; } else if ((liwork < liwmin) && ! lquery) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = a[0]; if (wantz) { a[0] = 1.; } return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_dsyevd(jobz_, uplo_, &n, a, &lda, w, work, &lwork, iwork, &liwork, info); return *info; } /* Get machine constants. */ safmin = lapackf77_dlamch("Safe minimum"); eps = lapackf77_dlamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_dsqrt(smlnum); rmax = magma_dsqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_dlansy("M", uplo_, &n, a, &lda, work ); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_dlascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, a, &lda, info); } /* Call DSYTRD to reduce symmetric matrix to tridiagonal form. */ // dsytrd work: e (n) + tau (n) + llwork (n*nb) ==> 2n + n*nb // dstedx work: e (n) + tau (n) + z (n*n) + llwrk2 (1 + 4*n + n^2) ==> 1 + 6n + 2n^2 inde = 0; indtau = inde + n; indwrk = indtau + n; indwk2 = indwrk + n*n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; magma_timer_t time; timer_start( time ); magma_dsytrd(uplo, n, a, lda, w, &work[inde], &work[indtau], &work[indwrk], llwork, queue, &iinfo); timer_stop( time ); timer_printf( "time dsytrd = %6.2f\n", time ); /* For eigenvalues only, call DSTERF. For eigenvectors, first call DSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call DORMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_dsterf(&n, w, &work[inde], info); } else { timer_start( time ); if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 3*n*(n/2 + 1) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // TTT Possible bug for n < 128 magma_dstedx(MagmaRangeAll, n, 0., 0., 0, 0, w, &work[inde], &work[indwrk], n, &work[indwk2], llwrk2, iwork, liwork, dwork, queue, info); magma_free( dwork ); timer_stop( time ); timer_printf( "time dstedx = %6.2f\n", time ); timer_start( time ); magma_dormtr(MagmaLeft, uplo, MagmaNoTrans, n, n, a, lda, &work[indtau], &work[indwrk], n, &work[indwk2], llwrk2, queue, &iinfo); lapackf77_dlacpy("A", &n, &n, &work[indwrk], &n, a, &lda); timer_stop( time ); timer_printf( "time dormtr + copy = %6.2f\n", time ); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { d__1 = 1. / sigma; blasf77_dscal(&n, &d__1, w, &ione); } work[0] = lwmin * one_eps; // round up iwork[0] = liwmin; return *info; } /* magma_dsyevd */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zher2k */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t N, K; magma_int_t Ak, An, Bk, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex *h_A, *h_B, *h_C, *h_Ccublas; magmaDoubleComplex *d_A, *d_B, *d_C; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.29, -0.86 ); double beta = MAGMA_D_MAKE( -0.48, 0.38 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("uplo = %s, transA = %s\n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA) ); printf(" N K CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.msize[itest]; K = opts.ksize[itest]; gflops = FLOPS_ZHER2K(K, N) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = An = N; Ak = K; ldb = Bn = N; Bk = K; } else { lda = An = K; Ak = N; ldb = Bn = K; Bk = N; } ldc = N; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*Ak; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*Bk ); TESTING_MALLOC_CPU( h_C, magmaDoubleComplex, ldc*N ); TESTING_MALLOC_CPU( h_Ccublas, magmaDoubleComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*Bk ); TESTING_MALLOC_DEV( d_C, magmaDoubleComplex, lddc*N ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_zlarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( An, Ak, h_A, lda, d_A, ldda ); magma_zsetmatrix( Bn, Bk, h_B, ldb, d_B, lddb ); magma_zsetmatrix( N, N, h_C, ldc, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ); cublasZher2k( handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), N, K, &alpha, d_A, ldda, d_B, lddb, &beta, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetmatrix( N, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_zher2k( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_zlange( "M", &N, &N, h_C, &ldc, work ); blasf77_zaxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_zlange( "M", &N, &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Ccublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zgeev(magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n, magmaDoubleComplex *a, magma_int_t lda, magmaDoubleComplex *geev_w_array, magmaDoubleComplex *vl, magma_int_t ldvl, magmaDoubleComplex *vr, magma_int_t ldvr, magmaDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t *info, magma_queue_t queue) { /* -- clMAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver September 2012 Purpose ======= ZGEEV computes for an N-by-N complex nonsymmetric matrix A, the eigenvalues and, optionally, the left and/or right eigenvectors. The right eigenvector v(j) of A satisfies A * v(j) = lambda(j) * v(j) where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies u(j)**H * A = lambda(j) * u(j)**H where u(j)**H denotes the conjugate transpose of u(j). The computed eigenvectors are normalized to have Euclidean norm equal to 1 and largest component real. Arguments ========= JOBVL (input) CHARACTER*1 = 'N': left eigenvectors of A are not computed; = 'V': left eigenvectors of are computed. JOBVR (input) CHARACTER*1 = 'N': right eigenvectors of A are not computed; = 'V': right eigenvectors of A are computed. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX*16 array, dimension (LDA,N) On entry, the N-by-N matrix A. On exit, A has been overwritten. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). W (output) COMPLEX*16 array, dimension (N) W contains the computed eigenvalues. VL (output) COMPLEX*16 array, dimension (LDVL,N) If JOBVL = 'V', the left eigenvectors u(j) are stored one after another in the columns of VL, in the same order as their eigenvalues. If JOBVL = 'N', VL is not referenced. u(j) = VL(:,j), the j-th column of VL. LDVL (input) INTEGER The leading dimension of the array VL. LDVL >= 1; if JOBVL = 'V', LDVL >= N. VR (output) COMPLEX*16 array, dimension (LDVR,N) If JOBVR = 'V', the right eigenvectors v(j) are stored one after another in the columns of VR, in the same order as their eigenvalues. If JOBVR = 'N', VR is not referenced. v(j) = VR(:,j), the j-th column of VR. LDVR (input) INTEGER The leading dimension of the array VR. LDVR >= 1; if JOBVR = 'V', LDVR >= N. WORK (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= (1+nb)*N. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. RWORK (workspace) DOUBLE PRECISION array, dimension (2*N) INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value. > 0: if INFO = i, the QR algorithm failed to compute all the eigenvalues, and no eigenvectors have been computed; elements and i+1:N of W contain eigenvalues which have converged. ===================================================================== */ magma_int_t c__1 = 1; magma_int_t c__0 = 0; magma_int_t a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, i__2, i__3; double d__1, d__2; magmaDoubleComplex z__1, z__2; magma_int_t i__, k, ihi; double scl; magma_int_t ilo; double dum[1], eps; magmaDoubleComplex tmp; magma_int_t ibal; double anrm; magma_int_t ierr, itau, iwrk, nout; magma_int_t scalea; double cscale; magma_int_t select[1]; double bignum; magma_int_t minwrk; magma_int_t wantvl; double smlnum; magma_int_t irwork; magma_int_t lquery, wantvr; magma_int_t nb = 0; magmaDoubleComplex_ptr dT; //magma_timestr_t start, end; char side[2] = {0, 0}; magma_vec_t jobvl_ = jobvl; magma_vec_t jobvr_ = jobvr; *info = 0; lquery = lwork == -1; wantvl = lapackf77_lsame(lapack_const(jobvl_), "V"); wantvr = lapackf77_lsame(lapack_const(jobvr_), "V"); if (! wantvl && ! lapackf77_lsame(lapack_const(jobvl_), "N")) { *info = -1; } else if (! wantvr && ! lapackf77_lsame(lapack_const(jobvr_), "N")) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) { *info = -8; } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) { *info = -10; } /* Compute workspace */ if (*info == 0) { nb = magma_get_zgehrd_nb(n); minwrk = (1+nb)*n; work[0] = MAGMA_Z_MAKE((double) minwrk, 0.); if (lwork < minwrk && ! lquery) { *info = -12; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } // if eigenvectors are needed #if defined(VERSION3) if (MAGMA_SUCCESS != magma_malloc(&dT, nb*n*sizeof(magmaDoubleComplex) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #endif a_dim1 = lda; a_offset = 1 + a_dim1; a -= a_offset; vl_dim1 = ldvl; vl_offset = 1 + vl_dim1; vl -= vl_offset; vr_dim1 = ldvr; vr_offset = 1 + vr_dim1; vr -= vr_offset; --work; --rwork; /* Get machine constants */ eps = lapackf77_dlamch("P"); smlnum = lapackf77_dlamch("S"); bignum = 1. / smlnum; lapackf77_dlabad(&smlnum, &bignum); smlnum = magma_dsqrt(smlnum) / eps; bignum = 1. / smlnum; /* Scale A if max element outside range [SMLNUM,BIGNUM] */ anrm = lapackf77_zlange("M", &n, &n, &a[a_offset], &lda, dum); scalea = 0; if (anrm > 0. && anrm < smlnum) { scalea = 1; cscale = smlnum; } else if (anrm > bignum) { scalea = 1; cscale = bignum; } if (scalea) { lapackf77_zlascl("G", &c__0, &c__0, &anrm, &cscale, &n, &n, &a[a_offset], &lda, & ierr); } /* Balance the matrix (CWorkspace: none) (RWorkspace: need N) */ ibal = 1; lapackf77_zgebal("B", &n, &a[a_offset], &lda, &ilo, &ihi, &rwork[ibal], &ierr); /* Reduce to upper Hessenberg form (CWorkspace: need 2*N, prefer N+N*NB) (RWorkspace: none) */ itau = 1; iwrk = itau + n; i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) /* * Version 1 - LAPACK */ lapackf77_zgehrd(&n, &ilo, &ihi, &a[a_offset], &lda, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION2) /* * Version 2 - LAPACK consistent HRD */ magma_zgehrd2(n, ilo, ihi, &a[a_offset], lda, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored, */ magma_zgehrd(n, ilo, ihi, &a[a_offset], lda, &work[itau], &work[iwrk], i__1, dT, 0, &ierr, queue); #endif //end = get_current_time(); //printf(" Time for zgehrd = %5.2f sec\n", GetTimerValue(start,end)/1000.); if (wantvl) { /* Want left eigenvectors Copy Householder vectors to VL */ side[0] = 'L'; lapackf77_zlacpy(MagmaLowerStr, &n, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl); /* Generate unitary matrix in VL (CWorkspace: need 2*N-1, prefer N+(N-1)*NB) (RWorkspace: none) */ i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) || defined(VERSION2) /* * Version 1 & 2 - LAPACK */ lapackf77_zunghr(&n, &ilo, &ihi, &vl[vl_offset], &ldvl, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored */ magma_zunghr(n, ilo, ihi, &vl[vl_offset], ldvl, &work[itau], dT, 0, nb, &ierr, queue); #endif //end = get_current_time(); //printf(" Time for zunghr = %5.2f sec\n", GetTimerValue(start,end)/1000.); /* Perform QR iteration, accumulating Schur vectors in VL (CWorkspace: need 1, prefer HSWORK (see comments) ) (RWorkspace: none) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_zhseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, geev_w_array, &vl[vl_offset], &ldvl, &work[iwrk], &i__1, info); if (wantvr) { /* Want left and right eigenvectors Copy Schur vectors to VR */ side[0] = 'B'; lapackf77_zlacpy("F", &n, &n, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr); } } else if (wantvr) { /* Want right eigenvectors Copy Householder vectors to VR */ side[0] = 'R'; lapackf77_zlacpy("L", &n, &n, &a[a_offset], &lda, &vr[vr_offset], &ldvr); /* Generate unitary matrix in VR (CWorkspace: need 2*N-1, prefer N+(N-1)*NB) (RWorkspace: none) */ i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) || defined(VERSION2) /* * Version 1 & 2 - LAPACK */ lapackf77_zunghr(&n, &ilo, &ihi, &vr[vr_offset], &ldvr, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored */ magma_zunghr(n, ilo, ihi, &vr[vr_offset], ldvr, &work[itau], dT, 0, nb, &ierr, queue); #endif //end = get_current_time(); //printf(" Time for zunghr = %5.2f sec\n", GetTimerValue(start,end)/1000.); /* Perform QR iteration, accumulating Schur vectors in VR (CWorkspace: need 1, prefer HSWORK (see comments) ) (RWorkspace: none) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_zhseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, geev_w_array, &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info); } else { /* Compute eigenvalues only (CWorkspace: need 1, prefer HSWORK (see comments) ) (RWorkspace: none) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_zhseqr("E", "N", &n, &ilo, &ihi, &a[a_offset], &lda, geev_w_array, &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info); } /* If INFO > 0 from ZHSEQR, then quit */ if (*info > 0) { goto L50; } if (wantvl || wantvr) { /* Compute left and/or right eigenvectors (CWorkspace: need 2*N) (RWorkspace: need 2*N) */ irwork = ibal + n; lapackf77_ztrevc(side, "B", select, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr, &n, &nout, &work[iwrk], &rwork[irwork], &ierr); } if (wantvl) { /* Undo balancing of left eigenvectors (CWorkspace: none) (RWorkspace: need N) */ lapackf77_zgebak("B", "L", &n, &ilo, &ihi, &rwork[ibal], &n, &vl[vl_offset], &ldvl, &ierr); /* Normalize left eigenvectors and make largest component real */ for (i__ = 1; i__ <= n; ++i__) { scl = 1. / cblas_dznrm2(n, &vl[i__ * vl_dim1 + 1], 1); cblas_zdscal(n, scl, &vl[i__ * vl_dim1 + 1], 1); i__2 = n; for (k = 1; k <= i__2; ++k) { i__3 = k + i__ * vl_dim1; /* Computing 2nd power */ d__1 = MAGMA_Z_REAL(vl[i__3]); /* Computing 2nd power */ d__2 = MAGMA_Z_IMAG(vl[k + i__ * vl_dim1]); rwork[irwork + k - 1] = d__1 * d__1 + d__2 * d__2; } /* Comment: Fortran BLAS does not have to add 1 C BLAS must add one to cblas_idamax */ k = cblas_idamax(n, &rwork[irwork], 1)+1; z__2 = MAGMA_Z_CNJG(vl[k + i__ * vl_dim1]); d__1 = magma_dsqrt(rwork[irwork + k - 1]); MAGMA_Z_DSCALE(z__1, z__2, d__1); tmp = z__1; cblas_zscal(n, CBLAS_SADDR(tmp), &vl[i__ * vl_dim1 + 1], 1); i__2 = k + i__ * vl_dim1; i__3 = k + i__ * vl_dim1; d__1 = MAGMA_Z_REAL(vl[i__3]); MAGMA_Z_SET2REAL(z__1, d__1); vl[i__2] = z__1; } } if (wantvr) { /* Undo balancing of right eigenvectors (CWorkspace: none) (RWorkspace: need N) */ lapackf77_zgebak("B", "R", &n, &ilo, &ihi, &rwork[ibal], &n, &vr[vr_offset], &ldvr, &ierr); /* Normalize right eigenvectors and make largest component real */ for (i__ = 1; i__ <= n; ++i__) { scl = 1. / cblas_dznrm2(n, &vr[i__ * vr_dim1 + 1], 1); cblas_zdscal(n, scl, &vr[i__ * vr_dim1 + 1], 1); i__2 = n; for (k = 1; k <= i__2; ++k) { i__3 = k + i__ * vr_dim1; /* Computing 2nd power */ d__1 = MAGMA_Z_REAL(vr[i__3]); /* Computing 2nd power */ d__2 = MAGMA_Z_IMAG(vr[k + i__ * vr_dim1]); rwork[irwork + k - 1] = d__1 * d__1 + d__2 * d__2; } /* Comment: Fortran BLAS does not have to add 1 C BLAS must add one to cblas_idamax */ k = cblas_idamax(n, &rwork[irwork], 1)+1; z__2 = MAGMA_Z_CNJG(vr[k + i__ * vr_dim1]); d__1 = magma_dsqrt(rwork[irwork + k - 1]); MAGMA_Z_DSCALE(z__1, z__2, d__1); tmp = z__1; cblas_zscal(n, CBLAS_SADDR(tmp), &vr[i__ * vr_dim1 + 1], 1); i__2 = k + i__ * vr_dim1; i__3 = k + i__ * vr_dim1; d__1 = MAGMA_Z_REAL(vr[i__3]); MAGMA_Z_SET2REAL(z__1, d__1); vr[i__2] = z__1; } } /* Undo scaling if necessary */ L50: if (scalea) { i__1 = n - *info; /* Computing MAX */ i__3 = n - *info; i__2 = max(i__3,1); lapackf77_zlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, geev_w_array + *info, &i__2, &ierr); if (*info > 0) { i__1 = ilo - 1; lapackf77_zlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, geev_w_array, &n, &ierr); } } #if defined(VERSION3) magma_free( dT ); #endif return *info; } /* magma_zgeev */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dlansy */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; double *h_A; double *h_work; double *d_A; double *d_work; magma_int_t N, n2, lda, ldda; magma_int_t idist = 3; // normal distribution (otherwise max norm is always ~ 1) magma_int_t ISEED[4] = {0,0,0,1}; double error, norm_magma, norm_lapack; magma_int_t status = 0; bool mkl_warning = false; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper }; magma_norm_t norm[] = { MagmaInfNorm, MagmaOneNorm, MagmaMaxNorm }; // Double-Complex inf-norm not supported on Tesla (CUDA arch 1.x) #if defined(PRECISION_z) magma_int_t arch = magma_getdevice_arch(); if ( arch < 200 ) { printf("!!!! NOTE: Double-Complex %s and %s norm are not supported\n" "!!!! on CUDA architecture %d; requires arch >= 200.\n" "!!!! It should report \"parameter number 1 had an illegal value\" below.\n\n", MagmaInfNormStr, MagmaOneNormStr, (int) arch ); for( int inorm = 0; inorm < 2; ++inorm ) { for( int iuplo = 0; iuplo < 2; ++iuplo ) { printf( "Testing that magmablas_dlansy( %s, %s, ... ) returns -1 error...\n", lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] )); norm_magma = magmablas_dlansy( norm[inorm], uplo[iuplo], 1, NULL, 1, NULL ); if ( norm_magma != -1 ) { printf( "expected magmablas_dlansy to return -1 error, but got %f\n", norm_magma ); status = 1; } } } printf( "...return values %s\n\n", (status == 0 ? "ok" : "failed") ); } #endif printf(" N norm uplo CPU GByte/s (ms) GPU GByte/s (ms) error \n"); printf("=======================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int inorm = 0; inorm < 3; ++inorm ) { for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = roundup( N, opts.pad ); // read upper or lower triangle gbytes = 0.5*(N+1)*N*sizeof(double) / 1e9; TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_work, double, N ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_work, double, N ); /* Initialize the matrix */ lapackf77_dlarnv( &idist, ISEED, &n2, h_A ); //magma_dmake_symmetric( N, h_A, lda ); // make diagonal real -- according to docs, should NOT be necesary //for( int i=0; i < N; ++i ) { // h_A[i + i*lda] = MAGMA_D_MAKE( MAGMA_D_REAL( h_A[i + i*lda] ), 0 ); //} magma_dsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); norm_magma = magmablas_dlansy( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gbytes / gpu_time; if (norm_magma == -1) { printf( "%5d %4c skipped because it isn't supported on this GPU\n", (int) N, lapacke_norm_const( norm[inorm] )); continue; } if (norm_magma < 0) printf("magmablas_dlansy returned error %f: %s.\n", norm_magma, magma_strerror( (int) norm_magma )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); norm_lapack = lapackf77_dlansy( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (norm_lapack < 0) printf("lapackf77_dlansy returned error %f: %s.\n", norm_lapack, magma_strerror( (int) norm_lapack )); /* ===================================================================== Check the result compared to LAPACK Note: MKL (11.1.0) has bug for uplo=Lower with multiple threads. Try with $MKL_NUM_THREADS = 1. =================================================================== */ error = fabs( norm_magma - norm_lapack ) / norm_lapack; double tol2 = tol; if ( norm[inorm] == MagmaMaxNorm ) { // max-norm depends on only one element, so for Real precisions, // MAGMA and LAPACK should exactly agree (tol2 = 0), // while Complex precisions incur roundoff in fabs. #if defined(PRECISION_s) || defined(PRECISION_d) tol2 = 0; #endif } if ( error > tol2 && norm[inorm] == MagmaInfNorm && uplo[iuplo] == MagmaLower ) { mkl_warning = true; } printf("%5d %4c %4c %7.2f (%7.2f) %7.2f (%7.2f) %#9.3g %s\n", (int) N, lapacke_norm_const( norm[inorm] ), lapacke_uplo_const( uplo[iuplo] ), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error <= tol2 ? "ok" : "failed") ); status += ! (error <= tol2); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } } // end iuplo, inorm, iter printf( "\n" ); } if ( mkl_warning ) { printf("* Some versions of MKL (e.g., 11.1.0) have a bug in dlansy with uplo=L\n" " and multiple threads. Try again with MKL_NUM_THREADS=1.\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing magma_zhemm_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex calpha = MAGMA_Z_MAKE( 3.456, 5.678 ); magmaDoubleComplex cbeta = MAGMA_Z_MAKE( 1.234, 2.456 ); real_Double_t gflops, gpu_perf=0., cpu_perf=0., gpu_time=0., cpu_time=0.; real_Double_t gpu_perf2=0., gpu_time2=0.; double error=0., errorbis=0., work[1]; magmaDoubleComplex *hA, *hX, *hB, *hR; magmaDoubleComplex_ptr dA[MagmaMaxGPUs], dX[MagmaMaxGPUs], dB[MagmaMaxGPUs], dwork[MagmaMaxGPUs], hwork[MagmaMaxGPUs+1]; magmaDoubleComplex_ptr dA2; magma_int_t M, N, size, lda, ldda, msize, nb, nstream; magma_int_t ione = 1; magma_int_t iseed[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); // default values nb = (opts.nb > 0 ? opts.nb : 64); nstream = (opts.nstream > 0 ? opts.nstream : 2); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t nbcmplx = 0; magma_buildconnection_mgpu(gnode, &nbcmplx, opts.ngpu); printf("Initializing communication pattern... GPU-ncmplx %d\n\n", (int) nbcmplx); for (int i=0; i < nbcmplx; ++i) { int myngpu = gnode[i][MagmaMaxGPUs]; printf("cmplx %d has %d gpu ", i, myngpu); for(int j=0; j < myngpu; ++j) printf(" %d", (int) gnode[i][j]); printf("\n"); } magma_int_t nbevents = 2; magma_queue_t streams[MagmaMaxGPUs][20]; magma_event_t redevents[MagmaMaxGPUs][20]; magma_event_t redevents2[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10]; for( int d = 0; d < opts.ngpu; ++d ) { for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_create( &streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&redevents[d][i], cudaEventDisableTiming); cudaEventCreateWithFlags(&redevents2[d][i], cudaEventDisableTiming); } } printf( "nb %d, ngpu %d, nstream %d version %d\n", (int) nb, (int) opts.ngpu, (int) nstream, (int) opts.version ); printf(" M N nb offset CPU GFlop/s (sec) GPU GFlop/s (sec) CUBLAS hemm (sec) ||R|| / ||A||*||X||\n"); printf("=========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { M = opts.msize[itest]; N = opts.nsize[itest]; for( int offset = 0; offset < N; offset += min(N,nb) ) { for( int iter = 0; iter < opts.niter; ++iter ) { msize = M - offset; lda = M; ldda = ((M + 31)/32)*32; size = lda*M; gflops = FLOPS_ZHEMM( MagmaLeft, (double)msize, (double)N ) / 1e9; magma_int_t dworksiz = ldda*N*3; magma_int_t hworksiz = lda*N; TESTING_MALLOC_CPU( hA, magmaDoubleComplex, lda*M ); TESTING_MALLOC_CPU( hX, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( hB, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( hR, magmaDoubleComplex, lda*N ); for( int d = 0; d < opts.ngpu; ++d ) { magma_int_t mlocal = ((M / nb) / opts.ngpu + 1) * nb; magma_setdevice( d ); TESTING_MALLOC_DEV( dA[d], magmaDoubleComplex, ldda*mlocal ); TESTING_MALLOC_DEV( dX[d], magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dB[d], magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dwork[d], magmaDoubleComplex, dworksiz ); TESTING_MALLOC_PIN( hwork[d], magmaDoubleComplex, hworksiz ); } TESTING_MALLOC_PIN( hwork[opts.ngpu], magmaDoubleComplex, lda*N ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_MALLOC_DEV( dA2, magmaDoubleComplex, ldda*M ); } lapackf77_zlarnv( &ione, iseed, &size, hA ); magma_zmake_hermitian( M, hA, lda ); size = lda*N; lapackf77_zlarnv( &ione, iseed, &size, hX ); lapackf77_zlarnv( &ione, iseed, &size, hB ); lapackf77_zlacpy( "Full", &M, &N, hB, &lda, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix_1D_col_bcyclic( M, M, hA, lda, dA, ldda, opts.ngpu, nb ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); //magmablasSetKernelStream( streams[ d ][ 0 ] ); magma_zsetmatrix( M, N, hX, lda, dX[d], ldda ); //if (d == 0) magma_zsetmatrix( M, N, hB, lda, dB[d], ldda ); // this is wrong coz when offset != 0 the gpu who do the beta*C may be not 0 so this should be related to stdev(starting device who own i=0 first col) magma_zsetmatrix( M, N, hB, lda, dB[d], ldda ); } //memset(hR, 0, lda*N*sizeof(magmaDoubleComplex)); trace_init( 1, opts.ngpu, nstream, (magma_queue_t*) streams ); //magma_int_t offset = 0; //nb; gpu_time = magma_sync_wtime(0); magmablas_zhemm_mgpu_com( MagmaLeft, MagmaLower, msize, N, calpha, dA, ldda, offset, dX, ldda, cbeta, dB, ldda, dwork, dworksiz, hR, lda, hwork, hworksiz, opts.ngpu, nb, streams, nstream, redevents2, nbevents, gnode, nbcmplx); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; #ifdef TRACING char buf[80]; snprintf( buf, sizeof(buf), "zhemm-m%d-n%d-nb%d-stream%d-ngpu%d-run%d.svg", (int) M, (int) N, (int) nb, (int) nstream, (int) opts.ngpu, (int) iter ); trace_finalize( buf, "trace.css" ); #endif /* ==================================================================== Performs operation using CUBLAS =================================================================== */ if ( opts.check && iter == 0 ) { magma_setdevice( 0 ); magmablasSetKernelStream( 0 ); magma_zsetmatrix( M, M, hA, lda, dA2, ldda ); magma_zsetmatrix( M, N, hX, lda, dX[0], ldda ); magma_zsetmatrix( M, N, hB, lda, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0); magma_zhemm( MagmaLeft, MagmaLower, msize, N, calpha, dA2+offset*ldda+offset, ldda, dX[0], ldda, cbeta, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0) - gpu_time2; gpu_perf2 = gflops / gpu_time2; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.check ) { // store ||A||*||X|| errorbis = lapackf77_zlange("fro", &msize, &msize, hA+offset*lda+offset, &lda, work ); errorbis *= lapackf77_zlange("fro", &msize, &N, hX, &lda, work ); //printf( "A =" ); magma_zprint( M, M, hA, lda ); //printf( "X =" ); magma_zprint( M, N, hX, lda ); //printf( "B =" ); magma_zprint( M, N, hB, lda ); cpu_time = magma_wtime(); blasf77_zhemm( "Left", "Lower", &msize, &N, &calpha, hA+offset*lda+offset, &lda, hX, &lda, &cbeta, hB, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* trace_file = fopen("AJETE/C", "w"); for (int j = 0; j < N; j++) for (int i = 0; i < siz; i++) fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, hB[j*lda+i]); fclose(trace_file); */ magma_int_t firstprint=0; for(magma_int_t dev=0; dev < opts.ngpu; ++dev) { magma_setdevice( dev ); magma_zgetmatrix( M, N, dB[dev], ldda, hR, lda ); // compute relative error ||R||/||A||*||X||, where R := B_magma - B_lapack = R - B size = lda*N; blasf77_zaxpy( &size, &c_neg_one, hB, &ione, hR, &ione ); error = lapackf77_zlange("fro", &msize, &N, hR, &lda, work) / errorbis; //printf( "R =" ); magma_zprint( M, N, hR, lda ); if (firstprint == 0) { printf( "%5d %5d %5d %5d %7.1f (%7.4f) %7.1f (%7.4f) %7.1f (%7.4f) %8.2e %s\n", (int) M, (int) N, (int) nb, (int) offset, cpu_perf, cpu_time, gpu_perf, gpu_time, gpu_perf2, gpu_time2, error, (error < tol ? "ok" : "failed") ); } else { printf( "%89s %8.2e %s\n", " ", error, (error < tol ? "ok" : "failed") ); } status += ! (error < tol); firstprint =1; } } else { printf( "%5d %5d %5d %5d --- ( --- ) %7.1f (%7.4f) --- ( --- ) ---\n", (int) M, (int) N, (int) nb, (int) offset, gpu_perf, gpu_time ); } TESTING_FREE_CPU( hA ); TESTING_FREE_CPU( hX ); TESTING_FREE_CPU( hB ); TESTING_FREE_PIN( hR ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); TESTING_FREE_DEV( dA[d] ); TESTING_FREE_DEV( dX[d] ); TESTING_FREE_DEV( dB[d] ); TESTING_FREE_DEV( dwork[d] ); TESTING_FREE_PIN( hwork[d] ); } TESTING_FREE_PIN( hwork[opts.ngpu] ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_FREE_DEV( dA2 ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } // offset printf( "\n" ); } for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { magma_event_destroy( redevents[d][i] ); magma_event_destroy( redevents2[d][i] ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqlf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = 2. * opts.tolerance * lapackf77_dlamch("E"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_zgeqrf_nb(M); gflops = FLOPS_ZGEQLF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_zgeqlf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); lwork = max( lwork, N*nb ); lwork = max( lwork, 2*nb*nb); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgeqlf( M, N, h_R, lda, tau, h_work, lwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgeqlf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapack_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_zlange("f", &M, &N, h_A, &lda, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_zlange("f", &M, &N, h_R, &lda, work) / error; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zlanhe */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A; double *h_work; magmaDoubleComplex_ptr d_A; magmaDouble_ptr d_work; magma_int_t i, j, N, n2, lda, ldda; magma_int_t idist = 3; // normal distribution (otherwise max norm is always ~ 1) magma_int_t ISEED[4] = {0,0,0,1}; double error, norm_magma, norm_lapack; magma_int_t status = 0; magma_int_t lapack_nan_fail = 0; magma_int_t lapack_inf_fail = 0; bool mkl_warning = false; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tol2; magma_uplo_t uplo[] = { MagmaLower, MagmaUpper }; magma_norm_t norm[] = { MagmaInfNorm, MagmaOneNorm, MagmaMaxNorm, MagmaFrobeniusNorm }; // Double-Complex inf-norm not supported on Tesla (CUDA arch 1.x) #if defined(PRECISION_z) magma_int_t arch = magma_getdevice_arch(); if ( arch < 200 ) { printf("!!!! NOTE: Double-Complex %s and %s norm are not supported\n" "!!!! on CUDA architecture %d; requires arch >= 200.\n" "!!!! It should report \"parameter number 1 had an illegal value\" below.\n\n", MagmaInfNormStr, MagmaOneNormStr, (int) arch ); for( int inorm = 0; inorm < 2; ++inorm ) { for( int iuplo = 0; iuplo < 2; ++iuplo ) { printf( "Testing that magmablas_zlanhe( %s, %s, ... ) returns -1 error...\n", lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] )); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], 1, NULL, 1, NULL, 1 ); if ( norm_magma != -1 ) { printf( "expected magmablas_zlanhe to return -1 error, but got %f\n", norm_magma ); status = 1; } }} printf( "...return values %s\n\n", (status == 0 ? "ok" : "failed") ); } #endif #ifdef MAGMA_WITH_MKL // MKL 11.1 has bug in multi-threaded zlanhe; use single thread to work around. // MKL 11.2 corrects it for inf, one, max norm. // MKL 11.2 still segfaults for Frobenius norm, which is not tested here // because MAGMA doesn't implement Frobenius norm yet. MKLVersion mkl_version; mkl_get_version( &mkl_version ); magma_int_t la_threads = magma_get_lapack_numthreads(); bool mkl_single_thread = (mkl_version.MajorVersion <= 11 && mkl_version.MinorVersion < 2); if ( mkl_single_thread ) { printf( "\nNote: using single thread to work around MKL zlanhe bug.\n\n" ); } #endif printf("%% N norm uplo CPU GByte/s (ms) GPU GByte/s (ms) error nan inf\n"); printf("%%=================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int inorm = 0; inorm < 3; ++inorm ) { /* < 4 for Frobenius */ for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = magma_roundup( N, opts.align ); // read upper or lower triangle gbytes = 0.5*(N+1)*N*sizeof(magmaDoubleComplex) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, double, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_work, double, N ); /* Initialize the matrix */ lapackf77_zlarnv( &idist, ISEED, &n2, h_A ); magma_zsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gbytes / gpu_time; if (norm_magma == -1) { printf( "%5d %4c skipped because %s norm isn't supported\n", (int) N, lapacke_norm_const( norm[inorm] ), lapack_norm_const( norm[inorm] )); goto cleanup; } else if (norm_magma < 0) { printf("magmablas_zlanhe returned error %f: %s.\n", norm_magma, magma_strerror( (int) norm_magma )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // work around MKL bug in multi-threaded zlanhe magma_set_lapack_numthreads( 1 ); } #endif cpu_time = magma_wtime(); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (norm_lapack < 0) { printf("lapackf77_zlanhe returned error %f: %s.\n", norm_lapack, magma_strerror( (int) norm_lapack )); } /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = fabs( norm_magma - norm_lapack ) / norm_lapack; tol2 = tol; if ( norm[inorm] == MagmaMaxNorm ) { // max-norm depends on only one element, so for Real precisions, // MAGMA and LAPACK should exactly agree (tol2 = 0), // while Complex precisions incur roundoff in cuCabs. #ifdef REAL tol2 = 0; #endif } bool okay; okay = (error <= tol2); status += ! okay; mkl_warning |= ! okay; /* ==================================================================== Check for NAN and INF propagation =================================================================== */ #define h_A(i_, j_) (h_A + (i_) + (j_)*lda) #define d_A(i_, j_) (d_A + (i_) + (j_)*ldda) i = rand() % N; j = rand() % N; magma_int_t tmp; if ( uplo[iuplo] == MagmaLower && i < j ) { tmp = i; i = j; j = tmp; } else if ( uplo[iuplo] == MagmaUpper && i > j ) { tmp = i; i = j; j = tmp; } *h_A(i,j) = MAGMA_Z_NAN; magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool nan_okay; nan_okay = isnan(norm_magma); bool la_nan_okay; la_nan_okay = isnan(norm_lapack); lapack_nan_fail += ! la_nan_okay; status += ! nan_okay; *h_A(i,j) = MAGMA_Z_INF; magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool inf_okay; inf_okay = isinf(norm_magma); bool la_inf_okay; la_inf_okay = isinf(norm_lapack); lapack_inf_fail += ! la_inf_okay; status += ! inf_okay; #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // end single thread to work around MKL bug magma_set_lapack_numthreads( la_threads ); } #endif printf("%5d %4c %4c %7.2f (%7.2f) %7.2f (%7.2f) %#9.3g %-6s %6s%1s %6s%1s\n", (int) N, lapacke_norm_const( norm[inorm] ), lapacke_uplo_const( uplo[iuplo] ), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (okay ? "ok" : "failed"), (nan_okay ? "ok" : "failed"), (la_nan_okay ? " " : "*"), (inf_okay ? "ok" : "failed"), (la_inf_okay ? " " : "*")); cleanup: TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_work ); fflush( stdout ); } // end iter if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iuplo, inorm printf( "\n" ); } // don't print "failed" here because then run_tests.py thinks MAGMA failed if ( lapack_nan_fail ) { printf( "* Warning: LAPACK did not pass NAN propagation test; upgrade to LAPACK version >= 3.4.2 (Sep. 2012)\n" ); } if ( lapack_inf_fail ) { printf( "* Warning: LAPACK did not pass INF propagation test\n" ); } if ( mkl_warning ) { printf("* MKL (e.g., 11.1) has a bug in zlanhe with multiple threads;\n" " corrected in 11.2 for one, inf, max norms, but still in Frobenius norm.\n" " Try again with MKL_NUM_THREADS=1.\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_dsgesv_gpu(char trans, magma_int_t n, magma_int_t nrhs, double *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *dipiv, double *dB, magma_int_t lddb, double *dX, magma_int_t lddx, double *dworkd, float *dworks, magma_int_t *iter, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DSGESV computes the solution to a real system of linear equations A * X = B or A' * X = B where A is an N-by-N matrix and X and B are N-by-NRHS matrices. DSGESV first attempts to factorize the matrix in real SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with real DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a real DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio real SINGLE PRECISION performance over real DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments ========= TRANS (input) CHARACTER*1 Specifies the form of the system of equations: = 'N': A * X = B (No transpose) = 'T': A'* X = B (Transpose) = 'C': A'* X = B (Conjugate transpose = Transpose) N (input) INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input or input/output) DOUBLE PRECISION array on the GPU, dimension (ldda,N) On entry, the N-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. ldda (input) INTEGER The leading dimension of the array dA. ldda >= max(1,N). IPIV (output) INTEGER array, dimension (N) The pivot indices that define the permutation matrix P; row i of the matrix was interchanged with row IPIV(i). Corresponds either to the single precision factorization (if info.EQ.0 and ITER.GE.0) or the double precision factorization (if info.EQ.0 and ITER.LT.0). dIPIV (output) INTEGER array on the GPU, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was moved to row IPIV(i). dB (input) DOUBLE PRECISION array on the GPU, dimension (lddb,NRHS) The N-by-NRHS right hand side matrix B. lddb (input) INTEGER The leading dimension of the array dB. lddb >= max(1,N). dX (output) DOUBLE PRECISION array on the GPU, dimension (lddx,NRHS) If info = 0, the N-by-NRHS solution matrix X. lddx (input) INTEGER The leading dimension of the array dX. lddx >= max(1,N). dworkd (workspace) DOUBLE PRECISION array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. dworks (workspace) SINGLE PRECISION array on the GPU, dimension (N*(N+NRHS)) This array is used to store the real single precision matrix and the right-hand sides or solutions in single precision. iter (output) INTEGER < 0: iterative refinement has failed, double precision factorization has been performed -1 : the routine fell back to full precision for implementation- or machine-specific reasons -2 : narrowing the precision induced an overflow, the routine fell back to full precision -3 : failure of SGETRF -31: stop the iterative refinement after the 30th iteration > 0: iterative refinement has been successfully used. Returns the number of iterations info (output) INTEGER = 0: successful exit < 0: if info = -i, the i-th argument had an illegal value > 0: if info = i, U(i,i) computed in DOUBLE PRECISION is exactly zero. The factorization has been completed, but the factor U is exactly singular, so the solution could not be computed. ===================================================================== */ #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; magma_int_t ione = 1; double *dR; float *dSA, *dSX; double Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -8; else if ( lddx < max(1,n)) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_dlange('I', n, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ //magmablas_dlag2s( n, nrhs, dB, lddb, dSX, lddsx, info ); // done inside dsgetrs with pivots if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_dlag2s( n, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_sgetrf_gpu( n, n, dSA, lddsa, ipiv, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Generate parallel pivots { magma_int_t *newipiv; magma_imalloc_cpu( &newipiv, n ); if ( newipiv == NULL ) { *iter = -3; goto FALLBACK; } swp2pswp( trans, n, ipiv, newipiv ); magma_setvector( n, sizeof(magma_int_t), newipiv, 1, dipiv, 1 ); magma_free_cpu( newipiv ); } // solve dSA*dSX = dB in single precision // converts dB to dSX and applies pivots, solves, then converts result back to dX magma_dsgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dB, lddb, dX, lddx, dSX, info ); // residual dR = dB - dA*dX in double precision magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_dgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange? for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( n, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX // solve dSA*dSX = R in single precision // convert result back to double precision dR // it's okay that dR is used for both dB input and dX output. magma_dsgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dR, lddr, dR, lddr, dSX, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dR --and-- // dR = dB // This saves going through dR a second time (if done with one more kernel). // -- not really: first time is read, second time is write. for( j=0; j < nrhs; j++ ) { magmablas_daxpycp( n, dR(0,j), dX(0,j), dB(0,j) ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_dgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER>0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( n, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_dgetrf_gpu( n, n, dA, ldda, ipiv, info ); if (*info == 0) { magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); magma_dgetrs_gpu( trans, n, nrhs, dA, ldda, ipiv, dX, lddx, info ); } return *info; }
// ---------------------------------------- int main( int argc, char** argv ) { TESTING_INIT(); //real_Double_t t_m, t_c, t_f; magma_int_t ione = 1; magmaDoubleComplex *A, *B; double error_cblas, error_fblas, error_inline; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t i, j, k, m, n, size, maxn, ld; // complex x for magma, cblas, fortran, inline blas respectively magmaDoubleComplex x2_m, x2_c, x2_f, x2_i; // real x for magma, cblas, fortran, inline blas respectively double x_m, x_c, x_f, x_i; MAGMA_UNUSED( x_c ); MAGMA_UNUSED( x_f ); MAGMA_UNUSED( x2_c ); MAGMA_UNUSED( x2_f ); MAGMA_UNUSED( x2_m ); magma_opts opts; opts.parse_opts( argc, argv ); opts.tolerance = max( 100., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); gTol = tol; magma_int_t inc[] = { -2, -1, 1, 2 }; //{ 1 }; //{ -1, 1 }; magma_int_t ninc = sizeof(inc)/sizeof(*inc); magma_int_t maxinc = 0; for( i=0; i < ninc; ++i ) { maxinc = max( maxinc, abs(inc[i]) ); } printf( "!! Calling these CBLAS and Fortran BLAS sometimes crashes (segfaults), which !!\n" "!! is why we use wrappers. It does not necesarily indicate a bug in MAGMA. !!\n" "!! If MAGMA_WITH_MKL or __APPLE__ are defined, known failures are skipped. !!\n" "\n" ); // tell user about disabled functions #ifndef HAVE_CBLAS printf( "n/a: HAVE_CBLAS not defined, so no cblas functions tested.\n\n" ); #endif #if defined(MAGMA_WITH_MKL) printf( "n/a: cblas_zdotc, cblas_zdotu, blasf77_zdotc, and blasf77_zdotu are disabled with MKL, due to segfaults.\n\n" ); #endif #if defined(__APPLE__) printf( "n/a: blasf77_zdotc and blasf77_zdotu are disabled on MacOS, due to segfaults.\n\n" ); #endif printf( "%% Error w.r.t. Error w.r.t. Error w.r.t.\n" "%% M N K incx incy Function CBLAS Fortran BLAS inline\n" "%%====================================================================================\n" ); for( int itest = 0; itest < opts.ntest; ++itest ) { if ( itest > 0 ) { printf( "%%----------------------------------------------------------------------\n" ); } m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; // allocate matrices // over-allocate so they can be any combination of // {m,n,k} * {abs(incx), abs(incy)} by // {m,n,k} * {abs(incx), abs(incy)} maxn = max( max( m, n ), k ) * maxinc; ld = max( 1, maxn ); size = ld*maxn; TESTING_MALLOC_CPU( A, magmaDoubleComplex, size ); TESTING_MALLOC_CPU( B, magmaDoubleComplex, size ); // initialize matrices lapackf77_zlarnv( &ione, ISEED, &size, A ); lapackf77_zlarnv( &ione, ISEED, &size, B ); // ----- test DZASUM for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; // get one-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy error_cblas = 0; error_fblas = 0; error_inline = 0; for( j=0; j < k; ++j ) { x_m = magma_cblas_dzasum( m, A(0,j), incx ); #ifdef HAVE_CBLAS x_c = cblas_dzasum( m, A(0,j), incx ); error_cblas = max( error_cblas, fabs(x_m - x_c) / fabs(m*x_c) ); #else x_c = 0; error_cblas = SKIPPED_FLAG; #endif x_f = blasf77_dzasum( &m, A(0,j), &incx ); error_fblas = max( error_fblas, fabs(x_m - x_f) / fabs(m*x_f) ); // inline implementation x_i = 0; for( i=0; i < m; ++i ) { x_i += MAGMA_Z_ABS1( *A(i*incx,j) ); // |real(Aij)| + |imag(Aij)| } error_inline = max( error_inline, fabs(x_m - x_i) / fabs(m*x_i) ); //printf( "dzasum xm %.8e, xc %.8e, xf %.8e, xi %.8e\n", x_m, x_c, x_f, x_i ); } output( "dzasum", m, n, k, incx, incy, error_cblas, error_fblas, error_inline ); } } } printf( "\n" ); // ----- test DZNRM2 // get two-norm of column j of A for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; if ( incx > 0 && incx == incy ) { // positive, no incy error_cblas = 0; error_fblas = 0; error_inline = 0; for( j=0; j < k; ++j ) { x_m = magma_cblas_dznrm2( m, A(0,j), incx ); #ifdef HAVE_CBLAS x_c = cblas_dznrm2( m, A(0,j), incx ); error_cblas = max( error_cblas, fabs(x_m - x_c) / fabs(m*x_c) ); #else x_c = 0; error_cblas = SKIPPED_FLAG; #endif x_f = blasf77_dznrm2( &m, A(0,j), &incx ); error_fblas = max( error_fblas, fabs(x_m - x_f) / fabs(m*x_f) ); // inline implementation (poor -- doesn't scale) x_i = 0; for( i=0; i < m; ++i ) { x_i += real( *A(i*incx,j) ) * real( *A(i*incx,j) ) + imag( *A(i*incx,j) ) * imag( *A(i*incx,j) ); // same: real( conj( *A(i*incx,j) ) * *A(i*incx,j) ); } x_i = sqrt( x_i ); error_inline = max( error_inline, fabs(x_m - x_i) / fabs(m*x_i) ); //printf( "dznrm2 xm %.8e, xc %.8e, xf %.8e, xi %.8e\n", x_m, x_c, x_f, x_i ); } output( "dznrm2", m, n, k, incx, incy, error_cblas, error_fblas, error_inline ); } } } printf( "\n" ); // ----- test ZDOTC // dot columns, Aj^H Bj for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; error_cblas = 0; error_fblas = 0; error_inline = 0; for( j=0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_zdotc( m, A(0,j), incx, B(0,j), incy ); // crashes with MKL 11.1.2, ILP64 #if defined(HAVE_CBLAS) && ! defined(MAGMA_WITH_MKL) #ifdef COMPLEX cblas_zdotc_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_zdotc( m, A(0,j), incx, B(0,j), incy ); #endif error_cblas = max( error_cblas, fabs(x2_m - x2_c) / fabs(m*x2_c) ); #else x2_c = MAGMA_Z_ZERO; error_cblas = SKIPPED_FLAG; #endif // crashes with MKL 11.2.3 and MacOS 10.9 #if (! defined(COMPLEX) || ! defined(MAGMA_WITH_MKL)) && ! defined(__APPLE__) x2_f = blasf77_zdotc( &m, A(0,j), &incx, B(0,j), &incy ); error_fblas = max( error_fblas, fabs(x2_m - x2_f) / fabs(m*x2_f) ); #else x2_f = MAGMA_Z_ZERO; error_fblas = SKIPPED_FLAG; #endif // inline implementation x2_i = MAGMA_Z_ZERO; magma_int_t A_offset = (incx > 0 ? 0 : (-n + 1)*incx); magma_int_t B_offset = (incy > 0 ? 0 : (-n + 1)*incy); for( i=0; i < m; ++i ) { x2_i += conj( *A(A_offset + i*incx,j) ) * *B(B_offset + i*incy,j); } error_inline = max( error_inline, fabs(x2_m - x2_i) / fabs(m*x2_i) ); //printf( "zdotc xm %.8e + %.8ei, xc %.8e + %.8ei, xf %.8e + %.8ei, xi %.8e + %.8ei\n", // real(x2_m), imag(x2_m), // real(x2_c), imag(x2_c), // real(x2_f), imag(x2_f), // real(x2_i), imag(x2_i) ); } output( "zdotc", m, n, k, incx, incy, error_cblas, error_fblas, error_inline ); } } printf( "\n" ); // ----- test ZDOTU // dot columns, Aj^T * Bj for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; error_cblas = 0; error_fblas = 0; error_inline = 0; for( j=0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_zdotu( m, A(0,j), incx, B(0,j), incy ); // crashes with MKL 11.1.2, ILP64 #if defined(HAVE_CBLAS) && ! defined(MAGMA_WITH_MKL) #ifdef COMPLEX cblas_zdotu_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_zdotu( m, A(0,j), incx, B(0,j), incy ); #endif error_cblas = max( error_cblas, fabs(x2_m - x2_c) / fabs(m*x2_c) ); #else x2_c = MAGMA_Z_ZERO; error_cblas = SKIPPED_FLAG; #endif // crashes with MKL 11.2.3 and MacOS 10.9 #if (! defined(COMPLEX) || ! defined(MAGMA_WITH_MKL)) && ! defined(__APPLE__) x2_f = blasf77_zdotu( &m, A(0,j), &incx, B(0,j), &incy ); error_fblas = max( error_fblas, fabs(x2_m - x2_f) / fabs(m*x2_f) ); #else x2_f = MAGMA_Z_ZERO; error_fblas = SKIPPED_FLAG; #endif // inline implementation x2_i = MAGMA_Z_ZERO; magma_int_t A_offset = (incx > 0 ? 0 : (-n + 1)*incx); magma_int_t B_offset = (incy > 0 ? 0 : (-n + 1)*incy); for( i=0; i < m; ++i ) { x2_i += *A(A_offset + i*incx,j) * *B(B_offset + i*incy,j); } error_inline = max( error_inline, fabs(x2_m - x2_i) / fabs(m*x2_i) ); //printf( "zdotu xm %.8e + %.8ei, xc %.8e + %.8ei, xf %.8e + %.8ei, xi %.8e + %.8ei\n", // real(x2_m), imag(x2_m), // real(x2_c), imag(x2_c), // real(x2_f), imag(x2_f), // real(x2_i), imag(x2_i) ); } output( "zdotu", m, n, k, incx, incy, error_cblas, error_fblas, error_inline ); } } // cleanup TESTING_FREE_CPU( A ); TESTING_FREE_CPU( B ); fflush( stdout ); } // itest, incx, incy opts.cleanup(); TESTING_FINALIZE(); return gStatus; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhegvdx */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time; magmaDoubleComplex *h_A, *h_R, *h_B, *h_S, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork; #endif /* Matrix size */ double *w1, *w2, result[2]={0,0}; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: itype = %d, jobz = %s, range = %s, uplo = %s, opts.check = %d, fraction = %6.4f\n", (int) opts.itype, lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M GPU Time (sec)\n"); printf("============================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_zbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_zbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_S, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, double, lrwork); #endif /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); magma_zmake_hpd( N, h_B, N ); magma_zmake_hermitian( N, h_A, N ); magma_int_t m1 = 0; double vl = 0; double vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == MagmaRangeI) { il = 1; iu = (int) (opts.fraction*N); } // ================================================================== // Warmup using MAGMA // ================================================================== if (opts.warmup) { lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_zhegvdx_2stage(opts.itype, opts.jobz, range, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } // =================================================================== // Performs operation using MAGMA // =================================================================== lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); gpu_time = magma_wtime(); magma_zhegvdx_2stage(opts.itype, opts.jobz, range, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); gpu_time = magma_wtime() - gpu_time; if ( opts.check && opts.jobz != MagmaNoVec ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | S(with V) - S(w/o V) | / | S | =================================================================== */ #if defined(PRECISION_d) || defined(PRECISION_s) double *rwork = h_work + N*N; #endif result[0] = 1.; result[0] /= lapackf77_zlanhe("1", lapack_uplo_const(opts.uplo), &N, h_A, &N, rwork); result[0] /= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork); if (opts.itype == 1) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_B, &N, h_R, &N, &c_one, h_work, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork)/N; } lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_int_t m2 = m1; lapackf77_zhegvd(&opts.itype, "N", lapack_uplo_const(opts.uplo), &N, h_R, &N, h_S, &N, w2, h_work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, &info); double maxw=0, diff=0; for(int j=0; j<m2; j++) { maxw = max(maxw, fabs(w1[j])); maxw = max(maxw, fabs(w2[j])); diff = max(diff, fabs(w1[j] - w2[j])); } result[1] = diff / (m2*maxw); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %7.2f\n", (int) N, (int) m1, gpu_time); if ( opts.check && opts.jobz != MagmaNoVec ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype==1) { printf(" | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype==2) { printf(" | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype==3) { printf(" | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } printf( " | D(w/ Z) - D(w/o Z) | / |D| = %8.2e %s\n\n", result[1], (result[1] < tolulp ? "ok" : "failed")); status += ! (result[0] < tol && result[1] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE(); return status; }
magma_int_t magma_ztrevc3_mt( magma_side_t side, magma_vec_t howmany, magma_int_t *select, // logical in Fortran magma_int_t n, magmaDoubleComplex *T, magma_int_t ldt, magmaDoubleComplex *VL, magma_int_t ldvl, magmaDoubleComplex *VR, magma_int_t ldvr, magma_int_t mm, magma_int_t *mout, magmaDoubleComplex *work, magma_int_t lwork, #ifdef COMPLEX double *rwork, #endif magma_int_t *info ) { #define T(i,j) ( T + (i) + (j)*ldt ) #define VL(i,j) (VL + (i) + (j)*ldvl) #define VR(i,j) (VR + (i) + (j)*ldvr) #define work(i,j) (work + (i) + (j)*n) // .. Parameters .. const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magma_int_t nbmin = 16, nbmax = 128; const magma_int_t ione = 1; // .. Local Scalars .. magma_int_t allv, bothv, leftv, over, rightv, somev; magma_int_t i, ii, is, j, k, ki, iv, n2, nb, nb2, version; double ovfl, remax, unfl; //smlnum, smin, ulp // Decode and test the input parameters bothv = (side == MagmaBothSides); rightv = (side == MagmaRight) || bothv; leftv = (side == MagmaLeft ) || bothv; allv = (howmany == MagmaAllVec); over = (howmany == MagmaBacktransVec); somev = (howmany == MagmaSomeVec); // Set mout to the number of columns required to store the selected // eigenvectors. if ( somev ) { *mout = 0; for( j=0; j < n; ++j ) { if ( select[j] ) { *mout += 1; } } } else { *mout = n; } *info = 0; if ( ! rightv && ! leftv ) *info = -1; else if ( ! allv && ! over && ! somev ) *info = -2; else if ( n < 0 ) *info = -4; else if ( ldt < max( 1, n ) ) *info = -6; else if ( ldvl < 1 || ( leftv && ldvl < n ) ) *info = -8; else if ( ldvr < 1 || ( rightv && ldvr < n ) ) *info = -10; else if ( mm < *mout ) *info = -11; else if ( lwork < max( 1, 2*n ) ) *info = -14; if ( *info != 0 ) { magma_xerbla( __func__, -(*info) ); return *info; } // Quick return if possible. if ( n == 0 ) { return *info; } // Use blocked version (2) if sufficient workspace. // Requires 1 vector to save diagonal elements, and 2*nb vectors for x and Q*x. // (Compared to dtrevc3, rwork stores 1-norms.) // Zero-out the workspace to avoid potential NaN propagation. nb = 2; if ( lwork >= n + 2*n*nbmin ) { version = 2; nb = (lwork - n) / (2*n); nb = min( nb, nbmax ); nb2 = 1 + 2*nb; lapackf77_zlaset( "F", &n, &nb2, &c_zero, &c_zero, work, &n ); } else { version = 1; } // Set the constants to control overflow. unfl = lapackf77_dlamch( "Safe minimum" ); ovfl = 1. / unfl; lapackf77_dlabad( &unfl, &ovfl ); //ulp = lapackf77_dlamch( "Precision" ); //smlnum = unfl*( n / ulp ); // Store the diagonal elements of T in working array work. for( i=0; i < n; ++i ) { *work(i,0) = *T(i,i); } // Compute 1-norm of each column of strictly upper triangular // part of T to control overflow in triangular solver. rwork[0] = 0.; for( j=1; j < n; ++j ) { rwork[j] = magma_cblas_dzasum( j, T(0,j), ione ); } // launch threads -- each single-threaded MKL magma_int_t nthread = magma_get_parallel_numthreads(); magma_int_t lapack_nthread = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( 1 ); magma_thread_queue queue; queue.launch( nthread ); //printf( "nthread %d, %d\n", nthread, lapack_nthread ); // gemm_nb = N/thread, rounded up to multiple of 16, // but avoid multiples of page size, e.g., 512*8 bytes = 4096. magma_int_t gemm_nb = magma_int_t( ceil( ceil( ((double)n) / nthread ) / 16. ) * 16. ); if ( gemm_nb % 512 == 0 ) { gemm_nb += 32; } magma_timer_t time_total=0, time_trsv=0, time_gemm=0, time_gemv=0, time_trsv_sum=0, time_gemm_sum=0, time_gemv_sum=0; timer_start( time_total ); if ( rightv ) { // ============================================================ // Compute right eigenvectors. // iv is index of column in current block. // Non-blocked version always uses iv=1; // blocked version starts with iv=nb, goes down to 1. // (Note the "0-th" column is used to store the original diagonal.) iv = 1; if ( version == 2 ) { iv = nb; } timer_start( time_trsv ); is = *mout - 1; for( ki=n-1; ki >= 0; --ki ) { if ( somev ) { if ( ! select[ki] ) { continue; } } //smin = max( ulp*MAGMA_Z_ABS1( *T(ki,ki) ), smlnum ); // -------------------------------------------------------- // Complex right eigenvector *work(ki,iv) = c_one; // Form right-hand side. for( k=0; k < ki; ++k ) { *work(k,iv) = -(*T(k,ki)); } // Solve upper triangular system: // [ T(1:ki-1,1:ki-1) - T(ki,ki) ]*X = scale*work. if ( ki > 0 ) { queue.push_task( new magma_zlatrsd_task( MagmaUpper, MagmaNoTrans, MagmaNonUnit, MagmaTrue, ki, T, ldt, *T(ki,ki), work(0,iv), work(ki,iv), rwork )); } // Copy the vector x or Q*x to VR and normalize. if ( ! over ) { // ------------------------------ // no back-transform: copy x to VR and normalize queue.sync(); n2 = ki+1; blasf77_zcopy( &n2, work(0,iv), &ione, VR(0,is), &ione ); ii = blasf77_izamax( &n2, VR(0,is), &ione ) - 1; remax = 1. / MAGMA_Z_ABS1( *VR(ii,is) ); blasf77_zdscal( &n2, &remax, VR(0,is), &ione ); for( k=ki+1; k < n; ++k ) { *VR(k,is) = c_zero; } } else if ( version == 1 ) { // ------------------------------ // version 1: back-transform each vector with GEMV, Q*x. queue.sync(); time_trsv_sum += timer_stop( time_trsv ); timer_start( time_gemv ); if ( ki > 0 ) { blasf77_zgemv( "n", &n, &ki, &c_one, VR, &ldvr, work(0, iv), &ione, work(ki,iv), VR(0,ki), &ione ); } time_gemv_sum += timer_stop( time_gemv ); ii = blasf77_izamax( &n, VR(0,ki), &ione ) - 1; remax = 1. / MAGMA_Z_ABS1( *VR(ii,ki) ); blasf77_zdscal( &n, &remax, VR(0,ki), &ione ); timer_start( time_trsv ); } else if ( version == 2 ) { // ------------------------------ // version 2: back-transform block of vectors with GEMM // zero out below vector for( k=ki+1; k < n; ++k ) { *work(k,iv) = c_zero; } // Columns iv:nb of work are valid vectors. // When the number of vectors stored reaches nb, // or if this was last vector, do the GEMM if ( (iv == 1) || (ki == 0) ) { queue.sync(); time_trsv_sum += timer_stop( time_trsv ); timer_start( time_gemm ); nb2 = nb-iv+1; n2 = ki+nb-iv+1; // split gemm into multiple tasks, each doing one block row for( i=0; i < n; i += gemm_nb ) { magma_int_t ib = min( gemm_nb, n-i ); queue.push_task( new zgemm_task( MagmaNoTrans, MagmaNoTrans, ib, nb2, n2, c_one, VR(i,0), ldvr, work(0,iv ), n, c_zero, work(i,nb+iv), n )); } queue.sync(); time_gemm_sum += timer_stop( time_gemm ); // normalize vectors // TODO if somev, should copy vectors individually to correct location. for( k = iv; k <= nb; ++k ) { ii = blasf77_izamax( &n, work(0,nb+k), &ione ) - 1; remax = 1. / MAGMA_Z_ABS1( *work(ii,nb+k) ); blasf77_zdscal( &n, &remax, work(0,nb+k), &ione ); } lapackf77_zlacpy( "F", &n, &nb2, work(0,nb+iv), &n, VR(0,ki), &ldvr ); iv = nb; timer_start( time_trsv ); } else { iv -= 1; } } // blocked back-transform is -= 1; } } timer_stop( time_trsv ); timer_stop( time_total ); timer_printf( "trevc trsv %.4f, gemm %.4f, gemv %.4f, total %.4f\n", time_trsv_sum, time_gemm_sum, time_gemv_sum, time_total ); if ( leftv ) { // ============================================================ // Compute left eigenvectors. // iv is index of column in current block. // Non-blocked version always uses iv=1; // blocked version starts with iv=1, goes up to nb. // (Note the "0-th" column is used to store the original diagonal.) iv = 1; is = 0; for( ki=0; ki < n; ++ki ) { if ( somev ) { if ( ! select[ki] ) { continue; } } //smin = max( ulp*MAGMA_Z_ABS1( *T(ki,ki) ), smlnum ); // -------------------------------------------------------- // Complex left eigenvector *work(ki,iv) = c_one; // Form right-hand side. for( k = ki + 1; k < n; ++k ) { *work(k,iv) = -MAGMA_Z_CONJ( *T(ki,k) ); } // Solve conjugate-transposed triangular system: // [ T(ki+1:n,ki+1:n) - T(ki,ki) ]**H * X = scale*work. // TODO what happens with T(k,k) - lambda is small? Used to have < smin test. if ( ki < n-1 ) { n2 = n-ki-1; queue.push_task( new magma_zlatrsd_task( MagmaUpper, MagmaConjTrans, MagmaNonUnit, MagmaTrue, n2, T(ki+1,ki+1), ldt, *T(ki,ki), work(ki+1,iv), work(ki,iv), rwork )); } // Copy the vector x or Q*x to VL and normalize. if ( ! over ) { // ------------------------------ // no back-transform: copy x to VL and normalize queue.sync(); n2 = n-ki; blasf77_zcopy( &n2, work(ki,iv), &ione, VL(ki,is), &ione ); ii = blasf77_izamax( &n2, VL(ki,is), &ione ) + ki - 1; remax = 1. / MAGMA_Z_ABS1( *VL(ii,is) ); blasf77_zdscal( &n2, &remax, VL(ki,is), &ione ); for( k=0; k < ki; ++k ) { *VL(k,is) = c_zero; } } else if ( version == 1 ) { // ------------------------------ // version 1: back-transform each vector with GEMV, Q*x. queue.sync(); if ( ki < n-1 ) { n2 = n-ki-1; blasf77_zgemv( "n", &n, &n2, &c_one, VL(0,ki+1), &ldvl, work(ki+1,iv), &ione, work(ki, iv), VL(0,ki), &ione ); } ii = blasf77_izamax( &n, VL(0,ki), &ione ) - 1; remax = 1. / MAGMA_Z_ABS1( *VL(ii,ki) ); blasf77_zdscal( &n, &remax, VL(0,ki), &ione ); } else if ( version == 2 ) { // ------------------------------ // version 2: back-transform block of vectors with GEMM // zero out above vector // could go from (ki+1)-NV+1 to ki for( k=0; k < ki; ++k ) { *work(k,iv) = c_zero; } // Columns 1:iv of work are valid vectors. // When the number of vectors stored reaches nb, // or if this was last vector, do the GEMM if ( (iv == nb) || (ki == n-1) ) { queue.sync(); n2 = n-(ki+1)+iv; // split gemm into multiple tasks, each doing one block row for( i=0; i < n; i += gemm_nb ) { magma_int_t ib = min( gemm_nb, n-i ); queue.push_task( new zgemm_task( MagmaNoTrans, MagmaNoTrans, ib, iv, n2, c_one, VL(i,ki-iv+1), ldvl, work(ki-iv+1,1), n, c_zero, work(i,nb+1), n )); } queue.sync(); // normalize vectors for( k=1; k <= iv; ++k ) { ii = blasf77_izamax( &n, work(0,nb+k), &ione ) - 1; remax = 1. / MAGMA_Z_ABS1( *work(ii,nb+k) ); blasf77_zdscal( &n, &remax, work(0,nb+k), &ione ); } lapackf77_zlacpy( "F", &n, &iv, work(0,nb+1), &n, VL(0,ki-iv+1), &ldvl ); iv = 1; } else { iv += 1; } } // blocked back-transform is += 1; } } // close down threads queue.quit(); magma_set_lapack_numthreads( lapack_nthread ); return *info; } // End of ZTREVC
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgesv_gpu */ int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; double error, Rnorm, Anorm, Xnorm, *work; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_B, *h_X; magmaDouble_ptr d_A, d_B; magma_int_t *ipiv; magma_int_t N, nrhs, lda, ldb, ldda, lddb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf("%% N NRHS CPU Gflop/s (sec) GPU Gflop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("%%===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldb = lda; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default lddb = ldda; gflops = ( FLOPS_DGETRF( N, N ) + FLOPS_DGETRS( N, nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, double, lda*N ); TESTING_MALLOC_CPU( h_B, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, double, ldb*nrhs ); TESTING_MALLOC_CPU( work, double, N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_B, double, lddb*nrhs ); /* Initialize the matrices */ sizeA = lda*N; sizeB = ldb*nrhs; lapackf77_dlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); magma_dsetmatrix( N, N, h_A, lda, d_A, ldda, opts.queue ); magma_dsetmatrix( N, nrhs, h_B, ldb, d_B, lddb, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dgesv_gpu( N, nrhs, d_A, ldda, ipiv, d_B, lddb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgesv_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } //===================================================================== // Residual //===================================================================== magma_dgetmatrix( N, nrhs, d_B, lddb, h_X, ldb, opts.queue ); Anorm = lapackf77_dlange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_dlange("I", &N, &nrhs, h_X, &ldb, work); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_dlange("I", &N, &nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status += ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dgesv( &N, &nrhs, h_A, &lda, ipiv, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetri_batched */ int main( int argc, char** argv) { TESTING_INIT(); // constants const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A, *h_Ainv, *h_R, *work; magmaDoubleComplex_ptr d_A, d_invA; magmaDoubleComplex_ptr *dA_array; magmaDoubleComplex_ptr *dinvA_array; magma_int_t **dipiv_array; magma_int_t *dinfo_array; magma_int_t *ipiv, *cpu_info; magma_int_t *d_ipiv, *d_info; magma_int_t N, n2, lda, ldda, info, info1, info2, lwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex tmp; double error, rwork[1]; magma_int_t columns; magma_int_t status = 0; magma_opts opts( MagmaOptsBatched ); opts.parse_opts( argc, argv ); magma_int_t batchCount = opts.batchcount; double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% batchCount N CPU Gflop/s (ms) GPU Gflop/s (ms) ||I - A*A^{-1}||_1 / (N*cond(A))\n"); printf("%%===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N * batchCount; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default // This is the correct flops but since this getri_batched is based on // 2 trsm = getrs and to know the real flops I am using the getrs one //gflops = (FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRI( N ))/ 1e9 * batchCount; gflops = (FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRS( N, N ))/ 1e9 * batchCount; // query for workspace size lwork = -1; lapackf77_zgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) { printf("lapackf77_zgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = magma_int_t( MAGMA_Z_REAL( tmp )); TESTING_MALLOC_CPU( cpu_info, magma_int_t, batchCount ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N * batchCount ); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork*batchCount ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_Ainv, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( d_invA, magmaDoubleComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, N * batchCount ); TESTING_MALLOC_DEV( d_info, magma_int_t, batchCount ); TESTING_MALLOC_DEV( dA_array, magmaDoubleComplex*, batchCount ); TESTING_MALLOC_DEV( dinvA_array, magmaDoubleComplex*, batchCount ); TESTING_MALLOC_DEV( dinfo_array, magma_int_t, batchCount ); TESTING_MALLOC_DEV( dipiv_array, magma_int_t*, batchCount ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); columns = N * batchCount; lapackf77_zlacpy( MagmaFullStr, &N, &columns, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaFullStr, &N, &columns, h_A, &lda, h_Ainv, &lda ); magma_zsetmatrix( N, columns, h_R, lda, d_A, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zset_pointer( dA_array, d_A, ldda, 0, 0, ldda * N, batchCount, opts.queue ); magma_zset_pointer( dinvA_array, d_invA, ldda, 0, 0, ldda * N, batchCount, opts.queue ); magma_iset_pointer( dipiv_array, d_ipiv, 1, 0, 0, N, batchCount, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); info1 = magma_zgetrf_batched( N, N, dA_array, ldda, dipiv_array, dinfo_array, batchCount, opts.queue); info2 = magma_zgetri_outofplace_batched( N, dA_array, ldda, dipiv_array, dinvA_array, ldda, dinfo_array, batchCount, opts.queue); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1, opts.queue ); for (magma_int_t i=0; i < batchCount; i++) { if (cpu_info[i] != 0 ) { printf("magma_zgetrf_batched matrix %d returned error %d\n", (int) i, (int)cpu_info[i] ); } } if (info1 != 0) printf("magma_zgetrf_batched returned argument error %d: %s.\n", (int) info1, magma_strerror( info1 )); if (info2 != 0) printf("magma_zgetri_batched returned argument error %d: %s.\n", (int) info2, magma_strerror( info2 )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_int_t nthreads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); magma_set_omp_numthreads(nthreads); #pragma omp parallel for schedule(dynamic) #endif for (int i=0; i < batchCount; i++) { magma_int_t locinfo; lapackf77_zgetrf(&N, &N, h_Ainv + i*lda*N, &lda, ipiv + i*N, &locinfo); if (locinfo != 0) { printf("lapackf77_zgetrf returned error %d: %s.\n", (int) locinfo, magma_strerror( locinfo )); } lapackf77_zgetri(&N, h_Ainv + i*lda*N, &lda, ipiv + i*N, work + i*lwork, &lwork, &locinfo ); if (locinfo != 0) { printf("lapackf77_zgetri returned error %d: %s.\n", (int) locinfo, magma_strerror( locinfo )); } } #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_set_lapack_numthreads(nthreads); #endif cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; printf("%10d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) batchCount, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000. ); } else { printf("%10d %5d --- ( --- ) %7.2f (%7.2f)", (int) batchCount, (int) N, gpu_perf, gpu_time*1000. ); } /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { magma_igetvector( N*batchCount, d_ipiv, 1, ipiv, 1, opts.queue ); magma_zgetmatrix( N, N*batchCount, d_invA, ldda, h_Ainv, lda, opts.queue ); error = 0; for (magma_int_t i=0; i < batchCount; i++) { for (magma_int_t k=0; k < N; k++) { if (ipiv[i*N+k] < 1 || ipiv[i*N+k] > N ) { printf("error for matrix %d ipiv @ %d = %d\n", (int) i, (int) k, (int) ipiv[i*N+k]); error = -1; } } if (error == -1) { break; } // compute 1-norm condition number estimate, following LAPACK's zget03 double normA, normAinv, rcond, err; normA = lapackf77_zlange( "1", &N, &N, h_A + i*lda*N, &lda, rwork ); normAinv = lapackf77_zlange( "1", &N, &N, h_Ainv + i*lda*N, &lda, rwork ); if ( normA <= 0 || normAinv <= 0 ) { rcond = 0; err = 1 / (tol/opts.tolerance); // == 1/eps } else { rcond = (1 / normA) / normAinv; // R = I // R -= A*A^{-1} // err = ||I - A*A^{-1}|| / ( N ||A||*||A^{-1}|| ) = ||R|| * rcond / N, using 1-norm lapackf77_zlaset( "full", &N, &N, &c_zero, &c_one, h_R + i*lda*N, &lda ); blasf77_zgemm( "no", "no", &N, &N, &N, &c_neg_one, h_A + i*lda*N, &lda, h_Ainv + i*lda*N, &lda, &c_one, h_R + i*lda*N, &lda ); err = lapackf77_zlange( "1", &N, &N, h_R + i*lda*N, &lda, rwork ); err = err * rcond / N; } if ( isnan(err) || isinf(err) ) { error = err; break; } error = max( err, error ); } bool okay = (error < tol); status += ! okay; printf(" %8.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf("\n"); } TESTING_FREE_CPU( cpu_info ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_Ainv ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_invA ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_info ); TESTING_FREE_DEV( dA_array ); TESTING_FREE_DEV( dinvA_array ); TESTING_FREE_DEV( dinfo_array ); TESTING_FREE_DEV( dipiv_array ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsyevd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; double *h_A, *h_R, *h_work; double *w1, *w2; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork, lda, aux_iwork[1]; magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double result[3], eps, aux_work[1]; eps = lapackf77_dlamch( "E" ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: jobz = %s, uplo = %s\n", lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo)); printf(" N CPU Time (sec) GPU Time (sec)\n"); printf("=======================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; lda = N; // query for workspace sizes magma_dsyevd( opts.jobz, opts.uplo, N, NULL, lda, NULL, aux_work, -1, aux_iwork, -1, opts.queue, &info ); lwork = (magma_int_t) aux_work[0]; liwork = aux_iwork[0]; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, double, N*lda ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, double, N*lda ); TESTING_MALLOC_PIN( h_work, double, lwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* warm up run */ if ( opts.warmup ) { magma_dsyevd( opts.jobz, opts.uplo, N, h_R, lda, w1, h_work, lwork, iwork, liwork, opts.queue, &info ); if (info != 0) printf("magma_dsyevd returned error %d: %s.\n", (int) info, magma_strerror( info )); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dsyevd( opts.jobz, opts.uplo, N, h_R, lda, w1, h_work, lwork, iwork, liwork, opts.queue, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_dsyevd returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zcds]drvst routine. A is factored as A = U S U' and the following 3 tests computed: (1) | A - U S U' | / ( |A| N ) (2) | I - U'U | / ( N ) (3) | S(with U) - S(w/o U) | / | S | =================================================================== */ double temp1, temp2; // tau=NULL is unused since itype=1 lapackf77_dsyt21( &ione, lapack_uplo_const(opts.uplo), &N, &izero, h_A, &lda, w1, h_work, h_R, &lda, h_R, &lda, NULL, h_work, &result[0] ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dsyevd( MagmaNoVec, opts.uplo, N, h_R, lda, w2, h_work, lwork, iwork, liwork, opts.queue, &info ); if (info != 0) printf("magma_dsyevd returned error %d: %s.\n", (int) info, magma_strerror( info )); temp1 = temp2 = 0; for( int j=0; j<N; j++ ) { temp1 = max(temp1, fabs(w1[j])); temp1 = max(temp1, fabs(w2[j])); temp2 = max(temp2, fabs(w1[j]-w2[j])); } result[2] = temp2 / (((double)N)*temp1); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dsyevd( lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), &N, h_A, &lda, w2, h_work, &lwork, iwork, &liwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_dsyevd returned error %d: %s.\n", (int) info, magma_strerror( info )); printf("%5d %7.2f %7.2f\n", (int) N, cpu_time, gpu_time); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } /* ===================================================================== Print execution time =================================================================== */ if ( opts.check ) { printf("Testing the factorization A = U S U' for correctness:\n"); printf("(1) | A - U S U' | / (|A| N) = %8.2e %s\n", result[0]*eps, (result[0]*eps < tol ? "ok" : "failed") ); printf("(2) | I - U'U | / N = %8.2e %s\n", result[1]*eps, (result[1]*eps < tol ? "ok" : "failed") ); printf("(3) | S(w/ U) - S(w/o U) | / |S| = %8.2e %s\n\n", result[2] , (result[2] < tolulp ? "ok" : "failed") ); status += ! (result[0]*eps < tol && result[1]*eps < tol && result[2] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetrf */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A, *h_R, *work; magmaDoubleComplex_ptr d_A, dwork; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t N, n2, lda, ldda, info, lwork, ldwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex tmp; double error, rwork[1]; magma_int_t *ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) // need looser bound (3000*eps instead of 30*eps) for tests // TODO: should compute ||I - A*A^{-1}|| / (n*||A||*||A^{-1}||) opts.tolerance = max( 3000., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / (N*||A||_F)\n"); printf("=================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; ldwork = N * magma_get_zgetri_nb( N ); gflops = FLOPS_ZGETRI( N ) / 1e9; // query for workspace size lwork = -1; lapackf77_zgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) printf("lapackf77_zgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = int( MAGMA_Z_REAL( tmp )); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); error = lapackf77_zlange( "f", &N, &N, h_A, &lda, rwork ); // norm(A) /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_zsetmatrix( N, N, h_A, lda, d_A, 0, ldda, opts.queue ); magma_zgetrf_gpu( N, N, d_A, 0, ldda, ipiv, opts.queue, &info ); magma_zgetmatrix( N, N, d_A, 0, ldda, h_A, lda, opts.queue ); if ( info != 0 ) printf("magma_zgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // check for exact singularity //h_A[ 10 + 10*lda ] = MAGMA_Z_MAKE( 0.0, 0.0 ); //magma_zsetmatrix( N, N, h_A, lda, d_A, 0, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, opts.queues2, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgetri_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_zgetmatrix( N, N, d_A, 0, ldda, h_R, lda, opts.queue ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_zaxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_zlange( "f", &N, &N, h_R, &lda, rwork ) / (N*error); printf( "%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( "%5d --- ( --- ) %7.2f (%7.2f) ---\n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- DSYEVD computes all eigenvalues and, optionally, eigenvectors of a real symmetric matrix A. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] nrgpu INTEGER Number of GPUs to use. @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA, N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the orthonormal eigenvectors of the matrix A. If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] w DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_dsytrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. @ingroup magma_dsyev_driver ********************************************************************/ extern "C" magma_int_t magma_dsyevd_m(magma_int_t nrgpu, magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, double *w, double *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); magma_int_t ione = 1; magma_int_t izero = 0; double d_one = 1.; double d__1; double eps; magma_int_t inde; double anrm; double rmin, rmax; double sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; double safmin; double bignum; magma_int_t indtau; magma_int_t indwrk, liwmin; magma_int_t llwork; double smlnum; magma_int_t lquery; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); lquery = (lwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (lower || (uplo == MagmaUpper))) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } magma_int_t nb = magma_get_dsytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } // multiply by 1+eps (in Double!) to ensure length gets rounded up, // if it cannot be exactly represented in floating point. real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon"); work[0] = lwmin * one_eps; iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -8; } else if ((liwork < liwmin) && ! lquery) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = A[0]; if (wantz) { A[0] = 1.; } return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_dsyevd(jobz_, uplo_, &n, A, &lda, w, work, &lwork, iwork, &liwork, info); return *info; } /* Get machine constants. */ safmin = lapackf77_dlamch("Safe minimum"); eps = lapackf77_dlamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_dsqrt(smlnum); rmax = magma_dsqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_dlansy("M", uplo_, &n, A, &lda, work); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_dlascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, A, &lda, info); } /* Call DSYTRD to reduce symmetric matrix to tridiagonal form. */ // dsytrd work: e (n) + tau (n) + llwork (n*nb) ==> 2n + n*nb // dstedx work: e (n) + tau (n) + z (n*n) + llwrk2 (1 + 4*n + n^2) ==> 1 + 6n + 2n^2 inde = 0; indtau = inde + n; indwrk = indtau + n; indwk2 = indwrk + n*n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; magma_timer_t time=0; timer_start( time ); magma_dsytrd_mgpu(nrgpu, 1, uplo, n, A, lda, w, &work[inde], &work[indtau], &work[indwrk], llwork, &iinfo); timer_stop( time ); timer_printf( "time dsytrd = %6.2f\n", time ); /* For eigenvalues only, call DSTERF. For eigenvectors, first call DSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call DORMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_dsterf(&n, w, &work[inde], info); } else { timer_start( time ); #ifdef USE_SINGLE_GPU if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 3*n*(n/2 + 1) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_dstedx(MagmaRangeAll, n, 0., 0., 0, 0, w, &work[inde], &work[indwrk], n, &work[indwk2], llwrk2, iwork, liwork, dwork, info); magma_free( dwork ); #else magma_dstedx_m(nrgpu, MagmaRangeAll, n, 0., 0., 0, 0, w, &work[inde], &work[indwrk], n, &work[indwk2], llwrk2, iwork, liwork, info); #endif timer_stop( time ); timer_printf( "time dstedc = %6.2f\n", time ); timer_start( time ); magma_dormtr_m(nrgpu, MagmaLeft, uplo, MagmaNoTrans, n, n, A, lda, &work[indtau], &work[indwrk], n, &work[indwk2], llwrk2, &iinfo); lapackf77_dlacpy("A", &n, &n, &work[indwrk], &n, A, &lda); timer_stop( time ); timer_printf( "time dormtr + copy = %6.2f\n", time ); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { d__1 = 1. / sigma; blasf77_dscal(&n, &d__1, w, &ione); } work[0] = lwmin * one_eps; // round up iwork[0] = liwmin; return *info; } /* magma_dsyevd_m */
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time; double magma_error, dev_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, ldda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ydev, *Ymagma; magmaDoubleComplex_ptr dA, dX, dY; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% trans = %s\n", lapack_trans_const(opts.transA) ); #ifdef HAVE_CUBLAS printf("%% M N MAGMA Gflop/s (ms) %s Gflop/s (ms) CPU Gflop/s (ms) MAGMA error %s error\n", g_platform_str, g_platform_str ); #else printf("%% M N %s Gflop/s (ms) CPU Gflop/s (ms) %s error\n", g_platform_str, g_platform_str ); #endif printf("%%==================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default gflops = FLOPS_ZGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ydev, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( M, N, A, lda, dA, ldda, opts.queue ); magma_zsetvector( Xm, X, incx, dX, incx, opts.queue ); magma_zsetvector( Ym, Y, incy, dY, incy, opts.queue ); dev_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS cublasZgemv( opts.handle, cublas_trans_const(opts.transA), M, N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); #else magma_zgemv( opts.transA, M, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); #endif dev_time = magma_sync_wtime( opts.queue ) - dev_time; dev_perf = gflops / dev_time; magma_zgetvector( Ym, dY, incy, Ydev, incy, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (currently only with CUDA) =================================================================== */ #ifdef HAVE_CUBLAS magma_zsetvector( Ym, Y, incy, dY, incy, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); magmablas_zgemv( opts.transA, M, N, alpha, dA, ldda, dX, incx, beta, dY, incy, opts.queue ); magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( Ym, dY, incy, Ymagma, incy, opts.queue ); #endif /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ double Anorm = lapackf77_zlange( "F", &M, &N, A, &lda, work ); double Xnorm = lapackf77_zlange( "F", &Xm, &ione, X, &Xm, work ); blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ydev, &incy ); dev_error = lapackf77_zlange( "F", &Ym, &ione, Ydev, &Ym, work ) / (Anorm * Xnorm); #ifdef HAVE_CUBLAS blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "F", &Ym, &ione, Ymagma, &Ym, work ) / (Anorm * Xnorm); bool okay = (magma_error < tol) && (dev_error < tol); status += ! okay; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, magma_error, dev_error, (okay ? "ok" : "failed")); #else bool okay = (dev_error < tol); status += ! okay; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, dev_error, (okay ? "ok" : "failed")); #endif TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ydev ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zcgeqrsv */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time, gpu_perfd, gpu_perfs; double error, gpu_error, cpu_error, Anorm, work[1]; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_A2, *h_B, *h_X, *h_R; magmaDoubleComplex_ptr d_A, d_B, d_X, d_T; magmaFloatComplex *d_SA, *d_SB; magmaDoubleComplex *h_workd, *tau, tmp[1]; magmaFloatComplex *h_works; magma_int_t lda, ldb, lhwork, lworkgpu; magma_int_t ldda, lddb, lddx; magma_int_t M, N, nrhs, qrsv_iters, info, size, min_mn, max_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; printf("Epsilon(double): %8.6e\n" "Epsilon(single): %8.6e\n\n", lapackf77_dlamch("Epsilon"), lapackf77_slamch("Epsilon") ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf(" CPU Gflop/s GPU Gflop/s |b-Ax|| / (N||A||) ||dx-x||/(N||A||)\n"); printf(" M N NRHS double double single mixed Iter CPU GPU \n"); printf("=============================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if ( M < N ) { printf( "%5d %5d %5d skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; ldda = ((M+31)/32) * 32; lddb = ((max_mn+31)/32)*32; lddx = ((N+31)/32) * 32; nb = max( magma_get_zgeqrf_nb( M ), magma_get_cgeqrf_nb( M ) ); gflops = (FLOPS_ZGEQRF( M, N ) + FLOPS_ZGEQRS( M, N, nrhs )) / 1e9; lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; // query for workspace size lhwork = -1; lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs, NULL, &lda, NULL, &ldb, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); lhwork = max( lhwork, lworkgpu ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_A2, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_workd, magmaDoubleComplex, lhwork ); h_works = (magmaFloatComplex*)h_workd; TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*nrhs ); TESTING_MALLOC_DEV( d_X, magmaDoubleComplex, lddx*nrhs ); TESTING_MALLOC_DEV( d_T, magmaDoubleComplex, ( 2*min_mn + (N+31)/32*32 )*nb ); /* Initialize the matrices */ size = lda*N; lapackf77_zlarnv( &ione, ISEED, &size, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS size = ldb*nrhs; lapackf77_zlarnv( &ione, ISEED, &size, h_B ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); //===================================================================== // Mixed Precision Iterative Refinement - GPU //===================================================================== gpu_time = magma_wtime(); magma_zcgeqrsv_gpu( M, N, nrhs, d_A, ldda, d_B, lddb, d_X, lddx, &qrsv_iters, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zcgeqrsv returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute the residual magma_zgetmatrix( N, nrhs, d_X, lddx, h_X, ldb ); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); Anorm = lapackf77_zlange("f", &M, &N, h_A, &lda, work); //===================================================================== // Double Precision Solve //===================================================================== magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_zgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_workd, lworkgpu, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfd = gflops / gpu_time; //===================================================================== // Single Precision Solve //===================================================================== magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); /* The allocation of d_SA and d_SB is done here to avoid * to double the memory used on GPU with zcgeqrsv */ TESTING_MALLOC_DEV( d_SA, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_SB, magmaFloatComplex, lddb*nrhs ); magmablas_zlag2c( M, N, d_A, ldda, d_SA, ldda, &info ); magmablas_zlag2c( N, nrhs, d_B, lddb, d_SB, lddb, &info ); gpu_time = magma_wtime(); magma_cgels_gpu( MagmaNoTrans, M, N, nrhs, d_SA, ldda, d_SB, lddb, h_works, lhwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfs = gflops / gpu_time; TESTING_FREE_DEV( d_SA ); TESTING_FREE_DEV( d_SB ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_workd, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb ); cpu_error = lapackf77_zlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm); gpu_error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); // error relative to LAPACK size = M*nrhs; blasf77_zaxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); printf("%5d %5d %5d %7.2f %7.2f %7.2f %7.2f %4d %8.2e %8.2e %8.2e %s\n", (int) M, (int) N, (int) nrhs, cpu_perf, gpu_perfd, gpu_perfs, gpu_perf, (int) qrsv_iters, cpu_error, gpu_error, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_workd ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_X ); TESTING_FREE_DEV( d_T ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_T, *h_R, *tau, *h_work, tmp[1]; double *d_A, *d_T, *ddA, *dtau; double *d_A2, *d_T2, *ddA2, *dtau2; double *dwork, *dwork2; magma_int_t M, N, lda, ldda, lwork, n2, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; #define BLOCK_SIZE 64 magma_opts opts; parse_opts( argc, argv, &opts ); double tol = 10. * opts.tolerance * lapackf77_dlamch("E"); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); printf("version %d\n", (int) opts.version ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F/||A||_F ||R_T||\n"); printf("=============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if (N > 128) { printf("%5d %5d skipping because dgeqr2x requires N <= 128\n", (int) M, (int) N); continue; } if (M < N) { printf("%5d %5d skipping because dgeqr2x requires M >= N\n", (int) M, (int) N); continue; } min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_DGEQRF( M, N ) + FLOPS_DGEQRT( M, N )) / 1e9; /* Allocate memory for the matrix */ TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_T, double, N*N ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_T, double, N*N ); TESTING_MALLOC_DEV( ddA, double, N*N ); TESTING_MALLOC_DEV( dtau, double, min_mn ); TESTING_MALLOC_DEV( d_A2, double, ldda*N ); TESTING_MALLOC_DEV( d_T2, double, N*N ); TESTING_MALLOC_DEV( ddA2, double, N*N ); TESTING_MALLOC_DEV( dtau2, double, min_mn ); TESTING_MALLOC_DEV( dwork, double, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); TESTING_MALLOC_DEV( dwork2, double, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); // todo replace with magma_dlaset cudaMemset(ddA, 0, N*N*sizeof(double)); cudaMemset(d_T, 0, N*N*sizeof(double)); cudaMemset(ddA2, 0, N*N*sizeof(double)); cudaMemset(d_T2, 0, N*N*sizeof(double)); lwork = -1; lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lwork = max(lwork, N*N); TESTING_MALLOC_CPU( h_work, double, lwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, lda, d_A, ldda ); magma_dsetmatrix( M, N, h_R, lda, d_A2, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime(0); if (opts.version == 1) magma_dgeqr2x_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 2) magma_dgeqr2x2_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 3) magma_dgeqr2x3_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else { printf( "call magma_dgeqr2x4_gpu\n" ); /* Going through NULL stream is faster Going through any stream is slower Doing two streams in parallel is slower than doing them sequentially Queuing happens on the NULL stream - user defined buffers are smaller? */ magma_dgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info, NULL); //magma_dgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info, stream[1]); //magma_dgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, stream[0]); //magma_dgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, NULL); //gflops *= 2; } gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgeqr2x_gpu version %d returned error %d: %s.\n", (int) opts.version, (int) info, magma_strerror( info )); } else { if ( opts.check ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &M, &N, h_A, &lda, tau, h_work, &N); //magma_dgeqr2(&M, &N, h_A, &lda, tau, h_work, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, ldda, h_R, M ); magma_dgetmatrix( N, N, ddA, N, h_T, N ); // Restore the upper triangular part of A before the check for(int col=0; col < N; col++){ for(int row=0; row <= col; row++) h_R[row + col*M] = h_T[row + col*N]; } error = lapackf77_dlange("M", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("M", &M, &N, h_R, &lda, work) / (N * error); // Check if T is the same magma_dgetmatrix( N, N, d_T, N, h_T, N ); double terr = 0.; for(int col=0; col < N; col++) for(int row=0; row <= col; row++) terr += ( MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N])* MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N]) ); terr = magma_dsqrt(terr); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, terr, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_T ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_T ); TESTING_FREE_DEV( ddA ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); TESTING_FREE_DEV( d_A2 ); TESTING_FREE_DEV( d_T2 ); TESTING_FREE_DEV( ddA2 ); TESTING_FREE_DEV( dtau2 ); TESTING_FREE_DEV( dwork2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgehrd2 */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; //*h_R1 is used for warm-up magmaDoubleComplex *h_A, *h_R, *h_Q, *h_work, *tau, *twork, *h_R1; magmaDoubleComplex_ptr dT; double *rwork; double result[2] = {0., 0.}; double eps; int checkres; checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; /* Matrix size */ int N=0, n2, lda, nb, lwork, ltwork, once = 0; #if defined (PRECISION_z) magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7000,7000,7000,7000}; #else magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,9900}; #endif int i, info; int ione = 1; int ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if ( N > 0 ){ printf(" testing_zgehrd -N %d\n\n", N); once = 1; } else { printf("\nUsage: \n"); printf(" testing_zgehrd -N %d\n\n", 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_zgehrd -N %d\n\n", 1024); N = size[9]; } /* Initialize */ magma_queue_t queue; magma_device_t device; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( &device, 1, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } eps = lapackf77_dlamch( "E" ); lda = N; n2 = N*lda; nb = magma_get_zgehrd_nb(N); /* We suppose the magma nb is bigger than lapack nb */ lwork = N*nb; TESTING_MALLOC_HOST( h_A , magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( tau , magmaDoubleComplex, N ); TESTING_MALLOC_HOST( h_R , magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( h_R1 , magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_DEV ( dT , magmaDoubleComplex, nb*N ); /* To avoid uninitialized variable warning */ h_Q = NULL; twork = NULL; rwork = NULL; if ( checkres ) { ltwork = 2*(N*N); TESTING_MALLOC_HOST( h_Q, magmaDoubleComplex, lda*N ); TESTING_MALLOC_HOST( twork, magmaDoubleComplex, ltwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_HOST( rwork, double, N ); #endif } printf("\n\n"); printf(" N CPU GFlop/s GPU GFlop/s |A-QHQ'|/N|A| |I-QQ'|/N \n"); printf("=============================================================\n"); for(i=0; i<10; i++){ if ( !once ) { N = size[i]; } lda = N; n2 = lda*N; gflops = FLOPS( (double)N ) / 1e9; /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R1, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zgehrd ( N, ione, N, h_R1, lda, tau, h_work, lwork, dT, 0, &info, queue); if ( info < 0 ) printf("Argument %d of magma_zgehrd had an illegal value\n", -info); clFinish(queue); gpu_time = get_time(); magma_zgehrd ( N, ione, N, h_R, lda, tau, h_work, lwork, dT, 0, &info, queue); gpu_time = get_time() - gpu_time; if ( info < 0 ) printf("Argument %d of magma_zgehrd had an illegal value\n", -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the factorization =================================================================== */ if ( checkres ) { lapackf77_zlacpy(MagmaUpperLowerStr, &N, &N, h_R, &lda, h_Q, &lda); { int i, j; for(j=0; j<N-1; j++) for(i=j+2; i<lda; i++) h_R[i+j*lda] = MAGMA_Z_ZERO; } nb = magma_get_zgehrd_nb(N); magma_zunghr(N, ione, N, h_Q, lda, tau, dT, 0, nb, &info, queue); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zhst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, rwork, result); #else lapackf77_zhst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, result); #endif } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = get_time(); lapackf77_zgehrd(&N, &ione, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = get_time() - cpu_time; if (info < 0) printf("Argument %d of lapack_zgehrd had an illegal value.\n", -info); cpu_perf = gflops / cpu_time; /* ===================================================================== Print performance and error. =================================================================== */ if ( checkres ) { printf("%5d %6.2f %6.2f %e %e\n", N, cpu_perf, gpu_perf, result[0]*eps, result[1]*eps ); } else { printf("%5d %6.2f %6.2f\n", N, cpu_perf, gpu_perf ); } if ( once ) break; } /* Memory clean up */ TESTING_FREE ( h_A ); TESTING_FREE ( tau ); TESTING_FREE_HOST( h_work); TESTING_FREE_HOST( h_R ); TESTING_FREE_HOST( h_R1 ); TESTING_FREE_DEV ( dT ); if ( checkres ) { TESTING_FREE_HOST( h_Q ); TESTING_FREE( twork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE( rwork ); #endif } /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); return EXIT_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zpotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A, *h_R; magma_int_t N, n2, lda, info; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("ngpu %d, uplo %c\n", (int) opts.ngpu, opts.uplo ); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = N; n2 = lda*N; gflops = FLOPS_ZPOTRF( N ) / 1e9; TESTING_MALLOC( h_A, magmaDoubleComplex, n2 ); TESTING_HOSTALLOC( h_R, magmaDoubleComplex, n2 ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hpd( N, h_A, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zpotrf( opts.uplo, N, h_R, lda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zpotrf( &opts.uplo, &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_zlange("f", &N, &N, h_A, &lda, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_zlange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e%s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "" : " failed") ); status |= ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhesv */ int main( int argc, char** argv) { TESTING_INIT(); magmaDoubleComplex *h_A, *h_B, *h_X, *work, temp; real_Double_t gflops, gpu_perf, gpu_time = 0.0, cpu_perf=0, cpu_time=0; double error, error_lapack = 0.0; magma_int_t *ipiv; magma_int_t N, n2, lda, ldb, sizeB, lwork, info; magma_int_t status = 0, ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); printf("%%========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; ldb = N; lda = N; n2 = lda*N; sizeB = ldb*opts.nrhs; gflops = ( FLOPS_ZPOTRF( N ) + FLOPS_ZPOTRS( N, opts.nrhs ) ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_B, magmaDoubleComplex, sizeB ); TESTING_MALLOC_PIN( h_X, magmaDoubleComplex, sizeB ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { lwork = -1; lapackf77_zhesv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, ipiv, h_X, &ldb, &temp, &lwork, &info); lwork = (int)MAGMA_Z_REAL(temp); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork ); init_matrix( N, N, h_A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_zlacpy( MagmaFullStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_zhesv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, ipiv, h_X, &ldb, work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_zhesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } error_lapack = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb ); TESTING_FREE_CPU( work ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( N, N, h_A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_zlacpy( MagmaFullStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); magma_setdevice(0); gpu_time = magma_wtime(); magma_zhesv( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_zhesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) N, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) N, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 0 ) { printf(" --- \n"); } else { error = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb ); printf(" %8.2e %s", error, (error < tol ? "ok" : "failed")); if (opts.lapack) printf(" (lapack rel.res. = %8.2e)", error_lapack); printf("\n"); status += ! (error < tol); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }