/* //////////////////////////////////////////////////////////////////////////// -- Testing cpotrf */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; float flops, gpu_perf, cpu_perf; cuFloatComplex *h_A, *h_R; cuFloatComplex *d_A; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info; const char *uplo = MagmaUpperStr; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], matnorm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_cpotri_gpu -N %d\n\n", 1024); } /* Allocate host memory for the matrix */ n2 = size[9] * size[9]; ldda = ((size[9]+31)/32) * 32; TESTING_MALLOC( h_A, cuFloatComplex, n2); TESTING_HOSTALLOC( h_R, cuFloatComplex, n2); TESTING_DEVALLOC( d_A, cuFloatComplex, ldda*size[9] ); printf(" N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("========================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; flops = FLOPS_CPOTRI( (float)N ) / 1000000; ldda = ((N+31)/32)*32; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ { magma_int_t i, j; for(i=0; i<N; i++) { MAGMA_C_SET2REAL( h_A[i*lda+i], ( MAGMA_C_REAL(h_A[i*lda+i]) + 1.*N ) ); for(j=0; j<i; j++) h_A[i*lda+j] = cuConjf(h_A[j*lda+i]); } } lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ //cublasSetMatrix( N, N, sizeof(cuFloatComplex), h_A, lda, d_A, ldda); //magma_cpotrf_gpu(uplo[0], N, d_A, ldda, &info); /* factorize matrix */ magma_csetmatrix( N, N, h_A, lda, d_A, ldda ); magma_cpotrf_gpu(uplo[0], N, d_A, ldda, &info); // check for exact singularity //magma_cgetmatrix( N, N, d_A, ldda, h_R, lda ); //h_R[ 10 + 10*lda ] = MAGMA_C_MAKE( 0.0, 0.0 ); //magma_csetmatrix( N, N, h_R, lda, d_A, ldda ); start = get_current_time(); magma_cpotri_gpu(uplo[0], N, d_A, ldda, &info); end = get_current_time(); if (info != 0) printf("magma_cpotri_gpu returned error %d\n", (int) info); gpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_cpotrf(uplo, &N, h_A, &lda, &info); start = get_current_time(); lapackf77_cpotri(uplo, &N, h_A, &lda, &info); end = get_current_time(); if (info != 0) printf("lapackf77_cpotri returned error %d\n", (int) info); cpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix( N, N, d_A, ldda, h_R, lda ); matnorm = lapackf77_clange("f", &N, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %6.2f %6.2f %e\n", (int) size[i], cpu_perf, gpu_perf, lapackf77_clange("f", &N, &N, h_R, &lda, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); TESTING_DEVFREE( d_A ); /* Shutdown */ TESTING_CUDA_FINALIZE(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgelqf_gpu */ int main( int argc, char** argv) { TESTING_INIT(); const float d_neg_one = MAGMA_D_NEG_ONE; const float d_one = MAGMA_D_ONE; const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magma_int_t ione = 1; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float Anorm, error=0, error2=0; magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex_ptr d_A; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |L - A*Q^H| |I - Q*Q^H|\n"); printf("%%==============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default n2 = lda*N; nb = magma_get_cgeqrf_nb( M, N ); gflops = FLOPS_CGELQF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_cgelqf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); lwork = max( lwork, M*nb ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix( M, N, h_R, lda, d_A, ldda, opts.queue ); gpu_time = magma_wtime(); magma_cgelqf_gpu( M, N, d_A, ldda, tau, h_work, lwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_cgelqf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the result, following zlqt01 except using the reduced Q. This works for any M,N (square, tall, wide). =================================================================== */ if ( opts.check ) { magma_cgetmatrix( M, N, d_A, ldda, h_R, lda, opts.queue ); magma_int_t ldq = min_mn; magma_int_t ldl = M; magmaFloatComplex *Q, *L; float *work; TESTING_MALLOC_CPU( Q, magmaFloatComplex, ldq*N ); // K by N TESTING_MALLOC_CPU( L, magmaFloatComplex, ldl*min_mn ); // M by K TESTING_MALLOC_CPU( work, float, min_mn ); // generate K by N matrix Q, where K = min(M,N) lapackf77_clacpy( "Upper", &min_mn, &N, h_R, &lda, Q, &ldq ); lapackf77_cunglq( &min_mn, &N, &min_mn, Q, &ldq, tau, h_work, &lwork, &info ); assert( info == 0 ); // copy N by K matrix L lapackf77_claset( "Upper", &M, &min_mn, &c_zero, &c_zero, L, &ldl ); lapackf77_clacpy( "Lower", &M, &min_mn, h_R, &lda, L, &ldl ); // error = || L - A*Q^H || / (N * ||A||) blasf77_cgemm( "NoTrans", "Conj", &M, &min_mn, &N, &c_neg_one, h_A, &lda, Q, &ldq, &c_one, L, &ldl ); Anorm = lapackf77_clange( "1", &M, &N, h_A, &lda, work ); error = lapackf77_clange( "1", &M, &min_mn, L, &ldl, work ); if ( N > 0 && Anorm > 0 ) error /= (N*Anorm); // set L = I (K by K), then L = I - Q*Q^H // error = || I - Q*Q^H || / N lapackf77_claset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, L, &ldl ); blasf77_cherk( "Upper", "NoTrans", &min_mn, &N, &d_neg_one, Q, &ldq, &d_one, L, &ldl ); error2 = safe_lapackf77_clanhe( "1", "Upper", &min_mn, L, &ldl, work ); if ( N > 0 ) error2 /= N; TESTING_FREE_CPU( Q ); Q = NULL; TESTING_FREE_CPU( L ); L = NULL; TESTING_FREE_CPU( work ); work = NULL; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cgelqf( &M, &N, h_A, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapack_cgelqf returned error %d: %s.\n", (int) info, magma_strerror( info )); } } /* ===================================================================== Print performance and error. =================================================================== */ printf("%5d %5d ", (int) M, (int) N ); if ( opts.lapack ) { printf( "%7.2f (%7.2f)", cpu_perf, cpu_time ); } else { printf(" --- ( --- )" ); } printf( " %7.2f (%7.2f) ", gpu_perf, gpu_time ); if ( opts.check ) { bool okay = (error < tol && error2 < tol); printf( "error %.4g, error2 %.4g, tol %.4g, okay %d\n", error, error2, tol, okay ); status += ! okay; printf( "%11.2e %11.2e %s\n", error, error2, (okay ? "ok" : "failed") ); } else { printf( " ---\n" ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_cheevx(char jobz, char range, char uplo, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, float vl, float vu, magma_int_t il, magma_int_t iu, float abstol, magma_int_t *m, float *w, magmaFloatComplex *z, magma_int_t ldz, magmaFloatComplex *work, magma_int_t lwork, float *rwork, magma_int_t *iwork, magma_int_t *ifail, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CHEEVX computes selected eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. Arguments ========= JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, the lower triangle (if UPLO='L') or the upper triangle (if UPLO='U') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). VL (input) REAL VU (input) REAL If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. ABSTOL (input) REAL The absolute error tolerance for the eigenvalues. An approximate eigenvalue is accepted as converged when it is determined to lie in an interval [a,b] of width less than or equal to ABSTOL + EPS * max( |a|,|b| ) , where EPS is the machine precision. If ABSTOL is less than or equal to zero, then EPS*|T| will be used in its place, where |T| is the 1-norm of the tridiagonal matrix obtained by reducing A to tridiagonal form. Eigenvalues will be computed most accurately when ABSTOL is set to twice the underflow threshold 2*SLAMCH('S'), not zero. If this routine returns with INFO>0, indicating that some eigenvectors did not converge, try setting ABSTOL to 2*SLAMCH('S'). See "Computing Small Singular Values of Bidiagonal Matrices with Guaranteed High Relative Accuracy," by Demmel and Kahan, LAPACK Working Note #3. M (output) INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. W (output) REAL array, dimension (N) On normal exit, the first M elements contain the selected eigenvalues in ascending order. Z (output) COMPLEX array, dimension (LDZ, max(1,M)) If JOBZ = 'V', then if INFO = 0, the first M columns of Z contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues, with the i-th column of Z holding the eigenvector associated with W(i). If an eigenvector fails to converge, then that column of Z contains the latest approximation to the eigenvector, and the index of the eigenvector is returned in IFAIL. If JOBZ = 'N', then Z is not referenced. Note: the user must ensure that at least max(1,M) columns are supplied in the array Z; if RANGE = 'V', the exact value of M is not known in advance and an upper bound must be used. LDZ (input) INTEGER The leading dimension of the array Z. LDZ >= 1, and if JOBZ = 'V', LDZ >= max(1,N). WORK (workspace/output) COMPLEX array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. LWORK >= max(1,2*N-1). For optimal efficiency, LWORK >= (NB+1)*N, where NB is the max of the blocksize for CHETRD and for CUNMTR as returned by ILAENV. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. RWORK (workspace) REAL array, dimension (7*N) IWORK (workspace) INTEGER array, dimension (5*N) IFAIL (output) INTEGER array, dimension (N) If JOBZ = 'V', then if INFO = 0, the first M elements of IFAIL are zero. If INFO > 0, then IFAIL contains the indices of the eigenvectors that failed to converge. If JOBZ = 'N', then IFAIL is not referenced. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, then i eigenvectors failed to converge. Their indices are stored in array IFAIL. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; char range_[2] = {range, 0}; magma_int_t izero = 0; magma_int_t ione = 1; char order[1]; magma_int_t indd, inde; magma_int_t imax; magma_int_t lopt, itmp1, indee; magma_int_t lower, wantz; magma_int_t i, j, jj, i__1; magma_int_t alleig, valeig, indeig; magma_int_t iscale, indibl; magma_int_t indiwk, indisp, indtau; magma_int_t indrwk, indwrk; magma_int_t llwork, nsplit; magma_int_t lquery; magma_int_t iinfo; float safmin; float bignum; float smlnum; float eps, tmp1; float anrm; float sigma, d__1; float rmin, rmax; /* Function Body */ lower = lapackf77_lsame(uplo_, MagmaLowerStr); wantz = lapackf77_lsame(jobz_, MagmaVecStr); alleig = lapackf77_lsame(range_, "A"); valeig = lapackf77_lsame(range_, "V"); indeig = lapackf77_lsame(range_, "I"); lquery = lwork == -1; *info = 0; if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else if (ldz < 1 || (wantz && ldz < n)) { *info = -15; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_chetrd_nb(n); lopt = n * (nb + 1); work[0] = MAGMA_C_MAKE( lopt, 0 ); if (lwork < lopt && ! lquery) { *info = -17; } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } *m = 0; /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_cheevx(jobz_, range_, uplo_, &n, a, &lda, &vl, &vu, &il, &iu, &abstol, m, w, z, &ldz, work, &lwork, rwork, iwork, ifail, info); return *info; } --w; --work; --rwork; --iwork; --ifail; /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt(smlnum); rmax = magma_ssqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_clanhe("M", uplo_, &n, a, &lda, &rwork[1]); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { d__1 = 1.; lapackf77_clascl(uplo_, &izero, &izero, &d__1, &sigma, &n, &n, a, &lda, info); if (abstol > 0.) { abstol *= sigma; } if (valeig) { vl *= sigma; vu *= sigma; } } /* Call CHETRD to reduce Hermitian matrix to tridiagonal form. */ indd = 1; inde = indd + n; indrwk = inde + n; indtau = 1; indwrk = indtau + n; llwork = lwork - indwrk + 1; magma_chetrd(uplo, n, a, lda, &rwork[indd], &rwork[inde], &work[indtau], &work[indwrk], llwork, &iinfo); lopt = n + (magma_int_t)MAGMA_C_REAL(work[indwrk]); /* If all eigenvalues are desired and ABSTOL is less than or equal to zero, then call SSTERF or CUNGTR and CSTEQR. If this fails for some eigenvalue, then try SSTEBZ. */ if ((alleig || (indeig && il == 1 && iu == n)) && abstol <= 0.) { blasf77_scopy(&n, &rwork[indd], &ione, &w[1], &ione); indee = indrwk + 2*n; if (! wantz) { i__1 = n - 1; blasf77_scopy(&i__1, &rwork[inde], &ione, &rwork[indee], &ione); lapackf77_ssterf(&n, &w[1], &rwork[indee], info); } else { lapackf77_clacpy("A", &n, &n, a, &lda, z, &ldz); lapackf77_cungtr(uplo_, &n, z, &ldz, &work[indtau], &work[indwrk], &llwork, &iinfo); i__1 = n - 1; blasf77_scopy(&i__1, &rwork[inde], &ione, &rwork[indee], &ione); lapackf77_csteqr(jobz_, &n, &w[1], &rwork[indee], z, &ldz, &rwork[indrwk], info); if (*info == 0) { for (i = 1; i <= n; ++i) { ifail[i] = 0; } } } if (*info == 0) { *m = n; } } /* Otherwise, call SSTEBZ and, if eigenvectors are desired, CSTEIN. */ if (*m == 0) { *info = 0; if (wantz) { *(unsigned char *)order = 'B'; } else { *(unsigned char *)order = 'E'; } indibl = 1; indisp = indibl + n; indiwk = indisp + n; lapackf77_sstebz(range_, order, &n, &vl, &vu, &il, &iu, &abstol, &rwork[indd], &rwork[inde], m, &nsplit, &w[1], &iwork[indibl], &iwork[indisp], &rwork[indrwk], &iwork[indiwk], info); if (wantz) { lapackf77_cstein(&n, &rwork[indd], &rwork[inde], m, &w[1], &iwork[indibl], &iwork[indisp], z, &ldz, &rwork[indrwk], &iwork[indiwk], &ifail[1], info); /* Apply unitary matrix used in reduction to tridiagonal form to eigenvectors returned by CSTEIN. */ magma_cunmtr(MagmaLeft, uplo, MagmaNoTrans, n, *m, a, lda, &work[indtau], z, ldz, &work[indwrk], llwork, &iinfo); } } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = *m; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_sscal(&imax, &d__1, &w[1], &ione); } /* If eigenvalues are not in order, then sort them, along with eigenvectors. */ if (wantz) { for (j = 1; j <= *m-1; ++j) { i = 0; tmp1 = w[j]; for (jj = j + 1; jj <= *m; ++jj) { if (w[jj] < tmp1) { i = jj; tmp1 = w[jj]; } } if (i != 0) { itmp1 = iwork[indibl + i - 1]; w[i] = w[j]; iwork[indibl + i - 1] = iwork[indibl + j - 1]; w[j] = tmp1; iwork[indibl + j - 1] = itmp1; blasf77_cswap(&n, z + (i-1)*ldz, &ione, z + (j-1)*ldz, &ione); if (*info != 0) { itmp1 = ifail[i]; ifail[i] = ifail[j]; ifail[j] = itmp1; } } } } /* Set WORK(1) to optimal complex workspace size. */ work[1] = MAGMA_C_MAKE( lopt, 0 ); return *info; } /* magma_cheevx */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cheevd */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); cuFloatComplex *h_A, *h_R, *h_work; float *rwork, *w1, *w2; magma_int_t *iwork; float gpu_time, cpu_time; magma_timestr_t start, end; /* Matrix size */ magma_int_t N=0, n2; magma_int_t size[8] = {1024,2048,3072,4032,5184,6016,7040,8064}; magma_int_t i, info; magma_int_t ione = 1, izero = 0; magma_int_t ISEED[4] = {0,0,0,1}; const char *uplo = MagmaLowerStr; const char *jobz = MagmaVectorsStr; magma_int_t checkres; float result[3], eps = lapackf77_slamch( "E" ); if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); } else if ( strcmp("-JV", argv[i]) == 0 ) { jobz = MagmaVectorsStr; } else if ( strcmp("-JN", argv[i]) == 0 ) { jobz = MagmaNoVectorsStr; } } if (N>0) printf(" testing_cheevd -N %d [-JV] [-JN]\n\n", (int) N); else { printf("\nUsage: \n"); printf(" testing_cheevd -N %d [-JV] [-JN]\n\n", (int) N); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cheevd -N %d [-JV] [-JN]\n\n", 1024); N = size[7]; } checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; if ( checkres && jobz[0] == MagmaNoVectors ) { printf( "Cannot check results when vectors are not computed (jobz='N')\n" ); checkres = false; } /* Query for workspace sizes */ cuFloatComplex aux_work[1]; float aux_rwork[1]; magma_int_t aux_iwork[1]; magma_cheevd( jobz[0], uplo[0], N, h_R, N, w1, aux_work, -1, aux_rwork, -1, aux_iwork, -1, &info ); magma_int_t lwork, lrwork, liwork; lwork = (magma_int_t) MAGMA_C_REAL( aux_work[0] ); lrwork = (magma_int_t) aux_rwork[0]; liwork = aux_iwork[0]; /* Allocate host memory for the matrix */ TESTING_MALLOC( h_A, cuFloatComplex, N*N ); TESTING_MALLOC( w1, float , N ); TESTING_MALLOC( w2, float , N ); TESTING_HOSTALLOC( h_R, cuFloatComplex, N*N ); TESTING_HOSTALLOC( h_work, cuFloatComplex, lwork ); TESTING_MALLOC( rwork, float, lrwork ); TESTING_MALLOC( iwork, magma_int_t, liwork ); printf(" N CPU Time(s) GPU Time(s) \n"); printf("===================================\n"); for(i=0; i<8; i++){ if (argc==1){ N = size[i]; } n2 = N*N; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); for( int i=0; i<N; i++) { MAGMA_C_SET2REAL( h_A[i*N+i], MAGMA_C_REAL(h_A[i*N+i]) ); } lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); /* warm up run */ magma_cheevd(jobz[0], uplo[0], N, h_R, N, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); /* query for optimal workspace sizes */ magma_cheevd(jobz[0], uplo[0], N, h_R, N, w1, h_work, -1, rwork, -1, iwork, -1, &info); int lwork_save = lwork; int lrwork_save = lrwork; int liwork_save = liwork; lwork = min( lwork, (magma_int_t) MAGMA_C_REAL( h_work[0] )); lrwork = min( lrwork, (magma_int_t) rwork[0] ); liwork = min( liwork, iwork[0] ); //printf( "lwork %d, query %d, used %d; liwork %d, query %d, used %d\n", // lwork_save, (magma_int_t) h_work[0], lwork, // liwork_save, iwork[0], liwork ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ start = get_current_time(); magma_cheevd(jobz[0], uplo[0], N, h_R, N, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); end = get_current_time(); gpu_time = GetTimerValue(start,end)/1000.; lwork = lwork_save; lrwork = lrwork_save; liwork = liwork_save; if ( checkres ) { /* ===================================================================== Check the results following the LAPACK's [zcds]drvst routine. A is factored as A = U S U' and the following 3 tests computed: (1) | A - U S U' | / ( |A| N ) (2) | I - U'U | / ( N ) (3) | S(with U) - S(w/o U) | / | S | =================================================================== */ float temp1, temp2; cuFloatComplex *tau; lapackf77_chet21(&ione, uplo, &N, &izero, h_A, &N, w1, w1, h_R, &N, h_R, &N, tau, h_work, rwork, &result[0]); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); magma_cheevd('N', uplo[0], N, h_R, N, w2, h_work, lwork, rwork, lrwork, iwork, liwork, &info); temp1 = temp2 = 0; for(int j=0; j<N; j++){ temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[2] = temp2 / temp1; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_cheevd(jobz, uplo, &N, h_A, &N, w2, h_work, &lwork, rwork, &lrwork, iwork, &liwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of cheevd had an illegal value.\n", (int) -info); cpu_time = GetTimerValue(start,end)/1000.; /* ===================================================================== Print execution time =================================================================== */ printf("%5d %6.2f %6.2f\n", (int) N, cpu_time, gpu_time); if ( checkres ){ printf("Testing the factorization A = U S U' for correctness:\n"); printf("(1) | A - U S U' | / (|A| N) = %e\n", result[0]*eps); printf("(2) | I - U'U | / N = %e\n", result[1]*eps); printf("(3) | S(w/ U)-S(w/o U)|/ |S| = %e\n\n", result[2]); } if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( h_A); TESTING_FREE( w1); TESTING_FREE( w2); TESTING_FREE( rwork); TESTING_FREE( iwork); TESTING_HOSTFREE(h_work); TESTING_HOSTFREE( h_R); /* Shutdown */ TESTING_CUDA_FINALIZE(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cpotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaFloatComplex *h_A, *h_R; magma_int_t N, n2, lda, info; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu = %d, uplo = %s\n", (int) opts.ngpu, lapack_uplo_const(opts.uplo) ); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; gflops = FLOPS_CPOTRF( N ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); magma_cmake_hpd( N, h_A, lda ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cpotrf( opts.uplo, N, h_R, lda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cpotrf( lapack_uplo_const(opts.uplo), &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_clange("f", &N, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_clange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- CHEEVX computes selected eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. Arguments --------- @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA COMPLEX array, dimension (LDDA, N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,N). @param[in] vl REAL @param[in] vu REAL If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[in] abstol REAL The absolute error tolerance for the eigenvalues. An approximate eigenvalue is accepted as converged when it is determined to lie in an interval [a,b] of width less than or equal to ABSTOL + EPS * max( |a|,|b| ), \n where EPS is the machine precision. If ABSTOL is less than or equal to zero, then EPS*|T| will be used in its place, where |T| is the 1-norm of the tridiagonal matrix obtained by reducing A to tridiagonal form. \n Eigenvalues will be computed most accurately when ABSTOL is set to twice the underflow threshold 2*SLAMCH('S'), not zero. If this routine returns with INFO > 0, indicating that some eigenvectors did not converge, try setting ABSTOL to 2*SLAMCH('S'). \n See "Computing Small Singular Values of Bidiagonal Matrices with Guaranteed High Relative Accuracy," by Demmel and Kahan, LAPACK Working Note #3. @param[out] m INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1. @param[out] w REAL array, dimension (N) On normal exit, the first M elements contain the selected eigenvalues in ascending order. @param[out] dZ COMPLEX array, dimension (LDDZ, max(1,M)) If JOBZ = MagmaVec, then if INFO = 0, the first M columns of Z contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues, with the i-th column of Z holding the eigenvector associated with W(i). If an eigenvector fails to converge, then that column of Z contains the latest approximation to the eigenvector, and the index of the eigenvector is returned in IFAIL. If JOBZ = MagmaNoVec, then Z is not referenced. Note: the user must ensure that at least max(1,M) columns are supplied in the array Z; if RANGE = MagmaRangeV, the exact value of M is not known in advance and an upper bound must be used. ********* (workspace) If FAST_HEMV is defined DZ should be (LDDZ, max(1,N)) in both cases. @param[in] lddz INTEGER The leading dimension of the array DZ. LDDZ >= 1, and if JOBZ = MagmaVec, LDDZ >= max(1,N). @param wA (workspace) COMPLEX array, dimension (LDWA, N) @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,N). @param wZ (workspace) COMPLEX array, dimension (LDWZ, max(1,M)) @param[in] ldwz INTEGER The leading dimension of the array wZ. LDWZ >= 1, and if JOBZ = MagmaVec, LDWZ >= max(1,N). @param[out] work (workspace) COMPLEX array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= (NB+1)*N, where NB is the max of the blocksize for CHETRD. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param rwork (workspace) REAL array, dimension (7*N) @param iwork (workspace) INTEGER array, dimension (5*N) @param[out] ifail INTEGER array, dimension (N) If JOBZ = MagmaVec, then if INFO = 0, the first M elements of IFAIL are zero. If INFO > 0, then IFAIL contains the indices of the eigenvectors that failed to converge. If JOBZ = MagmaNoVec, then IFAIL is not referenced. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, then i eigenvectors failed to converge. Their indices are stored in array IFAIL. @ingroup magma_cheev_driver ********************************************************************/ extern "C" magma_int_t magma_cheevx_gpu( magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, float vl, float vu, magma_int_t il, magma_int_t iu, float abstol, magma_int_t *m, float *w, magmaFloatComplex_ptr dZ, magma_int_t lddz, magmaFloatComplex *wA, magma_int_t ldwa, magmaFloatComplex *wZ, magma_int_t ldwz, magmaFloatComplex *work, magma_int_t lwork, float *rwork, magma_int_t *iwork, magma_int_t *ifail, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); const char* range_ = lapack_range_const( range ); magma_int_t ione = 1; const char* order_; magma_int_t indd, inde; magma_int_t imax; magma_int_t lopt, itmp1, indee; magma_int_t lower, wantz; magma_int_t i, j, jj, i__1; magma_int_t alleig, valeig, indeig; magma_int_t iscale, indibl; magma_int_t indiwk, indisp, indtau; magma_int_t indrwk, indwrk; magma_int_t llwork, nsplit; magma_int_t lquery; magma_int_t iinfo; float safmin; float bignum; float smlnum; float eps, tmp1; float anrm; float sigma, d__1; float rmin, rmax; magmaFloat_ptr dwork; /* Function Body */ lower = (uplo == MagmaLower); wantz = (jobz == MagmaVec); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || (uplo == MagmaUpper))) { *info = -3; } else if (n < 0) { *info = -4; } else if (ldda < max(1,n)) { *info = -6; } else if (lddz < 1 || (wantz && lddz < n)) { *info = -15; } else if (ldwa < max(1,n)) { *info = -17; } else if (ldwz < 1 || (wantz && ldwz < n)) { *info = -19; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_chetrd_nb(n); lopt = n * (nb + 1); work[0] = MAGMA_C_MAKE( lopt, 0 ); if (lwork < lopt && ! lquery) { *info = -21; } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } *m = 0; /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif magmaFloatComplex *a; magma_cmalloc_cpu( &a, n*n ); magma_cgetmatrix(n, n, dA, ldda, a, n); lapackf77_cheevx(jobz_, range_, uplo_, &n, a, &n, &vl, &vu, &il, &iu, &abstol, m, w, wZ, &ldwz, work, &lwork, rwork, iwork, ifail, info); magma_csetmatrix( n, n, a, n, dA, ldda); magma_csetmatrix( n, *m, wZ, ldwz, dZ, lddz); magma_free_cpu(a); return *info; } if (MAGMA_SUCCESS != magma_smalloc( &dwork, n )) { fprintf (stderr, "!!!! device memory allocation error (magma_cheevx_gpu)\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } --w; --work; --rwork; --iwork; --ifail; /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt(smlnum); rmax = magma_ssqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = magmablas_clanhe(MagmaMaxNorm, uplo, n, dA, ldda, dwork); iscale = 0; sigma = 1; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { d__1 = 1.; magmablas_clascl(uplo, 0, 0, 1., sigma, n, n, dA, ldda, info); if (abstol > 0.) { abstol *= sigma; } if (valeig) { vl *= sigma; vu *= sigma; } } /* Call CHETRD to reduce Hermitian matrix to tridiagonal form. */ indd = 1; inde = indd + n; indrwk = inde + n; indtau = 1; indwrk = indtau + n; llwork = lwork - indwrk + 1; #ifdef FAST_HEMV magma_chetrd2_gpu(uplo, n, dA, ldda, &rwork[indd], &rwork[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, dZ, lddz*n, &iinfo); #else magma_chetrd_gpu (uplo, n, dA, ldda, &rwork[indd], &rwork[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, &iinfo); #endif lopt = n + (magma_int_t)MAGMA_C_REAL(work[indwrk]); /* If all eigenvalues are desired and ABSTOL is less than or equal to zero, then call SSTERF or CUNGTR and CSTEQR. If this fails for some eigenvalue, then try SSTEBZ. */ if ((alleig || (indeig && il == 1 && iu == n)) && abstol <= 0.) { blasf77_scopy(&n, &rwork[indd], &ione, &w[1], &ione); indee = indrwk + 2*n; if (! wantz) { i__1 = n - 1; blasf77_scopy(&i__1, &rwork[inde], &ione, &rwork[indee], &ione); lapackf77_ssterf(&n, &w[1], &rwork[indee], info); } else { lapackf77_clacpy("A", &n, &n, wA, &ldwa, wZ, &ldwz); lapackf77_cungtr(uplo_, &n, wZ, &ldwz, &work[indtau], &work[indwrk], &llwork, &iinfo); i__1 = n - 1; blasf77_scopy(&i__1, &rwork[inde], &ione, &rwork[indee], &ione); lapackf77_csteqr(jobz_, &n, &w[1], &rwork[indee], wZ, &ldwz, &rwork[indrwk], info); if (*info == 0) { for (i = 1; i <= n; ++i) { ifail[i] = 0; } magma_csetmatrix( n, n, wZ, ldwz, dZ, lddz ); } } if (*info == 0) { *m = n; } } /* Otherwise, call SSTEBZ and, if eigenvectors are desired, CSTEIN. */ if (*m == 0) { *info = 0; if (wantz) { order_ = "B"; } else { order_ = "E"; } indibl = 1; indisp = indibl + n; indiwk = indisp + n; lapackf77_sstebz(range_, order_, &n, &vl, &vu, &il, &iu, &abstol, &rwork[indd], &rwork[inde], m, &nsplit, &w[1], &iwork[indibl], &iwork[indisp], &rwork[indrwk], &iwork[indiwk], info); if (wantz) { lapackf77_cstein(&n, &rwork[indd], &rwork[inde], m, &w[1], &iwork[indibl], &iwork[indisp], wZ, &ldwz, &rwork[indrwk], &iwork[indiwk], &ifail[1], info); magma_csetmatrix( n, *m, wZ, ldwz, dZ, lddz ); /* Apply unitary matrix used in reduction to tridiagonal form to eigenvectors returned by CSTEIN. */ magma_cunmtr_gpu(MagmaLeft, uplo, MagmaNoTrans, n, *m, dA, ldda, &work[indtau], dZ, lddz, wA, ldwa, &iinfo); } } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = *m; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_sscal(&imax, &d__1, &w[1], &ione); } /* If eigenvalues are not in order, then sort them, along with eigenvectors. */ if (wantz) { for (j = 1; j <= *m-1; ++j) { i = 0; tmp1 = w[j]; for (jj = j + 1; jj <= *m; ++jj) { if (w[jj] < tmp1) { i = jj; tmp1 = w[jj]; } } if (i != 0) { itmp1 = iwork[indibl + i - 1]; w[i] = w[j]; iwork[indibl + i - 1] = iwork[indibl + j - 1]; w[j] = tmp1; iwork[indibl + j - 1] = itmp1; magma_cswap(n, dZ + (i-1)*lddz, ione, dZ + (j-1)*lddz, ione); if (*info != 0) { itmp1 = ifail[i]; ifail[i] = ifail[j]; ifail[j] = itmp1; } } } } /* Set WORK[0] to optimal complex workspace size. */ work[1] = MAGMA_C_MAKE( lopt, 0 ); return *info; } /* magma_cheevx_gpu */
/** Purpose ------- CUNGQR generates an M-by-N COMPLEX matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by CGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A COMPLEX array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQRF_GPU. @param[in] T COMPLEX array, dimension (NB, min(M,N)). T contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_cgeqrf_gpu (except stored on the CPU, not the GPU). @param[in] nb INTEGER This is the block size used in CGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in T. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_cungqr_m( magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *T, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(d,i,j) (dA[d] + (i) + (j)*ldda) #define dT(d,i,j) (dT[d] + (i) + (j)*nb) magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldwork; magma_int_t d, i, ib, j, jb, ki, kk; magmaFloatComplex *work=NULL; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_int_t di, dn; magma_int_t dpanel; magma_int_t ngpu = magma_num_gpus(); magma_device_t orig_dev; magma_getdevice( &orig_dev ); // Allocate memory on GPUs for A and workspaces magma_int_t ldda = magma_roundup( m, 32 ); magma_int_t lddwork = magma_roundup( n, 32 ); magma_int_t min_lblocks = (n / nb) / ngpu; // min. blocks per gpu magma_int_t last_dev = (n / nb) % ngpu; // device with last block magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magmaFloatComplex *dA[ MagmaMaxGPUs ] = { NULL }; magmaFloatComplex *dT[ MagmaMaxGPUs ] = { NULL }; magmaFloatComplex *dV[ MagmaMaxGPUs ] = { NULL }; magmaFloatComplex *dW[ MagmaMaxGPUs ] = { NULL }; magma_queue_t queues[ MagmaMaxGPUs ] = { NULL }; for( d = 0; d < ngpu; ++d ) { // example with n = 75, nb = 10, ngpu = 3 // min_lblocks = 2 // last_dev = 1 // gpu 0: 2 blocks, cols: 0- 9, 30-39, 60-69 // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial) // gpu 2: 1 block, cols: 20-29, 50-59 magma_setdevice( d ); nlocal[d] = min_lblocks*nb; if ( d < last_dev ) { nlocal[d] += nb; } else if ( d == last_dev ) { nlocal[d] += (n % nb); } ldwork = nlocal[d]*ldda // dA + nb*m // dT + nb*ldda // dV + nb*lddwork; // dW if ( MAGMA_SUCCESS != magma_cmalloc( &dA[d], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } dT[d] = dA[d] + nlocal[d]*ldda; dV[d] = dT[d] + nb*m; dW[d] = dV[d] + nb*ldda; magma_queue_create( d, &queues[d] ); } trace_init( 1, ngpu, 1, queues ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for larfb work // m*nb for V // nb*nb for T lwork = (n + m + nb) * nb; magma_cmalloc_cpu( &work, lwork ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } magmaFloatComplex *work_T, *work_V; work_T = work + n*nb; work_V = work + n*nb + nb*nb; // Use unblocked code for the last or only block. if (kk < n) { trace_cpu_start( 0, "ungqr", "ungqr last block" ); m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; // cungqr requires less workspace (n*nb), but is slow if k < cungqr's block size. // replacing it with the 4 routines below is much faster (e.g., 60x). //magma_int_t iinfo; //lapackf77_cungqr( &m_kk, &n_kk, &k_kk, // A(kk, kk), &lda, // &tau[kk], work, &lwork, &iinfo ); lapackf77_clacpy( MagmaFullStr, &m_kk, &k_kk, A(kk,kk), &lda, work_V, &m_kk); lapackf77_claset( MagmaFullStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, work_V, &m_kk, &tau[kk], work_T, &k_kk); lapackf77_clarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, work_V, &m_kk, work_T, &k_kk, A(kk, kk), &lda, work, &n_kk ); if (kk > 0) { for( j=kk; j < n; j += nb ) { jb = min( n-j, nb ); d = (j / nb) % ngpu; di = ((j / nb) / ngpu) * nb; magma_setdevice( d ); magma_csetmatrix( m_kk, jb, A(kk, j), lda, dA(d, kk, di), ldda, queues[d] ); // Set A(1:kk,kk+1:n) to zero. magmablas_claset( MagmaFull, kk, jb, c_zero, c_zero, dA(d, 0, di), ldda, queues[d] ); } } trace_cpu_end( 0 ); } if (kk > 0) { // Use blocked code // send T to all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set T" ); magma_csetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, queues[d] ); trace_gpu_end( d, 0 ); } // queue: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for( i = ki; i >= 0; i -= nb ) { ib = min(nb, k - i); mi = m - i; dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; // Send current panel to dV on the GPUs lapackf77_claset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set V" ); magma_csetmatrix_async( mi, ib, A(i, i), lda, dV[d], ldda, queues[d] ); trace_gpu_end( d, 0 ); } // set panel to identity magma_setdevice( dpanel ); trace_gpu_start( dpanel, 0, "laset", "laset" ); magmablas_claset( MagmaFull, i, ib, c_zero, c_zero, dA(dpanel, 0, di), ldda, queues[dpanel] ); magmablas_claset( MagmaFull, mi, ib, c_zero, c_one, dA(dpanel, i, di), ldda, queues[dpanel] ); trace_gpu_end( dpanel, 0 ); if (i < n) { // Apply H to A(i:m,i:n) from the left for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn ); trace_gpu_start( d, 0, "larfb", "larfb" ); magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, dn-di, ib, dV[d], ldda, dT(d,0,i), nb, dA(d, i, di), ldda, dW[d], lddwork, queues[d] ); trace_gpu_end( d, 0 ); } } } // copy result back to CPU trace_cpu_start( 0, "get", "get A" ); magma_cgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb, queues ); trace_cpu_end( 0 ); } #ifdef TRACING char name[80]; snprintf( name, sizeof(name), "cungqr-n%d-ngpu%d.svg", m, ngpu ); trace_finalize( name, "trace.css" ); #endif cleanup: for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_free( dA[d] ); magma_queue_destroy( queues[d] ); } magma_free_cpu( work ); magma_setdevice( orig_dev ); return *info; } /* magma_cungqr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing claswp */ int main( int argc, char** argv) { /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } magmaFloatComplex *h_A1, *h_A2, *h_A3, *h_AT; magmaFloatComplex_ptr d_A1; real_Double_t gpu_time, cpu_time1, cpu_time2; /* Matrix size */ int M=0, N=0, n2, lda, ldat; int size[7] = {1000,2000,3000,4000,5000,6000,7000}; int i, j; int ione = 1; int ISEED[4] = {0,0,0,1}; int *ipiv; int k1, k2, r, c, incx; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if (M>0 && N>0) printf(" testing_claswp -M %d -N %d\n\n", M, N); else { printf("\nUsage: \n"); printf(" testing_claswp -M %d -N %d\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_claswp -M %d -N %d\n\n", 1024, 1024); M = N = size[6]; } lda = M; n2 = M*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A1, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_A2, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_A3, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_AT, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A1, magmaFloatComplex, n2 ); ipiv = (int*)malloc(M * sizeof(int)); if (ipiv == 0) { fprintf (stderr, "!!!! host memory allocation error (ipiv)\n"); } printf("\n\n"); printf(" M N CPU_BLAS (sec) CPU_LAPACK (sec) GPU (sec) \n"); printf("=============================================================================\n"); for(i=0; i<7; i++) { if(argc == 1){ M = N = size[i]; } lda = M; ldat = N; n2 = M*N; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A1 ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A1, &lda, h_A2, &lda ); for(r=0;r<M;r++){ for(c=0;c<N;c++){ h_AT[c+r*ldat] = h_A1[r+c*lda]; } } magma_csetmatrix( N, M, h_AT, 0, ldat, d_A1, 0, ldat, queue); for(j=0; j<M; j++) { ipiv[j] = (int)((rand()*1.*M) / (RAND_MAX * 1.)) + 1; } /* * BLAS swap */ /* Column Major */ cpu_time1 = magma_wtime(); for ( j=0; j<M; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+j, &lda, h_A1+(ipiv[j]-1), &lda); } } cpu_time1 = magma_wtime() - cpu_time1; /* * LAPACK laswp */ cpu_time2 = magma_wtime(); k1 = 1; k2 = M; incx = 1; lapackf77_claswp(&N, h_A2, &lda, &k1, &k2, ipiv, &incx); cpu_time2 = magma_wtime() - cpu_time2; /* * GPU swap */ /* Col swap on transpose matrix*/ gpu_time = magma_wtime(); magma_cpermute_long2(N, d_A1, 0, ldat, ipiv, M, 0, queue); gpu_time = magma_wtime() - gpu_time; /* Check Result */ magma_cgetmatrix( N, M, d_A1, 0, ldat, h_AT, 0, ldat, queue); for(r=0;r<N;r++){ for(c=0;c<M;c++){ h_A3[c+r*lda] = h_AT[r+c*ldat]; } } int check_bl, check_bg, check_lg; check_bl = diffMatrix( h_A1, h_A2, M, N, lda ); check_bg = diffMatrix( h_A1, h_A3, M, N, lda ); check_lg = diffMatrix( h_A2, h_A3, M, N, lda ); printf("%5d %5d %6.2f %6.2f %6.2f %s %s %s\n", M, N, cpu_time1, cpu_time2, gpu_time, (check_bl == 0) ? "SUCCESS" : "FAILED", (check_bg == 0) ? "SUCCESS" : "FAILED", (check_lg == 0) ? "SUCCESS" : "FAILED"); if(check_lg !=0){ printf("lapack swap results:\n"); magma_cprint(M, N, h_A1, lda); printf("gpu swap transpose matrix result:\n"); magma_cprint(M, N, h_A3, lda); } if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A1 ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_A3 ); TESTING_FREE_CPU( h_AT ); TESTING_FREE_DEV( d_A1 ); magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing clacpy_batched Code is very similar to testing_cgeadd_batched.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_B; magmaFloatComplex_ptr d_A, d_B; magmaFloatComplex **hAarray, **hBarray, **dAarray, **dBarray; magma_int_t M, N, mb, nb, size, lda, ldda, mstride, nstride, ntile; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts( MagmaOptsBatched ); opts.parse_opts( argc, argv ); mb = (opts.nb == 0 ? 32 : opts.nb); nb = (opts.nb == 0 ? 64 : opts.nb); mstride = 2*mb; nstride = 3*nb; printf("%% mb=%d, nb=%d, mstride=%d, nstride=%d\n", (int) mb, (int) nb, (int) mstride, (int) nstride ); printf("%% M N ntile CPU Gflop/s (ms) GPU Gflop/s (ms) check\n"); printf("%%================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default size = lda*N; if ( N < nb || M < nb ) { ntile = 0; } else { ntile = min( (M - nb)/mstride + 1, (N - nb)/nstride + 1 ); } gbytes = 2.*mb*nb*ntile / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda *N ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, lda *N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, ldda*N ); TESTING_MALLOC_CPU( hAarray, magmaFloatComplex*, ntile ); TESTING_MALLOC_CPU( hBarray, magmaFloatComplex*, ntile ); TESTING_MALLOC_DEV( dAarray, magmaFloatComplex*, ntile ); TESTING_MALLOC_DEV( dBarray, magmaFloatComplex*, ntile ); lapackf77_clarnv( &ione, ISEED, &size, h_A ); lapackf77_clarnv( &ione, ISEED, &size, h_B ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix( M, N, h_A, lda, d_A, ldda, opts.queue ); magma_csetmatrix( M, N, h_B, lda, d_B, ldda, opts.queue ); // setup pointers for( magma_int_t tile = 0; tile < ntile; ++tile ) { magma_int_t offset = tile*mstride + tile*nstride*ldda; hAarray[tile] = &d_A[offset]; hBarray[tile] = &d_B[offset]; } magma_setvector( ntile, sizeof(magmaFloatComplex*), hAarray, 1, dAarray, 1, opts.queue ); magma_setvector( ntile, sizeof(magmaFloatComplex*), hBarray, 1, dBarray, 1, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); magmablas_clacpy_batched( MagmaFull, mb, nb, dAarray, ldda, dBarray, ldda, ntile, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); for( magma_int_t tile = 0; tile < ntile; ++tile ) { magma_int_t offset = tile*mstride + tile*nstride*lda; lapackf77_clacpy( MagmaFullStr, &mb, &nb, &h_A[offset], &lda, &h_B[offset], &lda ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ===================================================================== Check the result =================================================================== */ magma_cgetmatrix( M, N, d_B, ldda, h_A, lda, opts.queue ); blasf77_caxpy(&size, &c_neg_one, h_A, &ione, h_B, &ione); error = lapackf77_clange("f", &M, &N, h_B, &lda, work); bool okay = (error == 0); status += ! okay; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", (int) M, (int) N, (int) ntile, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (okay ? "ok" : "failed") ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_CPU( hAarray ); TESTING_FREE_CPU( hBarray ); TESTING_FREE_DEV( dAarray ); TESTING_FREE_DEV( dBarray ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing chegvdx */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); real_Double_t mgpu_time; magmaFloatComplex *h_A, *h_Ainit, *h_B, *h_Binit, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; magma_int_t lrwork; #endif float *w1, result=0; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: ngpu = %d, itype = %d, jobz = %s, range = %s, uplo = %s, opts.check = %d, fraction = %6.4f\n", (int) opts.ngpu, (int) opts.itype, lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M ngpu MGPU Time (sec)\n"); printf("====================================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_cbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_cbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; //magma_int_t NB = 96;//magma_bulge_get_nb(N); //magma_int_t sizvblg = magma_cbulge_get_lq2(N, threads); //magma_int_t siz = max(sizvblg,n2)+2*(N*NB+N)+24*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_B, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, float, lrwork); #endif TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clarnv( &ione, ISEED, &n2, h_B ); magma_cmake_hpd( N, h_B, N ); magma_cmake_hermitian( N, h_A, N ); if ( opts.warmup || opts.check ) { TESTING_MALLOC_CPU( h_Ainit, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_Binit, magmaFloatComplex, n2 ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_Ainit, &N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_Binit, &N ); } magma_int_t m1 = 0; float vl = 0; float vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == MagmaRangeI) { il = 1; iu = (int) (opts.fraction*N); } if ( opts.warmup ) { // ================================================================== // Warmup using MAGMA. I prefer to use smalltest to warmup A- // ================================================================== magma_chegvdx_2stage_m(opts.ngpu, opts.itype, opts.jobz, range, opts.uplo, N, h_A, N, h_B, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_Binit, &N, h_B, &N ); } // =================================================================== // Performs operation using MAGMA // =================================================================== mgpu_time = magma_wtime(); magma_chegvdx_2stage_m(opts.ngpu, opts.itype, opts.jobz, range, opts.uplo, N, h_A, N, h_B, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); mgpu_time = magma_wtime() - mgpu_time; if ( opts.check ) { // =================================================================== // Check the results following the LAPACK's [zc]hegvdx routine. // A x = lambda B x is solved // and the following 3 tests computed: // (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) // | A B Z - Z D | / ( |A||Z| N ) (itype = 2) // | B A Z - Z D | / ( |A||Z| N ) (itype = 3) // =================================================================== #if defined(PRECISION_d) || defined(PRECISION_s) float *rwork = h_work + N*N; #endif result = 1.; result /= lapackf77_clanhe("1", lapack_uplo_const(opts.uplo), &N, h_Ainit, &N, rwork); result /= lapackf77_clange("1", &N , &m1, h_A, &N, rwork); if (opts.itype == 1) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_csscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_Binit, &N, h_A, &N, &c_one, h_work, &N); result *= lapackf77_clange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Binit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_csscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Ainit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_clange("1", &N, &m1, h_A, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_csscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Binit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_clange("1", &N, &m1, h_A, &N, rwork)/N; } } // =================================================================== // Print execution time // =================================================================== printf("%5d %5d %4d %7.2f\n", (int) N, (int) m1, (int) opts.ngpu, mgpu_time); if ( opts.check ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype==1) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } else if (opts.itype==2) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } else if (opts.itype==3) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } printf("\n"); status += ! (result < tol); } TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( iwork ); if ( opts.warmup || opts.check ) { TESTING_FREE_CPU( h_Ainit ); TESTING_FREE_CPU( h_Binit ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE_MGPU(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgesv */ int main(int argc, char **argv) { real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; float error, Rnorm, Anorm, Xnorm, *work; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_LU, *h_B, *h_X; magma_int_t *ipiv; magma_int_t N, nrhs, lda, ldb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; /* Initialize */ magma_queue_t queue[2]; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); nrhs = opts.nrhs; err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } // Create two queues on device opts.device err = magma_queue_create( device[ opts.device ], &queue[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[ opts.device ], &queue[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf("ngpu %d\n", (int) opts.ngpu ); printf(" N NRHS CPU Gflop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = N; ldb = lda; gflops = ( FLOPS_CGETRF( N, N ) + FLOPS_CGETRS( N, nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_LU, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaFloatComplex, ldb*nrhs ); TESTING_MALLOC_CPU( work, float, N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); /* Initialize the matrices */ sizeA = lda*N; sizeB = ldb*nrhs; lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); // copy A to LU and B to X; save A and B for residual lapackf77_clacpy( "F", &N, &N, h_A, &lda, h_LU, &lda ); lapackf77_clacpy( "F", &N, &nrhs, h_B, &ldb, h_X, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cgesv( N, nrhs, h_LU, lda, ipiv, h_X, ldb, &info, queue ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Residual //===================================================================== Anorm = lapackf77_clange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_clange("I", &N, &nrhs, h_X, &ldb, work); blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_clange("I", &N, &nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status |= ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cgesv( &N, &nrhs, h_A, &lda, ipiv, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e%s\n", (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "" : " failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e%s\n", (int) N, (int) nrhs, gpu_perf, gpu_time, error, (error < tol ? "" : " failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_LU ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( ipiv ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( queue[0] ); magma_queue_destroy( queue[1] ); magma_finalize(); return status; }
/***************************************************************************//** Purpose ------- CUNGQR generates an M-by-N COMPLEX matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by CGEQRF. This version recomputes the T matrices on the CPU and sends them to the GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A COMPLEX array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQRF_GPU. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_ungqr *******************************************************************************/ extern "C" magma_int_t magma_cungqr2( magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t nb = magma_get_cgeqrf_nb( m, n ); magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; magmaFloatComplex *dA, *dV, *dW, *dT, *T; magmaFloatComplex *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = magma_roundup( m, 32 ); lddwork = magma_roundup( n, 32 ); if (MAGMA_SUCCESS != magma_cmalloc( &dA, ldda*n + ldda*nb + lddwork*nb + nb*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; dT = dA + ldda*n + ldda*nb + lddwork*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_cmalloc_cpu( &work, lwork ); T = work; if (work == NULL) { magma_free( dA ); magma_free_cpu( work ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magmaFloatComplex *V = work + (n+nb)*nb; magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* lapackf77_cungqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_clacpy( MagmaFullStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_claset( MagmaFullStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_clarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_csetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda, queue ); // Set A(1:kk,kk+1:n) to zero. magmablas_claset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda, queue ); } } if (kk > 0) { // Use blocked code // queue: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_claset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_csetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, queue ); lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &mi, &ib, A(i,i), &lda, &tau[i], T, &nb); magma_csetmatrix_async( ib, ib, T, nb, dT, nb, queue ); // set panel to identity magmablas_claset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda, queue ); magmablas_claset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda, queue ); magma_queue_sync( queue ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT, nb, dA(i, i), ldda, dW, lddwork, queue ); } } // copy result back to CPU magma_cgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda, queue ); } magma_queue_destroy( queue ); magma_free( dA ); magma_free_cpu( work ); return *info; } /* magma_cungqr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmqr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; magmaFloatComplex *C, *R, *A, *W, *tau; magma_opts opts; parse_opts( argc, argv, &opts ); // test all combinations of input parameters const char* side[] = { MagmaLeftStr, MagmaRightStr }; const char* trans[] = { MagmaConjTransStr, MagmaNoTransStr }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; nb = magma_get_cgeqrf_nb( m ); ldc = ((m + 31)/32)*32; lda = ((max(m,n) + 31)/32)*32; gflops = FLOPS_CUNMQR( m, n, k, *side[iside] ) / 1e9; if ( *side[iside] == 'L' && m < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=left and m < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } if ( *side[iside] == 'R' && n < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=right and n < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } lwork_max = max( m*nb, n*nb ); TESTING_MALLOC( C, magmaFloatComplex, ldc*n ); TESTING_MALLOC( R, magmaFloatComplex, ldc*n ); TESTING_MALLOC( A, magmaFloatComplex, lda*k ); TESTING_MALLOC( W, magmaFloatComplex, lwork_max ); TESTING_MALLOC( tau, magmaFloatComplex, k ); // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, ISEED, &size, C ); lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc ); //magma_csetmatrix( m, n, C, ldc, dC, ldc ); // A is m x k (left) or n x k (right) lda = (*side[iside] == 'L' ? m : n); size = lda*k; lapackf77_clarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in A, tau magma_cgeqrf( lda, k, A, lda, tau, W, lwork_max, &info ); if (info != 0) printf("magma_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmqr( side[iside], trans[itran], &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for work size lwork = -1; magma_cunmqr( *side[iside], *trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) printf("magma_cunmqr (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_C_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) printf("invalid lwork %d, lwork_max %d\n", (int) lwork, (int) lwork_max ); gpu_time = magma_wtime(); magma_cunmqr( *side[iside], *trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); //magma_cgetmatrix( m, n, dC, ldc, R, ldc ); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %-5s %-9s %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) m, (int) n, (int) k, side[iside], trans[itran], cpu_perf, cpu_time, gpu_perf, gpu_time, error ); TESTING_FREE( C ); TESTING_FREE( R ); TESTING_FREE( A ); TESTING_FREE( W ); TESTING_FREE( tau ); }} // end iside, itran printf( "\n" ); } TESTING_FINALIZE(); return 0; }
/***************************************************************************//** Purpose ------- CHEEVD computes all eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the orthonormal eigenvectors of the matrix A. If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] vl REAL @param[in] vu REAL If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] m INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1. @param[out] w REAL array, dimension (N) If INFO = 0, the eigenvalues in ascending order. @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. - If N <= 1, LWORK >= 1. - If JOBZ = MagmaNoVec and N > 1, LWORK >= N + N*NB. - If JOBZ = MagmaVec and N > 1, LWORK >= max( N + N*NB, 2*N + N**2 ). NB can be obtained through magma_get_chetrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] rwork (workspace) REAL array, dimension (LRWORK) On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK. @param[in] lrwork INTEGER The dimension of the array RWORK. - If N <= 1, LRWORK >= 1. - If JOBZ = MagmaNoVec and N > 1, LRWORK >= N. - If JOBZ = MagmaVec and N > 1, LRWORK >= 1 + 5*N + 2*N**2. \n If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. - If N <= 1, LIWORK >= 1. - If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. - If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. @ingroup magma_heevdx *******************************************************************************/ extern "C" magma_int_t magma_cheevdx_m( magma_int_t ngpu, magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *m, float *w, magmaFloatComplex *work, magma_int_t lwork, #ifdef COMPLEX float *rwork, magma_int_t lrwork, #endif magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); magma_int_t ione = 1; magma_int_t izero = 0; float d_one = 1.; float d__1; float eps; magma_int_t inde; float anrm; magma_int_t imax; float rmin, rmax; float sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t llrwk; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; float safmin; float bignum; magma_int_t indtau; magma_int_t indrwk, indwrk, liwmin; magma_int_t lrwmin, llwork; float smlnum; magma_int_t lquery; magma_int_t alleig, valeig, indeig; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1 || lrwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || (uplo == MagmaUpper))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_chetrd_nb( n ); if ( n <= 1 ) { lwmin = 1; lrwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( n + n*nb, 2*n + n*n ); lrwmin = 1 + 5*n + 2*n*n; liwmin = 3 + 5*n; } else { lwmin = n + n*nb; lrwmin = n; liwmin = 1; } work[0] = magma_cmake_lwork( lwmin ); rwork[0] = magma_smake_lwork( lrwmin ); iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -14; } else if ((lrwork < lrwmin) && ! lquery) { *info = -16; } else if ((liwork < liwmin) && ! lquery) { *info = -18; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = MAGMA_C_REAL(A[0]); if (wantz) { A[0] = MAGMA_C_ONE; } return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%lld NB=%lld, calling lapack on CPU\n", (long long) n, (long long) nb ); printf("--------------------------------------------------------------\n"); #endif lapackf77_cheevd(jobz_, uplo_, &n, A, &lda, w, work, &lwork, #ifdef COMPLEX rwork, &lrwork, #endif iwork, &liwork, info); return *info; } /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt(smlnum); rmax = magma_ssqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_clanhe("M", uplo_, &n, A, &lda, rwork); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_clascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, A, &lda, info); } /* Call CHETRD to reduce Hermitian matrix to tridiagonal form. */ inde = 0; indtau = 0; indwrk = indtau + n; indrwk = inde + n; indwk2 = indwrk + n * n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; llrwk = lrwork - indrwk; magma_timer_t time=0; timer_start( time ); magma_chetrd_mgpu(ngpu, 1, uplo, n, A, lda, w, &rwork[inde], &work[indtau], &work[indwrk], llwork, &iinfo); timer_stop( time ); timer_printf( "time chetrd = %6.2f\n", time ); /* For eigenvalues only, call SSTERF. For eigenvectors, first call CSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call CUNMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_ssterf(&n, w, &rwork[inde], info); magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); } else { timer_start( time ); magma_cstedx_m(ngpu, range, n, vl, vu, il, iu, w, &rwork[inde], &work[indwrk], n, &rwork[indrwk], llrwk, iwork, liwork, info); timer_stop( time ); timer_printf( "time cstedc = %6.2f\n", time ); timer_start( time ); magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); magma_cunmtr_m(ngpu, MagmaLeft, uplo, MagmaNoTrans, n, *m, A, lda, &work[indtau], &work[indwrk + n * (il-1)], n, &work[indwk2], llwrk2, &iinfo); lapackf77_clacpy("A", &n, m, &work[indwrk + n * (il-1)], &n, A, &lda); timer_stop( time ); timer_printf( "time cunmtr + copy = %6.2f\n", time ); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = n; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_sscal(&imax, &d__1, w, &ione); } work[0] = magma_cmake_lwork( lwmin ); rwork[0] = magma_smake_lwork( lrwmin ); iwork[0] = liwmin; return *info; } /* magma_cheevd_m */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgegqr */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float e1, e2, e3, e4, e5, *work; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex *h_A, *h_R, *tau, *dtau, *h_work, *h_rwork, tmp[1]; magmaFloatComplex *d_A, *dwork; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn; magma_int_t ione = 1, ldwork; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) // versions 1...4 are valid if (opts.version < 1 || opts.version > 4) { printf("Unknown version %d; exiting\n", opts.version ); return -1; } float tol, eps = lapackf77_slamch("E"); tol = 10* opts.tolerance * eps; printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||I-Q'Q||_F / M ||I-Q'Q||_I / M ||A-Q R||_I\n"); printf(" MAGMA / LAPACK MAGMA / LAPACK\n"); printf("==========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if (N > 128) { printf("%5d %5d skipping because cgegqr requires N <= 128\n", (int) M, (int) N); continue; } if (M < N) { printf("%5d %5d skipping because cgegqr requires M >= N\n", (int) M, (int) N); continue; } min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGEQRF( M, N ) / 1e9 + FLOPS_CUNGQR( M, N, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); lwork = max(lwork, 3*N*N); ldwork = N*N; if (opts.version == 2) { ldwork = 3*N*N + min_mn; } TESTING_MALLOC_PIN( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); TESTING_MALLOC_PIN(h_rwork, magmaFloatComplex, lwork ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( work, float, M ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( dtau, magmaFloatComplex, min_mn ); TESTING_MALLOC_DEV( dwork, magmaFloatComplex, ldwork ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); // warmup magma_cgegqr_gpu( 1, M, N, d_A, ldda, dwork, h_work, &info ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime( 0 ); magma_cgegqr_gpu( opts.version, M, N, d_A, ldda, dwork, h_rwork, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgegqr returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_cgetmatrix( M, N, d_A, ldda, h_R, M ); // Regenerate R // blasf77_cgemm("t", "n", &N, &N, &M, &c_one, h_R, &M, h_A, &M, &c_zero, h_rwork, &N); // magma_cprint(N, N, h_work, N); blasf77_ctrmm("r", "u", "n", "n", &M, &N, &c_one, h_rwork, &N, h_R, &M); blasf77_caxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); e5 = lapackf77_clange("i", &M, &N, h_R, &M, work) / lapackf77_clange("i", &M, &N, h_A, &lda, work); magma_cgetmatrix( M, N, d_A, ldda, h_R, M ); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); /* Orthogonalize on the CPU */ lapackf77_cgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_cungqr(&M, &N, &N, h_A, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cungqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_cgemm("t", "n", &N, &N, &M, &c_one, h_R, &M, h_R, &M, &c_zero, h_work, &N); for(int ii = 0; ii < N*N; ii += N+1 ) { h_work[ii] = MAGMA_C_SUB(h_work[ii], c_one); } e1 = lapackf77_clange("f", &N, &N, h_work, &N, work) / N; e3 = lapackf77_clange("i", &N, &N, h_work, &N, work) / N; blasf77_cgemm("t", "n", &N, &N, &M, &c_one, h_A, &M, h_A, &M, &c_zero, h_work, &N); for(int ii = 0; ii < N*N; ii += N+1 ) { h_work[ii] = MAGMA_C_SUB(h_work[ii], c_one); } e2 = lapackf77_clange("f", &N, &N, h_work, &N, work) / N; e4 = lapackf77_clange("i", &N, &N, h_work, &N, work) / N; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e / %8.2e %8.2e / %8.2e %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, e1, e2, e3, e4, e5, (e1 < tol ? "ok" : "failed")); status += ! (e1 < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time ); } TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_rwork ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cposv */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; float error, Rnorm, Anorm, Xnorm, *work; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *h_B, *h_X; magma_int_t N, lda, ldb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu = %d, uplo = %s\n", (int) opts.ngpu, lapack_uplo_const(opts.uplo) ); printf(" N NRHS CPU Gflop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = ldb = N; gflops = ( FLOPS_CPOTRF( N ) + FLOPS_CPOTRS( N, opts.nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*opts.nrhs ); TESTING_MALLOC_CPU( h_X, magmaFloatComplex, ldb*opts.nrhs ); TESTING_MALLOC_CPU( work, float, N ); /* ==================================================================== Initialize the matrix =================================================================== */ sizeA = lda*N; sizeB = ldb*opts.nrhs; lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); magma_cmake_hpd( N, h_A, lda ); // copy A to R and B to X; save A and B for residual lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cposv( opts.uplo, N, opts.nrhs, h_R, lda, h_X, ldb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Residual =================================================================== */ Anorm = lapackf77_clange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_clange("I", &N, &opts.nrhs, h_X, &ldb, work); blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &opts.nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb ); Rnorm = lapackf77_clange("I", &N, &opts.nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status += ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cposv( lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cposv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) opts.nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) opts.nrhs, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; magmaFloatComplex *h_A, *h_R; magmaFloatComplex *d_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||PA-LU||/(||A||*N)\n"); printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGETRF( M, N ) / 1e9; if ( N > 512 ) { fprintf( stderr, "cgetf2 does not support N > 512; skipping N=%d.\n", (int) N ); continue; } TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cgetf2_gpu( M, N, d_A, ldda, ipiv, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgetf2_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000. ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time*1000. ); } if ( opts.check ) { magma_cgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_LU_error( M, N, h_R, lda, h_A, ipiv ); printf(" %8.2e\n", error ); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
extern "C" magma_int_t magma_cheevdx(char jobz, char range, char uplo, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *m, float *w, magmaFloatComplex *work, magma_int_t lwork, float *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CHEEVDX computes selected eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, the first m columns of A contains the required orthonormal eigenvectors of the matrix A. If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') or the upper triangle (if UPLO='U') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). VL (input) DOUBLE PRECISION VU (input) DOUBLE PRECISION If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. M (output) INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. W (output) DOUBLE PRECISION array, dimension (N) If INFO = 0, the required m eigenvalues in ascending order. WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = 'N' and N > 1, LWORK >= N + N*NB. If JOBZ = 'V' and N > 1, LWORK >= max( N + N*NB, 2*N + N**2 ). NB can be obtained through magma_get_chetrd_nb(N). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. RWORK (workspace/output) DOUBLE PRECISION array, dimension (LRWORK) On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK. LRWORK (input) INTEGER The dimension of the array RWORK. If N <= 1, LRWORK >= 1. If JOBZ = 'N' and N > 1, LRWORK >= N. If JOBZ = 'V' and N > 1, LRWORK >= 1 + 5*N + 2*N**2. If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = 'N' and N > 1, LIWORK >= 1. If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i and JOBZ = 'N', then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = 'V', then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; char range_[2] = {range, 0}; magma_int_t ione = 1; magma_int_t izero = 0; float d_one = 1.; float d__1; float eps; magma_int_t inde; float anrm; magma_int_t imax; float rmin, rmax; float sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t llrwk; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; float safmin; float bignum; magma_int_t indtau; magma_int_t indrwk, indwrk, liwmin; magma_int_t lrwmin, llwork; float smlnum; magma_int_t lquery; magma_int_t alleig, valeig, indeig; float* dwork; wantz = lapackf77_lsame(jobz_, MagmaVecStr); lower = lapackf77_lsame(uplo_, MagmaLowerStr); alleig = lapackf77_lsame( range_, "A" ); valeig = lapackf77_lsame( range_, "V" ); indeig = lapackf77_lsame( range_, "I" ); lquery = lwork == -1 || lrwork == -1 || liwork == -1; *info = 0; if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_chetrd_nb( n ); if ( n <= 1 ) { lwmin = 1; lrwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( n + n*nb, 2*n + n*n ); lrwmin = 1 + 5*n + 2*n*n; liwmin = 3 + 5*n; } else { lwmin = n + n*nb; lrwmin = n; liwmin = 1; } // multiply by 1+eps to ensure length gets rounded up, // if it cannot be exactly represented in floating point. work[0] = MAGMA_C_MAKE( lwmin * (1. + lapackf77_slamch("Epsilon")), 0.); rwork[0] = lrwmin * (1. + lapackf77_slamch("Epsilon")); iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -14; } else if ((lrwork < lrwmin) && ! lquery) { *info = -16; } else if ((liwork < liwmin) && ! lquery) { *info = -18; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = MAGMA_C_REAL(a[0]); if (wantz) { a[0] = MAGMA_C_ONE; } return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128){ #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_cheevd(jobz_, uplo_, &n, a, &lda, w, work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); return *info; } /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt(smlnum); rmax = magma_ssqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_clanhe("M", uplo_, &n, a, &lda, rwork); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_clascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, a, &lda, info); } /* Call CHETRD to reduce Hermitian matrix to tridiagonal form. */ // chetrd rwork: e (n) // cstedx rwork: e (n) + llrwk (1 + 4*N + 2*N**2) ==> 1 + 5n + 2n^2 inde = 0; indrwk = inde + n; llrwk = lrwork - indrwk; // chetrd work: tau (n) + llwork (n*nb) ==> n + n*nb // cstedx work: tau (n) + z (n^2) // cunmtr work: tau (n) + z (n^2) + llwrk2 (n or n*nb) ==> 2n + n^2, or n + n*nb + n^2 indtau = 0; indwrk = indtau + n; indwk2 = indwrk + n*n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; // #ifdef ENABLE_TIMER magma_timestr_t start, end; start = get_current_time(); #endif magma_chetrd(uplo_[0], n, a, lda, w, &rwork[inde], &work[indtau], &work[indwrk], llwork, &iinfo); #ifdef ENABLE_TIMER end = get_current_time(); printf("time chetrd = %6.2f\n", GetTimerValue(start,end)/1000.); #endif /* For eigenvalues only, call SSTERF. For eigenvectors, first call CSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call CUNMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_ssterf(&n, w, &rwork[inde], info); magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); } else { #ifdef ENABLE_TIMER start = get_current_time(); #endif if (MAGMA_SUCCESS != magma_smalloc( &dwork, 3*n*(n/2 + 1) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_cstedx(range, n, vl, vu, il, iu, w, &rwork[inde], &work[indwrk], n, &rwork[indrwk], llrwk, iwork, liwork, dwork, info); magma_free( dwork ); #ifdef ENABLE_TIMER end = get_current_time(); printf("time cstedx = %6.2f\n", GetTimerValue(start,end)/1000.); start = get_current_time(); #endif magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); magma_cunmtr(MagmaLeft, uplo, MagmaNoTrans, n, *m, a, lda, &work[indtau], &work[indwrk + n * (il-1) ], n, &work[indwk2], llwrk2, &iinfo); lapackf77_clacpy("A", &n, m, &work[indwrk + n * (il-1)] , &n, a, &lda); #ifdef ENABLE_TIMER end = get_current_time(); printf("time cunmtr + copy = %6.2f\n", GetTimerValue(start,end)/1000.); #endif } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = n; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_sscal(&imax, &d__1, w, &ione); } work[0] = MAGMA_C_MAKE( lwmin * (1. + lapackf77_slamch("Epsilon")), 0.); // round up rwork[0] = lrwmin * (1. + lapackf77_slamch("Epsilon")); iwork[0] = liwmin; return *info; } /* magma_cheevdx */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgels */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float gpu_error, cpu_error, error, Anorm, work[1]; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex *d_A, *d_B; magma_int_t M, N, size, nrhs, lda, ldb, ldda, lddb, min_mn, max_mn, nb, info; magma_int_t lworkgpu, lhwork, lhwork2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; float tol = opts.tolerance * lapackf77_slamch("E"); nrhs = opts.nrhs; printf(" ||b-Ax|| / (N||A||) ||dx-x||/(N||A||)\n"); printf(" M N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) CPU GPU \n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if ( M < N ) { printf( "%5d %5d %5d skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; size = lda*N; ldda = ((M+31)/32)*32; lddb = ((max_mn+31)/32)*32; nb = magma_get_cgeqrf_nb(M); gflops = (FLOPS_CGEQRF( M, N ) + FLOPS_CGEQRS( M, N, nrhs )) / 1e9; lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; // query for workspace size lhwork = -1; lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lhwork, &info); lhwork2 = (magma_int_t) MAGMA_C_REAL( tmp[0] ); lhwork = -1; lapackf77_cunmqr( MagmaLeftStr, MagmaConjTransStr, &M, &nrhs, &min_mn, NULL, &lda, NULL, NULL, &ldb, tmp, &lhwork, &info); lhwork = (magma_int_t) MAGMA_C_REAL( tmp[0] ); lhwork = max( max( lhwork, lhwork2 ), lworkgpu ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_A2, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaFloatComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*nrhs ); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &size, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS size = M*nrhs; lapackf77_clarnv( &ione, ISEED, &size, h_B ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); // make consistent RHS //size = N*nrhs; //lapackf77_clarnv( &ione, ISEED, &size, h_X ); //blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, // &c_one, h_A, &lda, // h_X, &ldb, // &c_zero, h_B, &ldb ); //lapackf77_clacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix( M, N, h_A, lda, d_A, ldda ); magma_csetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_cgels3_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgels3_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // Get the solution in h_X magma_cgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); // compute the residual blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); Anorm = lapackf77_clange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_clacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_cgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); cpu_error = lapackf77_clange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm); gpu_error = lapackf77_clange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); // error relative to LAPACK size = M*nrhs; blasf77_caxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_clange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e", (int) M, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, cpu_error, gpu_error, error ); if ( M == N ) { printf( " %s\n", (gpu_error < tol && error < tol ? "ok" : "failed")); status += ! (gpu_error < tol && error < tol); } else { printf( " %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cheevd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; magmaFloatComplex *h_A, *h_R, *h_work, aux_work[1]; float *rwork, *w1, *w2, result[3], eps, aux_rwork[1]; magma_int_t *iwork, aux_iwork[1]; magma_int_t N, n2, info, lwork, lrwork, liwork, lda; magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; eps = lapackf77_slamch( "E" ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: jobz = %s, uplo = %s\n", lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo)); printf(" N CPU Time (sec) GPU Time (sec)\n"); printf("=======================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; lda = N; // query for workspace sizes magma_cheevd( opts.jobz, opts.uplo, N, NULL, lda, NULL, aux_work, -1, aux_rwork, -1, aux_iwork, -1, &info ); lwork = (magma_int_t) MAGMA_C_REAL( aux_work[0] ); lrwork = (magma_int_t) aux_rwork[0]; liwork = aux_iwork[0]; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaFloatComplex, N*lda ); TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( rwork, float, lrwork ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, N*lda ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); magma_cmake_hermitian( N, h_A, N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* warm up run */ if ( opts.warmup ) { magma_cheevd( opts.jobz, opts.uplo, N, h_R, lda, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info ); if (info != 0) printf("magma_cheevd returned error %d: %s.\n", (int) info, magma_strerror( info )); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cheevd( opts.jobz, opts.uplo, N, h_R, lda, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_cheevd returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zcds]drvst routine. A is factored as A = U S U' and the following 3 tests computed: (1) | A - U S U' | / ( |A| N ) (2) | I - U'U | / ( N ) (3) | S(with U) - S(w/o U) | / | S | =================================================================== */ float temp1, temp2; // tau=NULL is unused since itype=1 lapackf77_chet21( &ione, lapack_uplo_const(opts.uplo), &N, &izero, h_A, &lda, w1, w1, h_R, &lda, h_R, &lda, NULL, h_work, rwork, &result[0] ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_cheevd( MagmaNoVec, opts.uplo, N, h_R, lda, w2, h_work, lwork, rwork, lrwork, iwork, liwork, &info ); if (info != 0) printf("magma_cheevd returned error %d: %s.\n", (int) info, magma_strerror( info )); temp1 = temp2 = 0; for( int j=0; j<N; j++ ) { temp1 = max(temp1, fabs(w1[j])); temp1 = max(temp1, fabs(w2[j])); temp2 = max(temp2, fabs(w1[j]-w2[j])); } result[2] = temp2 / (((float)N)*temp1); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cheevd( lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), &N, h_A, &lda, w2, h_work, &lwork, rwork, &lrwork, iwork, &liwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_cheevd returned error %d: %s.\n", (int) info, magma_strerror( info )); printf("%5d %7.2f %7.2f\n", (int) N, cpu_time, gpu_time); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } /* ===================================================================== Print execution time =================================================================== */ if ( opts.check ) { printf("Testing the factorization A = U S U' for correctness:\n"); printf("(1) | A - U S U' | / (|A| N) = %8.2e %s\n", result[0]*eps, (result[0]*eps < tol ? "ok" : "failed") ); printf("(2) | I - U'U | / N = %8.2e %s\n", result[1]*eps, (result[1]*eps < tol ? "ok" : "failed") ); printf("(3) | S(w/ U) - S(w/o U) | / |S| = %8.2e %s\n\n", result[2] , (result[2] < tolulp ? "ok" : "failed") ); status += ! (result[0]*eps < tol && result[1]*eps < tol && result[2] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( rwork ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmqr */ int main( int argc, char** argv ) { TESTING_CUDA_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; /* Matrix size */ magma_int_t m, n, k; const int MAXTESTS = 10; magma_int_t msize[MAXTESTS] = { 1024, 2048, 3072, 4032, 5184, 6016, 7040, 8064, 9088, 10112 }; magma_int_t nsize[MAXTESTS] = { 1024, 2048, 3072, 4032, 5184, 6016, 7040, 8064, 9088, 10112 }; magma_int_t ksize[MAXTESTS] = { 1024, 2048, 3072, 4032, 5184, 6016, 7040, 8064, 9088, 10112 }; magma_int_t size; magma_int_t info; magma_int_t iseed[4] = {0,0,0,1}; printf( "Usage: %s -N m,n,k -c\n" " -N can be repeated %d times. m > 0, n > 0, k > 0 is required.\n" " If only m,n is given, then n=k. If only m is given, then m=n=k.\n" " -c or setting $MAGMA_TESTINGS_CHECK runs LAPACK and checks result.\n\n", argv[0], MAXTESTS ); int checkres = (getenv("MAGMA_TESTINGS_CHECK") != NULL); int ntest = 0; magma_int_t nmax = 0; magma_int_t mmax = 0; magma_int_t kmax = 0; for( int i = 1; i < argc; i++ ) { if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ) { magma_assert( ntest < MAXTESTS, "error: -N repeated more than maximum %d tests\n", MAXTESTS ); info = sscanf( argv[++i], "%d,%d,%d", &m, &n, &k ); if ( info == 3 && m > 0 && n > 0 && k > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = n; ksize[ ntest ] = k; } else if ( info == 2 && m > 0 && n > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = n; ksize[ ntest ] = n; // implicitly } else if ( info == 1 && m > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = m; // implicitly ksize[ ntest ] = m; // implicitly } else { printf( "error: -N %s is invalid; ensure m > 0, n > 0, k > 0.\n", argv[i] ); exit(1); } mmax = max( mmax, msize[ntest] ); nmax = max( nmax, nsize[ntest] ); kmax = max( kmax, ksize[ntest] ); ntest++; } else if ( strcmp("-c", argv[i]) == 0 ) { checkres = true; } else { printf( "invalid argument: %s\n", argv[i] ); exit(1); } } if ( ntest == 0 ) { ntest = MAXTESTS; nmax = nsize[ntest-1]; mmax = msize[ntest-1]; kmax = ksize[ntest-1]; } m = mmax; n = nmax; k = kmax; assert( n > 0 && m > 0 && k > 0 ); magma_int_t nb = magma_get_cgeqrf_nb( m ); magma_int_t ldc = m; magma_int_t lda = max(m,n); ldc = ((ldc+31)/32)*32; lda = ((lda+31)/32)*32; // Allocate memory for matrices cuFloatComplex *C, *R, *A, *W, *tau; magma_int_t lwork = max( m*nb, n*nb ); magma_int_t lwork_max = lwork; TESTING_MALLOC( C, cuFloatComplex, ldc*n ); TESTING_MALLOC( R, cuFloatComplex, ldc*n ); TESTING_MALLOC( A, cuFloatComplex, lda*k ); TESTING_MALLOC( W, cuFloatComplex, lwork_max ); TESTING_MALLOC( tau, cuFloatComplex, k ); // test all combinations of input parameters const char* side[] = { MagmaLeftStr, MagmaRightStr }; const char* trans[] = { MagmaConjTransStr, MagmaNoTransStr }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int i = 0; i < ntest; ++i ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { m = msize[i]; n = nsize[i]; k = ksize[i]; if ( *side[iside] == 'L' && m < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=left and m < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } if ( *side[iside] == 'R' && n < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=right and n < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } gflops = FLOPS_CUNMQR( m, n, k, *side[iside] ) / 1e9; // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, iseed, &size, C ); lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc ); //magma_csetmatrix( m, n, C, ldc, dC, ldc ); // A is m x k (left) or n x k (right) lda = (*side[iside] == 'L' ? m : n); size = lda*k; lapackf77_clarnv( &ione, iseed, &size, A ); // compute QR factorization to get Householder vectors in A, tau magma_cgeqrf( lda, k, A, lda, tau, W, lwork_max, &info ); if ( info != 0 ) printf("magma_cgeqrf returned error %d\n", info); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmqr( side[iside], trans[itran], &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cunmqr returned error %d.\n", (int) info); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for work size lwork = -1; magma_cunmqr( *side[iside], *trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) printf("magma_cunmqr returned error %d (lwork query).\n", (int) info); lwork = (magma_int_t) MAGMA_C_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) printf("invalid lwork %d, lwork_max %d\n", lwork, lwork_max ); gpu_time = magma_wtime(); magma_cunmqr( *side[iside], *trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cunmqr returned error %d.\n", (int) info); //magma_cgetmatrix( m, n, dC, ldc, R, ldc ); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %-5s %-9s %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) m, (int) n, (int) k, side[iside], trans[itran], cpu_perf, cpu_time, gpu_perf, gpu_time, error ); }} // end iside, itran printf( "\n" ); } // end i // Memory clean up TESTING_FREE( C ); TESTING_FREE( R ); TESTING_FREE( A ); TESTING_FREE( W ); TESTING_FREE( tau ); // Shutdown TESTING_CUDA_FINALIZE(); return 0; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; float matnorm, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex_ptr d_lA[MagmaMaxGPUs]; /* Matrix size */ magma_int_t M = 0, N = 0, n2, n_local[4], lda, ldda, lhwork; magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000}; magma_int_t i, k, nk, info, min_mn; int max_num_gpus = 2, num_gpus = 2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-NGPU", argv[i])==0) num_gpus = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); M = N = size[9]; } ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); magma_int_t nb = magma_get_cgeqrf_nb(M); if (num_gpus > max_num_gpus){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = max_num_gpus; } printf("Number of GPUs to be used = %d\n", (int) num_gpus); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; magma_device_t devices[ MagmaMaxGPUs ]; magma_int_t num = 0; magma_int_t err; magma_init(); err = magma_getdevices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } } /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); for(i=0; i<num_gpus; i++){ n_local[i] = ((N/nb)/num_gpus)*nb; if (i < (N/nb)%num_gpus) n_local[i] += nb; else if (i == (N/nb)%num_gpus) n_local[i] += N%nb; TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, ldda*n_local[i] ); printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); } lhwork = -1; lapackf77_cgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork ); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (float)M, (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &M, tau, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_cgeqrf had an illegal value.\n", (int) -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ int j; magma_queue_t *trans_queues = (magma_queue_t*)malloc(num_gpus*sizeof(magma_queue_t)); for(j=0;j<num_gpus;j++){ trans_queues[j] = queues[2*j]; } // warm-up magma_csetmatrix_1D_col_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, queues, &info); magma_csetmatrix_1D_col_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); gpu_time = magma_wtime(); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, queues, &info); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_cgeqrf2 had an illegal value.\n", (int) -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix_1D_col_bcyclic(M, N, d_lA, ldda, h_R, lda, num_gpus, nb, trans_queues); matnorm = lapackf77_clange("f", &M, &N, h_A, &M, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_clange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); for(i=0; i<num_gpus; i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy(queues[2*i]); magma_queue_destroy(queues[2*i+1]); } /* Shutdown */ magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing chesv */ int main( int argc, char** argv) { TESTING_INIT(); magmaFloatComplex *h_A, *h_B, *h_X, *work, temp; real_Double_t gflops, gpu_perf, gpu_time = 0.0, cpu_perf=0, cpu_time=0; float error, error_lapack = 0.0; magma_int_t *ipiv; magma_int_t N, n2, lda, ldb, sizeB, lwork, info; magma_int_t status = 0, ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); printf("%%========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; ldb = N; lda = N; n2 = lda*N; sizeB = ldb*opts.nrhs; gflops = ( FLOPS_CPOTRF( N ) + FLOPS_CPOTRS( N, opts.nrhs ) ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_B, magmaFloatComplex, sizeB ); TESTING_MALLOC_PIN( h_X, magmaFloatComplex, sizeB ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { lwork = -1; lapackf77_chesv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, ipiv, h_X, &ldb, &temp, &lwork, &info); lwork = (int)MAGMA_C_REAL(temp); TESTING_MALLOC_CPU( work, magmaFloatComplex, lwork ); init_matrix( N, N, h_A, lda ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_clacpy( MagmaFullStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_chesv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, ipiv, h_X, &ldb, work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_chesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } error_lapack = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb ); TESTING_FREE_CPU( work ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( N, N, h_A, lda ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_clacpy( MagmaFullStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); magma_setdevice(0); gpu_time = magma_wtime(); magma_chesv( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_chesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) N, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) N, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 0 ) { printf(" --- \n"); } else { error = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb ); printf(" %8.2e %s", error, (error < tol ? "ok" : "failed")); if (opts.lapack) printf(" (lapack rel.res. = %8.2e)", error_lapack); printf("\n"); status += ! (error < tol); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
magma_int_t magma_ctrevc3_mt( magma_side_t side, magma_vec_t howmany, magma_int_t *select, // logical in Fortran magma_int_t n, magmaFloatComplex *T, magma_int_t ldt, magmaFloatComplex *VL, magma_int_t ldvl, magmaFloatComplex *VR, magma_int_t ldvr, magma_int_t mm, magma_int_t *mout, magmaFloatComplex *work, magma_int_t lwork, float *rwork, magma_int_t *info ) { #define T(i,j) ( T + (i) + (j)*ldt ) #define VL(i,j) (VL + (i) + (j)*ldvl) #define VR(i,j) (VR + (i) + (j)*ldvr) #define work(i,j) (work + (i) + (j)*n) // .. Parameters .. const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magma_int_t nbmin = 16, nbmax = 128; const magma_int_t ione = 1; // .. Local Scalars .. magma_int_t allv, bothv, leftv, over, rightv, somev; magma_int_t i, ii, is, j, k, ki, iv, n2, nb, nb2, version; float ovfl, remax, smin, smlnum, ulp, unfl; // Decode and test the input parameters bothv = (side == MagmaBothSides); rightv = (side == MagmaRight) || bothv; leftv = (side == MagmaLeft ) || bothv; allv = (howmany == MagmaAllVec); over = (howmany == MagmaBacktransVec); somev = (howmany == MagmaSomeVec); // Set mout to the number of columns required to store the selected // eigenvectors. if ( somev ) { *mout = 0; for( j=0; j < n; ++j ) { if ( select[j] ) { *mout += 1; } } } else { *mout = n; } *info = 0; if ( ! rightv && ! leftv ) *info = -1; else if ( ! allv && ! over && ! somev ) *info = -2; else if ( n < 0 ) *info = -4; else if ( ldt < max( 1, n ) ) *info = -6; else if ( ldvl < 1 || ( leftv && ldvl < n ) ) *info = -8; else if ( ldvr < 1 || ( rightv && ldvr < n ) ) *info = -10; else if ( mm < *mout ) *info = -11; else if ( lwork < max( 1, 2*n ) ) *info = -14; if ( *info != 0 ) { magma_xerbla( __func__, -(*info) ); return *info; } // Quick return if possible. if ( n == 0 ) { return *info; } // Use blocked version (2) if sufficient workspace. // Requires 1 vector to save diagonal elements, and 2*nb vectors for x and Q*x. // (Compared to dtrevc3, rwork stores 1-norms.) // Zero-out the workspace to avoid potential NaN propagation. nb = 2; if ( lwork >= n + 2*n*nbmin ) { version = 2; nb = (lwork - n) / (2*n); nb = min( nb, nbmax ); nb2 = 1 + 2*nb; lapackf77_claset( "F", &n, &nb2, &c_zero, &c_zero, work, &n ); } else { version = 1; } // Set the constants to control overflow. unfl = lapackf77_slamch( "Safe minimum" ); ovfl = 1. / unfl; lapackf77_slabad( &unfl, &ovfl ); ulp = lapackf77_slamch( "Precision" ); smlnum = unfl*( n / ulp ); // Store the diagonal elements of T in working array work. for( i=0; i < n; ++i ) { *work(i,0) = *T(i,i); } // Compute 1-norm of each column of strictly upper triangular // part of T to control overflow in triangular solver. rwork[0] = 0.; for( j=1; j < n; ++j ) { rwork[j] = cblas_scasum( j, T(0,j), ione ); } // launch threads -- each single-threaded MKL magma_int_t nthread = magma_get_parallel_numthreads(); magma_int_t lapack_nthread = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( 1 ); magma_queue queue; queue.launch( nthread ); //printf( "nthread %d, %d\n", nthread, lapack_nthread ); // NB = N/thread, rounded up to multiple of 16, // but avoid multiples of page size, e.g., 512*8 bytes = 4096. magma_int_t NB = magma_int_t( ceil( ceil( ((float)n) / nthread ) / 16. ) * 16. ); if ( NB % 512 == 0 ) { NB += 32; } magma_timer_t time_total=0, time_trsv=0, time_gemm=0, time_gemv=0, time_trsv_sum=0, time_gemm_sum=0, time_gemv_sum=0; timer_start( time_total ); if ( rightv ) { // ============================================================ // Compute right eigenvectors. // iv is index of column in current block. // Non-blocked version always uses iv=1; // blocked version starts with iv=nb, goes down to 1. // (Note the "0-th" column is used to store the original diagonal.) iv = 1; if ( version == 2 ) { iv = nb; } timer_start( time_trsv ); is = *mout - 1; for( ki=n-1; ki >= 0; --ki ) { if ( somev ) { if ( ! select[ki] ) { continue; } } smin = max( ulp*( MAGMA_C_ABS1( *T(ki,ki) ) ), smlnum ); // -------------------------------------------------------- // Complex right eigenvector *work(ki,iv) = c_one; // Form right-hand side. for( k=0; k < ki; ++k ) { *work(k,iv) = -(*T(k,ki)); } // Solve upper triangular system: // [ T(1:ki-1,1:ki-1) - T(ki,ki) ]*X = scale*work. if ( ki > 0 ) { queue.push_task( new magma_clatrsd_task( MagmaUpper, MagmaNoTrans, MagmaNonUnit, MagmaTrue, ki, T, ldt, *T(ki,ki), work(0,iv), work(ki,iv), rwork )); } // Copy the vector x or Q*x to VR and normalize. if ( ! over ) { // ------------------------------ // no back-transform: copy x to VR and normalize queue.sync(); n2 = ki+1; blasf77_ccopy( &n2, work(0,iv), &ione, VR(0,is), &ione ); ii = blasf77_icamax( &n2, VR(0,is), &ione ) - 1; remax = 1. / MAGMA_C_ABS1( *VR(ii,is) ); blasf77_csscal( &n2, &remax, VR(0,is), &ione ); for( k=ki+1; k < n; ++k ) { *VR(k,is) = c_zero; } } else if ( version == 1 ) { // ------------------------------ // version 1: back-transform each vector with GEMV, Q*x. queue.sync(); time_trsv_sum += timer_stop( time_trsv ); timer_start( time_gemv ); if ( ki > 0 ) { blasf77_cgemv( "n", &n, &ki, &c_one, VR, &ldvr, work(0, iv), &ione, work(ki,iv), VR(0,ki), &ione ); } time_gemv_sum += timer_stop( time_gemv ); ii = blasf77_icamax( &n, VR(0,ki), &ione ) - 1; remax = 1. / MAGMA_C_ABS1( *VR(ii,ki) ); blasf77_csscal( &n, &remax, VR(0,ki), &ione ); timer_start( time_trsv ); } else if ( version == 2 ) { // ------------------------------ // version 2: back-transform block of vectors with GEMM // zero out below vector for( k=ki+1; k < n; ++k ) { *work(k,iv) = c_zero; } // Columns iv:nb of work are valid vectors. // When the number of vectors stored reaches nb, // or if this was last vector, do the GEMM if ( (iv == 1) || (ki == 0) ) { queue.sync(); time_trsv_sum += timer_stop( time_trsv ); timer_start( time_gemm ); nb2 = nb-iv+1; n2 = ki+nb-iv+1; // split gemm into multiple tasks, each doing one block row for( i=0; i < n; i += NB ) { magma_int_t ib = min( NB, n-i ); queue.push_task( new cgemm_task( MagmaNoTrans, MagmaNoTrans, ib, nb2, n2, c_one, VR(i,0), ldvr, work(0,iv ), n, c_zero, work(i,nb+iv), n )); } queue.sync(); time_gemm_sum += timer_stop( time_gemm ); // normalize vectors // TODO if somev, should copy vectors individually to correct location. for( k = iv; k <= nb; ++k ) { ii = blasf77_icamax( &n, work(0,nb+k), &ione ) - 1; remax = 1. / MAGMA_C_ABS1( *work(ii,nb+k) ); blasf77_csscal( &n, &remax, work(0,nb+k), &ione ); } lapackf77_clacpy( "F", &n, &nb2, work(0,nb+iv), &n, VR(0,ki), &ldvr ); iv = nb; timer_start( time_trsv ); } else { iv -= 1; } } // blocked back-transform is -= 1; } } timer_stop( time_trsv ); timer_stop( time_total ); timer_printf( "trevc trsv %.4f, gemm %.4f, gemv %.4f, total %.4f\n", time_trsv_sum, time_gemm_sum, time_gemv_sum, time_total ); if ( leftv ) { // ============================================================ // Compute left eigenvectors. // iv is index of column in current block. // Non-blocked version always uses iv=1; // blocked version starts with iv=1, goes up to nb. // (Note the "0-th" column is used to store the original diagonal.) iv = 1; is = 0; for( ki=0; ki < n; ++ki ) { if ( somev ) { if ( ! select[ki] ) { continue; } } smin = max( ulp*MAGMA_C_ABS1( *T(ki,ki) ), smlnum ); // -------------------------------------------------------- // Complex left eigenvector *work(ki,iv) = c_one; // Form right-hand side. for( k = ki + 1; k < n; ++k ) { *work(k,iv) = -MAGMA_C_CNJG( *T(ki,k) ); } // Solve conjugate-transposed triangular system: // [ T(ki+1:n,ki+1:n) - T(ki,ki) ]**H * X = scale*work. // TODO what happens with T(k,k) - lambda is small? Used to have < smin test. if ( ki < n-1 ) { n2 = n-ki-1; queue.push_task( new magma_clatrsd_task( MagmaUpper, MagmaConjTrans, MagmaNonUnit, MagmaTrue, n2, T(ki+1,ki+1), ldt, *T(ki,ki), work(ki+1,iv), work(ki,iv), rwork )); } // Copy the vector x or Q*x to VL and normalize. if ( ! over ) { // ------------------------------ // no back-transform: copy x to VL and normalize queue.sync(); n2 = n-ki; blasf77_ccopy( &n2, work(ki,iv), &ione, VL(ki,is), &ione ); ii = blasf77_icamax( &n2, VL(ki,is), &ione ) + ki - 1; remax = 1. / MAGMA_C_ABS1( *VL(ii,is) ); blasf77_csscal( &n2, &remax, VL(ki,is), &ione ); for( k=0; k < ki; ++k ) { *VL(k,is) = c_zero; } } else if ( version == 1 ) { // ------------------------------ // version 1: back-transform each vector with GEMV, Q*x. queue.sync(); if ( ki < n-1 ) { n2 = n-ki-1; blasf77_cgemv( "n", &n, &n2, &c_one, VL(0,ki+1), &ldvl, work(ki+1,iv), &ione, work(ki, iv), VL(0,ki), &ione ); } ii = blasf77_icamax( &n, VL(0,ki), &ione ) - 1; remax = 1. / MAGMA_C_ABS1( *VL(ii,ki) ); blasf77_csscal( &n, &remax, VL(0,ki), &ione ); } else if ( version == 2 ) { // ------------------------------ // version 2: back-transform block of vectors with GEMM // zero out above vector // could go from (ki+1)-NV+1 to ki for( k=0; k < ki; ++k ) { *work(k,iv) = c_zero; } // Columns 1:iv of work are valid vectors. // When the number of vectors stored reaches nb, // or if this was last vector, do the GEMM if ( (iv == nb) || (ki == n-1) ) { queue.sync(); n2 = n-(ki+1)+iv; // split gemm into multiple tasks, each doing one block row for( i=0; i < n; i += NB ) { magma_int_t ib = min( NB, n-i ); queue.push_task( new cgemm_task( MagmaNoTrans, MagmaNoTrans, ib, iv, n2, c_one, VL(i,ki-iv+1), ldvl, work(ki-iv+1,1), n, c_zero, work(i,nb+1), n )); } queue.sync(); // normalize vectors for( k=1; k <= iv; ++k ) { ii = blasf77_icamax( &n, work(0,nb+k), &ione ) - 1; remax = 1. / MAGMA_C_ABS1( *work(ii,nb+k) ); blasf77_csscal( &n, &remax, work(0,nb+k), &ione ); } lapackf77_clacpy( "F", &n, &iv, work(0,nb+1), &n, VL(0,ki-iv+1), &ldvl ); iv = 1; } else { iv += 1; } } // blocked back-transform is += 1; } } // close down threads queue.quit(); magma_set_lapack_numthreads( lapack_nthread ); return *info; } // End of CTREVC
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex_ptr d_A, dT; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; float tol; opts.lapack |= (opts.version == 2 && opts.check == 2); // check (-c2) implies lapack (-l) if ( opts.version != 2 && opts.check == 1 ) { printf( "NOTE: version %d requires -c2 check due to the special structure of the\n" "MAGMA cgeqrf results; using -c2.\n\n", (int) opts.version ); opts.check = 2; } printf( "version %d\n", (int) opts.version ); if ( opts.version == 2 ) { if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1*eps) ||I-Q'Q||_1 / (M*eps)\n"); printf("=========================================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); } tol = 1.0; } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||Ax-b||_F/(N*||A||_F*||x||_F)\n"); printf("====================================================================================\n"); tol = opts.tolerance * lapackf77_slamch("E"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lwork ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // save seeds lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, 0, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); if ( opts.version == 2 ) { magma_cgeqrf2_gpu( M, N, d_A, 0, ldda, tau, opts.queues2, &info ); } else { nb = magma_get_cgeqrf_nb( M ); size = (2*min(M, N) + (N+31)/32*32 )*nb; TESTING_MALLOC_DEV( dT, magmaFloatComplex, size ); if ( opts.version == 1 ) { magma_cgeqrf_gpu( M, N, d_A, 0, ldda, tau, dT, 0, opts.queue, &info ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { magma_cgeqrf3_gpu( M, N, d_A, 0, ldda, tau, dT, opts.queue, &info ); } #endif else { printf( "Unknown version %d\n", opts.version ); exit(1); } } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ magmaFloatComplex *tau2; TESTING_MALLOC_CPU( tau2, magmaFloatComplex, min_mn ); cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &lda, tau2, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( tau2 ); } if ( opts.check == 1 && M >= N ) { /* ===================================================================== Check the result -- only version 1, cqrt02 requires M >= N =================================================================== */ magma_int_t lwork = n2+N; magmaFloatComplex *h_W1, *h_W2, *h_W3; float *h_RW, results[2]; magma_cgetmatrix( M, N, d_A, 0, ldda, h_R, M, opts.queue ); TESTING_MALLOC_CPU( h_W1, magmaFloatComplex, n2 ); // Q TESTING_MALLOC_CPU( h_W2, magmaFloatComplex, n2 ); // R TESTING_MALLOC_CPU( h_W3, magmaFloatComplex, lwork ); // WORK TESTING_MALLOC_CPU( h_RW, float, M ); // RWORK lapackf77_clarnv( &ione, ISEED2, &n2, h_A ); lapackf77_cqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0], results[1] ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0], results[1] ); } // todo also check results[1] < tol? printf(" %s\n", (results[0] < tol ? "ok" : "failed")); status += ! (results[0] < tol); TESTING_FREE_CPU( h_W1 ); TESTING_FREE_CPU( h_W2 ); TESTING_FREE_CPU( h_W3 ); TESTING_FREE_CPU( h_RW ); } else if ( opts.check == 2 && opts.version == 2 ) { /* ===================================================================== Check the result compared to LAPACK -- only version 2 =================================================================== */ magma_cgetmatrix( M, N, d_A, 0, ldda, h_R, M, opts.queue ); error = lapackf77_clange("f", &M, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_clange("f", &M, &N, h_R, &lda, work) / error; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf(" %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check == 2 && M >= N ) { /* ===================================================================== Check the result by solving linear system -- only versions 1 & 3, M >= N =================================================================== */ magma_int_t lwork; magmaFloatComplex *x, *b, *hwork; magmaFloatComplex_ptr d_B; const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magma_int_t ione = 1; // initialize RHS, b = A*random TESTING_MALLOC_CPU( x, magmaFloatComplex, N ); TESTING_MALLOC_CPU( b, magmaFloatComplex, M ); lapackf77_clarnv( &ione, ISEED, &N, x ); blasf77_cgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione ); // copy to GPU TESTING_MALLOC_DEV( d_B, magmaFloatComplex, M ); magma_csetvector( M, b, 1, d_B, 0, 1, opts.queue ); if ( opts.version == 1 ) { // allocate hwork magma_cgeqrs_gpu( M, N, 1, d_A, 0, ldda, tau, dT, 0, d_B, 0, M, tmp, -1, opts.queue, &info ); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, magmaFloatComplex, lwork ); // solve linear system magma_cgeqrs_gpu( M, N, 1, d_A, 0, ldda, tau, dT, 0, d_B, 0, M, hwork, lwork, opts.queue, &info ); if (info != 0) printf("magma_cgeqrs returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // allocate hwork magma_cgeqrs3_gpu( M, N, 1, d_A, 0, ldda, tau, dT, 0, d_B, 0, M, tmp, -1, opts.queue, &info ); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, magmaFloatComplex, lwork ); // solve linear system magma_cgeqrs3_gpu( M, N, 1, d_A, 0, ldda, tau, dT, 0, d_B, 0, M, hwork, lwork, opts.queue, &info ); if (info != 0) printf("magma_cgeqrs3 returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } #endif else { printf( "Unknown version %d\n", opts.version ); exit(1); } magma_cgetvector( N, d_B, 0, 1, x, 1, opts.queue ); // compute r = Ax - b, saved in b lapackf77_clarnv( &ione, ISEED2, &n2, h_A ); blasf77_cgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione ); // compute residual |Ax - b| / (n*|A|*|x|) float norm_x, norm_A, norm_r, work[1]; norm_A = lapackf77_clange( "F", &M, &N, h_A, &lda, work ); norm_r = lapackf77_clange( "F", &M, &ione, b, &M, work ); norm_x = lapackf77_clange( "F", &N, &ione, x, &N, work ); TESTING_FREE_CPU( x ); TESTING_FREE_CPU( b ); TESTING_FREE_DEV( d_B ); error = norm_r / (N * norm_A * norm_x); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf(" %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) ---", (int) M, (int) N, gpu_perf, gpu_time); } printf("%s\n", (opts.check != 0 ? " (error check only for M >= N)" : "")); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); if ( opts.version != 2 ) TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_cheevdx_2stage_m(magma_int_t nrgpu, char jobz, char range, char uplo, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *m, float *w, magmaFloatComplex *work, magma_int_t lwork, float *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CHEEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. It uses a two-stage algorithm for the tridiagonalization. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, the first m columns of A contains the required orthonormal eigenvectors of the matrix A. If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') or the upper triangle (if UPLO='U') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). VL (input) DOUBLE PRECISION VU (input) DOUBLE PRECISION If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. M (output) INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. W (output) DOUBLE PRECISION array, dimension (N) If INFO = 0, the required m eigenvalues in ascending order. WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = 'N' and N > 1, LWORK >= LQ2 + N * (NB + 1). If JOBZ = 'V' and N > 1, LWORK >= LQ2 + 2*N + N**2. where LQ2 is the size needed to store the Q2 matrix and is returned by MAGMA_BULGE_GET_LQ2. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. RWORK (workspace/output) DOUBLE PRECISION array, dimension (LRWORK) On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. LRWORK (input) INTEGER The dimension of the array RWORK. If N <= 1, LRWORK >= 1. If JOBZ = 'N' and N > 1, LRWORK >= N. If JOBZ = 'V' and N > 1, LRWORK >= 1 + 5*N + 2*N**2. If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = 'N' and N > 1, LIWORK >= 1. If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i and JOBZ = 'N', then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = 'V', then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; char range_[2] = {range, 0}; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t ione = 1; magma_int_t izero = 0; float d_one = 1.; float d__1; float eps; float anrm; magma_int_t imax; float rmin, rmax; float sigma; //magma_int_t iinfo; magma_int_t lwmin, lrwmin, liwmin; magma_int_t lower; magma_int_t wantz; magma_int_t iscale; float safmin; float bignum; float smlnum; magma_int_t lquery; magma_int_t alleig, valeig, indeig; /* determine the number of threads */ magma_int_t threads = magma_get_numthreads(); magma_setlapack_numthreads(threads); wantz = lapackf77_lsame(jobz_, MagmaVecStr); lower = lapackf77_lsame(uplo_, MagmaLowerStr); alleig = lapackf77_lsame( range_, "A" ); valeig = lapackf77_lsame( range_, "V" ); indeig = lapackf77_lsame( range_, "I" ); lquery = lwork == -1 || lrwork == -1 || liwork == -1; *info = 0; if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_cbulge_nb(n, threads); magma_int_t Vblksiz = magma_cbulge_get_Vblksiz(n, nb, threads); magma_int_t ldt = Vblksiz; magma_int_t ldv = nb + Vblksiz; magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz); magma_int_t lq2 = magma_cbulge_get_lq2(n, threads); if (wantz) { lwmin = lq2 + 2 * n + n * n; lrwmin = 1 + 5 * n + 2 * n * n; liwmin = 5 * n + 3; } else { lwmin = lq2 + n * (nb + 1); lrwmin = n; liwmin = 1; } work[0] = MAGMA_C_MAKE( lwmin * (1. + lapackf77_slamch("Epsilon")), 0.); // round up rwork[0] = lrwmin * (1. + lapackf77_slamch("Epsilon")); iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -14; } else if ((lrwork < lrwmin) && ! lquery) { *info = -16; } else if ((liwork < liwmin) && ! lquery) { *info = -18; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = MAGMA_C_REAL(a[0]); if (wantz) { a[0] = MAGMA_C_ONE; } return *info; } #ifdef ENABLE_DEBUG printf("using %d threads\n", threads); #endif /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ magma_int_t ntiles = n/nb; if( ( ntiles < 2 ) || ( n <= 128 ) ){ #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_cheevd(jobz_, uplo_, &n, a, &lda, w, work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); *m = n; return *info; } /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt(smlnum); rmax = magma_ssqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_clanhe("M", uplo_, &n, a, &lda, rwork); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_clascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, a, &lda, info); } magma_int_t indT2 = 0; magma_int_t indTAU2 = indT2 + blkcnt*ldt*Vblksiz; magma_int_t indV2 = indTAU2+ blkcnt*Vblksiz; magma_int_t indtau1 = indV2 + blkcnt*ldv*Vblksiz; magma_int_t indwrk = indtau1+ n; magma_int_t indwk2 = indwrk + n * n; magma_int_t llwork = lwork - indwrk; magma_int_t llwrk2 = lwork - indwk2; magma_int_t inde = 0; magma_int_t indrwk = inde + n; magma_int_t llrwk = lrwork - indrwk; #ifdef ENABLE_TIMER magma_timestr_t start, st1, st2, end; start = get_current_time(); #endif #ifdef HE2HB_SINGLEGPU magmaFloatComplex *dT1; if (MAGMA_SUCCESS != magma_cmalloc( &dT1, n*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #ifdef ENABLE_TIMER tband1 = get_current_time(); #endif magma_chetrd_he2hb(uplo, n, nb, a, lda, &work[indtau1], &work[indwrk], llwork, dT1, threads, info); #ifdef ENABLE_TIMER tband2 = get_current_time(); printf(" 1 GPU seq code time chetrd_he2hb only = %7.4f\n" , GetTimerValue(tband1,tband2)/1000.); #endif magma_free(dT1); #else magma_int_t nstream = max(3,nrgpu+2); magma_queue_t streams[MagmaMaxGPUs][20]; magmaFloatComplex *da[MagmaMaxGPUs],*dT1[MagmaMaxGPUs]; magma_int_t ldda = ((n+31)/32)*32; magma_int_t ver = 0; magma_int_t distblk = max(256, 4*nb); #ifdef ENABLE_DEBUG printf("voici ngpu %d distblk %d NB %d nstream %d version %d \n ",nrgpu,distblk,nb,nstream,ver); #endif #ifdef ENABLE_TIMER magma_timestr_t tband1, tband2, t1, t2, ta1, ta2; t1 = get_current_time(); #endif for( magma_int_t dev = 0; dev < nrgpu; ++dev ) { magma_int_t mlocal = ((n / distblk) / nrgpu + 1) * distblk; magma_setdevice( dev ); magma_cmalloc(&da[dev], ldda*mlocal ); magma_cmalloc(&dT1[dev], (n*nb) ); for( int i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } } #ifdef ENABLE_TIMER t2 = get_current_time(); #endif magma_csetmatrix_1D_col_bcyclic( n, n, a, lda, da, ldda, nrgpu, distblk); magma_setdevice(0); #ifdef ENABLE_TIMER tband1 = get_current_time(); #endif if(ver==30){ magma_chetrd_he2hb_mgpu_spec(uplo, n, nb, a, lda, &work[indtau1], &work[indwrk], llwork, da, ldda, dT1, nb, nrgpu, distblk, streams, nstream, threads, info); }else{ magma_chetrd_he2hb_mgpu(uplo, n, nb, a, lda, &work[indtau1], &work[indwrk], llwork, da, ldda, dT1, nb, nrgpu, distblk, streams, nstream, threads, info); } #ifdef ENABLE_TIMER tband2 = get_current_time(); printf(" time alloc %7.4f, ditribution %7.4f, chetrd_he2hb only = %7.4f\n" , GetTimerValue(t1,t2)/1000., GetTimerValue(t2,tband1)/1000., GetTimerValue(tband1,tband2)/1000.); #endif for( magma_int_t dev = 0; dev < nrgpu; ++dev ) { magma_setdevice( dev ); magma_free( da[dev] ); magma_free( dT1[dev] ); for( int i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } #endif #ifdef ENABLE_TIMER st1 = get_current_time(); printf(" time chetrd_he2hb_mgpu = %6.2f\n" , GetTimerValue(start,st1)/1000.); #endif /* copy the input matrix into WORK(INDWRK) with band storage */ /* PAY ATTENTION THAT work[indwrk] should be able to be of size lda2*n which it should be checked in any future modification of lwork.*/ magma_int_t lda2 = 2*nb; //nb+1+(nb-1); magmaFloatComplex* A2 = &work[indwrk]; memset(A2 , 0, n*lda2*sizeof(magmaFloatComplex)); for (magma_int_t j = 0; j < n-nb; j++) { cblas_ccopy(nb+1, &a[j*(lda+1)], 1, &A2[j*lda2], 1); memset(&a[j*(lda+1)], 0, (nb+1)*sizeof(magmaFloatComplex)); a[nb + j*(lda+1)] = c_one; } for (magma_int_t j = 0; j < nb; j++) { cblas_ccopy(nb-j, &a[(j+n-nb)*(lda+1)], 1, &A2[(j+n-nb)*lda2], 1); memset(&a[(j+n-nb)*(lda+1)], 0, (nb-j)*sizeof(magmaFloatComplex)); } #ifdef ENABLE_TIMER st2 = get_current_time(); printf(" time chetrd_convert = %6.2f\n" , GetTimerValue(st1,st2)/1000.); #endif magma_chetrd_hb2st(threads, uplo, n, nb, Vblksiz, A2, lda2, w, &rwork[inde], &work[indV2], ldv, &work[indTAU2], wantz, &work[indT2], ldt); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time chetrd_hb2st = %6.2f\n" , GetTimerValue(st2,end)/1000.); printf(" time chetrd = %6.2f\n", GetTimerValue(start,end)/1000.); #endif /* For eigenvalues only, call SSTERF. For eigenvectors, first call CSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call CUNMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { #ifdef ENABLE_TIMER start = get_current_time(); #endif lapackf77_ssterf(&n, w, &rwork[inde], info); magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time dstedc = %6.2f\n", GetTimerValue(start,end)/1000.); #endif } else { #ifdef ENABLE_TIMER start = get_current_time(); #endif magma_cstedx_m(nrgpu, range, n, vl, vu, il, iu, w, &rwork[inde], &work[indwrk], n, &rwork[indrwk], llrwk, iwork, liwork, info); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time cstedx_m = %6.2f\n", GetTimerValue(start,end)/1000.); start = get_current_time(); #endif magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); /* magmaFloatComplex *dZ; magma_int_t lddz = n; if (MAGMA_SUCCESS != magma_cmalloc( &dZ, *m*lddz)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_cbulge_back(threads, uplo, n, nb, *m, Vblksiz, &work[indwrk + n * (il-1)], n, dZ, lddz, &work[indV2], ldv, &work[indTAU2], &work[indT2], ldt, info); magma_cgetmatrix( n, *m, dZ, lddz, &work[indwrk], n); magma_free(dZ); */ magma_cbulge_back_m(nrgpu, threads, uplo, n, nb, *m, Vblksiz, &work[indwrk + n * (il-1)], n, &work[indV2], ldv, &work[indTAU2], &work[indT2], ldt, info); #ifdef ENABLE_TIMER st1 = get_current_time(); printf(" time cbulge_back_m = %6.2f\n" , GetTimerValue(start,st1)/1000.); #endif magma_cunmqr_m(nrgpu, MagmaLeft, MagmaNoTrans, n-nb, *m, n-nb, a+nb, lda, &work[indtau1], &work[indwrk + n * (il-1) + nb], n, &work[indwk2], llwrk2, info); lapackf77_clacpy("A", &n, m, &work[indwrk + n * (il-1)], &n, a, &lda); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time cunmqr_m + copy = %6.2f\n", GetTimerValue(st1,end)/1000.); printf(" time eigenvectors backtransf. = %6.2f\n" , GetTimerValue(start,end)/1000.); #endif } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = n; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_sscal(&imax, &d__1, w, &ione); } work[0] = MAGMA_C_MAKE((float) lwmin, 0.); rwork[0] = (float) lrwmin; iwork[0] = liwmin; return *info; } /* magma_cheevdx_2stage_m */
/* //////////////////////////////////////////////////////////////////////////// -- Testing chegvdx */ int main( int argc, char** argv) { TESTING_INIT(); /* Constants */ const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magma_int_t ione = 1; /* Local variables */ real_Double_t gpu_time; magmaFloatComplex *h_A, *h_R, *h_B, *h_S, *h_work; #ifdef COMPLEX float *rwork; magma_int_t lrwork; #endif float *w1, *w2, result[2]={0,0}; magma_int_t *iwork; magma_int_t N, n2, info, lda, lwork, liwork; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; // pass ngpu = -1 to test multi-GPU code using 1 gpu magma_int_t abs_ngpu = abs( opts.ngpu ); printf("%% itype = %d, jobz = %s, range = %s, uplo = %s, fraction = %6.4f, ngpu = %d\n", int(opts.itype), lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), opts.fraction, int(abs_ngpu) ); if (opts.itype == 1) { printf("%% N M GPU Time (sec) |AZ-BZD| |D - D_magma|\n"); } else if (opts.itype == 2) { printf("%% N M GPU Time (sec) |ABZ-ZD| |D - D_magma|\n"); } else if (opts.itype == 3) { printf("%% N M GPU Time (sec) |BAZ-ZD| |D - D_magma|\n"); } printf("%%======================================================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; // TODO: test vl-vu range magma_int_t m1 = 0; float vl = 0; float vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (opts.fraction == 0) { il = max( 1, magma_int_t(0.1*N) ); iu = max( 1, magma_int_t(0.3*N) ); } else { il = 1; iu = max( 1, magma_int_t(opts.fraction*N) ); } magma_cheevdx_getworksize(N, threads, (opts.jobz == MagmaVec), &lwork, #ifdef COMPLEX &lrwork, #endif &liwork); /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_S, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, max( lwork, N*N )); // check needs N*N #ifdef COMPLEX TESTING_MALLOC_PIN( rwork, float, lrwork); #endif /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clarnv( &ione, ISEED, &n2, h_B ); magma_cmake_hpd( N, h_B, lda ); magma_cmake_hermitian( N, h_A, lda ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_B, &lda, h_S, &lda ); // =================================================================== // Performs operation using MAGMA // =================================================================== gpu_time = magma_wtime(); if (opts.ngpu == 1) { magma_chegvdx_2stage( opts.itype, opts.jobz, range, opts.uplo, N, h_R, lda, h_S, lda, vl, vu, il, iu, &m1, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); } else { magma_chegvdx_2stage_m( abs_ngpu, opts.itype, opts.jobz, range, opts.uplo, N, h_R, lda, h_S, lda, vl, vu, il, iu, &m1, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); } gpu_time = magma_wtime() - gpu_time; if (info != 0) { printf("magma_chegvdx_2stage returned error %d: %s.\n", (int) info, magma_strerror( info )); } if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A| |Z| N ) (itype = 1) | A B Z - Z D | / ( |A| |Z| N ) (itype = 2) | B A Z - Z D | / ( |A| |Z| N ) (itype = 3) (2) | D(with V, magma) - D(w/o V, lapack) | / | D | =================================================================== */ #ifdef REAL float *rwork = h_work + N*N; #endif if ( opts.jobz != MagmaNoVec ) { result[0] = 1.; result[0] /= safe_lapackf77_clanhe("1", lapack_uplo_const(opts.uplo), &N, h_A, &lda, rwork); result[0] /= lapackf77_clange("1", &N, &m1, h_R, &lda, rwork); if (opts.itype == 1) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &lda, h_R, &lda, &c_zero, h_work, &N); for (int i=0; i < m1; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_B, &lda, h_R, &lda, &c_one, h_work, &N); result[0] *= lapackf77_clange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &lda, h_R, &lda, &c_zero, h_work, &N); for (int i=0; i < m1; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_clange("1", &N, &m1, h_R, &lda, rwork)/N; } else if (opts.itype == 3) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &lda, h_R, &lda, &c_zero, h_work, &N); for (int i=0; i < m1; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_clange("1", &N, &m1, h_R, &lda, rwork)/N; } } lapackf77_clacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_B, &lda, h_S, &lda ); lapackf77_chegvd( &opts.itype, "N", lapack_uplo_const(opts.uplo), &N, h_R, &lda, h_S, &lda, w2, h_work, &lwork, #ifdef COMPLEX rwork, &lrwork, #endif iwork, &liwork, &info ); if (info != 0) { printf("lapackf77_chegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); } float maxw=0, diff=0; for (int j=0; j < m1; j++) { maxw = max(maxw, fabs(w1[j])); maxw = max(maxw, fabs(w2[j])); diff = max(diff, fabs(w1[j] - w2[j])); } result[1] = diff / (m1*maxw); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %9.4f ", (int) N, (int) m1, gpu_time); if ( opts.check ) { bool okay = (result[1] < tolulp); if ( opts.jobz != MagmaNoVec ) { okay = okay && (result[0] < tol); printf(" %8.2e", result[0] ); } else { printf(" --- "); } printf(" %8.2e %s\n", result[1], (okay ? "ok" : "failed")); status += ! okay; } else { printf(" ---\n"); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); #ifdef COMPLEX TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing chetrd_he2hb */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); real_Double_t gpu_time, gpu_perf, gflops; magmaFloatComplex *h_A, *h_R, *h_work, *dT1; magmaFloatComplex *tau; float *D, *E; /* Matrix size */ magma_int_t N, n2, lda, lwork, ldt, lwork0; magma_int_t info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) magma_int_t WANTZ=0; magma_int_t THREADS=1; #endif #endif magma_int_t NE = 0; magma_int_t NB = 0; magma_int_t ngpu = 1; magma_opts opts; parse_opts( argc, argv, &opts ); NB = opts.nb; if (NB < 1) NB = 64; //64; //magma_get_chetrd_he2hb_nb(N); // what is NE ? if (NE < 1) NE = 64; //N; //magma_get_chetrd_he2hb_nb(N); // N not yet initialized printf(" N GPU GFlop/s \n"); printf("=====================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldt = N; n2 = N*lda; gflops = FLOPS_CHETRD( N ) / 1e9; /* We suppose the magma NB is bigger than lapack NB */ lwork0 = N*NB; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, N-1 ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork0 ); TESTING_MALLOC_PIN( D, float, N ); TESTING_MALLOC_PIN( E, float, N ); //TESTING_MALLOC_DEV( dT1, magmaFloatComplex, (2*min(N,N)+(N+31)/32*32)*NB ); TESTING_MALLOC_DEV( dT1, magmaFloatComplex, (N*NB) ); // if (WANTZ) gflops = 2.0*gflops; /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); magma_cmake_hermitian( N, h_A, lda ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_device_t cdev; magma_getdevice( &cdev ); gpu_time = magma_wtime(); /* magma_chetrd_he2hb( opts.uplo, N, NB, h_R, lda, tau, h_work, lwork0, dT1, THREADS, &info); tband = magma_wtime - gpu_time(); printf(" Finish BAND N %d NB %d ngpu %d timing= %f\n", N, NB, ngpu, tband); magma_chetrd_bhe2trc_v5(THREADS, WANTZ, opts.uplo, NE, N, NB, h_R, lda, D, E, dT1, ldt); */ /* magma_chetrd_he2hb( opts.uplo, N, NB, h_R, lda, tau, h_work, lwork, dT1, THREADS, &info); tband = magma_wtime - gpu_time(); printf(" Finish BAND N %d NB %d ngpu %d timing= %f\n", N, NB, ngpu, tband); magma_chetrd_bhe2trc(THREADS, WANTZ, opts.uplo, NE, N, NB, h_R, lda, D, E, dT1, ldt); */ magma_range_t range = MagmaRangeAll; magma_int_t fraction_ev = 100; magma_int_t il, iu, m1; float vl=0., vu=0.; if (fraction_ev == 0) { il = N / 10; iu = N / 5+il; } else { il = 1; iu = (int)(fraction_ev*N); if (iu < 1) iu = 1; } magmaFloatComplex *hh_work; magma_int_t *iwork; magma_int_t nb, /*lwork,*/ liwork; magma_int_t threads = magma_get_parallel_numthreads(); #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; magma_int_t lrwork; lwork = magma_cbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; TESTING_MALLOC_PIN( rwork, float, lrwork ); #else lwork = magma_cbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; nb = magma_get_chetrd_nb(N); TESTING_MALLOC_PIN( hh_work, magmaFloatComplex, lwork ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); if (ngpu == 1) { printf("calling cheevdx_2stage 1 GPU\n"); magma_cheevdx_2stage( opts.jobz, range, opts.uplo, N, h_R, lda, vl, vu, il, iu, &m1, D, hh_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } else { printf("calling cheevdx_2stage_m %d GPU\n", (int) ngpu); magma_cheevdx_2stage_m(ngpu, opts.jobz, range, opts.uplo, N, h_R, lda, vl, vu, il, iu, &m1, D, hh_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } magma_setdevice( cdev ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; /* ===================================================================== Check the factorization =================================================================== */ /* if ( opts.check ) { FILE *fp ; printf("Writing input matrix in matlab_i_mat.txt ...\n"); fp = fopen ("matlab_i_mat.txt", "w") ; if ( fp == NULL ) { printf("Couldn't open output file\n"); exit(1); } for (j=0; j < N; j++) { for (k=0; k < N; k++) { #if defined(PRECISION_z) || defined(PRECISION_c) fprintf(fp, "%5d %5d %11.8f %11.8f\n", k+1, j+1, h_A[k+j*lda].x, h_A[k+j*lda].y); #else fprintf(fp, "%5d %5d %11.8f\n", k+1, j+1, h_A[k+j*lda]); #endif } } fclose( fp ) ; printf("Writing output matrix in matlab_o_mat.txt ...\n"); fp = fopen ("matlab_o_mat.txt", "w") ; if ( fp == NULL ) { printf("Couldn't open output file\n"); exit(1); } for (j=0; j < N; j++) { for (k=0; k < N; k++) { #if defined(PRECISION_z) || defined(PRECISION_c) fprintf(fp, "%5d %5d %11.8f %11.8f\n", k+1, j+1, h_R[k+j*lda].x, h_R[k+j*lda].y); #else fprintf(fp, "%5d %5d %11.8f\n", k+1, j+1, h_R[k+j*lda]); #endif } } fclose( fp ) ; } */ /* ===================================================================== Print performance and error. =================================================================== */ #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) if ( opts.check ) { printf(" Total N %5d gflops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); char JOBZ; if (WANTZ == 0) JOBZ = 'N'; else JOBZ = 'V'; float nrmI=0.0, nrm1=0.0, nrm2=0.0; int lwork2 = 256*N; magmaFloatComplex *work2, *AINIT; float *rwork2, *D2; // TODO free this memory ! magma_cmalloc_cpu( &work2, lwork2 ); magma_smalloc_cpu( &rwork2, N ); magma_smalloc_cpu( &D2, N ); magma_cmalloc_cpu( &AINIT, N*lda ); memcpy(AINIT, h_A, N*lda*sizeof(magmaFloatComplex)); /* compute the eigenvalues using lapack routine to be able to compare to it and used as ref */ cpu_time = magma_wtime(); i= min(12, THREADS); #if defined(USEMKL) mkl_set_num_threads( i ); #endif #if defined(USEACML) omp_set_num_threads(i); #endif lapackf77_cheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, #if defined(PRECISION_z) || defined (PRECISION_c) rwork2, #endif &info ); ///* call eigensolver for our resulting tridiag [D E] and for Q */ //dstedc_withZ('V', N, D, E, h_R, lda); ////ssterf_( &N, D, E, &info); //// cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - EIGEN timing= %f threads %d\n", cpu_time, i); /* for (i=0; i < 10; i++) printf(" voici lpk D[%d] %8.2e\n", i, D2[i]); */ //magmaFloatComplex mydz=0.0, mydo=1.0; //magmaFloatComplex *Z; // magma_cmalloc_cpu( &Z, N*lda ); // dgemm_("N", "N", &N, &N, &N, &mydo, h_R, &lda, h_A, &lda, &mydz, Z, &lda); /* compare result */ cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2); magmaFloatComplex *WORKAJETER; float *RWORKAJETER, *RESU; // TODO free this memory ! magma_cmalloc_cpu( &WORKAJETER, (2* N * N + N) ); magma_smalloc_cpu( &RWORKAJETER, N ); magma_smalloc_cpu( &RESU, 10 ); int MATYPE; memset(RESU, 0, 10*sizeof(float)); MATYPE=3; float NOTHING=0.0; cpu_time = magma_wtime(); // check results ccheck_eig_(&JOBZ, &MATYPE, &N, &NB, AINIT, &lda, &NOTHING, &NOTHING, D2, D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU ); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - results timing= %f\n", cpu_time); #if defined(USEMKL) mkl_set_num_threads( 1 ); #endif #if defined(USEACML) omp_set_num_threads(1); #endif printf("\n"); printf(" ================================================================================================================\n"); printf(" ==> INFO voici threads=%d N=%d NB=%d WANTZ=%d\n", (int) THREADS, (int) N, (int) NB, (int) WANTZ); printf(" ================================================================================================================\n"); printf(" DSBTRD : %15s \n", "STATblgv9withQ "); printf(" ================================================================================================================\n"); if (WANTZ > 0) printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n", RESU[0]); if (WANTZ > 0) printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]); printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]); printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]); printf(" ================================================================================================================\n\n\n"); printf(" ****************************************************************************************************************\n"); printf(" * Hello here are the norm Infinite (max)=%8.2e norm one (sum)=%8.2e norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2); printf(" ****************************************************************************************************************\n\n"); } #endif #endif printf(" Total N %5d gflops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); printf("============================================================================\n\n\n"); /* Memory clean up */ TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( D ); TESTING_FREE_PIN( E ); TESTING_FREE_DEV( dT1 ); /* TODO - not all memory has been freed inside loop */ fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE_MGPU(); return EXIT_SUCCESS; }
extern "C" magma_int_t magma_cungqr(magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *dT, magma_int_t nb, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CUNGQR generates an M-by-N COMPLEX matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by CGEQRF. Arguments ========= M (input) INTEGER The number of rows of the matrix Q. M >= 0. N (input) INTEGER The number of columns of the matrix Q. M >= N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. A (input/output) COMPLEX array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. LDA (input) INTEGER The first dimension of the array A. LDA >= max(1,M). TAU (input) COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQRF_GPU. DT (input) COMPLEX array on the GPU device. DT contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_cgeqrf_gpu. NB (input) INTEGER This is the block size used in CGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument has an illegal value ===================================================================== */ #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; magmaFloatComplex *dA, *dV, *dW; magmaFloatComplex *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = ((m + 31) / 32) * 32; lddwork = ((n + 31) / 32) * 32; if (MAGMA_SUCCESS != magma_cmalloc( &dA, ldda*n + ldda*nb + lddwork*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_cmalloc_cpu( &work, lwork ); if (work == NULL) { magma_free( dA ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magmaFloatComplex *V = work + (n+nb)*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* // Replacing this with the following 4 routines works but cungqr is slow for // k smaller than the cungqr's blocking size (new version can be up to 60x faster) lapackf77_cungqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_clacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_claset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_clarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_csetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_claset( MagmaUpperLower, kk, n - kk, dA(0, kk), ldda ); } } if (kk > 0) { // Use blocked code // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_claset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_csetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, stream ); // set panel to identity magmablas_claset( MagmaUpperLower, i, ib, dA(0, i), ldda ); magmablas_claset_identity( mi, ib, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } // copy result back to CPU magma_cgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda); } magmablasSetKernelStream( NULL ); magma_queue_destroy( stream ); magma_free( dA ); magma_free_cpu( work ); return *info; } /* magma_cungqr */
/***************************************************************************//** Purpose ------- CGEEV computes for an N-by-N complex nonsymmetric matrix A, the eigenvalues and, optionally, the left and/or right eigenvectors. The right eigenvector v(j) of A satisfies A * v(j) = lambda(j) * v(j) where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies u(j)**H * A = lambda(j) * u(j)**H where u(j)**H denotes the conjugate transpose of u(j). The computed eigenvectors are normalized to have Euclidean norm equal to 1 and largest component real. Arguments --------- @param[in] jobvl magma_vec_t - = MagmaNoVec: left eigenvectors of A are not computed; - = MagmaVec: left eigenvectors of are computed. @param[in] jobvr magma_vec_t - = MagmaNoVec: right eigenvectors of A are not computed; - = MagmaVec: right eigenvectors of A are computed. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the N-by-N matrix A. On exit, A has been overwritten. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] w COMPLEX array, dimension (N) W contains the computed eigenvalues. @param[out] VL COMPLEX array, dimension (LDVL,N) If JOBVL = MagmaVec, the left eigenvectors u(j) are stored one after another in the columns of VL, in the same order as their eigenvalues. If JOBVL = MagmaNoVec, VL is not referenced. u(j) = VL(:,j), the j-th column of VL. @param[in] ldvl INTEGER The leading dimension of the array VL. LDVL >= 1; if JOBVL = MagmaVec, LDVL >= N. @param[out] VR COMPLEX array, dimension (LDVR,N) If JOBVR = MagmaVec, the right eigenvectors v(j) are stored one after another in the columns of VR, in the same order as their eigenvalues. If JOBVR = MagmaNoVec, VR is not referenced. v(j) = VR(:,j), the j-th column of VR. @param[in] ldvr INTEGER The leading dimension of the array VR. LDVR >= 1; if JOBVR = MagmaVec, LDVR >= N. @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= (1 + nb + nb*ngpu)*N. For optimal performance, LWORK >= (1 + 2*nb + nb*ngpu)*N. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param rwork (workspace) REAL array, dimension (2*N) @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: if INFO = i, the QR algorithm failed to compute all the eigenvalues, and no eigenvectors have been computed; elements and i+1:N of W contain eigenvalues which have converged. @ingroup magma_geev *******************************************************************************/ extern "C" magma_int_t magma_cgeev_m( magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, #ifdef COMPLEX magmaFloatComplex *w, #else float *wr, float *wi, #endif magmaFloatComplex *VL, magma_int_t ldvl, magmaFloatComplex *VR, magma_int_t ldvr, magmaFloatComplex *work, magma_int_t lwork, #ifdef COMPLEX float *rwork, #endif magma_int_t *info ) { #define VL(i,j) (VL + (i) + (j)*ldvl) #define VR(i,j) (VR + (i) + (j)*ldvr) const magma_int_t ione = 1; const magma_int_t izero = 0; float d__1, d__2; magmaFloatComplex tmp; float scl; float dum[1], eps; float anrm, cscale, bignum, smlnum; magma_int_t i, k, ilo, ihi; magma_int_t ibal, ierr, itau, iwrk, nout, liwrk, nb; magma_int_t scalea, minwrk, optwrk, irwork, lquery, wantvl, wantvr, select[1]; magma_side_t side = MagmaRight; magma_int_t ngpu = magma_num_gpus(); irwork = 0; *info = 0; lquery = (lwork == -1); wantvl = (jobvl == MagmaVec); wantvr = (jobvr == MagmaVec); if (! wantvl && jobvl != MagmaNoVec) { *info = -1; } else if (! wantvr && jobvr != MagmaNoVec) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) { *info = -8; } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) { *info = -10; } /* Compute workspace */ nb = magma_get_cgehrd_nb( n ); if (*info == 0) { minwrk = (1 + nb + nb*ngpu)*n; optwrk = (1 + 2*nb + nb*ngpu)*n; work[0] = magma_cmake_lwork( optwrk ); if (lwork < minwrk && ! lquery) { *info = -12; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } #if defined(Version3) magmaFloatComplex *dT; if (MAGMA_SUCCESS != magma_cmalloc( &dT, nb*n )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #endif #if defined(Version5) magmaFloatComplex *T; if (MAGMA_SUCCESS != magma_cmalloc_cpu( &T, nb*n )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif /* Get machine constants */ eps = lapackf77_slamch( "P" ); smlnum = lapackf77_slamch( "S" ); bignum = 1. / smlnum; lapackf77_slabad( &smlnum, &bignum ); smlnum = magma_ssqrt( smlnum ) / eps; bignum = 1. / smlnum; /* Scale A if max element outside range [SMLNUM,BIGNUM] */ anrm = lapackf77_clange( "M", &n, &n, A, &lda, dum ); scalea = 0; if (anrm > 0. && anrm < smlnum) { scalea = 1; cscale = smlnum; } else if (anrm > bignum) { scalea = 1; cscale = bignum; } if (scalea) { lapackf77_clascl( "G", &izero, &izero, &anrm, &cscale, &n, &n, A, &lda, &ierr ); } /* Balance the matrix * (CWorkspace: none) * (RWorkspace: need N) * - this space is reserved until after gebak */ ibal = 0; lapackf77_cgebal( "B", &n, A, &lda, &ilo, &ihi, &rwork[ibal], &ierr ); /* Reduce to upper Hessenberg form * (CWorkspace: need 2*N, prefer N + N*NB + NB*NGPU) * (RWorkspace: N) * - added NB*NGPU needed for multi-GPU magma_cgehrd_m * - including N reserved for gebal/gebak, unused by cgehrd */ itau = 0; iwrk = itau + n; liwrk = lwork - iwrk; #if defined(Version1) // Version 1 - LAPACK lapackf77_cgehrd( &n, &ilo, &ihi, A, &lda, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version2) // Version 2 - LAPACK consistent HRD magma_cgehrd2( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, &ierr ); #elif defined(Version3) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored, magma_cgehrd( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, dT, &ierr ); #elif defined(Version5) // Version 4 - Multi-GPU, T on host magma_cgehrd_m( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, T, &ierr ); #endif if (wantvl) { /* Want left eigenvectors * Copy Householder vectors to VL */ side = MagmaLeft; lapackf77_clacpy( MagmaLowerStr, &n, &n, A, &lda, VL, &ldvl ); /* Generate unitary matrix in VL * (CWorkspace: need 2*N-1, prefer N + (N-1)*NB) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by cunghr */ #if defined(Version1) || defined(Version2) // Version 1 & 2 - LAPACK lapackf77_cunghr( &n, &ilo, &ihi, VL, &ldvl, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version3) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored magma_cunghr( n, ilo, ihi, VL, ldvl, &work[itau], dT, nb, &ierr ); #elif defined(Version5) // Version 5 - Multi-GPU, T on host magma_cunghr_m( n, ilo, ihi, VL, ldvl, &work[itau], T, nb, &ierr ); #endif /* Perform QR iteration, accumulating Schur vectors in VL * (CWorkspace: need 1, prefer HSWORK (see comments) ) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by chseqr */ iwrk = itau; liwrk = lwork - iwrk; lapackf77_chseqr( "S", "V", &n, &ilo, &ihi, A, &lda, w, VL, &ldvl, &work[iwrk], &liwrk, info ); if (wantvr) { /* Want left and right eigenvectors * Copy Schur vectors to VR */ side = MagmaBothSides; lapackf77_clacpy( "F", &n, &n, VL, &ldvl, VR, &ldvr ); } } else if (wantvr) { /* Want right eigenvectors * Copy Householder vectors to VR */ side = MagmaRight; lapackf77_clacpy( "L", &n, &n, A, &lda, VR, &ldvr ); /* Generate unitary matrix in VR * (CWorkspace: need 2*N-1, prefer N + (N-1)*NB) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by cunghr */ #if defined(Version1) || defined(Version2) // Version 1 & 2 - LAPACK lapackf77_cunghr( &n, &ilo, &ihi, VR, &ldvr, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version3) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored magma_cunghr( n, ilo, ihi, VR, ldvr, &work[itau], dT, nb, &ierr ); #elif defined(Version5) // Version 5 - Multi-GPU, T on host magma_cunghr_m( n, ilo, ihi, VR, ldvr, &work[itau], T, nb, &ierr ); #endif /* Perform QR iteration, accumulating Schur vectors in VR * (CWorkspace: need 1, prefer HSWORK (see comments) ) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by chseqr */ iwrk = itau; liwrk = lwork - iwrk; lapackf77_chseqr( "S", "V", &n, &ilo, &ihi, A, &lda, w, VR, &ldvr, &work[iwrk], &liwrk, info ); } else { /* Compute eigenvalues only * (CWorkspace: need 1, prefer HSWORK (see comments) ) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by chseqr */ iwrk = itau; liwrk = lwork - iwrk; lapackf77_chseqr( "E", "N", &n, &ilo, &ihi, A, &lda, w, VR, &ldvr, &work[iwrk], &liwrk, info ); } /* If INFO > 0 from CHSEQR, then quit */ if (*info > 0) { goto CLEANUP; } if (wantvl || wantvr) { /* Compute left and/or right eigenvectors * (CWorkspace: need 2*N) * (RWorkspace: need 2*N) * - including N reserved for gebal/gebak, unused by ctrevc */ irwork = ibal + n; #if TREVC_VERSION == 1 lapackf77_ctrevc( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl, VR, &ldvr, &n, &nout, &work[iwrk], &rwork[irwork], &ierr ); #elif TREVC_VERSION == 2 liwrk = lwork - iwrk; lapackf77_ctrevc3( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl, VR, &ldvr, &n, &nout, &work[iwrk], &liwrk, &rwork[irwork], &ierr ); #elif TREVC_VERSION == 3 magma_ctrevc3( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &rwork[irwork], &ierr ); #elif TREVC_VERSION == 4 magma_ctrevc3_mt( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &rwork[irwork], &ierr ); #elif TREVC_VERSION == 5 magma_ctrevc3_mt_gpu( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &rwork[irwork], &ierr ); #else #error Unknown TREVC_VERSION #endif } if (wantvl) { /* Undo balancing of left eigenvectors * (CWorkspace: none) * (RWorkspace: need N) */ lapackf77_cgebak( "B", "L", &n, &ilo, &ihi, &rwork[ibal], &n, VL, &ldvl, &ierr ); /* Normalize left eigenvectors and make largest component real */ for (i = 0; i < n; ++i) { scl = 1. / magma_cblas_scnrm2( n, VL(0,i), 1 ); blasf77_csscal( &n, &scl, VL(0,i), &ione ); for (k = 0; k < n; ++k) { /* Computing 2nd power */ d__1 = MAGMA_C_REAL( *VL(k,i) ); d__2 = MAGMA_C_IMAG( *VL(k,i) ); rwork[irwork + k] = d__1*d__1 + d__2*d__2; } k = blasf77_isamax( &n, &rwork[irwork], &ione ) - 1; // subtract 1; k is 0-based tmp = MAGMA_C_CONJ( *VL(k,i) ) / magma_ssqrt( rwork[irwork + k] ); blasf77_cscal( &n, &tmp, VL(0,i), &ione ); *VL(k,i) = MAGMA_C_MAKE( MAGMA_C_REAL( *VL(k,i) ), 0 ); } } if (wantvr) { /* Undo balancing of right eigenvectors * (CWorkspace: none) * (RWorkspace: need N) */ lapackf77_cgebak( "B", "R", &n, &ilo, &ihi, &rwork[ibal], &n, VR, &ldvr, &ierr ); /* Normalize right eigenvectors and make largest component real */ for (i = 0; i < n; ++i) { scl = 1. / magma_cblas_scnrm2( n, VR(0,i), 1 ); blasf77_csscal( &n, &scl, VR(0,i), &ione ); for (k = 0; k < n; ++k) { /* Computing 2nd power */ d__1 = MAGMA_C_REAL( *VR(k,i) ); d__2 = MAGMA_C_IMAG( *VR(k,i) ); rwork[irwork + k] = d__1*d__1 + d__2*d__2; } k = blasf77_isamax( &n, &rwork[irwork], &ione ) - 1; // subtract 1; k is 0-based tmp = MAGMA_C_CONJ( *VR(k,i) ) / magma_ssqrt( rwork[irwork + k] ); blasf77_cscal( &n, &tmp, VR(0,i), &ione ); *VR(k,i) = MAGMA_C_MAKE( MAGMA_C_REAL( *VR(k,i) ), 0 ); } } CLEANUP: /* Undo scaling if necessary */ if (scalea) { // converged eigenvalues, stored in WR[i+1:n] and WI[i+1:n] for i = INFO magma_int_t nval = n - (*info); magma_int_t ld = max( nval, 1 ); lapackf77_clascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, w + (*info), &ld, &ierr ); if (*info > 0) { // first ilo columns were already upper triangular, // so the corresponding eigenvalues are also valid. nval = ilo - 1; lapackf77_clascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, w, &n, &ierr ); } } #if defined(Version3) magma_free( dT ); #endif #if defined(Version5) magma_free_cpu( T ); #endif work[0] = magma_cmake_lwork( minwrk ); // TODO use optwrk as in dgeev return *info; } /* magma_cgeev */