inline static void magma_clarfxsym_v2( magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *V, magmaFloatComplex *TAU, magmaFloatComplex *work) { /* WORK (workspace) float complex array, dimension N */ magma_int_t ione = 1; magmaFloatComplex dtmp; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_neg_one= MAGMA_C_NEG_ONE; magmaFloatComplex c_half = MAGMA_C_HALF; /* X = AVtau */ blasf77_chemv("L",&n, TAU, A, &lda, V, &ione, &c_zero, work, &ione); /* compute dtmp= X'*V */ dtmp = magma_cblas_cdotc(n, work, ione, V, ione); /* compute 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * c_half * (*TAU); /* compute W=X-1/2VX'Vt = X - dtmp*V */ blasf77_caxpy(&n, &dtmp, V, &ione, work, &ione); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_cher2("L", &n, &c_neg_one, work, &ione, V, &ione, A, &lda); }
extern "C" void magma_clarfxsym( magma_int_t N, magmaFloatComplex *A, magma_int_t LDA, magmaFloatComplex *V, magmaFloatComplex *TAU) { magma_int_t IONE=1; magmaFloatComplex dtmp; magmaFloatComplex Z_ZERO = MAGMA_C_ZERO; //magmaFloatComplex Z_ONE = MAGMA_C_ONE; magmaFloatComplex Z_MONE = MAGMA_C_NEG_ONE; magmaFloatComplex Z_HALF = MAGMA_C_HALF; //magmaFloatComplex WORK[N]; magmaFloatComplex *WORK; magma_cmalloc_cpu( &WORK, N ); /* apply left and right on A(st:ed,st:ed)*/ //magma_clarfxsym(len,A(st,st),LDX,V(st),TAU(st)); /* X = AVtau */ blasf77_chemv("L",&N, TAU, A, &LDA, V, &IONE, &Z_ZERO, WORK, &IONE); /* je calcul dtmp= X'*V */ dtmp = magma_cblas_cdotc(N, WORK, IONE, V, IONE); /* je calcul 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * Z_HALF * (*TAU); /* je calcul W=X-1/2VX'Vt = X - dtmp*V */ /* for (j = 0; j < N; j++) WORK[j] = WORK[j] + (dtmp*V[j]); */ blasf77_caxpy(&N, &dtmp, V, &IONE, WORK, &IONE); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_cher2("L",&N,&Z_MONE,WORK,&IONE,V,&IONE,A,&LDA); magma_free_cpu(WORK); }
inline static void magma_clarfxsym_v2(magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *V, magmaFloatComplex *TAU, magmaFloatComplex *work) { /* WORK (workspace) float complex array, dimension N */ magma_int_t ione = 1; magmaFloatComplex dtmp; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_neg_one= MAGMA_C_NEG_ONE; magmaFloatComplex c_half = MAGMA_C_HALF; /* X = AVtau */ blasf77_chemv("L",&n, TAU, A, &lda, V, &ione, &c_zero, work, &ione); /* compute dtmp= X'*V */ #if defined(PRECISION_z) || defined(PRECISION_c) dtmp = c_zero; for (magma_int_t j = 0; j < n; j++) dtmp = dtmp + MAGMA_C_CNJG(work[j]) * V[j]; //cblas_cdotc_sub(n, work, ione, V, ione, &dtmp); #else dtmp = cblas_cdotc(n, work, ione, V, ione); #endif /* compute 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * c_half * (*TAU); /* compute W=X-1/2VX'Vt = X - dtmp*V */ blasf77_caxpy(&n, &dtmp, V, &ione, work, &ione); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_cher2("L", &n, &c_neg_one, work, &ione, V, &ione, A, &lda); }
/** Purpose ------- CLATRD reduces NB rows and columns of a complex Hermitian matrix A to Hermitian tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, CLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, CLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by CHETRD. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e COMPLEX array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau COMPLEX array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W COMPLEX array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). @param dA TODO: dimension (ldda, n)? @param ldda TODO: ldda >= n? @param dW TODO: dimension (lddw, ??) @param lddw TODO: lddw >= n ?? @param[in] queue magma_queue_t Queue to execute in. Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a Hermitian rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_cheev_aux ********************************************************************/ extern "C" magma_int_t magma_clatrd( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magmaFloatComplex *A, magma_int_t lda, float *e, magmaFloatComplex *tau, magmaFloatComplex *W, magma_int_t ldw, magmaFloatComplex *work, magma_int_t lwork, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dW, magma_int_t lddw, magma_queue_t queue ) { #define A(i_, j_) (A + (i_) + (j_)*lda) #define W(i_, j_) (W + (i_) + (j_)*ldw) #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #define dW(i_, j_) (dW + (i_) + (j_)*lddw) /* Constants */ const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magma_int_t ione = 1; /* Local variables */ magmaFloatComplex alpha, value; magma_int_t i, i_n, i_1, iw; /* Check arguments */ magma_int_t info = 0; if ( uplo != MagmaLower && uplo != MagmaUpper ) { info = -1; } else if ( n < 0 ) { info = -2; } else if ( nb < 1 ) { info = -3; } else if ( lda < max(1,n) ) { info = -5; } else if ( ldw < max(1,n) ) { info = -9; } else if ( ldda < max(1,n) ) { info = -11; } else if ( lddw < max(1,n) ) { info = -13; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return if possible */ if (n == 0) { return info; } if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #ifdef COMPLEX lapackf77_clacgv( &i_n, W(i, iw+1), &ldw ); #endif blasf77_cgemv( "No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione ); #ifdef COMPLEX lapackf77_clacgv( &i_n, W(i, iw+1), &ldw ); lapackf77_clacgv( &i_n, A(i, i+1), &lda ); #endif blasf77_cgemv( "No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione ); #ifdef COMPLEX lapackf77_clacgv( &i_n, A(i, i+1), &lda ); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_clarfg( &i, &alpha, A(0, i), &ione, &tau[i - 1] ); e[i-1] = MAGMA_C_REAL( alpha ); *A(i-1,i) = MAGMA_C_ONE; /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_csetvector( i, A(0, i), 1, dA(0, i), 1, queue ); magma_chemv( MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione, queue ); // 2. Start putting the result back (asynchronously) magma_cgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw), ldw, queue ); if (i < n-1) { blasf77_cgemv( MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); } // 3. Here is where we need it // TODO find the right place magma_queue_sync( queue ); if (i < n-1) { blasf77_cgemv( "No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); blasf77_cgemv( MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); blasf77_cgemv( "No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); } blasf77_cscal( &i, &tau[i - 1], W(0, iw), &ione ); value = magma_cblas_cdotc( i, W(0,iw), ione, A(0,i), ione ); alpha = tau[i - 1] * -0.5f * value; blasf77_caxpy( &i, &alpha, A(0, i), &ione, W(0, iw), &ione ); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #ifdef COMPLEX lapackf77_clacgv( &i, W(i, 0), &ldw ); #endif blasf77_cgemv( "No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione ); #ifdef COMPLEX lapackf77_clacgv( &i, W(i, 0), &ldw ); lapackf77_clacgv( &i, A(i, 0), &lda ); #endif blasf77_cgemv( "No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione ); #ifdef COMPLEX lapackf77_clacgv( &i, A(i, 0), &lda ); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_clarfg( &i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i] ); e[i] = MAGMA_C_REAL( alpha ); *A(i+1,i) = MAGMA_C_ONE; /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_csetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1, queue ); magma_chemv( MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione, queue ); // 2. Start putting the result back (asynchronously) magma_cgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, queue ); blasf77_cgemv( MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); blasf77_cgemv( "No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, work, &ione ); blasf77_cgemv( MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); // 3. Here is where we need it magma_queue_sync( queue ); if (i != 0) blasf77_caxpy( &i_n, &c_one, work, &ione, W(i+1, i), &ione ); blasf77_cgemv( "No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione ); blasf77_cscal( &i_n, &tau[i], W(i+1,i), &ione ); value = magma_cblas_cdotc( i_n, W(i+1,i), ione, A(i+1,i), ione ); alpha = tau[i] * -0.5f * value; blasf77_caxpy( &i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione ); } } } return info; } /* magma_clatrd */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cher2k */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; float cublas_error, Cnorm, work[1]; magma_int_t N, K; magma_int_t Ak, An, Bk, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaFloatComplex *h_A, *h_B, *h_C, *h_Ccublas; magmaFloatComplex *d_A, *d_B, *d_C; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); float beta = MAGMA_D_MAKE( -0.48, 0.38 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("uplo = %s, transA = %s\n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA) ); printf(" N K CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.msize[itest]; K = opts.ksize[itest]; gflops = FLOPS_CHER2K(K, N) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = An = N; Ak = K; ldb = Bn = N; Bk = K; } else { lda = An = K; Ak = N; ldb = Bn = K; Bk = N; } ldc = N; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*Ak; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*Bk ); TESTING_MALLOC_CPU( h_C, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( h_Ccublas, magmaFloatComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*Bk ); TESTING_MALLOC_DEV( d_C, magmaFloatComplex, lddc*N ); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_clarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( An, Ak, h_A, lda, d_A, ldda ); magma_csetmatrix( Bn, Bk, h_B, ldb, d_B, lddb ); magma_csetmatrix( N, N, h_C, ldc, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ); cublasCher2k( handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), N, K, &alpha, d_A, ldda, d_B, lddb, &beta, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( N, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_cher2k( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_clange( "M", &N, &N, h_C, &ldc, work ); blasf77_caxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_clange( "M", &N, &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Ccublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- CLATRD2 reduces NB rows and columns of a complex Hermitian matrix A to Hermitian tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, CLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, CLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by CHETRD2_GPU. It uses an accelerated HEMV that needs extra memory. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e COMPLEX array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau COMPLEX array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W COMPLEX array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a Hermitian rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_cheev_aux ********************************************************************/ extern "C" magma_int_t magma_clatrd2(magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magmaFloatComplex *A, magma_int_t lda, float *e, magmaFloatComplex *tau, magmaFloatComplex *W, magma_int_t ldw, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *dW, magma_int_t lddw, magmaFloatComplex *dwork, magma_int_t ldwork) { #define A(i, j) (A + (j)*lda + (i)) #define W(i, j) (W + (j)*ldw + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) #define dW(i, j) (dW + (j)*lddw + (i)) magma_int_t i; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex value = MAGMA_C_ZERO; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; magmaFloatComplex alpha; magmaFloatComplex *f; if (n <= 0) { return 0; } magma_queue_t stream; magma_queue_create( &stream ); magma_cmalloc_cpu( &f, n ); assert( f != NULL ); // TODO return error, or allocate outside clatrd if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i_n, W(i, iw+1), &ldw); #endif blasf77_cgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i_n, W(i, iw+1), &ldw); lapackf77_clacgv(&i_n, A(i, i+1), &ldw); #endif blasf77_cgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i_n, A(i, i+1), &ldw); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_clarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_C_REAL( alpha ); *A(i-1,i) = MAGMA_C_MAKE( 1, 0 ); /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_csetvector( i, A(0, i), 1, dA(0, i), 1 ); //#if (GPUSHMEM < 200) //magma_chemv(MagmaUpper, i, c_one, dA(0, 0), ldda, // dA(0, i), ione, c_zero, dW(0, iw), ione); //#else magmablas_chemv_work(MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione, dwork, ldwork); //#endif // 2. Start putting the result back (asynchronously) magma_cgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw) /*test*/, ldw, stream ); if (i < n-1) { blasf77_cgemv(MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } // 3. Here is where we need it // TODO find the right place magma_queue_sync( stream ); if (i < n-1) { blasf77_cgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_cgemv(MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_cgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_cscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_cdotc_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_cdotc( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -0.5f * value; blasf77_caxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i, W(i, 0), &ldw); #endif blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i, W(i, 0), &ldw); lapackf77_clacgv(&i, A(i, 0), &lda); #endif blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i, A(i, 0), &lda); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_clarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); e[i] = MAGMA_C_REAL( alpha ); *A(i+1,i) = MAGMA_C_MAKE( 1, 0 ); /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_csetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 ); //#if (GPUSHMEM < 200) //magma_chemv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, // dW(i+1, i), ione); //#else magmablas_chemv_work(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione, dwork, ldwork); //#endif // 2. Start putting the result back (asynchronously) magma_cgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, stream ); blasf77_cgemv(MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_cgemv(MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); // 3. Here is where we need it magma_queue_sync( stream ); if (i != 0) blasf77_caxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_cscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_cdotc_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_cdotc( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i] * -0.5f * value; blasf77_caxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); } } } magma_free_cpu(f); magma_queue_destroy( stream ); return 0; } /* magma_clatrd */
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; float matnorm, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex_ptr d_lA[MagmaMaxGPUs]; /* Matrix size */ magma_int_t M = 0, N = 0, n2, n_local[4], lda, ldda, lhwork; magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000}; magma_int_t i, k, nk, info, min_mn; int max_num_gpus = 2, num_gpus = 2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-NGPU", argv[i])==0) num_gpus = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); M = N = size[9]; } ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); magma_int_t nb = magma_get_cgeqrf_nb(M); if (num_gpus > max_num_gpus){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = max_num_gpus; } printf("Number of GPUs to be used = %d\n", (int) num_gpus); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; magma_device_t devices[ MagmaMaxGPUs ]; magma_int_t num = 0; magma_int_t err; magma_init(); err = magma_getdevices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } } /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); for(i=0; i<num_gpus; i++){ n_local[i] = ((N/nb)/num_gpus)*nb; if (i < (N/nb)%num_gpus) n_local[i] += nb; else if (i == (N/nb)%num_gpus) n_local[i] += N%nb; TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, ldda*n_local[i] ); printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); } lhwork = -1; lapackf77_cgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork ); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (float)M, (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &M, tau, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_cgeqrf had an illegal value.\n", (int) -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ int j; magma_queue_t *trans_queues = (magma_queue_t*)malloc(num_gpus*sizeof(magma_queue_t)); for(j=0;j<num_gpus;j++){ trans_queues[j] = queues[2*j]; } // warm-up magma_csetmatrix_1D_col_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, queues, &info); magma_csetmatrix_1D_col_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); gpu_time = magma_wtime(); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, queues, &info); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_cgeqrf2 had an illegal value.\n", (int) -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix_1D_col_bcyclic(M, N, d_lA, ldda, h_R, lda, num_gpus, nb, trans_queues); matnorm = lapackf77_clange("f", &M, &N, h_A, &M, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_clange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); for(i=0; i<num_gpus; i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy(queues[2*i]); magma_queue_destroy(queues[2*i+1]); } /* Shutdown */ magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *tau, *dtau, *h_work, tmp[1]; magmaFloatComplex *d_A; float *dwork; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGEQRF( M, N ) / 1e9; lwork = -1; lapackf77_cgeqrf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC( h_A, magmaFloatComplex, n2 ); TESTING_HOSTALLOC( h_R, magmaFloatComplex, n2 ); TESTING_DEVALLOC( d_A, magmaFloatComplex, ldda*N ); TESTING_DEVALLOC( dtau, magmaFloatComplex, min_mn ); TESTING_DEVALLOC(dwork, float, min_mn ); TESTING_MALLOC( h_work, magmaFloatComplex, lwork ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); // warmup magma_cgeqr2_gpu( M, N, d_A, ldda, dtau, dwork, &info ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime( 0 ); magma_cgeqr2_gpu( M, N, d_A, ldda, dtau, dwork, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix( M, N, d_A, ldda, h_R, M ); error = lapackf77_clange("f", &M, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_clange("f", &M, &N, h_R, &lda, work) / error; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time ); } TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_work ); TESTING_HOSTFREE( h_R ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( dtau ); TESTING_DEVFREE( dwork ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ctrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; float cublas_error, normA, normx, normr, work[1]; magma_int_t N, info; magma_int_t sizeA; magma_int_t lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; magmaFloatComplex *h_A, *h_b, *h_x, *h_xcublas; magmaFloatComplex *d_A, *d_x; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_opts opts; parse_opts( argc, argv, &opts ); printf("uplo = %c, transA = %c, diag = %c\n", opts.uplo, opts.transA, opts.diag ); printf(" N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("============================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; gflops = FLOPS_CTRSM(opts.side, N, 1) / 1e9; lda = N; ldda = ((lda+31)/32)*32; sizeA = lda*N; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_b, magmaFloatComplex, N ); TESTING_MALLOC_CPU( h_x, magmaFloatComplex, N ); TESTING_MALLOC_CPU( h_xcublas, magmaFloatComplex, N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_x, magmaFloatComplex, N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_cgetrf( &N, &N, h_A, &lda, ipiv, &info ); for( int j = 0; j < N; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_clarnv( &ione, ISEED, &N, h_b ); blasf77_ccopy( &N, h_b, &ione, h_x, &ione ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_csetmatrix( N, N, h_A, lda, d_A, ldda ); magma_csetvector( N, h_x, 1, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ); cublasCtrsv( opts.uplo, opts.transA, opts.diag, N, d_A, ldda, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetvector( N, d_x, 1, h_xcublas, 1 ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ctrsv( &opts.uplo, &opts.transA, &opts.diag, &N, h_A, &lda, h_x, &ione ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) // error for CUBLAS normA = lapackf77_clange( "F", &N, &N, h_A, &lda, work ); normx = lapackf77_clange( "F", &N, &ione, h_xcublas, &ione, work ); blasf77_ctrmv( &opts.uplo, &opts.transA, &opts.diag, &N, h_A, &lda, h_xcublas, &ione ); blasf77_caxpy( &N, &c_neg_one, h_b, &ione, h_xcublas, &ione ); normr = lapackf77_clange( "F", &N, &ione, h_xcublas, &N, work ); cublas_error = normr / (normA*normx); if ( opts.lapack ) { printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error ); } else { printf("%5d %7.2f (%7.2f) --- ( --- ) %8.2e\n", (int) N, cublas_perf, 1000.*cublas_time, cublas_error ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_b ); TESTING_FREE_CPU( h_x ); TESTING_FREE_CPU( h_xcublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_x ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; float magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 1.5, -2.3 ); magmaFloatComplex beta = MAGMA_C_MAKE( -0.6, 0.8 ); magmaFloatComplex *A, *X, *Y, *Ycublas, *Ymagma; magmaFloatComplex *dA, *dX, *dY, *dC_work; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("=============================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_CHEMV( N ) / 1e9; TESTING_MALLOC( A, magmaFloatComplex, sizeA ); TESTING_MALLOC( X, magmaFloatComplex, sizeX ); TESTING_MALLOC( Y, magmaFloatComplex, sizeY ); TESTING_MALLOC( Ycublas, magmaFloatComplex, sizeY ); TESTING_MALLOC( Ymagma, magmaFloatComplex, sizeY ); TESTING_DEVALLOC( dA, magmaFloatComplex, sizeA ); TESTING_DEVALLOC( dX, magmaFloatComplex, sizeX ); TESTING_DEVALLOC( dY, magmaFloatComplex, sizeY ); blocks = (N + nb - 1) / nb; ldwork = lda * (blocks + 1); TESTING_DEVALLOC( dC_work, magmaFloatComplex, ldwork ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &sizeA, A ); magma_cmake_hermitian( N, A, lda ); lapackf77_clarnv( &ione, ISEED, &sizeX, X ); lapackf77_clarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( N, N, A, lda, dA, lda ); magma_csetvector( N, X, incx, dX, incx ); magma_csetvector( N, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasChemv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMA BLAS =================================================================== */ magma_csetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); #if (GPUSHMEM >= 200) magmablas_chemv2( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy, dC_work, ldwork ); #else magmablas_chemv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); #endif magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_cgetvector( N, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_chemv( &opts.uplo, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_caxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_clange( "M", &N, &ione, Ymagma, &N, work ) / N; blasf77_caxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_clange( "M", &N, &ione, Ycublas, &N, work ) / N; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error ); TESTING_FREE( A ); TESTING_FREE( X ); TESTING_FREE( Y ); TESTING_FREE( Ycublas ); TESTING_FREE( Ymagma ); TESTING_DEVFREE( dA ); TESTING_DEVFREE( dX ); TESTING_DEVFREE( dY ); TESTING_DEVFREE( dC_work ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing clarfb_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t M, N, K, size, ldc, ldv, ldt, ldw, nv; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float error, work[1]; magma_int_t status = 0; // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans [] = { MagmaConjTrans, MagmaNoTrans }; magma_direct_t direct[] = { MagmaForward, MagmaBackward }; magma_storev_t storev[] = { MagmaColumnwise, MagmaRowwise }; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf(" M N K storev side direct trans ||R||_F / ||HC||_F\n"); printf("========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { M = opts.msize[itest]; N = opts.nsize[itest]; K = opts.ksize[itest]; if ( M < K || N < K || K <= 0 ) { printf( "%5d %5d %5d skipping because clarfb requires M >= K, N >= K, K >= 0\n", (int) M, (int) N, (int) K ); continue; } for( int istor = 0; istor < 2; ++istor ) { for( int iside = 0; iside < 2; ++iside ) { for( int idir = 0; idir < 2; ++idir ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { ldc = ((M+31)/32)*32; ldt = ((K+31)/32)*32; ldw = (side[iside] == MagmaLeft ? N : M); // (ldv, nv) get swapped later if rowwise ldv = (side[iside] == MagmaLeft ? M : N); nv = K; // Allocate memory for matrices magmaFloatComplex *C, *R, *V, *T, *W; TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( V, magmaFloatComplex, ldv*K ); TESTING_MALLOC_CPU( T, magmaFloatComplex, ldt*K ); TESTING_MALLOC_CPU( W, magmaFloatComplex, ldw*K ); magmaFloatComplex_ptr dC, dV, dT, dW; TESTING_MALLOC_DEV( dC, magmaFloatComplex, ldc*N ); TESTING_MALLOC_DEV( dV, magmaFloatComplex, ldv*K ); TESTING_MALLOC_DEV( dT, magmaFloatComplex, ldt*K ); TESTING_MALLOC_DEV( dW, magmaFloatComplex, ldw*K ); // C is M x N. size = ldc*N; lapackf77_clarnv( &ione, ISEED, &size, C ); //printf( "C=" ); magma_cprint( M, N, C, ldc ); // V is ldv x nv. See larfb docs for description. // if column-wise and left, M x K // if column-wise and right, N x K // if row-wise and left, K x M // if row-wise and right, K x N size = ldv*nv; lapackf77_clarnv( &ione, ISEED, &size, V ); if ( storev[istor] == MagmaColumnwise ) { if ( direct[idir] == MagmaForward ) { lapackf77_claset( MagmaUpperStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_claset( MagmaLowerStr, &K, &K, &c_zero, &c_one, &V[(ldv-K)], &ldv ); } } else { // rowwise, swap V's dimensions std::swap( ldv, nv ); if ( direct[idir] == MagmaForward ) { lapackf77_claset( MagmaLowerStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_claset( MagmaUpperStr, &K, &K, &c_zero, &c_one, &V[(nv-K)*ldv], &ldv ); } } //printf( "# ldv %d, nv %d\n", ldv, nv ); //printf( "V=" ); magma_cprint( ldv, nv, V, ldv ); // T is K x K, upper triangular for forward, and lower triangular for backward magma_int_t k1 = K-1; size = ldt*K; lapackf77_clarnv( &ione, ISEED, &size, T ); if ( direct[idir] == MagmaForward ) { lapackf77_claset( MagmaLowerStr, &k1, &k1, &c_zero, &c_zero, &T[1], &ldt ); } else { lapackf77_claset( MagmaUpperStr, &k1, &k1, &c_zero, &c_zero, &T[1*ldt], &ldt ); } //printf( "T=" ); magma_cprint( K, K, T, ldt ); magma_csetmatrix( M, N, C, ldc, dC, ldc ); magma_csetmatrix( ldv, nv, V, ldv, dV, ldv ); magma_csetmatrix( K, K, T, ldt, dT, ldt ); lapackf77_clarfb( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), lapack_direct_const( direct[idir] ), lapack_storev_const( storev[istor] ), &M, &N, &K, V, &ldv, T, &ldt, C, &ldc, W, &ldw ); //printf( "HC=" ); magma_cprint( M, N, C, ldc ); magma_clarfb_gpu( side[iside], trans[itran], direct[idir], storev[istor], M, N, K, dV, ldv, dT, ldt, dC, ldc, dW, ldw ); magma_cgetmatrix( M, N, dC, ldc, R, ldc ); //printf( "dHC=" ); magma_cprint( M, N, R, ldc ); // compute relative error |HC_magma - HC_lapack| / |HC_lapack| error = lapackf77_clange( "Fro", &M, &N, C, &ldc, work ); size = ldc*N; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &M, &N, R, &ldc, work ) / error; printf( "%5d %5d %5d %c %c %c %c %8.2e %s\n", (int) M, (int) N, (int) K, lapacke_storev_const(storev[istor]), lapacke_side_const(side[iside]), lapacke_direct_const(direct[idir]), lapacke_trans_const(trans[itran]), error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( V ); TESTING_FREE_CPU( T ); TESTING_FREE_CPU( W ); TESTING_FREE_DEV( dC ); TESTING_FREE_DEV( dV ); TESTING_FREE_DEV( dT ); TESTING_FREE_DEV( dW ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }}}} printf( "\n" ); } TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magma_int_t ione = 1; real_Double_t gflops, magma_perf, magma_time, cpu_perf, cpu_time; float magma_error, work[1]; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; magmaFloatComplex alpha = MAGMA_C_MAKE( 1.5, -2.3 ); magmaFloatComplex beta = MAGMA_C_MAKE( -0.6, 0.8 ); magmaFloatComplex *A, *X, *Y, *Ymagma; magmaFloatComplex_ptr dA, dX, dY, dwork; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf(" N MAGMA Gflop/s (ms) CPU Gflop/s (ms) MAGMA error\n"); printf("=========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_CSYMV( N ) / 1e9; TESTING_MALLOC_CPU( A, magmaFloatComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaFloatComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaFloatComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaFloatComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( dX, magmaFloatComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaFloatComplex, sizeY ); blocks = (N + nb - 1) / nb; ldwork = ldda*blocks; TESTING_MALLOC_DEV( dwork, magmaFloatComplex, ldwork ); magmablas_claset( MagmaFull, ldwork, 1, MAGMA_C_NAN, MAGMA_C_NAN, dwork, ldwork ); magmablas_claset( MagmaFull, ldda, N, MAGMA_C_NAN, MAGMA_C_NAN, dA, ldda ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &sizeA, A ); magma_cmake_hermitian( N, A, lda ); // should not use data from the opposite triangle -- fill with NAN to check magma_int_t N1 = N-1; if ( opts.uplo == MagmaUpper ) { lapackf77_claset( "Lower", &N1, &N1, &MAGMA_C_NAN, &MAGMA_C_NAN, &A[1], &lda ); } else { lapackf77_claset( "Upper", &N1, &N1, &MAGMA_C_NAN, &MAGMA_C_NAN, &A[lda], &lda ); } lapackf77_clarnv( &ione, ISEED, &sizeX, X ); lapackf77_clarnv( &ione, ISEED, &sizeY, Y ); /* Note: CUBLAS does not implement csymv */ /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_csetmatrix( N, N, A, lda, dA, ldda ); magma_csetvector( N, X, incx, dX, incx ); magma_csetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); if ( opts.version == 1 ) { magmablas_csymv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork, opts.queue ); } else { // non-work interface (has added overhead) magmablas_csymv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); } magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_cgetvector( N, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); lapackf77_csymv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_caxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_clange( "M", &N, &ione, Ymagma, &N, work ) / N; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, cpu_perf, 1000.*cpu_time, magma_error, (magma_error < tol ? "ok" : "failed")); status += ! (magma_error < tol); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cpotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaFloatComplex *h_A, *h_R; magma_int_t N, n2, lda, info; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu = %d, uplo = %s\n", (int) opts.ngpu, lapack_uplo_const(opts.uplo) ); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; gflops = FLOPS_CPOTRF( N ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); magma_cmake_hpd( N, h_A, lda ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cpotrf( opts.uplo, N, h_R, lda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cpotrf( lapack_uplo_const(opts.uplo), &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_clange("f", &N, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_clange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgemm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time; float magma_error, dev_error, Cnorm, work[1]; magma_int_t M, N, K; magma_int_t Am, An, Bm, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magmaFloatComplex *h_A, *h_B, *h_C, *h_Cmagma, *h_Cdev; magmaFloatComplex_ptr d_A, d_B, d_C; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magmaFloatComplex beta = MAGMA_C_MAKE( -0.48, 0.38 ); magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); #ifdef HAVE_CUBLAS // for CUDA, we can check MAGMA vs. CUBLAS, without running LAPACK printf("%% If running lapack (option --lapack), MAGMA and %s error are both computed\n" "%% relative to CPU BLAS result. Else, MAGMA error is computed relative to %s result.\n\n", g_platform_str, g_platform_str ); printf("%% transA = %s, transB = %s\n", lapack_trans_const(opts.transA), lapack_trans_const(opts.transB) ); printf("%% M N K MAGMA Gflop/s (ms) %s Gflop/s (ms) CPU Gflop/s (ms) MAGMA error %s error\n", g_platform_str, g_platform_str ); #else // for others, we need LAPACK for check opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf("%% transA = %s, transB = %s\n", lapack_trans_const(opts.transA), lapack_trans_const(opts.transB) ); printf("%% M N K %s Gflop/s (ms) CPU Gflop/s (ms) %s error\n", g_platform_str, g_platform_str ); #endif printf("%%========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; K = opts.ksize[itest]; gflops = FLOPS_CGEMM( M, N, K ) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if ( opts.transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } ldc = M; ldda = magma_roundup( lda, opts.align ); // multiple of 32 by default lddb = magma_roundup( ldb, opts.align ); // multiple of 32 by default lddc = magma_roundup( ldc, opts.align ); // multiple of 32 by default sizeA = lda*An; sizeB = ldb*Bn; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*An ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*Bn ); TESTING_MALLOC_CPU( h_C, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( h_Cmagma, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( h_Cdev, magmaFloatComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*An ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*Bn ); TESTING_MALLOC_DEV( d_C, magmaFloatComplex, lddc*N ); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_clarnv( &ione, ISEED, &sizeC, h_C ); magma_csetmatrix( Am, An, h_A, lda, d_A, ldda, opts.queue ); magma_csetmatrix( Bm, Bn, h_B, ldb, d_B, lddb, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (currently only with CUDA) =================================================================== */ #ifdef HAVE_CUBLAS magma_csetmatrix( M, N, h_C, ldc, d_C, lddc, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); magmablas_cgemm( opts.transA, opts.transB, M, N, K, alpha, d_A, ldda, d_B, lddb, beta, d_C, lddc, opts.queue ); magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_cgetmatrix( M, N, d_C, lddc, h_Cmagma, ldc, opts.queue ); #endif /* ===================================================================== Performs operation using CUBLAS / clBLAS / Xeon Phi MKL =================================================================== */ magma_csetmatrix( M, N, h_C, ldc, d_C, lddc, opts.queue ); dev_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS // opts.handle also uses opts.queue cublasCgemm( opts.handle, cublas_trans_const(opts.transA), cublas_trans_const(opts.transB), M, N, K, &alpha, d_A, ldda, d_B, lddb, &beta, d_C, lddc ); #else magma_cgemm( opts.transA, opts.transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, opts.queue ); #endif dev_time = magma_sync_wtime( opts.queue ) - dev_time; dev_perf = gflops / dev_time; magma_cgetmatrix( M, N, d_C, lddc, h_Cdev, ldc, opts.queue ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_cgemm( lapack_trans_const(opts.transA), lapack_trans_const(opts.transB), &M, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & dev, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_clange( "F", &M, &N, h_C, &ldc, work ); blasf77_caxpy( &sizeC, &c_neg_one, h_C, &ione, h_Cdev, &ione ); dev_error = lapackf77_clange( "F", &M, &N, h_Cdev, &ldc, work ) / Cnorm; #ifdef HAVE_CUBLAS blasf77_caxpy( &sizeC, &c_neg_one, h_C, &ione, h_Cmagma, &ione ); magma_error = lapackf77_clange( "F", &M, &N, h_Cmagma, &ldc, work ) / Cnorm; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, magma_error, dev_error, (magma_error < tol && dev_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && dev_error < tol); #else printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, (int) K, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, dev_error, (dev_error < tol ? "ok" : "failed")); status += ! (dev_error < tol); #endif } else { #ifdef HAVE_CUBLAS // compute relative error for magma, relative to dev (currently only with CUDA) Cnorm = lapackf77_clange( "F", &M, &N, h_Cdev, &ldc, work ); blasf77_caxpy( &sizeC, &c_neg_one, h_Cdev, &ione, h_Cmagma, &ione ); magma_error = lapackf77_clange( "F", &M, &N, h_Cmagma, &ldc, work ) / Cnorm; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e --- %s\n", (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, magma_error, (magma_error < tol ? "ok" : "failed")); status += ! (magma_error < tol); #else printf("%5d %5d %5d %7.2f (%7.2f) --- ( --- ) ---\n", (int) M, (int) N, (int) K, dev_perf, 1000.*dev_time ); #endif } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Cmagma ); TESTING_FREE_CPU( h_Cdev ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zlag2c and clag2z */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; float serror, swork[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaFloatComplex s_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, lda, ldda, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magmaFloatComplex *SA, *SR; magmaDoubleComplex *A, *R; magmaFloatComplex *dSA; magmaDoubleComplex_ptr dA; magma_opts opts; parse_opts( argc, argv, &opts ); printf("func M N CPU GB/s (ms) GPU GB/s (ms) ||R||_F\n"); printf("=====================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; lda = m; ldda = ((m+31)/32)*32; // m*n double-complex loads and m*n single-complex stores (and vice-versa for clag2z) gbytes = (real_Double_t) m*n * (sizeof(magmaDoubleComplex) + sizeof(magmaFloatComplex)) / 1e9; size = ldda*n; // ldda >= lda TESTING_MALLOC_CPU( SA, magmaFloatComplex, size ); TESTING_MALLOC_CPU( A, magmaDoubleComplex, size ); TESTING_MALLOC_CPU( SR, magmaFloatComplex, size ); TESTING_MALLOC_CPU( R, magmaDoubleComplex, size ); TESTING_MALLOC_DEV( dSA, magmaFloatComplex, size ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, size ); lapackf77_zlarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, SA ); magma_zsetmatrix( m, n, A, lda, dA, ldda ); magma_csetmatrix( m, n, SA, lda, dSA, ldda ); /* ===================================================================== Performs operation using LAPACK zlag2c =================================================================== */ cpu_time = magma_wtime(); lapackf77_zlag2c( &m, &n, A, &lda, SA, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (info != 0) printf("lapackf77_zlag2c returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA zlag2c =================================================================== */ gpu_time = magma_sync_wtime(0); magmablas_zlag2c( m, n, dA, ldda, dSA, ldda, &info ); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gbytes / gpu_time; if (info != 0) printf("magmablas_zlag2c returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_cgetmatrix( m, n, dSA, ldda, SR, lda ); /* ===================================================================== compute error |SA_magma - SA_lapack| should be zero if both are IEEE compliant =================================================================== */ blasf77_caxpy( &size, &s_neg_one, SA, &ione, SR, &ione ); serror = lapackf77_clange( "Fro", &m, &n, SR, &lda, swork ); printf( "zlag2c %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., serror, (serror == 0 ? "ok" : "failed") ); status += ! (serror == 0); /* ===================================================================== Reset matrices =================================================================== */ lapackf77_zlarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, SA ); magma_zsetmatrix( m, n, A, lda, dA, ldda ); magma_csetmatrix( m, n, SA, lda, dSA, ldda ); /* ===================================================================== Performs operation using LAPACK clag2z =================================================================== */ cpu_time = magma_wtime(); lapackf77_clag2z( &m, &n, SA, &lda, A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (info != 0) printf("lapackf77_clag2z returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA clag2z =================================================================== */ magma_csetmatrix( m, n, SA, lda, dSA, ldda ); gpu_time = magma_sync_wtime(0); magmablas_clag2z( m, n, dSA, ldda, dA, ldda, &info ); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gbytes / gpu_time; if (info != 0) printf("magmablas_clag2z returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_zgetmatrix( m, n, dA, ldda, R, lda ); /* ===================================================================== compute error |A_magma - A_lapack| should be zero if both are IEEE compliant =================================================================== */ blasf77_zaxpy( &size, &c_neg_one, A, &ione, R, &ione ); error = lapackf77_zlange( "Fro", &m, &n, R, &lda, work ); printf( "clag2z %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error == 0 ? "ok" : "failed") ); status += ! (error == 0); TESTING_FREE_CPU( SA ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( SR ); TESTING_FREE_CPU( R ); TESTING_FREE_DEV( dSA ); TESTING_FREE_DEV( dA ); printf( "\n" ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); magma_setdevice(0); magma_timestr_t start, end; float flops, magma_perf, cuda_perf, error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t n_local[4]; FILE *fp ; magma_int_t N, m, i, j, lda, LDA, M; magma_int_t matsize; magma_int_t vecsize; magma_int_t istart = 64; magma_int_t incx = 1; char uplo = MagmaLower; magmaFloatComplex alpha = MAGMA_C_MAKE(1., 0.); // MAGMA_C_MAKE( 1.5, -2.3 ); magmaFloatComplex beta = MAGMA_C_MAKE(0., 0.); // MAGMA_C_MAKE( -0.6, 0.8 ); magmaFloatComplex *A, *X, *Y[4], *Ycublas, *Ymagma; magmaFloatComplex *dA, *dX[4], *dY[4], *d_lA[4], *dYcublas ; magma_queue_t stream[4][10]; magmaFloatComplex *C_work; magmaFloatComplex *dC_work[4]; int max_num_gpus; magma_int_t num_gpus = 1, nb; magma_int_t blocks, lwork; magma_int_t offset = 0; M = 0; N = 0; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); istart = N; } else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-NGPU", argv[i])==0) num_gpus = atoi(argv[++i]); else if (strcmp("-offset", argv[i])==0) offset = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) { printf(" testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); printf(" in %c side \n", uplo); } else { printf("\nUsage: \n"); printf(" testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); exit(1); } } else { #if defined(PRECISION_z) M = N = 8000; #else M = N = 12480; #endif num_gpus = 2; offset = 0; printf("\nUsage: \n"); printf(" testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); } ////////////////////////////////////////////////////////////////////////// cudaGetDeviceCount(&max_num_gpus); if (num_gpus > max_num_gpus){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = max_num_gpus; } printf("Number of GPUs to be used = %d\n", (int) num_gpus); for(int i=0; i< num_gpus; i++) { magma_queue_create(&stream[i][0]); } LDA = ((N+31)/32)*32; matsize = N*LDA; vecsize = N*incx; nb = 32; //nb = 64; printf("block size = %d\n", (int) nb); TESTING_MALLOC_CPU( A, magmaFloatComplex, matsize ); TESTING_MALLOC_CPU( X, magmaFloatComplex, vecsize ); TESTING_MALLOC_CPU( Ycublas, magmaFloatComplex, vecsize ); TESTING_MALLOC_CPU( Ymagma, magmaFloatComplex, vecsize ); for(i=0; i<num_gpus; i++) { TESTING_MALLOC_CPU( Y[i], magmaFloatComplex, vecsize ); } magma_setdevice(0); TESTING_MALLOC_DEV( dA, magmaFloatComplex, matsize ); TESTING_MALLOC_DEV( dYcublas, magmaFloatComplex, vecsize ); for(i=0; i<num_gpus; i++) { n_local[i] = ((N/nb)/num_gpus)*nb; if (i < (N/nb)%num_gpus) n_local[i] += nb; else if (i == (N/nb)%num_gpus) n_local[i] += N%nb; magma_setdevice(i); TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, LDA*n_local[i] );// potentially bugged TESTING_MALLOC_DEV( dX[i], magmaFloatComplex, vecsize ); TESTING_MALLOC_DEV( dY[i], magmaFloatComplex, vecsize ); printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); } magma_setdevice(0); ////////////////////////////////////////////////////////////////////////// /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &matsize, A ); magma_cmake_hermitian( N, A, LDA ); blocks = N / nb + (N % nb != 0); lwork = LDA * (blocks + 1); TESTING_MALLOC_CPU( C_work, magmaFloatComplex, lwork ); for(i=0; i<num_gpus; i++){ magma_setdevice(i); TESTING_MALLOC_DEV( dC_work[i], magmaFloatComplex, lwork ); //fillZero(dC_work[i], lwork); } magma_setdevice(0); ////////////////////////////////////////////////////////////////////////// fp = fopen ("results_chemv_mgpu.csv", "w") ; if( fp == NULL ){ printf("Couldn't open output file\n"); exit(1);} printf("CHEMV magmaFloatComplex precision\n\n"); printf( " n CUBLAS,Gflop/s MAGMABLAS,Gflop/s \"error\"\n" "==============================================================\n"); fprintf(fp, " n CUBLAS,Gflop/s MAGMABLAS,Gflop/s \"error\"\n" "==============================================================\n"); // for( offset = 0; offset< N; offset ++ ) for(int size = istart ; size <= N ; size += 128) { // printf("offset = %d ", offset); m = size ; // m = N; // lda = ((m+31)/32)*32;// lda = LDA; flops = FLOPS( (float)m ) / 1e6; printf( "N %5d ", (int) m ); fprintf( fp, "%5d, ", (int) m ); vecsize = m * incx; lapackf77_clarnv( &ione, ISEED, &vecsize, X ); lapackf77_clarnv( &ione, ISEED, &vecsize, Y[0] ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_setdevice(0); magma_csetmatrix_1D_col_bcyclic(m, m, A, LDA, d_lA, lda, num_gpus, nb); magma_setdevice(0); magma_csetmatrix( m, m, A, LDA, dA, lda ); magma_csetvector( m, Y[0], incx, dYcublas, incx ); for(i=0; i<num_gpus; i++){ magma_setdevice(i); magma_csetvector( m, X, incx, dX[i], incx ); magma_csetvector( m, Y[0], incx, dY[i], incx ); blocks = m / nb + (m % nb != 0); magma_csetmatrix( lda, blocks, C_work, LDA, dC_work[i], lda ); } magma_setdevice(0); start = get_current_time(); cublasChemv( uplo, m-offset, alpha, dA + offset + offset * lda, lda, dX[0] + offset, incx, beta, dYcublas + offset, incx ); end = get_current_time(); magma_cgetvector( m, dYcublas, incx, Ycublas, incx ); cuda_perf = flops / GetTimerValue(start,end); printf( "%11.2f", cuda_perf ); fprintf(fp, "%11.2f,", cuda_perf ); magma_setdevice(0); start = get_current_time(); if(nb == 32) { magmablas_chemv2_mgpu_32_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, dC_work, lwork, num_gpus, nb, offset); } else // nb = 64 { magmablas_chemv2_mgpu_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, dC_work, lwork, num_gpus, nb, offset); } for(i=1; i<num_gpus; i++) { magma_setdevice(i); cudaDeviceSynchronize(); } end = get_current_time(); magma_perf = flops / GetTimerValue(start,end); printf( "%11.2f", magma_perf ); fprintf(fp, "%11.2f,", magma_perf ); for(i=0; i<num_gpus; i++) { magma_setdevice(i); magma_cgetvector( m, dY[i], incx, Y[i], incx ); } magma_setdevice(0); #ifdef validate for( j= offset;j<m;j++) { for(i=1; i<num_gpus; i++) { // printf("Y[%d][%d] = %15.14f\n", i, j, Y[i][j].x); #if defined(PRECISION_z) || defined(PRECISION_c) Y[0][j].x = Y[0][j].x + Y[i][j].x; Y[0][j].y = Y[0][j].y + Y[i][j].y; #else Y[0][j] = Y[0][j] + Y[i][j]; #endif } } /* #if defined(PRECISION_z) || defined(PRECISION_c) for( j=offset;j<m;j++) { if(Y[0][j].x != Ycublas[j].x) { printf("Y-multi[%d] = %f, %f\n", j, Y[0][j].x, Y[0][j].y ); printf("Ycublas[%d] = %f, %f\n", j, Ycublas[j].x, Ycublas[j].y); } } #else for( j=offset;j<m;j++) { if(Y[0][j] != Ycublas[j]) { printf("Y-multi[%d] = %f\n", j, Y[0][j] ); printf("Ycublas[%d] = %f\n", j, Ycublas[j]); } } #endif */ /* ===================================================================== Computing the Difference Cublas VS Magma =================================================================== */ magma_int_t nw = m - offset ; blasf77_caxpy( &nw, &c_neg_one, Y[0] + offset, &incx, Ycublas + offset, &incx); error = lapackf77_clange( "M", &nw, &ione, Ycublas + offset, &nw, work ); #if 0 printf( "\t\t %8.6e", error / m ); fprintf( fp, "\t\t %8.6e", error / m ); /* * Extra check with cblas vs magma */ cblas_ccopy( m, Y, incx, Ycublas, incx ); cblas_chemv( CblasColMajor, CblasLower, m, CBLAS_SADDR(alpha), A, LDA, X, incx, CBLAS_SADDR(beta), Ycublas, incx ); blasf77_caxpy( &m, &c_neg_one, Ymagma, &incx, Ycublas, &incx); error = lapackf77_clange( "M", &m, &ione, Ycublas, &m, work ); #endif printf( "\t\t %8.6e", error / m ); fprintf( fp, "\t\t %8.6e", error / m ); #endif printf("\n"); fprintf(fp, "\n"); } fclose( fp ) ; /* Free Memory */ TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_CPU( C_work ); magma_setdevice(0); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dYcublas ); for(i=0; i<num_gpus; i++) { TESTING_FREE_CPU( Y[i] ); magma_setdevice(i); TESTING_FREE_DEV( d_lA[i] ); TESTING_FREE_DEV( dX[i] ); TESTING_FREE_DEV( dY[i] ); TESTING_FREE_DEV( dC_work[i] ); } magma_setdevice(0); /////////////////////////////////////////////////////////// /* Free device */ TESTING_FINALIZE(); return 0; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; float magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 1.5, -2.3 ); magmaFloatComplex beta = MAGMA_C_MAKE( -0.6, 0.8 ); magmaFloatComplex *A, *X, *Y, *Ycublas, *Ymagma; magmaFloatComplex *dA, *dX, *dY; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("===================================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; lda = ((M+31)/32)*32; gflops = FLOPS_CGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, magmaFloatComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaFloatComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaFloatComplex, sizeY ); TESTING_MALLOC_CPU( Ycublas, magmaFloatComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaFloatComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaFloatComplex, sizeA ); TESTING_MALLOC_DEV( dX, magmaFloatComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaFloatComplex, sizeY ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &sizeA, A ); lapackf77_clarnv( &ione, ISEED, &sizeX, X ); lapackf77_clarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( M, N, A, lda, dA, lda ); magma_csetvector( Xm, X, incx, dX, incx ); magma_csetvector( Ym, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasCgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetvector( Ym, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_csetvector( Ym, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); magmablas_cgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_cgetvector( Ym, dY, incx, Ymagma, incx ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_cgemv( &opts.transA, &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_caxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_clange( "M", &Ym, &ione, Ymagma, &Ym, work ) / Ym; blasf77_caxpy( &Ym, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_clange( "M", &Ym, &ione, Ycublas, &Ym, work ) / Ym; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgeadd_batched Code is very similar to testing_clacpy_batched.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_B; magmaFloatComplex *d_A, *d_B; magmaFloatComplex **hAarray, **hBarray, **dAarray, **dBarray; magmaFloatComplex alpha = MAGMA_C_MAKE( 3.1415, 2.718 ); magma_int_t M, N, mb, nb, size, lda, ldda, mstride, nstride, ntile; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); mb = (opts.nb == 0 ? 32 : opts.nb); nb = (opts.nb == 0 ? 64 : opts.nb); mstride = 2*mb; nstride = 3*nb; printf("mb=%d, nb=%d, mstride=%d, nstride=%d\n", (int) mb, (int) nb, (int) mstride, (int) nstride ); printf(" M N ntile CPU GFlop/s (ms) GPU GFlop/s (ms) error \n"); printf("====================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = ((M+31)/32)*32; size = lda*N; if ( N < nb || M < nb ) { ntile = 0; } else { ntile = min( (M - nb)/mstride + 1, (N - nb)/nstride + 1 ); } gflops = 2.*mb*nb*ntile / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda *N ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, lda *N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, ldda*N ); TESTING_MALLOC_CPU( hAarray, magmaFloatComplex*, ntile ); TESTING_MALLOC_CPU( hBarray, magmaFloatComplex*, ntile ); TESTING_MALLOC_DEV( dAarray, magmaFloatComplex*, ntile ); TESTING_MALLOC_DEV( dBarray, magmaFloatComplex*, ntile ); lapackf77_clarnv( &ione, ISEED, &size, h_A ); lapackf77_clarnv( &ione, ISEED, &size, h_B ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix( M, N, h_A, lda, d_A, ldda ); magma_csetmatrix( M, N, h_B, lda, d_B, ldda ); // setup pointers for( int tile = 0; tile < ntile; ++tile ) { int offset = tile*mstride + tile*nstride*ldda; hAarray[tile] = &d_A[offset]; hBarray[tile] = &d_B[offset]; } magma_setvector( ntile, sizeof(magmaFloatComplex*), hAarray, 1, dAarray, 1 ); magma_setvector( ntile, sizeof(magmaFloatComplex*), hBarray, 1, dBarray, 1 ); gpu_time = magma_sync_wtime( 0 ); magmablas_cgeadd_batched( mb, nb, alpha, dAarray, ldda, dBarray, ldda, ntile ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); for( int tile = 0; tile < ntile; ++tile ) { int offset = tile*mstride + tile*nstride*lda; for( int j = 0; j < nb; ++j ) { blasf77_caxpy( &mb, &alpha, &h_A[offset + j*lda], &ione, &h_B[offset + j*lda], &ione ); } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ magma_cgetmatrix( M, N, d_B, ldda, h_A, lda ); error = lapackf77_clange( "F", &M, &N, h_B, &lda, work ); blasf77_caxpy(&size, &c_neg_one, h_A, &ione, h_B, &ione); error = lapackf77_clange("f", &M, &N, h_B, &lda, work) / error; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, (int) ntile, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_CPU( hAarray ); TESTING_FREE_CPU( hBarray ); TESTING_FREE_DEV( dAarray ); TESTING_FREE_DEV( dBarray ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing chegst */ int main( int argc, char** argv) { TESTING_INIT(); // Constants const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magma_int_t ione = 1; // Local variables real_Double_t gpu_time, cpu_time; magmaFloatComplex *h_A, *h_B, *h_R; magmaFloatComplex_ptr d_A, d_B; float Anorm, error, work[1]; magma_int_t N, n2, lda, ldda, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("%% itype N CPU time (sec) GPU time (sec) |R| \n"); printf("%%=======================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = magma_roundup( lda, opts.align ); n2 = N*lda; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, lda*N ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, lda*N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, ldda*N ); /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clarnv( &ione, ISEED, &n2, h_B ); magma_cmake_hermitian( N, h_A, lda ); magma_cmake_hpd( N, h_B, lda ); magma_cpotrf( opts.uplo, N, h_B, lda, &info ); if (info != 0) { printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_csetmatrix( N, N, h_A, lda, d_A, ldda ); magma_csetmatrix( N, N, h_B, lda, d_B, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_chegst_gpu( opts.itype, opts.uplo, N, d_A, ldda, d_B, ldda, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) { printf("magma_chegst_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_chegst( &opts.itype, lapack_uplo_const(opts.uplo), &N, h_A, &lda, h_B, &lda, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) { printf("lapackf77_chegst returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_cgetmatrix( N, N, d_A, ldda, h_R, lda ); blasf77_caxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); Anorm = safe_lapackf77_clanhe("f", lapack_uplo_const(opts.uplo), &N, h_A, &lda, work ); error = safe_lapackf77_clanhe("f", lapack_uplo_const(opts.uplo), &N, h_R, &lda, work ) / Anorm; bool okay = (error < tol); status += ! okay; printf("%3d %5d %7.2f %7.2f %8.2e %s\n", (int) opts.itype, (int) N, cpu_time, gpu_time, error, (okay ? "ok" : "failed")); } else { printf("%3d %5d --- %7.2f\n", (int) opts.itype, (int) N, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmqr_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max, dt_size; magmaFloatComplex *C, *R, *A, *W, *tau; magmaFloatComplex_ptr dC, dA, dT; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = 2. * opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { Magma_ConjTrans, MagmaNoTrans }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_cgeqrf_nb( m ); ldc = ((m + 31)/32)*32; lda = ((max(m,n) + 31)/32)*32; gflops = FLOPS_CUNMQR( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaLeft ) { // side = left lwork_max = (m - k + nb)*(n + nb) + n*nb; dt_size = ( 2*min(m,k) + ((max(m,n) + 31)/32)*32 )*nb; } else { // side = right lwork_max = (n - k + nb)*(m + nb) + m*nb; dt_size = ( 2*min(n,k) + ((max(m,n) + 31)/32)*32 )*nb; } TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaFloatComplex, lda*k ); TESTING_MALLOC_CPU( W, magmaFloatComplex, lwork_max ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, k ); TESTING_MALLOC_DEV( dC, magmaFloatComplex, ldc*n ); TESTING_MALLOC_DEV( dA, magmaFloatComplex, lda*k ); TESTING_MALLOC_DEV( dT, magmaFloatComplex, dt_size ); // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, ISEED, &size, C ); magma_csetmatrix( m, n, C, ldc, dC, ldc ); // A is m x k (left) or n x k (right) lda = (side[iside] == MagmaLeft ? m : n); size = lda*k; lapackf77_clarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in dA, tau, dT magma_csetmatrix( lda, k, A, lda, dA, lda ); magma_cgeqrf_gpu( lda, k, dA, lda, tau, dT, &info ); magma_cgetmatrix( lda, k, dA, lda, A, lda ); if (info != 0) printf("magma_cgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmqr( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_cunmqr_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, W, lwork, dT, nb, &info ); if (info != 0) printf("magma_cunmqr_gpu (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_C_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) printf("invalid lwork %d, lwork_max %d\n", (int) lwork, (int) lwork_max ); gpu_time = magma_sync_wtime( 0 ); // sync needed for L,N and R,T cases magma_cunmqr_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, W, lwork, dT, nb, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cunmqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_cgetmatrix( m, n, dC, ldc, R, ldc ); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( W ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dC ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex_ptr d_A, dT; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; float tol; opts.lapack |= (opts.version == 2 && opts.check == 2); // check (-c2) implies lapack (-l) if ( opts.version != 2 && opts.check == 1 ) { printf( "NOTE: version %d requires -c2 check due to the special structure of the\n" "MAGMA cgeqrf results; using -c2.\n\n", (int) opts.version ); opts.check = 2; } printf( "version %d\n", (int) opts.version ); if ( opts.version == 2 ) { if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1*eps) ||I-Q'Q||_1 / (M*eps)\n"); printf("=========================================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); } tol = 1.0; } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||Ax-b||_F/(N*||A||_F*||x||_F)\n"); printf("====================================================================================\n"); tol = opts.tolerance * lapackf77_slamch("E"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lwork ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // save seeds lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, 0, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); if ( opts.version == 2 ) { magma_cgeqrf2_gpu( M, N, d_A, 0, ldda, tau, opts.queues2, &info ); } else { nb = magma_get_cgeqrf_nb( M ); size = (2*min(M, N) + (N+31)/32*32 )*nb; TESTING_MALLOC_DEV( dT, magmaFloatComplex, size ); if ( opts.version == 1 ) { magma_cgeqrf_gpu( M, N, d_A, 0, ldda, tau, dT, 0, opts.queue, &info ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { magma_cgeqrf3_gpu( M, N, d_A, 0, ldda, tau, dT, opts.queue, &info ); } #endif else { printf( "Unknown version %d\n", opts.version ); exit(1); } } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ magmaFloatComplex *tau2; TESTING_MALLOC_CPU( tau2, magmaFloatComplex, min_mn ); cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &lda, tau2, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( tau2 ); } if ( opts.check == 1 && M >= N ) { /* ===================================================================== Check the result -- only version 1, cqrt02 requires M >= N =================================================================== */ magma_int_t lwork = n2+N; magmaFloatComplex *h_W1, *h_W2, *h_W3; float *h_RW, results[2]; magma_cgetmatrix( M, N, d_A, 0, ldda, h_R, M, opts.queue ); TESTING_MALLOC_CPU( h_W1, magmaFloatComplex, n2 ); // Q TESTING_MALLOC_CPU( h_W2, magmaFloatComplex, n2 ); // R TESTING_MALLOC_CPU( h_W3, magmaFloatComplex, lwork ); // WORK TESTING_MALLOC_CPU( h_RW, float, M ); // RWORK lapackf77_clarnv( &ione, ISEED2, &n2, h_A ); lapackf77_cqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0], results[1] ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0], results[1] ); } // todo also check results[1] < tol? printf(" %s\n", (results[0] < tol ? "ok" : "failed")); status += ! (results[0] < tol); TESTING_FREE_CPU( h_W1 ); TESTING_FREE_CPU( h_W2 ); TESTING_FREE_CPU( h_W3 ); TESTING_FREE_CPU( h_RW ); } else if ( opts.check == 2 && opts.version == 2 ) { /* ===================================================================== Check the result compared to LAPACK -- only version 2 =================================================================== */ magma_cgetmatrix( M, N, d_A, 0, ldda, h_R, M, opts.queue ); error = lapackf77_clange("f", &M, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_clange("f", &M, &N, h_R, &lda, work) / error; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf(" %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check == 2 && M >= N ) { /* ===================================================================== Check the result by solving linear system -- only versions 1 & 3, M >= N =================================================================== */ magma_int_t lwork; magmaFloatComplex *x, *b, *hwork; magmaFloatComplex_ptr d_B; const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magma_int_t ione = 1; // initialize RHS, b = A*random TESTING_MALLOC_CPU( x, magmaFloatComplex, N ); TESTING_MALLOC_CPU( b, magmaFloatComplex, M ); lapackf77_clarnv( &ione, ISEED, &N, x ); blasf77_cgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione ); // copy to GPU TESTING_MALLOC_DEV( d_B, magmaFloatComplex, M ); magma_csetvector( M, b, 1, d_B, 0, 1, opts.queue ); if ( opts.version == 1 ) { // allocate hwork magma_cgeqrs_gpu( M, N, 1, d_A, 0, ldda, tau, dT, 0, d_B, 0, M, tmp, -1, opts.queue, &info ); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, magmaFloatComplex, lwork ); // solve linear system magma_cgeqrs_gpu( M, N, 1, d_A, 0, ldda, tau, dT, 0, d_B, 0, M, hwork, lwork, opts.queue, &info ); if (info != 0) printf("magma_cgeqrs returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // allocate hwork magma_cgeqrs3_gpu( M, N, 1, d_A, 0, ldda, tau, dT, 0, d_B, 0, M, tmp, -1, opts.queue, &info ); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, magmaFloatComplex, lwork ); // solve linear system magma_cgeqrs3_gpu( M, N, 1, d_A, 0, ldda, tau, dT, 0, d_B, 0, M, hwork, lwork, opts.queue, &info ); if (info != 0) printf("magma_cgeqrs3 returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } #endif else { printf( "Unknown version %d\n", opts.version ); exit(1); } magma_cgetvector( N, d_B, 0, 1, x, 1, opts.queue ); // compute r = Ax - b, saved in b lapackf77_clarnv( &ione, ISEED2, &n2, h_A ); blasf77_cgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione ); // compute residual |Ax - b| / (n*|A|*|x|) float norm_x, norm_A, norm_r, work[1]; norm_A = lapackf77_clange( "F", &M, &N, h_A, &lda, work ); norm_r = lapackf77_clange( "F", &M, &ione, b, &M, work ); norm_x = lapackf77_clange( "F", &N, &ione, x, &N, work ); TESTING_FREE_CPU( x ); TESTING_FREE_CPU( b ); TESTING_FREE_DEV( d_B ); error = norm_r / (N * norm_A * norm_x); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf(" %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) ---", (int) M, (int) N, gpu_perf, gpu_time); } printf("%s\n", (opts.check != 0 ? " (error check only for M >= N)" : "")); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); if ( opts.version != 2 ) TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgegqr */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float e1, e2, e3, e4, e5, *work; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex *h_A, *h_R, *tau, *dtau, *h_work, *h_rwork, tmp[1]; magmaFloatComplex *d_A, *dwork; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn; magma_int_t ione = 1, ldwork; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) // versions 1...4 are valid if (opts.version < 1 || opts.version > 4) { printf("Unknown version %d; exiting\n", opts.version ); return -1; } float tol, eps = lapackf77_slamch("E"); tol = 10* opts.tolerance * eps; printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||I-Q'Q||_F / M ||I-Q'Q||_I / M ||A-Q R||_I\n"); printf(" MAGMA / LAPACK MAGMA / LAPACK\n"); printf("==========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if (N > 128) { printf("%5d %5d skipping because cgegqr requires N <= 128\n", (int) M, (int) N); continue; } if (M < N) { printf("%5d %5d skipping because cgegqr requires M >= N\n", (int) M, (int) N); continue; } min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGEQRF( M, N ) / 1e9 + FLOPS_CUNGQR( M, N, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); lwork = max(lwork, 3*N*N); ldwork = N*N; if (opts.version == 2) { ldwork = 3*N*N + min_mn; } TESTING_MALLOC_PIN( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); TESTING_MALLOC_PIN(h_rwork, magmaFloatComplex, lwork ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( work, float, M ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( dtau, magmaFloatComplex, min_mn ); TESTING_MALLOC_DEV( dwork, magmaFloatComplex, ldwork ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); // warmup magma_cgegqr_gpu( 1, M, N, d_A, ldda, dwork, h_work, &info ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime( 0 ); magma_cgegqr_gpu( opts.version, M, N, d_A, ldda, dwork, h_rwork, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgegqr returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_cgetmatrix( M, N, d_A, ldda, h_R, M ); // Regenerate R // blasf77_cgemm("t", "n", &N, &N, &M, &c_one, h_R, &M, h_A, &M, &c_zero, h_rwork, &N); // magma_cprint(N, N, h_work, N); blasf77_ctrmm("r", "u", "n", "n", &M, &N, &c_one, h_rwork, &N, h_R, &M); blasf77_caxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); e5 = lapackf77_clange("i", &M, &N, h_R, &M, work) / lapackf77_clange("i", &M, &N, h_A, &lda, work); magma_cgetmatrix( M, N, d_A, ldda, h_R, M ); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); /* Orthogonalize on the CPU */ lapackf77_cgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_cungqr(&M, &N, &N, h_A, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cungqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_cgemm("t", "n", &N, &N, &M, &c_one, h_R, &M, h_R, &M, &c_zero, h_work, &N); for(int ii = 0; ii < N*N; ii += N+1 ) { h_work[ii] = MAGMA_C_SUB(h_work[ii], c_one); } e1 = lapackf77_clange("f", &N, &N, h_work, &N, work) / N; e3 = lapackf77_clange("i", &N, &N, h_work, &N, work) / N; blasf77_cgemm("t", "n", &N, &N, &M, &c_one, h_A, &M, h_A, &M, &c_zero, h_work, &N); for(int ii = 0; ii < N*N; ii += N+1 ) { h_work[ii] = MAGMA_C_SUB(h_work[ii], c_one); } e2 = lapackf77_clange("f", &N, &N, h_work, &N, work) / N; e4 = lapackf77_clange("i", &N, &N, h_work, &N, work) / N; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e / %8.2e %8.2e / %8.2e %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, e1, e2, e3, e4, e5, (e1 < tol ? "ok" : "failed")); status += ! (e1 < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time ); } TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_rwork ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cherk */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; float cublas_error, Cnorm, work[1]; magma_int_t N, K; magma_int_t Ak, An; magma_int_t sizeA, sizeC; magma_int_t lda, ldc, ldda, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaFloatComplex *h_A, *h_C, *h_Ccublas; magmaFloatComplex *d_A, *d_C; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; float alpha = MAGMA_D_MAKE( 0.29, -0.86 ); float beta = MAGMA_D_MAKE( -0.48, 0.38 ); magma_opts opts; parse_opts( argc, argv, &opts ); printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n" "uplo = %c, transA = %c\n", opts.uplo, opts.transA ); printf(" N K CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; K = opts.ksize[i]; gflops = FLOPS_CHERK(K, N) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = An = N; Ak = K; } else { lda = An = K; Ak = N; } ldc = N; ldda = ((lda+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*Ak; sizeC = ldc*N; TESTING_MALLOC( h_A, magmaFloatComplex, lda*Ak ); TESTING_MALLOC( h_C, magmaFloatComplex, ldc*N ); TESTING_MALLOC( h_Ccublas, magmaFloatComplex, ldc*N ); TESTING_DEVALLOC( d_A, magmaFloatComplex, ldda*Ak ); TESTING_DEVALLOC( d_C, magmaFloatComplex, lddc*N ); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_csetmatrix( An, Ak, h_A, lda, d_A, ldda ); magma_csetmatrix( N, N, h_C, ldc, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ); cublasCherk( opts.uplo, opts.transA, N, K, alpha, d_A, ldda, beta, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( N, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_cherk( &opts.uplo, &opts.transA, &N, &K, &alpha, h_A, &lda, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_clanhe("fro", &opts.uplo, &N, h_C, &ldc, work); blasf77_caxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_clanhe( "fro", &opts.uplo, &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error ); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time); } TESTING_FREE( h_A ); TESTING_FREE( h_C ); TESTING_FREE( h_Ccublas ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_C ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing claset Code is very similar to testing_clacpy.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R; magmaFloatComplex *d_A; magmaFloatComplex offdiag = MAGMA_C_MAKE( 1.2000, 6.7000 ); magmaFloatComplex diag = MAGMA_C_MAKE( 3.1415, 2.7183 ); magma_int_t M, N, size, lda, ldb, ldda; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("uplo M N CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("==================================================================\n"); for( int iuplo = 0; iuplo < 3; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; //M += 2; // space for insets //N += 2; lda = M; ldb = lda; ldda = ((M+31)/32)*32; size = lda*N; if ( uplo[iuplo] == MagmaLower || uplo[iuplo] == MagmaUpper ) { // save triangle (with diagonal) // TODO wrong for trapezoid gbytes = sizeof(magmaFloatComplex) * 0.5*N*(N+1) / 1e9; } else { // save entire matrix gbytes = sizeof(magmaFloatComplex) * 1.*M*N / 1e9; } TESTING_MALLOC_CPU( h_A, magmaFloatComplex, size ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, size ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_C_MAKE( i + j/10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix( M, N, h_A, lda, d_A, ldda ); gpu_time = magma_sync_wtime( 0 ); //magmablas_claset( uplo[iuplo], M-2, N-2, offdiag, diag, d_A+1+ldda, ldda ); // inset by 1 row & col magmablas_claset( uplo[iuplo], M, N, offdiag, diag, d_A, ldda ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); //magma_int_t M2 = M-2; // inset by 1 row & col //magma_int_t N2 = N-2; //lapackf77_claset( lapack_uplo_const( uplo[iuplo] ), &M2, &N2, &offdiag, &diag, h_A+1+lda, &lda ); lapackf77_claset( lapack_uplo_const( uplo[iuplo] ), &M, &N, &offdiag, &diag, h_A, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ===================================================================== Check the result =================================================================== */ magma_cgetmatrix( M, N, d_A, ldda, h_R, lda ); blasf77_caxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_clange("f", &M, &N, h_R, &lda, work); printf("%4c %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", lapacke_uplo_const( uplo[iuplo] ), (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmbr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, dwork[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, k, mi, ni, mm, nn, nq, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; magmaFloatComplex *C, *R, *A, *work, *tau, *tauq, *taup; float *d, *e; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_vect_t vect [] = { MagmaQ, MagmaP }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { Magma_ConjTrans, MagmaNoTrans }; printf(" M N K vect side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int ivect = 0; ivect < 2; ++ivect ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_cgebrd_nb( m ); ldc = m; // A is nq x k (vect=Q) or k x nq (vect=P) // where nq=m (left) or nq=n (right) nq = (side[iside] == MagmaLeft ? m : n ); mm = (vect[ivect] == MagmaQ ? nq : k ); nn = (vect[ivect] == MagmaQ ? k : nq); lda = mm; // MBR calls either MQR or MLQ in various ways if ( vect[ivect] == MagmaQ ) { if ( nq >= k ) { gflops = FLOPS_CUNMQR( m, n, k, side[iside] ) / 1e9; } else { if ( side[iside] == MagmaLeft ) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } gflops = FLOPS_CUNMQR( mi, ni, nq-1, side[iside] ) / 1e9; } } else { if ( nq > k ) { gflops = FLOPS_CUNMLQ( m, n, k, side[iside] ) / 1e9; } else { if ( side[iside] == MagmaLeft ) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } gflops = FLOPS_CUNMLQ( mi, ni, nq-1, side[iside] ) / 1e9; } } // workspace for gebrd is (mm + nn)*nb // workspace for unmbr is m*nb or n*nb, depending on side lwork_max = max( (mm + nn)*nb, max( m*nb, n*nb )); TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaFloatComplex, lda*nn ); TESTING_MALLOC_CPU( work, magmaFloatComplex, lwork_max ); TESTING_MALLOC_CPU( d, float, min(mm,nn) ); TESTING_MALLOC_CPU( e, float, min(mm,nn) ); TESTING_MALLOC_CPU( tauq, magmaFloatComplex, min(mm,nn) ); TESTING_MALLOC_CPU( taup, magmaFloatComplex, min(mm,nn) ); // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, ISEED, &size, C ); lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*nn; lapackf77_clarnv( &ione, ISEED, &size, A ); // compute BRD factorization to get Householder vectors in A, tauq, taup //lapackf77_cgebrd( &mm, &nn, A, &lda, d, e, tauq, taup, work, &lwork_max, &info ); magma_cgebrd( mm, nn, A, lda, d, e, tauq, taup, work, lwork_max, &info ); if (info != 0) printf("magma_cgebrd returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( vect[ivect] == MagmaQ ) { tau = tauq; } else { tau = taup; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmbr( lapack_vect_const( vect[ivect] ), lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cunmbr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_cunmbr( vect[ivect], side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, work, lwork, &info ); if (info != 0) printf("magma_cunmbr (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_C_REAL( work[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("optimal lwork %d > lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_cunmbr( vect[ivect], side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, work, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cunmbr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_clange( "Fro", &m, &n, C, &ldc, dwork ); size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, dwork ) / error; printf( "%5d %5d %5d %c %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_vect_const( vect[ivect] ), lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( d ); TESTING_FREE_CPU( e ); TESTING_FREE_CPU( taup ); TESTING_FREE_CPU( tauq ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }}} // end ivect, iside, itran printf( "\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgels */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float gpu_error, cpu_error, error, Anorm, work[1]; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex *d_A, *d_B; magma_int_t M, N, size, nrhs, lda, ldb, ldda, lddb, min_mn, max_mn, nb, info; magma_int_t lworkgpu, lhwork, lhwork2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; float tol = opts.tolerance * lapackf77_slamch("E"); nrhs = opts.nrhs; printf(" ||b-Ax|| / (N||A||) ||dx-x||/(N||A||)\n"); printf(" M N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) CPU GPU \n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if ( M < N ) { printf( "%5d %5d %5d skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; size = lda*N; ldda = ((M+31)/32)*32; lddb = ((max_mn+31)/32)*32; nb = magma_get_cgeqrf_nb(M); gflops = (FLOPS_CGEQRF( M, N ) + FLOPS_CGEQRS( M, N, nrhs )) / 1e9; lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; // query for workspace size lhwork = -1; lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lhwork, &info); lhwork2 = (magma_int_t) MAGMA_C_REAL( tmp[0] ); lhwork = -1; lapackf77_cunmqr( MagmaLeftStr, MagmaConjTransStr, &M, &nrhs, &min_mn, NULL, &lda, NULL, NULL, &ldb, tmp, &lhwork, &info); lhwork = (magma_int_t) MAGMA_C_REAL( tmp[0] ); lhwork = max( max( lhwork, lhwork2 ), lworkgpu ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_A2, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaFloatComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*nrhs ); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &size, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS size = M*nrhs; lapackf77_clarnv( &ione, ISEED, &size, h_B ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); // make consistent RHS //size = N*nrhs; //lapackf77_clarnv( &ione, ISEED, &size, h_X ); //blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, // &c_one, h_A, &lda, // h_X, &ldb, // &c_zero, h_B, &ldb ); //lapackf77_clacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix( M, N, h_A, lda, d_A, ldda ); magma_csetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_cgels3_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgels3_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // Get the solution in h_X magma_cgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); // compute the residual blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); Anorm = lapackf77_clange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_clacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_cgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); cpu_error = lapackf77_clange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm); gpu_error = lapackf77_clange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); // error relative to LAPACK size = M*nrhs; blasf77_caxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_clange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e", (int) M, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, cpu_error, gpu_error, error ); if ( M == N ) { printf( " %s\n", (gpu_error < tol && error < tol ? "ok" : "failed")); status += ! (gpu_error < tol && error < tol); } else { printf( " %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgemm_batched */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cpu_perf, cpu_time; float magma_error, magma_err, Ynorm, work[1]; magma_int_t M, N, Xm, Ym, lda, ldda; magma_int_t sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_int_t batchCount; magmaFloatComplex *h_A, *h_X, *h_Y, *h_Ymagma; magmaFloatComplex *d_A, *d_X, *d_Y; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magmaFloatComplex beta = MAGMA_C_MAKE( -0.48, 0.38 ); magmaFloatComplex **A_array = NULL; magmaFloatComplex **X_array = NULL; magmaFloatComplex **Y_array = NULL; magma_opts opts; parse_opts( argc, argv, &opts ); batchCount = opts.batchcount; opts.lapack |= opts.check; //float tol = opts.tolerance * lapackf77_slamch("E"); printf("trans = %s\n", lapack_trans_const(opts.transA) ); printf("BatchCount M N MAGMA Gflop/s (ms) CPU Gflop/s (ms) MAGMA error\n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = ((M+31)/32)*32; gflops = FLOPS_CGEMV( M, N ) / 1e9 * batchCount; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N*batchCount; sizeX = incx*Xm*batchCount; sizeY = incy*Ym*batchCount; ldda = ((lda+31)/32)*32; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, sizeA ); TESTING_MALLOC_CPU( h_X, magmaFloatComplex, sizeX ); TESTING_MALLOC_CPU( h_Y, magmaFloatComplex, sizeY ); TESTING_MALLOC_CPU( h_Ymagma, magmaFloatComplex, sizeY ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N*batchCount ); TESTING_MALLOC_DEV( d_X, magmaFloatComplex, sizeX ); TESTING_MALLOC_DEV( d_Y, magmaFloatComplex, sizeY ); magma_malloc((void**)&A_array, batchCount * sizeof(*A_array)); magma_malloc((void**)&X_array, batchCount * sizeof(*X_array)); magma_malloc((void**)&Y_array, batchCount * sizeof(*Y_array)); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeX, h_X ); lapackf77_clarnv( &ione, ISEED, &sizeY, h_Y ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_csetmatrix( M, N*batchCount, h_A, lda, d_A, ldda ); magma_csetvector( Xm*batchCount, h_X, incx, d_X, incx ); magma_csetvector( Ym*batchCount, h_Y, incy, d_Y, incy ); cset_pointer(A_array, d_A, ldda, 0, 0, ldda*N, batchCount, magma_stream); cset_pointer(X_array, d_X, 1, 0, 0, incx*Xm, batchCount, magma_stream); cset_pointer(Y_array, d_Y, 1, 0, 0, incy*Ym, batchCount, magma_stream); magma_time = magma_sync_wtime( NULL ); magmablas_cgemv_batched(opts.transA, M, N, alpha, A_array, ldda, X_array, incx, beta, Y_array, incy, batchCount, magma_stream); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_cgetvector( Ym*batchCount, d_Y, incy, h_Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); for(int i=0; i<batchCount; i++) { blasf77_cgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, h_A + i*lda*N, &lda, h_X + i*Xm, &incx, &beta, h_Y + i*Ym, &incy ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma relative to lapack, // |C_magma - C_lapack| / |C_lapack| magma_error = 0.0; for(int s=0; s<batchCount; s++) { Ynorm = lapackf77_clange( "M", &M, &ione, h_Y + s*Ym, &incy, work ); blasf77_caxpy( &Ym, &c_neg_one, h_Y + s*Ym, &ione, h_Ymagma + s*Ym, &ione ); magma_err = lapackf77_clange( "M", &M, &ione, h_Ymagma + s*Ym, &incy, work ) / Ynorm; if ( isnan(magma_err) || isinf(magma_err) ) { magma_error = magma_err; break; } magma_error = max(fabs(magma_err), magma_error); } printf("%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e \n", (int) batchCount, (int) M, (int) N, magma_perf, 1000.*magma_time, cpu_perf, 1000.*cpu_time, magma_error); } else { printf("%10d %5d %5d %7.2f (%7.2f) --- ( --- ) ---\n", (int) batchCount, (int) M, (int) N, magma_perf, 1000.*magma_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_Y ); TESTING_FREE_CPU( h_Ymagma ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_X ); TESTING_FREE_DEV( d_Y ); TESTING_FREE_DEV( A_array ); TESTING_FREE_DEV( X_array ); TESTING_FREE_DEV( Y_array ); fflush( stdout); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ctrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time=0, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; float magma_error, cublas_error, work[1]; magma_int_t M, N, info; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; magmaFloatComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2; magmaFloatComplex *d_A, *d_B; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("side = %s, uplo = %s, transA = %s, diag = %s \n", lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("==================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; gflops = FLOPS_CTRSM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_B1, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_X1, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_X2, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bcublas, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bmagma, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, Ak ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_cgetrf( &Ak, &Ak, h_A, &lda, ipiv, &info ); for( int j = 0; j < Ak; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); memcpy(h_B1, h_B, sizeB*sizeof(magmaFloatComplex)); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_csetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_csetmatrix( M, N, h_B, ldb, d_B, lddb ); magma_time = magma_sync_wtime( NULL ); magmablas_ctrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( M, N, h_B, ldb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ); cublasCtrsm( handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), M, N, &alpha, d_A, ldda, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ctrsm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) memcpy(h_X1, h_Bmagma, sizeB*sizeof(magmaFloatComplex)); magmaFloatComplex alpha2 = MAGMA_C_DIV( c_one, alpha ); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha2, h_A, &lda, h_X1, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione ); float norm1 = lapackf77_clange( "M", &M, &N, h_X1, &ldb, work ); float normx = lapackf77_clange( "M", &M, &N, h_Bmagma, &ldb, work ); float normA = lapackf77_clange( "M", &Ak, &Ak, h_A, &lda, work ); magma_error = norm1/(normx*normA); memcpy(h_X2, h_Bcublas, sizeB*sizeof(magmaFloatComplex)); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha2, h_A, &lda, h_X2, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione ); norm1 = lapackf77_clange( "M", &M, &N, h_X2, &ldb, work ); normx = lapackf77_clange( "M", &M, &N, h_Bcublas, &ldb, work ); normA = lapackf77_clange( "M", &Ak, &Ak, h_A, &lda, work ); cublas_error = norm1/(normx*normA); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_B1 ); TESTING_FREE_CPU( h_X1 ); TESTING_FREE_CPU( h_X2 ); TESTING_FREE_CPU( h_Bcublas ); TESTING_FREE_CPU( h_Bmagma ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmlq */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float Cnorm, error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; magmaFloatComplex *C, *R, *A, *W, *tau; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { Magma_ConjTrans, MagmaNoTrans }; printf("%% M N K side trans CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F / ||QC||_F\n"); printf("%%==============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_cgelqf_nb( m, n ); ldc = m; // A is k x m (left) or k x n (right) mm = (side[iside] == MagmaLeft ? m : n); lda = k; gflops = FLOPS_CUNMLQ( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } // need at least 2*nb*nb for gelqf lwork_max = max( max( m*nb, n*nb ), 2*nb*nb ); // this rounds it up slightly if needed to agree with lwork query lwork_max = int( real( magma_cmake_lwork( lwork_max ))); TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaFloatComplex, lda*mm ); TESTING_MALLOC_CPU( W, magmaFloatComplex, lwork_max ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, k ); // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, ISEED, &size, C ); lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*mm; lapackf77_clarnv( &ione, ISEED, &size, A ); // compute LQ factorization to get Householder vectors in A, tau magma_cgelqf( k, mm, A, lda, tau, W, lwork_max, &info ); if (info != 0) { printf("magma_cgelqf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmlq( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_cunmlq returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_cunmlq( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) { printf("magma_cunmlq (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = (magma_int_t) MAGMA_C_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("Warning: optimal lwork %d > allocated lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_cunmlq( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_cunmlq returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); Cnorm = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / (magma_ssqrt(m*n) * Cnorm); printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( W ); TESTING_FREE_CPU( tau ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmqr */ int main( int argc, char** argv ) { TESTING_CUDA_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; /* Matrix size */ magma_int_t m, n, k; const int MAXTESTS = 10; magma_int_t msize[MAXTESTS] = { 1024, 2048, 3072, 4032, 5184, 6016, 7040, 8064, 9088, 10112 }; magma_int_t nsize[MAXTESTS] = { 1024, 2048, 3072, 4032, 5184, 6016, 7040, 8064, 9088, 10112 }; magma_int_t ksize[MAXTESTS] = { 1024, 2048, 3072, 4032, 5184, 6016, 7040, 8064, 9088, 10112 }; magma_int_t size; magma_int_t info; magma_int_t iseed[4] = {0,0,0,1}; printf( "Usage: %s -N m,n,k -c\n" " -N can be repeated %d times. m > 0, n > 0, k > 0 is required.\n" " If only m,n is given, then n=k. If only m is given, then m=n=k.\n" " -c or setting $MAGMA_TESTINGS_CHECK runs LAPACK and checks result.\n\n", argv[0], MAXTESTS ); int checkres = (getenv("MAGMA_TESTINGS_CHECK") != NULL); int ntest = 0; magma_int_t nmax = 0; magma_int_t mmax = 0; magma_int_t kmax = 0; for( int i = 1; i < argc; i++ ) { if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ) { magma_assert( ntest < MAXTESTS, "error: -N repeated more than maximum %d tests\n", MAXTESTS ); info = sscanf( argv[++i], "%d,%d,%d", &m, &n, &k ); if ( info == 3 && m > 0 && n > 0 && k > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = n; ksize[ ntest ] = k; } else if ( info == 2 && m > 0 && n > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = n; ksize[ ntest ] = n; // implicitly } else if ( info == 1 && m > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = m; // implicitly ksize[ ntest ] = m; // implicitly } else { printf( "error: -N %s is invalid; ensure m > 0, n > 0, k > 0.\n", argv[i] ); exit(1); } mmax = max( mmax, msize[ntest] ); nmax = max( nmax, nsize[ntest] ); kmax = max( kmax, ksize[ntest] ); ntest++; } else if ( strcmp("-c", argv[i]) == 0 ) { checkres = true; } else { printf( "invalid argument: %s\n", argv[i] ); exit(1); } } if ( ntest == 0 ) { ntest = MAXTESTS; nmax = nsize[ntest-1]; mmax = msize[ntest-1]; kmax = ksize[ntest-1]; } m = mmax; n = nmax; k = kmax; assert( n > 0 && m > 0 && k > 0 ); magma_int_t nb = magma_get_cgeqrf_nb( m ); magma_int_t ldc = m; magma_int_t lda = max(m,n); ldc = ((ldc+31)/32)*32; lda = ((lda+31)/32)*32; // Allocate memory for matrices cuFloatComplex *C, *R, *A, *W, *tau; magma_int_t lwork = max( m*nb, n*nb ); magma_int_t lwork_max = lwork; TESTING_MALLOC( C, cuFloatComplex, ldc*n ); TESTING_MALLOC( R, cuFloatComplex, ldc*n ); TESTING_MALLOC( A, cuFloatComplex, lda*k ); TESTING_MALLOC( W, cuFloatComplex, lwork_max ); TESTING_MALLOC( tau, cuFloatComplex, k ); // test all combinations of input parameters const char* side[] = { MagmaLeftStr, MagmaRightStr }; const char* trans[] = { MagmaConjTransStr, MagmaNoTransStr }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int i = 0; i < ntest; ++i ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { m = msize[i]; n = nsize[i]; k = ksize[i]; if ( *side[iside] == 'L' && m < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=left and m < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } if ( *side[iside] == 'R' && n < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=right and n < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } gflops = FLOPS_CUNMQR( m, n, k, *side[iside] ) / 1e9; // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, iseed, &size, C ); lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc ); //magma_csetmatrix( m, n, C, ldc, dC, ldc ); // A is m x k (left) or n x k (right) lda = (*side[iside] == 'L' ? m : n); size = lda*k; lapackf77_clarnv( &ione, iseed, &size, A ); // compute QR factorization to get Householder vectors in A, tau magma_cgeqrf( lda, k, A, lda, tau, W, lwork_max, &info ); if ( info != 0 ) printf("magma_cgeqrf returned error %d\n", info); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmqr( side[iside], trans[itran], &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cunmqr returned error %d.\n", (int) info); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for work size lwork = -1; magma_cunmqr( *side[iside], *trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) printf("magma_cunmqr returned error %d (lwork query).\n", (int) info); lwork = (magma_int_t) MAGMA_C_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) printf("invalid lwork %d, lwork_max %d\n", lwork, lwork_max ); gpu_time = magma_wtime(); magma_cunmqr( *side[iside], *trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cunmqr returned error %d.\n", (int) info); //magma_cgetmatrix( m, n, dC, ldc, R, ldc ); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %-5s %-9s %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) m, (int) n, (int) k, side[iside], trans[itran], cpu_perf, cpu_time, gpu_perf, gpu_time, error ); }} // end iside, itran printf( "\n" ); } // end i // Memory clean up TESTING_FREE( C ); TESTING_FREE( R ); TESTING_FREE( A ); TESTING_FREE( W ); TESTING_FREE( tau ); // Shutdown TESTING_CUDA_FINALIZE(); return 0; }