// On input, LU and ipiv is LU factorization of A. On output, LU is overwritten. // Works for any m, n. // Uses init_matrix() to re-generate original A as needed. // Returns error in factorization, |PA - LU| / (n |A|) // This allocates 3 more matrices to store A, L, and U. double get_LU_error(magma_int_t M, magma_int_t N, magmaDoubleComplex *LU, magma_int_t lda, magma_int_t *ipiv) { magma_int_t min_mn = min(M,N); magma_int_t ione = 1; magma_int_t i, j; magmaDoubleComplex alpha = MAGMA_Z_ONE; magmaDoubleComplex beta = MAGMA_Z_ZERO; magmaDoubleComplex *A, *L, *U; double work[1], matnorm, residual; TESTING_MALLOC_CPU( A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( L, magmaDoubleComplex, M*min_mn ); TESTING_MALLOC_CPU( U, magmaDoubleComplex, min_mn*N ); memset( L, 0, M*min_mn*sizeof(magmaDoubleComplex) ); memset( U, 0, min_mn*N*sizeof(magmaDoubleComplex) ); // set to original A init_matrix( M, N, A, lda ); lapackf77_zlaswp( &N, A, &lda, &ione, &min_mn, ipiv, &ione); // copy LU to L and U, and set diagonal to 1 lapackf77_zlacpy( MagmaLowerStr, &M, &min_mn, LU, &lda, L, &M ); lapackf77_zlacpy( MagmaUpperStr, &min_mn, &N, LU, &lda, U, &min_mn ); for (j=0; j < min_mn; j++) L[j+j*M] = MAGMA_Z_MAKE( 1., 0. ); matnorm = lapackf77_zlange("f", &M, &N, A, &lda, work); blasf77_zgemm("N", "N", &M, &N, &min_mn, &alpha, L, &M, U, &min_mn, &beta, LU, &lda); for( j = 0; j < N; j++ ) { for( i = 0; i < M; i++ ) { LU[i+j*lda] = MAGMA_Z_SUB( LU[i+j*lda], A[i+j*lda] ); } } residual = lapackf77_zlange("f", &M, &N, LU, &lda, work); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( L ); TESTING_FREE_CPU( U ); return residual / (matnorm * N); }
/* Task execution code */ static void SCHED_zgemm(Quark* quark) { int M; int N; int K; cuDoubleComplex *A1; int LDA; cuDoubleComplex *A2; cuDoubleComplex *A3; cuDoubleComplex mone = MAGMA_Z_NEG_ONE; cuDoubleComplex one = MAGMA_Z_ONE; quark_unpack_args_7(quark, M, N, K, A1, LDA, A2, A3); blasf77_zgemm("n", "n", &M, &N, &K, &mone, A1, &LDA, A2, &LDA, &one, A3, &LDA); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zposv_batched */ int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; double err = 0.0, Rnorm, Anorm, Xnorm, *work; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_B, *h_X; magmaDoubleComplex_ptr d_A, d_B; magma_int_t *dinfo_array; magma_int_t N, nrhs, lda, ldb, ldda, lddb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_int_t batchCount = 1; magmaDoubleComplex **dA_array = NULL; magmaDoubleComplex **dB_array = NULL; magma_queue_t queue = magma_stream; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; batchCount = opts.batchcount ; printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("BatchCount N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldb = lda; ldda = ((N+31)/32)*32; lddb = ldda; gflops = ( FLOPS_ZPOTRF( N) + FLOPS_ZPOTRS( N, nrhs ) ) / 1e9 * batchCount; sizeA = lda*N*batchCount; sizeB = ldb*nrhs*batchCount; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, sizeB ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, sizeB ); TESTING_MALLOC_CPU( work, double, N); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N*batchCount ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*nrhs*batchCount ); TESTING_MALLOC_DEV( dinfo_array, magma_int_t, batchCount ); magma_malloc((void**)&dA_array, batchCount * sizeof(*dA_array)); magma_malloc((void**)&dB_array, batchCount * sizeof(*dB_array)); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); for(int i=0; i<batchCount; i++) { magma_zmake_hpd( N, h_A + i * lda * N, lda );// need modification } magma_zsetmatrix( N, N*batchCount, h_A, lda, d_A, ldda ); magma_zsetmatrix( N, nrhs*batchCount, h_B, ldb, d_B, lddb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ zset_pointer(dA_array, d_A, ldda, 0, 0, ldda*N, batchCount, queue); zset_pointer(dB_array, d_B, lddb, 0, 0, lddb*nrhs, batchCount, queue); gpu_time = magma_wtime(); info = magma_zposv_batched(opts.uplo, N, nrhs, dA_array, ldda, dB_array, lddb, dinfo_array, batchCount, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_int_t *cpu_info = (magma_int_t*) malloc(batchCount*sizeof(magma_int_t)); magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1); for(int i=0; i<batchCount; i++) { if(cpu_info[i] != 0 ){ printf("magma_zposv_batched matrix %d returned internal error %d\n",i, (int)cpu_info[i] ); } } if (info != 0) printf("magma_zposv_batched returned argument error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Residual //===================================================================== magma_zgetmatrix( N, nrhs*batchCount, d_B, lddb, h_X, ldb ); for(magma_int_t s=0; s<batchCount; s++) { Anorm = lapackf77_zlange("I", &N, &N, h_A + s * lda * N, &lda, work); Xnorm = lapackf77_zlange("I", &N, &nrhs, h_X + s * ldb * nrhs, &ldb, work); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A + s * lda * N, &lda, h_X + s * ldb * nrhs, &ldb, &c_neg_one, h_B + s * ldb * nrhs, &ldb); Rnorm = lapackf77_zlange("I", &N, &nrhs, h_B + s * ldb * nrhs, &ldb, work); double error = Rnorm/(N*Anorm*Xnorm); if ( isnan(error) || isinf(error) ) { err = error; break; } err = max(err, error); } status += ! (err < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); for(magma_int_t s=0; s<batchCount; s++) { lapackf77_zposv( lapack_uplo_const(opts.uplo), &N, &nrhs, h_A + s * lda * N, &lda, h_B + s * ldb * nrhs, &ldb, &info ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zposv returned err %d: %s.\n", (int) info, magma_strerror( info )); printf( "%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int)batchCount, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, err, (err < tol ? "ok" : "failed")); } else { printf( "%10d %5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int)batchCount, (int) N, (int) nrhs, gpu_perf, gpu_time, err, (err < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( dinfo_array ); magma_free(dA_array); magma_free(dB_array); free(cpu_info); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
magma_int_t magma_ztrevc3( magma_side_t side, magma_vec_t howmany, magma_int_t *select, // logical in Fortran magma_int_t n, magmaDoubleComplex *T, magma_int_t ldt, magmaDoubleComplex *VL, magma_int_t ldvl, magmaDoubleComplex *VR, magma_int_t ldvr, magma_int_t mm, magma_int_t *mout, magmaDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t *info ) { #define T(i,j) ( T + (i) + (j)*ldt ) #define VL(i,j) (VL + (i) + (j)*ldvl) #define VR(i,j) (VR + (i) + (j)*ldvr) #define work(i,j) (work + (i) + (j)*n) // .. Parameters .. const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magma_int_t nbmin = 16, nbmax = 128; const magma_int_t ione = 1; // .. Local Scalars .. magma_int_t allv, bothv, leftv, over, rightv, somev; magma_int_t i, ii, is, j, k, ki, iv, n2, nb, nb2, version; double ovfl, remax, scale, smin, smlnum, ulp, unfl; // Decode and test the input parameters bothv = (side == MagmaBothSides); rightv = (side == MagmaRight) || bothv; leftv = (side == MagmaLeft ) || bothv; allv = (howmany == MagmaAllVec); over = (howmany == MagmaBacktransVec); somev = (howmany == MagmaSomeVec); // Set mout to the number of columns required to store the selected // eigenvectors. if ( somev ) { *mout = 0; for( j=0; j < n; ++j ) { if ( select[j] ) { *mout += 1; } } } else { *mout = n; } *info = 0; if ( ! rightv && ! leftv ) *info = -1; else if ( ! allv && ! over && ! somev ) *info = -2; else if ( n < 0 ) *info = -4; else if ( ldt < max( 1, n ) ) *info = -6; else if ( ldvl < 1 || ( leftv && ldvl < n ) ) *info = -8; else if ( ldvr < 1 || ( rightv && ldvr < n ) ) *info = -10; else if ( mm < *mout ) *info = -11; else if ( lwork < max( 1, 2*n ) ) *info = -14; if ( *info != 0 ) { magma_xerbla( __func__, -(*info) ); return *info; } // Quick return if possible. if ( n == 0 ) { return *info; } // Use blocked version (2) if sufficient workspace. // Requires 1 vector to save diagonal elements, and 2*nb vectors for x and Q*x. // (Compared to dtrevc3, rwork stores 1-norms.) // Zero-out the workspace to avoid potential NaN propagation. nb = 2; if ( lwork >= n + 2*n*nbmin ) { version = 2; nb = (lwork - n) / (2*n); nb = min( nb, nbmax ); nb2 = 1 + 2*nb; lapackf77_zlaset( "F", &n, &nb2, &c_zero, &c_zero, work, &n ); } else { version = 1; } // Set the constants to control overflow. unfl = lapackf77_dlamch( "Safe minimum" ); ovfl = 1. / unfl; lapackf77_dlabad( &unfl, &ovfl ); ulp = lapackf77_dlamch( "Precision" ); smlnum = unfl*( n / ulp ); // Store the diagonal elements of T in working array work. for( i=0; i < n; ++i ) { *work(i,0) = *T(i,i); } // Compute 1-norm of each column of strictly upper triangular // part of T to control overflow in triangular solver. rwork[0] = 0.; for( j=1; j < n; ++j ) { rwork[j] = cblas_dzasum( j, T(0,j), ione ); } magma_timer_t time_total=0, time_trsv=0, time_gemm=0, time_gemv=0, time_trsv_sum=0, time_gemm_sum=0, time_gemv_sum=0; timer_start( time_total ); if ( rightv ) { // ============================================================ // Compute right eigenvectors. // iv is index of column in current block. // Non-blocked version always uses iv=1; // blocked version starts with iv=nb, goes down to 1. // (Note the "0-th" column is used to store the original diagonal.) iv = 1; if ( version == 2 ) { iv = nb; } timer_start( time_trsv ); is = *mout - 1; for( ki=n-1; ki >= 0; --ki ) { if ( somev ) { if ( ! select[ki] ) { continue; } } smin = max( ulp*( MAGMA_Z_ABS1( *T(ki,ki) ) ), smlnum ); // -------------------------------------------------------- // Complex right eigenvector *work(ki,iv) = c_one; // Form right-hand side. for( k=0; k < ki; ++k ) { *work(k,iv) = -(*T(k,ki)); } // Solve upper triangular system: // [ T(1:ki-1,1:ki-1) - T(ki,ki) ]*X = scale*work. for( k=0; k < ki; ++k ) { *T(k,k) -= *T(ki,ki); if ( MAGMA_Z_ABS1( *T(k,k) ) < smin ) { *T(k,k) = MAGMA_Z_MAKE( smin, 0. ); } } if ( ki > 0 ) { lapackf77_zlatrs( "Upper", "No transpose", "Non-unit", "Y", &ki, T, &ldt, work(0,iv), &scale, rwork, info ); *work(ki,iv) = MAGMA_Z_MAKE( scale, 0. ); } // Copy the vector x or Q*x to VR and normalize. if ( ! over ) { // ------------------------------ // no back-transform: copy x to VR and normalize n2 = ki+1; blasf77_zcopy( &n2, work(0,iv), &ione, VR(0,is), &ione ); ii = blasf77_izamax( &n2, VR(0,is), &ione ) - 1; remax = 1. / MAGMA_Z_ABS1( *VR(ii,is) ); blasf77_zdscal( &n2, &remax, VR(0,is), &ione ); for( k=ki+1; k < n; ++k ) { *VR(k,is) = c_zero; } } else if ( version == 1 ) { // ------------------------------ // version 1: back-transform each vector with GEMV, Q*x. time_trsv_sum += timer_stop( time_trsv ); timer_start( time_gemv ); if ( ki > 0 ) { blasf77_zgemv( "n", &n, &ki, &c_one, VR, &ldvr, work(0, iv), &ione, work(ki,iv), VR(0,ki), &ione ); } time_gemv_sum += timer_stop( time_gemv ); ii = blasf77_izamax( &n, VR(0,ki), &ione ) - 1; remax = 1. / MAGMA_Z_ABS1( *VR(ii,ki) ); blasf77_zdscal( &n, &remax, VR(0,ki), &ione ); timer_start( time_trsv ); } else if ( version == 2 ) { // ------------------------------ // version 2: back-transform block of vectors with GEMM // zero out below vector for( k=ki+1; k < n; ++k ) { *work(k,iv) = c_zero; } // Columns iv:nb of work are valid vectors. // When the number of vectors stored reaches nb, // or if this was last vector, do the GEMM if ( (iv == 1) || (ki == 0) ) { time_trsv_sum += timer_stop( time_trsv ); timer_start( time_gemm ); nb2 = nb-iv+1; n2 = ki+nb-iv+1; blasf77_zgemm( "n", "n", &n, &nb2, &n2, &c_one, VR, &ldvr, work(0,iv ), &n, &c_zero, work(0,nb+iv), &n ); time_gemm_sum += timer_stop( time_gemm ); // normalize vectors // TODO if somev, should copy vectors individually to correct location. for( k = iv; k <= nb; ++k ) { ii = blasf77_izamax( &n, work(0,nb+k), &ione ) - 1; remax = 1. / MAGMA_Z_ABS1( *work(ii,nb+k) ); blasf77_zdscal( &n, &remax, work(0,nb+k), &ione ); } lapackf77_zlacpy( "F", &n, &nb2, work(0,nb+iv), &n, VR(0,ki), &ldvr ); iv = nb; timer_start( time_trsv ); } else { iv -= 1; } } // blocked back-transform // Restore the original diagonal elements of T. for( k=0; k <= ki - 1; ++k ) { *T(k,k) = *work(k,0); } is -= 1; } } timer_stop( time_trsv ); timer_stop( time_total ); timer_printf( "trevc trsv %.4f, gemm %.4f, gemv %.4f, total %.4f\n", time_trsv_sum, time_gemm_sum, time_gemv_sum, time_total ); if ( leftv ) { // ============================================================ // Compute left eigenvectors. // iv is index of column in current block. // Non-blocked version always uses iv=1; // blocked version starts with iv=1, goes up to nb. // (Note the "0-th" column is used to store the original diagonal.) iv = 1; is = 0; for( ki=0; ki < n; ++ki ) { if ( somev ) { if ( ! select[ki] ) { continue; } } smin = max( ulp*MAGMA_Z_ABS1( *T(ki,ki) ), smlnum ); // -------------------------------------------------------- // Complex left eigenvector *work(ki,iv) = c_one; // Form right-hand side. for( k = ki + 1; k < n; ++k ) { *work(k,iv) = -MAGMA_Z_CNJG( *T(ki,k) ); } // Solve conjugate-transposed triangular system: // [ T(ki+1:n,ki+1:n) - T(ki,ki) ]**H * X = scale*work. for( k = ki + 1; k < n; ++k ) { *T(k,k) -= *T(ki,ki); if ( MAGMA_Z_ABS1( *T(k,k) ) < smin ) { *T(k,k) = MAGMA_Z_MAKE( smin, 0. ); } } if ( ki < n-1 ) { n2 = n-ki-1; lapackf77_zlatrs( "Upper", "Conjugate transpose", "Non-unit", "Y", &n2, T(ki+1,ki+1), &ldt, work(ki+1,iv), &scale, rwork, info ); *work(ki,iv) = MAGMA_Z_MAKE( scale, 0. ); } // Copy the vector x or Q*x to VL and normalize. if ( ! over ) { // ------------------------------ // no back-transform: copy x to VL and normalize n2 = n-ki; blasf77_zcopy( &n2, work(ki,iv), &ione, VL(ki,is), &ione ); ii = blasf77_izamax( &n2, VL(ki,is), &ione ) + ki - 1; remax = 1. / MAGMA_Z_ABS1( *VL(ii,is) ); blasf77_zdscal( &n2, &remax, VL(ki,is), &ione ); for( k=0; k < ki; ++k ) { *VL(k,is) = c_zero; } } else if ( version == 1 ) { // ------------------------------ // version 1: back-transform each vector with GEMV, Q*x. if ( ki < n-1 ) { n2 = n-ki-1; blasf77_zgemv( "n", &n, &n2, &c_one, VL(0,ki+1), &ldvl, work(ki+1,iv), &ione, work(ki, iv), VL(0,ki), &ione ); } ii = blasf77_izamax( &n, VL(0,ki), &ione ) - 1; remax = 1. / MAGMA_Z_ABS1( *VL(ii,ki) ); blasf77_zdscal( &n, &remax, VL(0,ki), &ione ); } else if ( version == 2 ) { // ------------------------------ // version 2: back-transform block of vectors with GEMM // zero out above vector // could go from (ki+1)-NV+1 to ki for( k=0; k < ki; ++k ) { *work(k,iv) = c_zero; } // Columns 1:iv of work are valid vectors. // When the number of vectors stored reaches nb, // or if this was last vector, do the GEMM if ( (iv == nb) || (ki == n-1) ) { n2 = n-(ki+1)+iv; blasf77_zgemm( "n", "n", &n, &iv, &n2, &c_one, VL(0,ki-iv+1), &ldvl, work(ki-iv+1,1 ), &n, &c_zero, work(0, nb+1), &n ); // normalize vectors for( k=1; k <= iv; ++k ) { ii = blasf77_izamax( &n, work(0,nb+k), &ione ) - 1; remax = 1. / MAGMA_Z_ABS1( *work(ii,nb+k) ); blasf77_zdscal( &n, &remax, work(0,nb+k), &ione ); } lapackf77_zlacpy( "F", &n, &iv, work(0,nb+1), &n, VL(0,ki-iv+1), &ldvl ); iv = 1; } else { iv += 1; } } // blocked back-transform // Restore the original diagonal elements of T. for( k = ki + 1; k < n; ++k ) { *T(k,k) = *work(k,0); } is += 1; } } return *info; } // End of ZTREVC
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqlf */ int main( int argc, char** argv) { TESTING_INIT(); const double d_neg_one = MAGMA_D_NEG_ONE; const double d_one = MAGMA_D_ONE; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magma_int_t ione = 1; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double Anorm, error=0, error2=0; magmaDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magma_int_t M, N, n2, lda, lwork, info, min_mn, nb; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |L - Q^H*A| |I - Q^H*Q|\n"); printf("===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; nb = magma_get_zgeqlf_nb(M); gflops = FLOPS_ZGEQLF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_zgeqlf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); lwork = max( lwork, N*nb ); lwork = max( lwork, 2*nb*nb); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgeqlf( M, N, h_R, lda, tau, h_work, lwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result, following zqlt01 except using the reduced Q. This works for any M,N (square, tall, wide). =================================================================== */ if ( opts.check ) { magma_int_t ldq = M; magma_int_t ldl = min_mn; magmaDoubleComplex *Q, *L; double *work; TESTING_MALLOC_CPU( Q, magmaDoubleComplex, ldq*min_mn ); // M by K TESTING_MALLOC_CPU( L, magmaDoubleComplex, ldl*N ); // K by N TESTING_MALLOC_CPU( work, double, min_mn ); // copy M by K matrix V to Q (copying diagonal, which isn't needed) and // copy K by N matrix L lapackf77_zlaset( "Full", &min_mn, &N, &c_zero, &c_zero, L, &ldl ); if ( M >= N ) { // for M=5, N=3: A = [ V V V ] <= V full block (M-N by K) // K=N [ V V V ] // [ ----- ] // [ L V V ] <= V triangle (N by K, copying diagonal too) // [ L L V ] <= L triangle (K by N) // [ L L L ] magma_int_t M_N = M - N; lapackf77_zlacpy( "Full", &M_N, &min_mn, h_R, &lda, Q, &ldq ); lapackf77_zlacpy( "Upper", &N, &min_mn, &h_R[M_N], &lda, &Q[M_N], &ldq ); lapackf77_zlacpy( "Lower", &min_mn, &N, &h_R[M_N], &lda, L, &ldl ); } else { // for M=3, N=5: A = [ L L | L V V ] <= V triangle (K by K) // K=M [ L L | L L V ] <= L triangle (K by M) // [ L L | L L L ] // ^^^============= L full block (K by N-M) magma_int_t N_M = N - M; lapackf77_zlacpy( "Upper", &M, &min_mn, &h_R[N_M*lda], &lda, Q, &ldq ); lapackf77_zlacpy( "Full", &min_mn, &N_M, h_R, &lda, L, &ldl ); lapackf77_zlacpy( "Lower", &min_mn, &M, &h_R[N_M*lda], &lda, &L[N_M*ldl], &ldl ); } // generate M by K matrix Q, where K = min(M,N) lapackf77_zungql( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info ); assert( info == 0 ); // error = || L - Q^H*A || / (N * ||A||) blasf77_zgemm( "Conj", "NoTrans", &min_mn, &N, &M, &c_neg_one, Q, &ldq, h_A, &lda, &c_one, L, &ldl ); Anorm = lapackf77_zlange( "1", &M, &N, h_A, &lda, work ); error = lapackf77_zlange( "1", &min_mn, &N, L, &ldl, work ); if ( N > 0 && Anorm > 0 ) error /= (N*Anorm); // set L = I (K by K identity), then L = I - Q^H*Q // error = || I - Q^H*Q || / N lapackf77_zlaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, L, &ldl ); blasf77_zherk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, L, &ldl ); error2 = lapackf77_zlanhe( "1", "Upper", &min_mn, L, &ldl, work ); if ( N > 0 ) error2 /= N; TESTING_FREE_CPU( Q ); Q = NULL; TESTING_FREE_CPU( L ); L = NULL; TESTING_FREE_CPU( work ); work = NULL; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zgeqlf( &M, &N, h_A, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapack_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Print performance and error. =================================================================== */ printf("%5d %5d ", (int) M, (int) N ); if ( opts.lapack ) { printf( "%7.2f (%7.2f)", cpu_perf, cpu_time ); } else { printf(" --- ( --- )" ); } printf( " %7.2f (%7.2f) ", gpu_perf, gpu_time ); if ( opts.check ) { bool okay = (error < tol && error2 < tol); status += ! okay; printf( "%11.2e %11.2e %s\n", error, error2, (okay ? "ok" : "failed") ); } else { printf( " ---\n" ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zposv_gpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; double error, Rnorm, Anorm, Xnorm, *work; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_B, *h_X; magmaDoubleComplex *d_A, *d_B; magma_int_t N, lda, ldb, ldda, lddb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf(" N NRHS CPU Gflop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = ldb = N; ldda = ((N+31)/32)*32; lddb = ldda; gflops = ( FLOPS_ZPOTRF( N ) + FLOPS_ZPOTRS( N, opts.nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*opts.nrhs ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, ldb*opts.nrhs ); TESTING_MALLOC_CPU( work, double, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*opts.nrhs ); /* ==================================================================== Initialize the matrix =================================================================== */ sizeA = lda*N; sizeB = ldb*opts.nrhs; lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); magma_zmake_hpd( N, h_A, lda ); magma_zsetmatrix( N, N, h_A, N, d_A, ldda ); magma_zsetmatrix( N, opts.nrhs, h_B, N, d_B, lddb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zposv_gpu( opts.uplo, N, opts.nrhs, d_A, ldda, d_B, lddb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zpotrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Residual =================================================================== */ magma_zgetmatrix( N, opts.nrhs, d_B, lddb, h_X, ldb ); Anorm = lapackf77_zlange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_zlange("I", &N, &opts.nrhs, h_X, &ldb, work); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &opts.nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb ); Rnorm = lapackf77_zlange("I", &N, &opts.nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status += ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zposv( lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zposv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) opts.nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) opts.nrhs, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgesv_gpu */ int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; double error, Rnorm, Anorm, Xnorm, *work; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_B, *h_X; magmaDoubleComplex_ptr d_A, d_B; magma_int_t *ipiv; magma_int_t N, nrhs, lda, ldb, ldda, lddb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf("%% N NRHS CPU Gflop/s (sec) GPU Gflop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("%%===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldb = lda; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default lddb = ldda; gflops = ( FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRS( N, nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( work, double, N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*nrhs ); /* Initialize the matrices */ sizeA = lda*N; sizeB = ldb*nrhs; lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); magma_zsetmatrix( N, N, h_A, lda, d_A, ldda, opts.queue ); magma_zsetmatrix( N, nrhs, h_B, ldb, d_B, lddb, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgesv_gpu( N, nrhs, d_A, ldda, ipiv, d_B, lddb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_zgesv_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } //===================================================================== // Residual //===================================================================== magma_zgetmatrix( N, nrhs, d_B, lddb, h_X, ldb, opts.queue ); Anorm = lapackf77_zlange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_zlange("I", &N, &nrhs, h_X, &ldb, work); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_zlange("I", &N, &nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status += ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zgesv( &N, &nrhs, h_A, &lda, ipiv, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_zgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetri_batched */ int main( int argc, char** argv) { TESTING_INIT(); // constants const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A, *h_Ainv, *h_R, *work; magmaDoubleComplex_ptr d_A, d_invA; magmaDoubleComplex_ptr *dA_array; magmaDoubleComplex_ptr *dinvA_array; magma_int_t **dipiv_array; magma_int_t *dinfo_array; magma_int_t *ipiv, *cpu_info; magma_int_t *d_ipiv, *d_info; magma_int_t N, n2, lda, ldda, info, info1, info2, lwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex tmp; double error, rwork[1]; magma_int_t columns; magma_int_t status = 0; magma_opts opts( MagmaOptsBatched ); opts.parse_opts( argc, argv ); magma_int_t batchCount = opts.batchcount; double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% batchCount N CPU Gflop/s (ms) GPU Gflop/s (ms) ||I - A*A^{-1}||_1 / (N*cond(A))\n"); printf("%%===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N * batchCount; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default // This is the correct flops but since this getri_batched is based on // 2 trsm = getrs and to know the real flops I am using the getrs one //gflops = (FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRI( N ))/ 1e9 * batchCount; gflops = (FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRS( N, N ))/ 1e9 * batchCount; // query for workspace size lwork = -1; lapackf77_zgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) { printf("lapackf77_zgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = magma_int_t( MAGMA_Z_REAL( tmp )); TESTING_MALLOC_CPU( cpu_info, magma_int_t, batchCount ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N * batchCount ); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork*batchCount ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_Ainv, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( d_invA, magmaDoubleComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, N * batchCount ); TESTING_MALLOC_DEV( d_info, magma_int_t, batchCount ); TESTING_MALLOC_DEV( dA_array, magmaDoubleComplex*, batchCount ); TESTING_MALLOC_DEV( dinvA_array, magmaDoubleComplex*, batchCount ); TESTING_MALLOC_DEV( dinfo_array, magma_int_t, batchCount ); TESTING_MALLOC_DEV( dipiv_array, magma_int_t*, batchCount ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); columns = N * batchCount; lapackf77_zlacpy( MagmaFullStr, &N, &columns, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaFullStr, &N, &columns, h_A, &lda, h_Ainv, &lda ); magma_zsetmatrix( N, columns, h_R, lda, d_A, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zset_pointer( dA_array, d_A, ldda, 0, 0, ldda * N, batchCount, opts.queue ); magma_zset_pointer( dinvA_array, d_invA, ldda, 0, 0, ldda * N, batchCount, opts.queue ); magma_iset_pointer( dipiv_array, d_ipiv, 1, 0, 0, N, batchCount, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); info1 = magma_zgetrf_batched( N, N, dA_array, ldda, dipiv_array, dinfo_array, batchCount, opts.queue); info2 = magma_zgetri_outofplace_batched( N, dA_array, ldda, dipiv_array, dinvA_array, ldda, dinfo_array, batchCount, opts.queue); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1, opts.queue ); for (magma_int_t i=0; i < batchCount; i++) { if (cpu_info[i] != 0 ) { printf("magma_zgetrf_batched matrix %d returned error %d\n", (int) i, (int)cpu_info[i] ); } } if (info1 != 0) printf("magma_zgetrf_batched returned argument error %d: %s.\n", (int) info1, magma_strerror( info1 )); if (info2 != 0) printf("magma_zgetri_batched returned argument error %d: %s.\n", (int) info2, magma_strerror( info2 )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_int_t nthreads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); magma_set_omp_numthreads(nthreads); #pragma omp parallel for schedule(dynamic) #endif for (int i=0; i < batchCount; i++) { magma_int_t locinfo; lapackf77_zgetrf(&N, &N, h_Ainv + i*lda*N, &lda, ipiv + i*N, &locinfo); if (locinfo != 0) { printf("lapackf77_zgetrf returned error %d: %s.\n", (int) locinfo, magma_strerror( locinfo )); } lapackf77_zgetri(&N, h_Ainv + i*lda*N, &lda, ipiv + i*N, work + i*lwork, &lwork, &locinfo ); if (locinfo != 0) { printf("lapackf77_zgetri returned error %d: %s.\n", (int) locinfo, magma_strerror( locinfo )); } } #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_set_lapack_numthreads(nthreads); #endif cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; printf("%10d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) batchCount, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000. ); } else { printf("%10d %5d --- ( --- ) %7.2f (%7.2f)", (int) batchCount, (int) N, gpu_perf, gpu_time*1000. ); } /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { magma_igetvector( N*batchCount, d_ipiv, 1, ipiv, 1, opts.queue ); magma_zgetmatrix( N, N*batchCount, d_invA, ldda, h_Ainv, lda, opts.queue ); error = 0; for (magma_int_t i=0; i < batchCount; i++) { for (magma_int_t k=0; k < N; k++) { if (ipiv[i*N+k] < 1 || ipiv[i*N+k] > N ) { printf("error for matrix %d ipiv @ %d = %d\n", (int) i, (int) k, (int) ipiv[i*N+k]); error = -1; } } if (error == -1) { break; } // compute 1-norm condition number estimate, following LAPACK's zget03 double normA, normAinv, rcond, err; normA = lapackf77_zlange( "1", &N, &N, h_A + i*lda*N, &lda, rwork ); normAinv = lapackf77_zlange( "1", &N, &N, h_Ainv + i*lda*N, &lda, rwork ); if ( normA <= 0 || normAinv <= 0 ) { rcond = 0; err = 1 / (tol/opts.tolerance); // == 1/eps } else { rcond = (1 / normA) / normAinv; // R = I // R -= A*A^{-1} // err = ||I - A*A^{-1}|| / ( N ||A||*||A^{-1}|| ) = ||R|| * rcond / N, using 1-norm lapackf77_zlaset( "full", &N, &N, &c_zero, &c_one, h_R + i*lda*N, &lda ); blasf77_zgemm( "no", "no", &N, &N, &N, &c_neg_one, h_A + i*lda*N, &lda, h_Ainv + i*lda*N, &lda, &c_one, h_R + i*lda*N, &lda ); err = lapackf77_zlange( "1", &N, &N, h_R + i*lda*N, &lda, rwork ); err = err * rcond / N; } if ( isnan(err) || isinf(err) ) { error = err; break; } error = max( err, error ); } bool okay = (error < tol); status += ! okay; printf(" %8.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf("\n"); } TESTING_FREE_CPU( cpu_info ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_Ainv ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_invA ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_info ); TESTING_FREE_DEV( dA_array ); TESTING_FREE_DEV( dinvA_array ); TESTING_FREE_DEV( dinfo_array ); TESTING_FREE_DEV( dipiv_array ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgesv */ int main(int argc, char **argv) { real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; double error, Rnorm, Anorm, Xnorm, *work; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_LU, *h_B, *h_X; magma_int_t *ipiv; magma_int_t N, nrhs, lda, ldb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; /* Initialize */ magma_queue_t queue[2]; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } // Create two queues on device opts.device err = magma_queue_create( device[ opts.device ], &queue[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[ opts.device ], &queue[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf("ngpu %d\n", (int) opts.ngpu ); printf(" N NRHS CPU Gflop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = N; ldb = lda; gflops = ( FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRS( N, nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_LU, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( work, double, N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); /* Initialize the matrices */ sizeA = lda*N; sizeB = ldb*nrhs; lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); // copy A to LU and B to X; save A and B for residual lapackf77_zlacpy( "F", &N, &N, h_A, &lda, h_LU, &lda ); lapackf77_zlacpy( "F", &N, &nrhs, h_B, &ldb, h_X, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgesv( N, nrhs, h_LU, lda, ipiv, h_X, ldb, &info, queue ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Residual //===================================================================== Anorm = lapackf77_zlange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_zlange("I", &N, &nrhs, h_X, &ldb, work); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_zlange("I", &N, &nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status |= ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zgesv( &N, &nrhs, h_A, &lda, ipiv, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e%s\n", (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "" : " failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e%s\n", (int) N, (int) nrhs, gpu_perf, gpu_time, error, (error < tol ? "" : " failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_LU ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( ipiv ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( queue[0] ); magma_queue_destroy( queue[1] ); magma_finalize(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhegvd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; magmaDoubleComplex *h_A, *h_R, *h_B, *h_S, *h_work; double *rwork, *w1, *w2; double result[4] = {0}; magma_int_t *iwork; magma_int_t N, n2, info, nb, lwork, liwork, lda, lrwork; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double d_one = 1.; double d_neg_one = -1.; //double d_ten = 10.; //magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: itype = %d, jobz = %s, uplo = %s\n", (int) opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo)); printf(" N CPU Time (sec) GPU Time(sec)\n"); printf("======================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = N*lda; nb = magma_get_zhetrd_nb(N); lwork = 2*N*nb + N*N; lrwork = 1 + 5*N +2*N*N; liwork = 3 + 5*N; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( rwork, double, lrwork ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_S, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); //lapackf77_zlatms( &N, &N, "U", ISEED, "P", w1, &five, &d_ten, // &d_one, &N, &N, lapack_uplo_const(opts.uplo), h_B, &lda, h_work, &info); //lapackf77_zlaset( "A", &N, &N, &c_zero, &c_one, h_B, &lda); lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); magma_zmake_hpd( N, h_B, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda ); /* warmup */ if ( opts.warmup ) { magma_zhegvd( opts.itype, opts.jobz, opts.uplo, N, h_R, lda, h_S, lda, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info ); if (info != 0) printf("magma_zhegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zhegvd( opts.itype, opts.jobz, opts.uplo, N, h_R, lda, h_S, lda, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_zhegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvd routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | I - V V' B | / ( N ) (itype = 1,2) | B - V V' | / ( |B| N ) (itype = 3) (3) | S(with V) - S(w/o V) | / | S | =================================================================== */ double temp1, temp2; //magmaDoubleComplex *tau; if ( opts.itype == 1 || opts.itype == 2 ) { lapackf77_zlaset( "A", &N, &N, &c_zero, &c_one, h_S, &lda); blasf77_zgemm("N", "C", &N, &N, &N, &c_one, h_R, &lda, h_R, &lda, &c_zero, h_work, &N); blasf77_zhemm("R", lapack_uplo_const(opts.uplo), &N, &N, &c_neg_one, h_B, &lda, h_work, &N, &c_one, h_S, &lda); result[1] = lapackf77_zlange("1", &N, &N, h_S, &lda, rwork) / N; } else if ( opts.itype == 3 ) { lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda); blasf77_zherk(lapack_uplo_const(opts.uplo), "N", &N, &N, &d_neg_one, h_R, &lda, &d_one, h_S, &lda); result[1] = lapackf77_zlanhe("1", lapack_uplo_const(opts.uplo), &N, h_S, &lda, rwork) / N / lapackf77_zlanhe("1", lapack_uplo_const(opts.uplo), &N, h_B, &lda, rwork); } result[0] = 1.; result[0] /= lapackf77_zlanhe("1", lapack_uplo_const(opts.uplo), &N, h_A, &lda, rwork); result[0] /= lapackf77_zlange("1", &N, &N, h_R, &lda, rwork); if ( opts.itype == 1 ) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_A, &lda, h_R, &lda, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_neg_one, h_B, &lda, h_R, &lda, &c_one, h_work, &N); result[0] *= lapackf77_zlange("1", &N, &N, h_work, &lda, rwork)/N; } else if ( opts.itype == 2 ) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_B, &lda, h_R, &lda, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_A, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_zlange("1", &N, &N, h_R, &lda, rwork)/N; } else if ( opts.itype == 3 ) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_A, &lda, h_R, &lda, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_B, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_zlange("1", &N, &N, h_R, &lda, rwork)/N; } /* lapackf77_zhet21( &ione, lapack_uplo_const(opts.uplo), &N, &izero, h_A, &lda, w1, w1, h_R, &lda, h_R, &lda, tau, h_work, rwork, &result[0] ); */ lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda ); magma_zhegvd( opts.itype, MagmaNoVec, opts.uplo, N, h_R, lda, h_S, lda, w2, h_work, lwork, rwork, lrwork, iwork, liwork, &info ); if (info != 0) printf("magma_zhegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); temp1 = temp2 = 0; for(int j=0; j<N; j++) { temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[2] = temp2 / (((double)N)*temp1); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zhegvd( &opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), &N, h_A, &lda, h_B, &lda, w2, h_work, &lwork, rwork, &lrwork, iwork, &liwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_zhegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); printf("%5d %7.2f %7.2f\n", (int) N, cpu_time, gpu_time); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } /* ===================================================================== Print execution time =================================================================== */ if ( opts.check ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if ( opts.itype==1 ) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); } else if ( opts.itype==2 ) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); } else if ( opts.itype==3 ) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); } if ( opts.itype==1 || opts.itype==2 ) { printf("(2) | I - Z Z' B | / N = %8.2e %s\n", result[1], (result[1] < tol ? "ok" : "failed") ); } else { printf("(2) | B - Z Z' | / (|B| N) = %8.2e %s\n", result[1], (result[1] < tol ? "ok" : "failed") ); } printf( "(3) | D(w/ Z) - D(w/o Z) | / |D| = %8.2e %s\n\n", result[2], (result[2] < tolulp ? "ok" : "failed") ); status += ! (result[0] < tol && result[1] < tol && result[2] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( rwork ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgemm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error, cublas_error, Cnorm, work[1]; magma_int_t M, N, K; magma_int_t Am, An, Bm, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex *h_A, *h_B, *h_C, *h_Cmagma, *h_Ccublas; magmaDoubleComplex *d_A, *d_B, *d_C; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.29, -0.86 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.48, 0.38 ); magma_opts opts; parse_opts( argc, argv, &opts ); printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n" "transA = %c, transB = %c\n", opts.transA, opts.transB ); printf(" M N K MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("=========================================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; K = opts.ksize[i]; gflops = FLOPS_ZGEMM( M, N, K ) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if ( opts.transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } ldc = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*An; sizeB = ldb*Bn; sizeC = ldc*N; TESTING_MALLOC( h_A, magmaDoubleComplex, lda*An ); TESTING_MALLOC( h_B, magmaDoubleComplex, ldb*Bn ); TESTING_MALLOC( h_C, magmaDoubleComplex, ldc*N ); TESTING_MALLOC( h_Cmagma, magmaDoubleComplex, ldc*N ); TESTING_MALLOC( h_Ccublas, magmaDoubleComplex, ldc*N ); TESTING_DEVALLOC( d_A, magmaDoubleComplex, ldda*An ); TESTING_DEVALLOC( d_B, magmaDoubleComplex, lddb*Bn ); TESTING_DEVALLOC( d_C, magmaDoubleComplex, lddc*N ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_zlarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using MAGMA-BLAS =================================================================== */ magma_zsetmatrix( Am, An, h_A, lda, d_A, ldda ); magma_zsetmatrix( Bm, Bn, h_B, ldb, d_B, lddb ); magma_zsetmatrix( M, N, h_C, ldc, d_C, lddc ); magma_time = magma_sync_wtime( NULL ); magmablas_zgemm( opts.transA, opts.transB, M, N, K, alpha, d_A, ldda, d_B, lddb, beta, d_C, lddc ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_zgetmatrix( M, N, d_C, lddc, h_Cmagma, ldc ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_zsetmatrix( M, N, h_C, ldc, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ); cublasZgemm( opts.transA, opts.transB, M, N, K, alpha, d_A, ldda, d_B, lddb, beta, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetmatrix( M, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_zgemm( &opts.transA, &opts.transB, &M, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_zlange( "M", &M, &N, h_C, &ldc, work ); blasf77_zaxpy( &sizeC, &c_neg_one, h_C, &ione, h_Cmagma, &ione ); magma_error = lapackf77_zlange( "M", &M, &N, h_Cmagma, &ldc, work ) / Cnorm; blasf77_zaxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_zlange( "M", &M, &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error ); } else { // compute relative error for magma, relative to cublas Cnorm = lapackf77_zlange( "M", &M, &N, h_Ccublas, &ldc, work ); blasf77_zaxpy( &sizeC, &c_neg_one, h_Ccublas, &ione, h_Cmagma, &ione ); magma_error = lapackf77_zlange( "M", &M, &N, h_Cmagma, &ldc, work ) / Cnorm; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e ---\n", (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error ); } TESTING_FREE( h_A ); TESTING_FREE( h_B ); TESTING_FREE( h_C ); TESTING_FREE( h_Cmagma ); TESTING_FREE( h_Ccublas ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); TESTING_DEVFREE( d_C ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgesv_gpu */ int main(int argc , char **argv) { real_Double_t gflops, gpu_perf, gpu_time; double Rnorm, Anorm, Xnorm, *work; magmaDoubleComplex *hA, *hB, *hX; magmaDoubleComplex_ptr dA, dB; magma_int_t *ipiv; magma_int_t N = 0, n2, lda, ldb, ldda, lddb; magma_int_t size[7] = { 1024, 2048, 3072, 4032, 5184, 6048, 7000}; magma_int_t i, info, szeB; magmaDoubleComplex z_one = MAGMA_Z_ONE; magmaDoubleComplex mz_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t NRHS = 100; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); if (strcmp("-R", argv[i])==0) NRHS = atoi(argv[++i]); } if (N>0) size[0] = size[6] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_zgesv_gpu -N <matrix size> -R <right hand sides>\n\n"); } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the largest matrix */ N = size[6]; n2 = N * N; ldda = ((N+31)/32) * 32; // ldda = N; lddb = ldda; TESTING_MALLOC_PIN( ipiv, magma_int_t, N ); TESTING_MALLOC_PIN( hA, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( hB, magmaDoubleComplex, N*NRHS ); TESTING_MALLOC_PIN( hX, magmaDoubleComplex, N*NRHS ); TESTING_MALLOC_PIN( work, double, N ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dB, magmaDoubleComplex, lddb*NRHS ); printf("\n\n"); printf(" N NRHS GPU GFlop/s (sec) ||B - AX|| / ||A||*||X||\n"); printf("===========================================================\n"); for( i = 0; i < 7; i++ ) { N = size[i]; lda = N; ldb = lda; n2 = lda*N; szeB = ldb*NRHS; ldda = ((N+31)/32)*32; //ldda = N; lddb = ldda; gflops = ( FLOPS_GETRF( (double)N, (double)N ) + FLOPS_GETRS( (double)N, (double)NRHS ) ) / 1e9; /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &n2, hA ); lapackf77_zlarnv( &ione, ISEED, &szeB, hB ); /* Warm up to measure the performance */ magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); magma_zsetmatrix( N, NRHS, hB, 0, lda, dB, 0, lddb, queue ); magma_zgesv_gpu( N, NRHS, dA, 0, ldda, ipiv, dB, 0, lddb, &info, queue ); //===================================================================== // Solve Ax = b through an LU factorization //===================================================================== magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); magma_zsetmatrix( N, NRHS, hB, 0, lda, dB, 0, lddb, queue ); gpu_time = magma_wtime(); magma_zgesv_gpu( N, NRHS, dA, 0, ldda, ipiv, dB, 0, lddb, &info, queue ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_zposv had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* ===================================================================== Residual =================================================================== */ magma_zgetmatrix( N, NRHS, dB, 0, lddb, hX, 0, ldb, queue ); Anorm = lapackf77_zlange("I", &N, &N, hA, &lda, work); Xnorm = lapackf77_zlange("I", &N, &NRHS, hX, &ldb, work); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &NRHS, &N, &z_one, hA, &lda, hX, &ldb, &mz_one, hB, &ldb ); Rnorm = lapackf77_zlange("I", &N, &NRHS, hB, &ldb, work); printf( "%5d %5d %7.2f (%7.2f) %8.2e\n", N, NRHS, gpu_perf, gpu_time, Rnorm/(Anorm*Xnorm) ); if (argc != 1) break; } /* clean up */ TESTING_FREE_PIN( hA ); TESTING_FREE_PIN( hB ); TESTING_FREE_PIN( hX ); TESTING_FREE_PIN( work ); TESTING_FREE_PIN( ipiv ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dB ); magma_queue_destroy( queue ); magma_finalize(); }
/** Purpose ------- ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA COMPLEX_16 array of pointers on the GPU, dimension (num_gpus) On entry, the Hermitian matrix dA distributed over GPUs (dl_A[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_mgpu_right(magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex **d_lA, magma_int_t ldda, magma_int_t *info ) { #define dlA(id, i, j) (d_lA[(id)] + (j) * ldda + (i)) #define dlP(id, i, j) (d_lP[(id)] + (j) * ldda + (i)) #define panel(j) (panel + (j)) #define tmppanel(j) (tmppanel + (j)) #define tmpprevpanel(j) (tmpprevpanel + (j)) #define STREAM_ID(i) (num_streams > 1 ? 1+((i)/nb)%(num_streams-1) : 0) magmaDoubleComplex z_one = MAGMA_Z_MAKE( 1.0, 0.0 ); magmaDoubleComplex mz_one = MAGMA_Z_MAKE( -1.0, 0.0 ); double one = 1.0; double m_one = -1.0; const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t j, nb, d, id, j_local, blkid, crosspoint, prevj, prevtrsmrows, num_streams = 5; magmaDoubleComplex *panel, *tmppanel0, *tmppanel1, *tmppanel, *tmpprevpanel; magmaDoubleComplex *d_lP[MagmaMaxGPUs], *dlpanel, *dlpanels[MagmaMaxGPUs]; magma_int_t rows, trsmrows, ngpu, n_local[MagmaMaxGPUs], ldpanel; magma_queue_t stream[MagmaMaxGPUs][10]; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); nb = magma_get_zpotrf_nb(n); ldpanel = ldda; magma_setdevice(0); if (MAGMA_SUCCESS != magma_zmalloc_pinned( &panel, 2 * nb * ldpanel )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } tmppanel0 = panel; tmppanel1 = tmppanel0 + nb * ldpanel; if ((nb <= 1) || (nb >= n)) { // Use unblocked code. magma_zgetmatrix( n, n, dlA(0, 0, 0), ldda, panel, ldpanel); lapackf77_zpotrf( uplo_, &n, panel, &ldpanel, info); magma_zsetmatrix( n, n, panel, ldpanel, dlA(0, 0, 0), ldda ); } else { for( d = 0; d < num_gpus; d++ ) { // local-n and local-ld n_local[d] = ((n / nb) / num_gpus) * nb; if (d < (n / nb) % num_gpus) n_local[d] += nb; else if (d == (n / nb) % num_gpus) n_local[d] += n % nb; magma_setdevice(d); magma_device_sync(); if (MAGMA_SUCCESS != magma_zmalloc( &d_lP[d], nb * ldda )) { for( j = 0; j < d; j++ ) { magma_setdevice(j); magma_free( d_lP[d] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < num_streams; j++ ) { magma_queue_create( &stream[d][j] ); } } //#define ENABLE_TIMER #if defined (ENABLE_TIMER) real_Double_t therk[4], tmtc, tcchol, tctrsm, tctm, tmnp, tcnp; real_Double_t ttot_herk[4] = {0,0,0,0}, ttot_mtc = 0, ttot_cchol = 0, ttot_ctrsm = 0, ttot_ctm = 0, ttot_mnp = 0, ttot_cnp = 0; printf("\n\n %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n", "j", "nb", "row", "mtc", "CPU_np", "panel", "ctrsm", "CH+TRSM", "CPU", "dsyrk[0]", "dsyrk[1]", "dsyrk[2]", "dsyrk[3]", "ctm P", "gpu_np"); printf(" ====================================================================================================\n"); #endif // Use blocked code. if (uplo == MagmaUpper) { printf( " === not supported, yet ===\n" ); } else { blkid = -1; if (num_gpus == 4) crosspoint = n; else if (num_gpus == 3) crosspoint = n; else if (num_gpus == 2) crosspoint = 20160; else crosspoint = 0; crosspoint = 0; //n; //n -- > gpu always does next panel, 0 --> cpu always does next panel crosspoint = n; #if defined (ENABLE_TIMER) real_Double_t tget = get_time(), tset = 0.0, ttot = 0.0; #endif if ( n > nb ) { // send first panel to cpu magma_setdevice(0); tmppanel = tmppanel0; magma_zgetmatrix_async(n, nb, dlA(0, 0, 0), ldda, tmppanel(0), ldpanel, stream[0][0] ); } #if defined (ENABLE_TIMER) for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_device_sync(); } tget = get_time()-tget; #endif // Compute the Cholesky factorization A = L*L' for (j = 0; (j + nb) < n; j += nb) { #if defined (ENABLE_TIMER) therk[0] = therk[1] = therk[2] = therk[3] = tmtc = tcchol = tctrsm = tctm = tmnp = tcnp = 0.0; #endif blkid += 1; tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; // Set the gpu number that holds the current panel id = (j / nb) % num_gpus; magma_setdevice(id); // Set the local index where the current panel is j_local = j / (nb * num_gpus) * nb; rows = n - j; // Wait for the panel on cpu magma_queue_sync( stream[id][0] ); if (j > 0 && prevtrsmrows > crosspoint) { #if defined (ENABLE_TIMER) tcnp = get_time(); #endif tmpprevpanel = ((blkid - 1) % 2) == 0 ? tmppanel0 : tmppanel1; blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr, &rows, &nb, &nb, &mz_one, tmpprevpanel(j), &ldpanel, tmpprevpanel(j), &ldpanel, &z_one, tmppanel(j), &ldpanel ); #if defined (ENABLE_TIMER) tcnp = get_time() - tcnp; ttot_cnp += tcnp; #endif } #if defined (ENABLE_TIMER) tcchol = get_time(); #endif lapackf77_zpotrf(MagmaLowerStr, &nb, tmppanel(j), &ldpanel, info); if (*info != 0) { *info = *info + j; break; } #if defined (ENABLE_TIMER) tcchol = get_time() - tcchol; ttot_cchol += tcchol; tctrsm = get_time(); #endif trsmrows = rows - nb; if (trsmrows > 0) { blasf77_ztrsm(MagmaRightStr, MagmaLowerStr, MagmaConjTransStr, MagmaNonUnitStr, &trsmrows, &nb, &z_one, tmppanel(j), &ldpanel, tmppanel(j + nb), &ldpanel); } #if defined (ENABLE_TIMER) tctrsm = get_time() - tctrsm; ttot_ctrsm += tctrsm; tctm = get_time(); #endif d = (id + 1) % num_gpus; // send current panel to gpus for (ngpu = 0; ngpu < num_gpus; ngpu++, d = (d + 1) % num_gpus ) { magma_int_t myrows = 0; magma_int_t row_offset = 0; if ( d == id ) { dlpanel = dlA(d, j, j_local); myrows = rows; row_offset = 0; } else { dlpanel = dlP(d, 0, 0); myrows = trsmrows; row_offset = nb; } if (myrows > 0) { magma_setdevice(d); magma_zsetmatrix_async(myrows, nb, tmppanel(j + row_offset), ldpanel, dlpanel, ldda, stream[d][0] ); } } /* make sure panel is on GPUs */ d = (id + 1) % num_gpus; for (ngpu = 0; ngpu < num_gpus; ngpu++, d = (d + 1) % num_gpus ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } #if defined (ENABLE_TIMER) tctm = get_time() - tctm; ttot_ctm += tctm; #endif if ( (j + nb) < n) { magma_int_t offset = 0; magma_int_t row_offset = 0; if (j + nb + nb < n) { d = (id + 1) % num_gpus; magma_setdevice(d); magma_int_t j_local2 = (j + nb) / (nb * num_gpus) * nb; if (trsmrows <= crosspoint) { #if defined (ENABLE_TIMER) tmnp = get_time(); #endif // do gemm on look ahead panel if ( d == id ) { dlpanel = dlA(d, j + nb, j_local); } else { dlpanel = dlP(d, 0, 0); } magmablasSetKernelStream(stream[d][STREAM_ID(j_local2)]); #define ZHERK_ON_DIAG #ifdef ZHERK_ON_DIAG magma_zherk( MagmaLower, MagmaNoTrans, nb, nb, m_one, dlpanel, ldda, one, dlA(d, j + nb, j_local2), ldda); magma_zgemm( MagmaNoTrans, MagmaConjTrans, trsmrows-nb, nb, nb, mz_one, dlpanel+nb, ldda, dlpanel, ldda, z_one, dlA(d, j + nb +nb, j_local2), ldda); #else magma_zgemm( MagmaNoTrans, MagmaConjTrans, trsmrows, nb, nb, mz_one, dlpanel, ldda, dlpanel, ldda, z_one, dlA(d, j + nb, j_local2), ldda); #endif #if defined (ENABLE_TIMER) magma_device_sync(); tmnp = get_time() - tmnp; ttot_mnp += tmnp; #endif } // send next panel to cpu magma_queue_sync( stream[d][STREAM_ID(j_local2)] ); // make sure lookahead is done tmppanel = ((blkid+1) % 2 == 0) ? tmppanel0 : tmppanel1; magma_zgetmatrix_async(rows-nb, nb, dlA(d, j+nb, j_local2), ldda, tmppanel(j+nb), ldpanel, stream[d][0] ); tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; offset = j + nb + nb; row_offset = nb; } else { offset = j + nb; row_offset = 0; } if (n - offset > 0) { // syrk on multiple gpu for (d = 0; d < num_gpus; d++ ) { if ( d == id ) { dlpanels[d] = dlA(d, j + nb + row_offset, j_local); } else { dlpanels[d] = dlP(d, row_offset, 0); } } #if defined (ENABLE_TIMER) for( d=0; d < num_gpus; d++ ) therk[d] = get_time(); #endif //magmablasSetKernelStream(stream[d]); //magma_zherk(MagmaLower, MagmaNoTrans, n - offset, nb, // m_one, dlpanel, ldda, // one, &d_lA[d][offset + offset*ldda], ldda ); #ifdef ZHERK_ON_DIAG magma_zherk_mgpu #else magma_zherk_mgpu2 #endif (num_gpus, MagmaLower, MagmaNoTrans, nb, n - offset, nb, m_one, dlpanels, ldda, 0, one, d_lA, ldda, offset, num_streams, stream ); #if defined (ENABLE_TIMER) for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_device_sync(); therk[d] = get_time() - therk[d]; ttot_herk[d] += therk[d]; } #endif } prevtrsmrows = trsmrows; prevj = j; #if defined (ENABLE_TIMER) ttot += (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(%d) %10.3lf\n", j, nb, rows, tmtc, tcnp, // gemm tcchol, // potrf tctrsm, // trsm (tcchol + tctrsm), (tmtc+tcnp+tcchol+tctrsm), therk[0], therk[1], therk[2], therk[3], // syrk tctm, // copy panel to GPU tmnp, // lookahead on GPU (id + 1) % num_gpus, (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp)); fflush(0); #endif } } for( d = 0; d < num_gpus; d++ ) { magma_setdevice(d); for( id=0; id < num_streams; id++ ) { magma_queue_sync( stream[d][id] ); } } #if defined (ENABLE_TIMER) printf("\n%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf\n", n, n, 0, ttot_mtc, ttot_cnp, // gemm ttot_cchol, // potrf ttot_ctrsm, // trsm (ttot_cchol + ttot_ctrsm), (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm), ttot_herk[0], ttot_herk[1], ttot_herk[2], ttot_herk[3], // syrk ttot_ctm, // copy panel to GPU ttot_mnp, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf (ratio)\n", n, n, 0, ttot_mtc/ttot, ttot_cnp/ttot, // gemm ttot_cchol/ttot, // potrf ttot_ctrsm/ttot, // trsm (ttot_cchol + ttot_ctrsm)/ttot, (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm)/ttot, ttot_herk[0]/ttot, ttot_herk[1]/ttot, ttot_herk[2]/ttot, ttot_herk[3]/ttot, // syrk ttot_ctm/ttot, // copy panel to GPU ttot_mnp/ttot, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)/ttot); #endif // cholesky for the last block if (j < n && *info == 0) { rows = n - j; id = (j / nb) % num_gpus; // Set the local index where the current panel is j_local = j / (nb * num_gpus) * nb; magma_setdevice(id); #if defined (ENABLE_TIMER) tset = get_time(); #endif magma_zgetmatrix(rows, rows, dlA(id, j, j_local), ldda, panel(j), ldpanel); lapackf77_zpotrf(MagmaLowerStr, &rows, panel(j), &ldpanel, info); magma_zsetmatrix(rows, rows, panel(j), ldpanel, dlA(id, j, j_local), ldda); #if defined (ENABLE_TIMER) tset = get_time() - tset; #endif } #if defined (ENABLE_TIMER) printf( " matrix_get,set: %10.3lf %10.3lf -> %10.3lf\n",tget,tset,ttot+tget+tset ); #endif } // end of else not upper // clean up for( d = 0; d < num_gpus; d++ ) { magma_setdevice(d); for( j=0; j < num_streams; j++ ) { magma_queue_destroy( stream[d][j] ); } magma_free( d_lP[d] ); } } // end of not lapack // free workspace magma_free_pinned( panel ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_zpotrf_mgpu_right */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zcgeqrsv */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time, gpu_perfd, gpu_perfs; double error, gpu_error, cpu_error, Anorm, work[1]; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_A2, *h_B, *h_X, *h_R; magmaDoubleComplex_ptr d_A, d_B, d_X, d_T; magmaFloatComplex *d_SA, *d_SB; magmaDoubleComplex *h_workd, *tau, tmp[1]; magmaFloatComplex *h_works; magma_int_t lda, ldb, lhwork, lworkgpu; magma_int_t ldda, lddb, lddx; magma_int_t M, N, nrhs, qrsv_iters, info, size, min_mn, max_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; printf("Epsilon(double): %8.6e\n" "Epsilon(single): %8.6e\n\n", lapackf77_dlamch("Epsilon"), lapackf77_slamch("Epsilon") ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf(" CPU Gflop/s GPU Gflop/s |b-Ax|| / (N||A||) ||dx-x||/(N||A||)\n"); printf(" M N NRHS double double single mixed Iter CPU GPU \n"); printf("=============================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if ( M < N ) { printf( "%5d %5d %5d skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; ldda = ((M+31)/32) * 32; lddb = ((max_mn+31)/32)*32; lddx = ((N+31)/32) * 32; nb = max( magma_get_zgeqrf_nb( M ), magma_get_cgeqrf_nb( M ) ); gflops = (FLOPS_ZGEQRF( M, N ) + FLOPS_ZGEQRS( M, N, nrhs )) / 1e9; lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; // query for workspace size lhwork = -1; lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs, NULL, &lda, NULL, &ldb, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); lhwork = max( lhwork, lworkgpu ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_A2, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_workd, magmaDoubleComplex, lhwork ); h_works = (magmaFloatComplex*)h_workd; TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*nrhs ); TESTING_MALLOC_DEV( d_X, magmaDoubleComplex, lddx*nrhs ); TESTING_MALLOC_DEV( d_T, magmaDoubleComplex, ( 2*min_mn + (N+31)/32*32 )*nb ); /* Initialize the matrices */ size = lda*N; lapackf77_zlarnv( &ione, ISEED, &size, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS size = ldb*nrhs; lapackf77_zlarnv( &ione, ISEED, &size, h_B ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); //===================================================================== // Mixed Precision Iterative Refinement - GPU //===================================================================== gpu_time = magma_wtime(); magma_zcgeqrsv_gpu( M, N, nrhs, d_A, ldda, d_B, lddb, d_X, lddx, &qrsv_iters, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zcgeqrsv returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute the residual magma_zgetmatrix( N, nrhs, d_X, lddx, h_X, ldb ); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); Anorm = lapackf77_zlange("f", &M, &N, h_A, &lda, work); //===================================================================== // Double Precision Solve //===================================================================== magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_zgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_workd, lworkgpu, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfd = gflops / gpu_time; //===================================================================== // Single Precision Solve //===================================================================== magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); /* The allocation of d_SA and d_SB is done here to avoid * to double the memory used on GPU with zcgeqrsv */ TESTING_MALLOC_DEV( d_SA, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_SB, magmaFloatComplex, lddb*nrhs ); magmablas_zlag2c( M, N, d_A, ldda, d_SA, ldda, &info ); magmablas_zlag2c( N, nrhs, d_B, lddb, d_SB, lddb, &info ); gpu_time = magma_wtime(); magma_cgels_gpu( MagmaNoTrans, M, N, nrhs, d_SA, ldda, d_SB, lddb, h_works, lhwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfs = gflops / gpu_time; TESTING_FREE_DEV( d_SA ); TESTING_FREE_DEV( d_SB ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_workd, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb ); cpu_error = lapackf77_zlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm); gpu_error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); // error relative to LAPACK size = M*nrhs; blasf77_zaxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); printf("%5d %5d %5d %7.2f %7.2f %7.2f %7.2f %4d %8.2e %8.2e %8.2e %s\n", (int) M, (int) N, (int) nrhs, cpu_perf, gpu_perfd, gpu_perfs, gpu_perf, (int) qrsv_iters, cpu_error, gpu_error, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_workd ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_X ); TESTING_FREE_DEV( d_T ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zgetrf_nopiv(magma_int_t *m, magma_int_t *n, magmaDoubleComplex *a, magma_int_t *lda, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZGETRF_NOPIV computes an LU factorization of a general M-by-N matrix A without pivoting. The factorization has the form A = L * U where L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t a_dim1, a_offset, min_mn, i__3, i__4; magma_int_t j, jb, nb, iinfo; a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; /* Function Body */ *info = 0; if (*m < 0) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*lda < max(1,*m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (*m == 0 || *n == 0) { return *info; } /* Determine the block size for this environment. */ nb = 128; min_mn = min(*m,*n); if (nb <= 1 || nb >= min_mn) { /* Use unblocked code. */ magma_zgetf2_nopiv(m, n, &a[a_offset], lda, info); } else { /* Use blocked code. */ for (j = 1; j <= min_mn; j += nb) { /* Computing MIN */ i__3 = min_mn - j + 1; jb = min(i__3,nb); /* Factor diagonal and subdiagonal blocks and test for exact singularity. */ i__3 = *m - j + 1; //magma_zgetf2_nopiv(&i__3, &jb, &a[j + j * a_dim1], lda, &iinfo); i__3 -= jb; magma_zgetf2_nopiv(&jb, &jb, &a[j + j * a_dim1], lda, &iinfo); blasf77_ztrsm("R", "U", "N", "N", &i__3, &jb, &c_one, &a[j + j * a_dim1], lda, &a[j + jb + j * a_dim1], lda); /* Adjust INFO */ if (*info == 0 && iinfo > 0) *info = iinfo + j - 1; if (j + jb <= *n) { /* Compute block row of U. */ i__3 = *n - j - jb + 1; blasf77_ztrsm("Left", "Lower", "No transpose", "Unit", &jb, &i__3, &c_one, &a[j + j * a_dim1], lda, &a[j + (j+jb)*a_dim1], lda); if (j + jb <= *m) { /* Update trailing submatrix. */ i__3 = *m - j - jb + 1; i__4 = *n - j - jb + 1; blasf77_zgemm("No transpose", "No transpose", &i__3, &i__4, &jb, &c_neg_one, &a[j + jb + j * a_dim1], lda, &a[j + (j + jb) * a_dim1], lda, &c_one, &a[j + jb + (j + jb) * a_dim1], lda); } } } } return *info; } /* magma_zgetrf_nopiv */
virtual void run() { blasf77_zgemm( lapack_trans_const(transA), lapack_trans_const(transB), &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc ); }
/** Purpose ------- ZLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX_16 array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. @param[in,out] auxv COMPLEX_16 array, dimension (NB) Auxiliar vector. @param[in,out] F COMPLEX_16 array, dimension (LDF,NB) Matrix F' = L*Y'*A. @param[in] ldf INTEGER The leading dimension of the array F. LDF >= max(1,N). @ingroup magma_zgeqp3_aux ********************************************************************/ extern "C" magma_int_t magma_zlaqps( magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaDoubleComplex *tau, double *vn1, double *vn2, magmaDoubleComplex *auxv, magmaDoubleComplex *F, magma_int_t ldf, magmaDoubleComplex_ptr dF, magma_int_t lddf) { #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dA + (i) + (j)*(ldda)) #define F(i, j) (F + (i) + (j)*(ldf )) #define dF(i, j) (dF + (i) + (j)*(lddf)) magmaDoubleComplex c_zero = MAGMA_Z_MAKE( 0.,0.); magmaDoubleComplex c_one = MAGMA_Z_MAKE( 1.,0.); magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; double d__1; magmaDoubleComplex z__1; magma_int_t j, k, rk; magmaDoubleComplex Akk; magma_int_t pvt; double temp, temp2, tol3z; magma_int_t itemp; magma_int_t lsticc; magma_int_t lastrk; lastrk = min( m, n + offset ); tol3z = magma_dsqrt( lapackf77_dlamch("Epsilon")); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); lsticc = 0; k = 0; while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran idamax; pvt, k are 0-based. i__1 = n-k; pvt = k + blasf77_idamax( &i__1, &vn1[k], &ione ) - 1; if (pvt != k) { if (pvt >= nb) { /* 1. Start copy from GPU */ magma_zgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, queue ); } /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; blasf77_zswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; vn1[pvt] = vn1[k]; vn2[pvt] = vn2[k]; if (pvt < nb) { /* no need of transfer if pivot is within the panel */ blasf77_zswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { /* 1. Finish copy from GPU */ magma_queue_sync( queue ); /* 2. Swap as usual on CPU */ blasf77_zswap(&m, A(0, pvt), &ione, A(0, k), &ione); /* 3. Restore the GPU */ magma_zsetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, queue ); } } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { #ifdef COMPLEX for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_Z_CONJ( *F(k,j) ); } #endif i__1 = m - rk; i__2 = k; blasf77_zgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione ); #ifdef COMPLEX for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_Z_CONJ( *F(k,j) ); } #endif } /* Generate elementary reflector H(k). */ if (rk < m-1) { i__1 = m - rk; lapackf77_zlarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] ); } else { lapackf77_zlarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] ); } Akk = *A(rk, k); *A(rk, k) = c_one; /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ magma_zsetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda, queue ); /* Multiply on GPU */ // was CALL ZGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) magma_int_t i__3 = nb-k-1; magma_int_t i__4 = i__2 - i__3; magma_int_t i__5 = nb-k; magma_zgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, tau[k], dA(rk +i__5, k+1+i__3), ldda, dA(rk +i__5, k ), ione, c_zero, dF(k+1+i__3, k ), ione, queue ); magma_zgetmatrix_async( i__2-i__3, 1, dF(k + 1 +i__3, k), i__2, F (k + 1 +i__3, k), i__2, queue ); blasf77_zgemv( MagmaConjTransStr, &i__1, &i__3, &tau[k], A(rk, k+1), &lda, A(rk, k ), &ione, &c_zero, F(k+1, k ), &ione ); magma_queue_sync( queue ); blasf77_zgemv( MagmaConjTransStr, &i__5, &i__4, &tau[k], A(rk, k+1+i__3), &lda, A(rk, k ), &ione, &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. */ for (j = 0; j < k; ++j) { *F(j, k) = c_zero; } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */ if (k > 0) { i__1 = m - rk; i__2 = k; z__1 = MAGMA_Z_NEGATE( tau[k] ); blasf77_zgemv( MagmaConjTransStr, &i__1, &i__2, &z__1, A(rk, 0), &lda, A(rk, k), &ione, &c_zero, auxv, &ione ); i__1 = k; blasf77_zgemv( MagmaNoTransStr, &n, &i__1, &c_one, F(0,0), &ldf, auxv, &ione, &c_one, F(0,k), &ione ); } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, &c_neg_one, A(rk, 0 ), &lda, F(k+1,0 ), &ldf, &c_one, A(rk, k+1), &lda ); } /* Update partial column norms. */ if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { /* NOTE: The following 4 lines follow from the analysis in Lapack Working Note 176. */ temp = MAGMA_Z_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (double) lsticc; lsticc = j; } else { vn1[j] *= magma_dsqrt(temp); } } } } *A(rk, k) = Akk; ++k; } // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU */ magma_zsetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2, queue ); magma_zgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), i__2, c_one, dA(rk+1, *kb), ldda, queue ); } /* Recomputation of difficult columns. */ while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) { vn1[lsticc] = magma_cblas_dznrm2( i__1, A(rk+1,lsticc), ione ); } else { /* Where is the data, CPU or GPU ? */ double r1, r2; r1 = magma_cblas_dznrm2( nb-k, A(rk+1,lsticc), ione ); r2 = magma_dznrm2( m-offset-nb, dA(offset + nb + 1, lsticc), ione, queue ); //vn1[lsticc] = magma_dznrm2( i__1, dA(rk + 1, lsticc), ione, queue ); vn1[lsticc] = magma_dsqrt(r1*r1 + r2*r2); } /* NOTE: The computation of VN1( LSTICC ) relies on the fact that SNRM2 does not fail on vectors with norm below the value of SQRT(DLAMCH('S')) */ vn2[lsticc] = vn1[lsticc]; lsticc = itemp; } magma_queue_destroy( queue ); return MAGMA_SUCCESS; } /* magma_zlaqps */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrs */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double gpu_error, cpu_error, matnorm, work[1]; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; magmaDoubleComplex *d_A, *d_B; magma_int_t M, N, n2, nrhs, lda, ldb, ldda, lddb, min_mn, max_mn, nb, info; magma_int_t lworkgpu, lhwork, lhwork2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) CPU GPU \n"); printf("===============================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; if ( M < N ) { printf( "skipping M=%d, N=%d because M < N is not yet supported.\n", (int) M, (int) N ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; n2 = lda*N; ldda = ((M+31)/32)*32; lddb = ((max_mn+31)/32)*32; nb = magma_get_zgeqrf_nb(M); gflops = (FLOPS_ZGEQRF( M, N ) + FLOPS_ZGEQRS( M, N, nrhs )) / 1e9; // query for workspace size lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; lhwork = -1; lapackf77_zgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork2 = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); lhwork = -1; lapackf77_zunmqr( MagmaLeftStr, MagmaConjTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); lhwork = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); lhwork = max( max( lhwork, lhwork2 ), lworkgpu ); TESTING_MALLOC( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC( h_A2, magmaDoubleComplex, lda*N ); TESTING_MALLOC( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC( h_X, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC( h_R, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC( h_work, magmaDoubleComplex, lhwork ); TESTING_DEVALLOC( d_A, magmaDoubleComplex, ldda*N ); TESTING_DEVALLOC( d_B, magmaDoubleComplex, lddb*nrhs ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS n2 = M*nrhs; lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); // make consistent RHS //n2 = N*nrhs; //lapackf77_zlarnv( &ione, ISEED, &n2, h_X ); //blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, // &c_one, h_A, &lda, // h_X, &ldb, // &c_zero, h_B, &ldb ); //lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_zgels3_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgels returned error %d: %s.\n", (int) info, magma_strerror( info )); // Get the solution in h_X magma_zgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); // compute the residual blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_zlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); cpu_error = lapackf77_zlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*matnorm); gpu_error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*matnorm); printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, cpu_error, gpu_error ); printf("%s\n", (gpu_error < tol ? "" : " failed")); status |= ! (gpu_error < tol); TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_A2 ); TESTING_FREE( h_B ); TESTING_FREE( h_X ); TESTING_FREE( h_R ); TESTING_FREE( h_work ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgemm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time; double magma_error, dev_error, Cnorm, work[1]; magma_int_t M, N, K; magma_int_t Am, An, Bm, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magmaDoubleComplex *h_A, *h_B, *h_C, *h_Cmagma, *h_Cdev; magmaDoubleComplex_ptr d_A, d_B, d_C; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.29, -0.86 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.48, 0.38 ); magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); #ifdef HAVE_CUBLAS // for CUDA, we can check MAGMA vs. CUBLAS, without running LAPACK printf("If running lapack (option --lapack), MAGMA and %s error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to %s result.\n\n", g_platform_str, g_platform_str ); printf("transA = %s, transB = %s\n", lapack_trans_const(opts.transA), lapack_trans_const(opts.transB) ); printf(" M N K MAGMA Gflop/s (ms) %s Gflop/s (ms) CPU Gflop/s (ms) MAGMA error %s error\n", g_platform_str, g_platform_str ); #else // for others, we need LAPACK for check opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf("transA = %s, transB = %s\n", lapack_trans_const(opts.transA), lapack_trans_const(opts.transB) ); printf(" M N K %s Gflop/s (ms) CPU Gflop/s (ms) %s error\n", g_platform_str, g_platform_str ); #endif printf("=========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; K = opts.ksize[itest]; gflops = FLOPS_ZGEMM( M, N, K ) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if ( opts.transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } ldc = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*An; sizeB = ldb*Bn; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*An ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*Bn ); TESTING_MALLOC_CPU( h_C, magmaDoubleComplex, ldc*N ); TESTING_MALLOC_CPU( h_Cmagma, magmaDoubleComplex, ldc*N ); TESTING_MALLOC_CPU( h_Cdev, magmaDoubleComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*An ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*Bn ); TESTING_MALLOC_DEV( d_C, magmaDoubleComplex, lddc*N ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_zlarnv( &ione, ISEED, &sizeC, h_C ); magma_zsetmatrix( Am, An, h_A, lda, d_A, ldda ); magma_zsetmatrix( Bm, Bn, h_B, ldb, d_B, lddb ); /* ===================================================================== Performs operation using MAGMABLAS (currently only with CUDA) =================================================================== */ #ifdef HAVE_CUBLAS magma_zsetmatrix( M, N, h_C, ldc, d_C, lddc ); magma_time = magma_sync_wtime( NULL ); magmablas_zgemm( opts.transA, opts.transB, M, N, K, alpha, d_A, ldda, d_B, lddb, beta, d_C, lddc ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_zgetmatrix( M, N, d_C, lddc, h_Cmagma, ldc ); #endif /* ===================================================================== Performs operation using CUBLAS / clBLAS / Xeon Phi MKL =================================================================== */ magma_zsetmatrix( M, N, h_C, ldc, d_C, lddc ); #ifdef HAVE_CUBLAS dev_time = magma_sync_wtime( NULL ); cublasZgemm( opts.handle, cublas_trans_const(opts.transA), cublas_trans_const(opts.transB), M, N, K, &alpha, d_A, ldda, d_B, lddb, &beta, d_C, lddc ); dev_time = magma_sync_wtime( NULL ) - dev_time; #else dev_time = magma_sync_wtime( opts.queue ); magma_zgemm( opts.transA, opts.transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, opts.queue ); dev_time = magma_sync_wtime( opts.queue ) - dev_time; #endif dev_perf = gflops / dev_time; magma_zgetmatrix( M, N, d_C, lddc, h_Cdev, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_zgemm( lapack_trans_const(opts.transA), lapack_trans_const(opts.transB), &M, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & dev, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_zlange( "F", &M, &N, h_C, &ldc, work ); blasf77_zaxpy( &sizeC, &c_neg_one, h_C, &ione, h_Cdev, &ione ); dev_error = lapackf77_zlange( "F", &M, &N, h_Cdev, &ldc, work ) / Cnorm; #ifdef HAVE_CUBLAS blasf77_zaxpy( &sizeC, &c_neg_one, h_C, &ione, h_Cmagma, &ione ); magma_error = lapackf77_zlange( "F", &M, &N, h_Cmagma, &ldc, work ) / Cnorm; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, magma_error, dev_error, (magma_error < tol && dev_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && dev_error < tol); #else printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, (int) K, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, dev_error, (dev_error < tol ? "ok" : "failed")); status += ! (dev_error < tol); #endif } else { #ifdef HAVE_CUBLAS // compute relative error for magma, relative to dev (currently only with CUDA) Cnorm = lapackf77_zlange( "F", &M, &N, h_Cdev, &ldc, work ); blasf77_zaxpy( &sizeC, &c_neg_one, h_Cdev, &ione, h_Cmagma, &ione ); magma_error = lapackf77_zlange( "F", &M, &N, h_Cmagma, &ldc, work ) / Cnorm; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e --- %s\n", (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, magma_error, (magma_error < tol ? "ok" : "failed")); status += ! (magma_error < tol); #else printf("%5d %5d %5d %7.2f (%7.2f) --- ( --- ) ---\n", (int) M, (int) N, (int) K, dev_perf, 1000.*dev_time ); #endif } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Cmagma ); TESTING_FREE_CPU( h_Cdev ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }