/** Purpose ------- DGETF2_NOPIV computes an LU factorization of a general m-by-n matrix A without pivoting. The factorization has the form A = L * U where L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 2 BLAS version of the algorithm. This is a CPU-only (not accelerated) version. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the m by n matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -k, the k-th argument had an illegal value - > 0: if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_aux ********************************************************************/ extern "C" magma_int_t magma_dgetf2_nopiv( magma_int_t m, magma_int_t n, double *A, magma_int_t lda, magma_int_t *info) { #define A(i_,j_) (A + (i_) + (j_)*lda) double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t min_mn, m_j, n_j; double inv_Ajj; magma_int_t i, j; double sfmin; A -= 1 + lda; /* Function Body */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Compute machine safe minimum */ sfmin = lapackf77_dlamch("S"); min_mn = min(m,n); for (j = 1; j <= min_mn; ++j) { /* Test for singularity. */ if ( ! MAGMA_D_EQUAL( *A(j,j), c_zero)) { /* Compute elements J+1:M of J-th column. */ if (j < m) { if (MAGMA_D_ABS( *A(j,j) ) >= sfmin) { m_j = m - j; inv_Ajj = MAGMA_D_DIV(c_one, *A(j,j)); blasf77_dscal( &m_j, &inv_Ajj, A(j+1,j), &ione ); } else { m_j = m - j; for (i = 1; i <= m_j; ++i) { *A(j+i,j) = MAGMA_D_DIV( *A(j+i,j), *A(j,j) ); } } } } else if (*info == 0) { *info = j; } if (j < min_mn) { /* Update trailing submatrix. */ m_j = m - j; n_j = n - j; blasf77_dger( &m_j, &n_j, &c_neg_one, A(j+1,j), &ione, A(j,j+1), &lda, A(j+1,j+1), &lda ); } } return *info; } /* magma_dgetf2_nopiv */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dtrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; double magma_error, cublas_error, work[1]; magma_int_t M, N, info; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *piv; magma_err_t err; double *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2, *LU, *LUT; double *d_A, *d_B; double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double alpha = MAGMA_D_MAKE( 0.29, -0.86 ); magma_opts opts; parse_opts( argc, argv, &opts ); printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n" "side = %c, uplo = %c, transA = %c, diag = %c \n", opts.side, opts.uplo, opts.transA, opts.diag ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("==================================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; gflops = FLOPS_DTRSM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC( h_A, double, lda*Ak ); TESTING_MALLOC( LU, double, lda*Ak ); TESTING_MALLOC( LUT, double, lda*Ak ); TESTING_MALLOC( h_B, double, ldb*N ); TESTING_MALLOC( h_B1, double, ldb*N ); TESTING_MALLOC( h_X1, double, ldb*N ); TESTING_MALLOC( h_X2, double, ldb*N ); TESTING_MALLOC( h_Bcublas, double, ldb*N ); TESTING_MALLOC( h_Bmagma, double, ldb*N ); TESTING_DEVALLOC( d_A, double, ldda*Ak ); TESTING_DEVALLOC( d_B, double, lddb*N ); /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &sizeA, LU ); err = magma_malloc_cpu( (void**) &piv, Ak*sizeof(magma_int_t) ); assert( err == 0 ); lapackf77_dgetrf( &Ak, &Ak, LU, &lda, piv, &info ); int i, j; for(i=0;i<Ak;i++){ for(j=0;j<Ak;j++){ LUT[j+i*lda] = LU[i+j*lda]; } } lapackf77_dlacpy(MagmaUpperStr, &Ak, &Ak, LUT, &lda, LU, &lda); if(opts.uplo == MagmaLower){ lapackf77_dlacpy(MagmaLowerStr, &Ak, &Ak, LU, &lda, h_A, &lda); }else{ lapackf77_dlacpy(MagmaUpperStr, &Ak, &Ak, LU, &lda, h_A, &lda); } lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); memcpy(h_B1, h_B, sizeB*sizeof(double)); /* ===================================================================== Performs operation using MAGMA-BLAS =================================================================== */ magma_dsetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_dsetmatrix( M, N, h_B, ldb, d_B, lddb ); magma_time = magma_sync_wtime( NULL ); magmablas_dtrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_dgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_dsetmatrix( M, N, h_B, ldb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ); cublasDtrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_dgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_dtrsm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) memcpy(h_X1, h_Bmagma, sizeB*sizeof(double)); double alpha2 = MAGMA_D_DIV( c_one, alpha ); blasf77_dtrmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha2, h_A, &lda, h_X1, &ldb ); blasf77_daxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione ); double norm1 = lapackf77_dlange( "M", &M, &N, h_X1, &ldb, work ); double normx = lapackf77_dlange( "M", &M, &N, h_Bmagma, &ldb, work ); double normA = lapackf77_dlange( "M", &Ak, &Ak, h_A, &lda, work ); magma_error = norm1/(normx*normA); memcpy(h_X2, h_Bcublas, sizeB*sizeof(double)); blasf77_dtrmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha2, h_A, &lda, h_X2, &ldb ); blasf77_daxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione ); norm1 = lapackf77_dlange( "M", &M, &N, h_X2, &ldb, work ); normx = lapackf77_dlange( "M", &M, &N, h_Bcublas, &ldb, work ); normA = lapackf77_dlange( "M", &Ak, &Ak, h_A, &lda, work ); cublas_error = norm1/(normx*normA); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error ); } else { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e %8.2e\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error, cublas_error ); } TESTING_FREE( h_A ); TESTING_FREE( LU ); TESTING_FREE( LUT ); TESTING_FREE( h_B ); TESTING_FREE( h_Bcublas ); TESTING_FREE( h_Bmagma ); TESTING_FREE( h_B1 ); TESTING_FREE( h_X1 ); TESTING_FREE( h_X2 ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }