/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf_batched */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf=0., cublas_time=0., cpu_perf=0, cpu_time=0; float error; magma_int_t cublas_enable = 0; magmaFloatComplex *h_A, *h_R; magmaFloatComplex *dA_magma; magmaFloatComplex **dA_array = NULL; magma_int_t **dipiv_array = NULL; magma_int_t *ipiv, *cpu_info; magma_int_t *dipiv_magma, *dinfo_magma; magma_int_t M, N, n2, lda, ldda, min_mn, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t batchCount; magma_int_t status = 0; magma_opts opts( MagmaOptsBatched ); opts.parse_opts( argc, argv ); //opts.lapack |= opts.check; batchCount = opts.batchcount; magma_int_t columns; float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% BatchCount M N CPU Gflop/s (ms) MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) ||PA-LU||/(||A||*N)\n"); printf("%%==========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N * batchCount; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default gflops = FLOPS_CGETRF( M, N ) / 1e9 * batchCount; TESTING_MALLOC_CPU( cpu_info, magma_int_t, batchCount ); TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn * batchCount ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( dA_magma, magmaFloatComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( dipiv_magma, magma_int_t, min_mn * batchCount ); TESTING_MALLOC_DEV( dinfo_magma, magma_int_t, batchCount ); TESTING_MALLOC_DEV( dA_array, magmaFloatComplex*, batchCount ); TESTING_MALLOC_DEV( dipiv_array, magma_int_t*, batchCount ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); // make A diagonally dominant, to not need pivoting for( int s=0; s < batchCount; ++s ) { for( int i=0; i < min_mn; ++i ) { h_A[ i + i*lda + s*lda*N ] = MAGMA_C_MAKE( MAGMA_C_REAL( h_A[ i + i*lda + s*lda*N ] ) + N, MAGMA_C_IMAG( h_A[ i + i*lda + s*lda*N ] )); } } columns = N * batchCount; lapackf77_clacpy( MagmaFullStr, &M, &columns, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, columns, h_R, lda, dA_magma, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_cset_pointer( dA_array, dA_magma, ldda, 0, 0, ldda*N, batchCount, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); info = magma_cgetrf_nopiv_batched( M, N, dA_array, ldda, dinfo_magma, batchCount, opts.queue); magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_getvector( batchCount, sizeof(magma_int_t), dinfo_magma, 1, cpu_info, 1); for (int i=0; i < batchCount; i++) { if (cpu_info[i] != 0 ) { printf("magma_cgetrf_batched matrix %d returned internal error %d\n", i, (int)cpu_info[i] ); } } if (info != 0) { printf("magma_cgetrf_batched returned argument error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); for (int i=0; i < batchCount; i++) { lapackf77_cgetrf(&M, &N, h_A + i*lda*N, &lda, ipiv + i * min_mn, &info); assert( info == 0 ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f)", (int) batchCount, (int) M, (int) N, cpu_perf, cpu_time*1000., magma_perf, magma_time*1000., cublas_perf*cublas_enable, cublas_time*1000.*cublas_enable ); } else { printf("%10d %5d %5d --- ( --- ) %7.2f (%7.2f) %7.2f (%7.2f)", (int) batchCount, (int) M, (int) N, magma_perf, magma_time*1000., cublas_perf*cublas_enable, cublas_time*1000.*cublas_enable ); } if ( opts.check ) { // initialize ipiv to 1, 2, 3, ... for (int i=0; i < batchCount; i++) { for (int k=0; k < min_mn; k++) { ipiv[i*min_mn+k] = k+1; } } magma_cgetmatrix( M, N*batchCount, dA_magma, ldda, h_A, lda ); error = 0; for (int i=0; i < batchCount; i++) { float err; err = get_LU_error( M, N, h_R + i * lda*N, lda, h_A + i * lda*N, ipiv + i * min_mn); if ( isnan(err) || isinf(err) ) { error = err; break; } error = max( err, error ); } bool okay = (error < tol); status += ! okay; printf(" %8.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf(" --- \n"); } TESTING_FREE_CPU( cpu_info ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( dA_magma ); TESTING_FREE_DEV( dinfo_magma ); TESTING_FREE_DEV( dipiv_magma ); TESTING_FREE_DEV( dipiv_array ); TESTING_FREE_DEV( dA_array ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_cgeqrf_batched( magma_int_t m, magma_int_t n, magmaFloatComplex **dA_array, magma_int_t ldda, magmaFloatComplex **dtau_array, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define dA(i, j) (dA + (i) + (j)*ldda) // A(i, j) means at i row, j column /* Local Parameter */ magma_int_t nb = magma_get_cgeqrf_batched_nb(m); magma_int_t min_mn = min(m, n); /* Check arguments */ cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); magma_int_t arginfo = 0; if (m < 0) arginfo = -1; else if (n < 0) arginfo = -2; else if (ldda < max(1,m)) arginfo = -4; if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } /* Quick return if possible */ if (m == 0 || n == 0) return arginfo; magmaFloatComplex *dT = NULL; magmaFloatComplex *dR = NULL; magmaFloatComplex **dR_array = NULL; magmaFloatComplex **dT_array = NULL; magma_malloc((void**)&dR_array, batchCount * sizeof(*dR_array)); magma_malloc((void**)&dT_array, batchCount * sizeof(*dT_array)); magma_int_t lddt = min(nb, min_mn); magma_int_t lddr = min(nb, min_mn); magma_cmalloc(&dR, lddr * lddr * batchCount); magma_cmalloc(&dT, lddt * lddt * batchCount); /* check allocation */ if ( dR_array == NULL || dT_array == NULL || dR == NULL || dT == NULL ) { magma_free(dR_array); magma_free(dT_array); magma_free(dR); magma_free(dT); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magma_cset_pointer( dR_array, dR, lddr, 0, 0, lddr*min(nb, min_mn), batchCount, queue ); magma_cset_pointer( dT_array, dT, lddt, 0, 0, lddt*min(nb, min_mn), batchCount, queue ); arginfo = magma_cgeqrf_expert_batched(m, n, dA_array, ldda, dR_array, lddr, dT_array, lddt, dtau_array, 0, info_array, batchCount, queue); magma_free(dR_array); magma_free(dT_array); magma_free(dR); magma_free(dT); return arginfo; }
/***************************************************************************//** Purpose ------- CPOTRS solves a system of linear equations A*X = B with a Hermitian positive definite matrix A using the Cholesky factorization A = U**H*U or A = L*L**H computed by CPOTRF. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA_array Array of pointers, dimension (batchCount). Each is a COMPLEX array on the GPU, dimension (LDDA,N) The triangular factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H, as computed by CPOTRF. @param[in] ldda INTEGER The leading dimension of each array A. LDDA >= max(1,N). @param[in,out] dB_array Array of pointers, dimension (batchCount). Each is a COMPLEX array on the GPU, dimension (LDDB,NRHS) On entry, each pointer is a right hand side matrix B. On exit, the corresponding solution matrix X. @param[in] lddb INTEGER The leading dimension of each array B. LDDB >= max(1,N). @param[in] batchCount INTEGER The number of matrices to operate on. @param[in] queue magma_queue_t Queue to execute in. @ingroup magma_potrs_batched *******************************************************************************/ extern "C" magma_int_t magma_cpotrs_batched( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaFloatComplex **dA_array, magma_int_t ldda, magmaFloatComplex **dB_array, magma_int_t lddb, magma_int_t batchCount, magma_queue_t queue) { magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) info = -1; if ( n < 0 ) info = -2; if ( nrhs < 0) info = -3; if ( ldda < max(1, n) ) info = -5; if ( lddb < max(1, n) ) info = -7; if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return info; } magmaFloatComplex **dW1_displ = NULL; magmaFloatComplex **dW2_displ = NULL; magmaFloatComplex **dW3_displ = NULL; magmaFloatComplex **dW4_displ = NULL; magmaFloatComplex **dinvA_array = NULL; magmaFloatComplex **dwork_array = NULL; magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dinvA_array, batchCount * sizeof(*dinvA_array)); magma_malloc((void**)&dwork_array, batchCount * sizeof(*dwork_array)); magma_int_t invA_msize = magma_roundup( n, CTRTRI_BATCHED_NB )*CTRTRI_BATCHED_NB; magma_int_t dwork_msize = n*nrhs; magmaFloatComplex* dinvA = NULL; magmaFloatComplex* dwork = NULL; // dinvA and dwork are workspace in ctrsm magma_cmalloc( &dinvA, invA_msize * batchCount); magma_cmalloc( &dwork, dwork_msize * batchCount ); /* check allocation */ if ( dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dinvA_array == NULL || dwork_array == NULL || dinvA == NULL || dwork == NULL ) { magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magmablas_claset_q( MagmaFull, invA_msize, batchCount, MAGMA_C_ZERO, MAGMA_C_ZERO, dinvA, invA_msize, queue ); magmablas_claset_q( MagmaFull, dwork_msize, batchCount, MAGMA_C_ZERO, MAGMA_C_ZERO, dwork, dwork_msize, queue ); magma_cset_pointer( dwork_array, dwork, n, 0, 0, dwork_msize, batchCount, queue ); magma_cset_pointer( dinvA_array, dinvA, CTRTRI_BATCHED_NB, 0, 0, invA_msize, batchCount, queue ); if ( uplo == MagmaUpper) { if (nrhs > 1) { // A = U^T U // solve U^{T}X = B ==> dworkX = U^-T * B magmablas_ctrsm_outofplace_batched( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, 1, n, nrhs, c_one, dA_array, ldda, // dA dB_array, lddb, // dB dwork_array, n, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue ); // solve U X = dwork ==> X = U^-1 * dwork magmablas_ctrsm_outofplace_batched( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, 1, n, nrhs, c_one, dA_array, ldda, // dA dwork_array, n, // dB dB_array, lddb, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue ); } else { // A = U^T U // solve U^{T}X = B ==> dworkX = U^-T * B magmablas_ctrsv_outofplace_batched( MagmaUpper, MagmaConjTrans, MagmaNonUnit, n, dA_array, ldda, // dA dB_array, 1, // dB dwork_array, // dX //output batchCount, queue, 0 ); // solve U X = dwork ==> X = U^-1 * dwork magmablas_ctrsv_outofplace_batched( MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA_array, ldda, // dA dwork_array, 1, // dB dB_array, // dX //output batchCount, queue, 0 ); } } else { if (nrhs > 1) { // A = L L^T // solve LX= B ==> dwork = L^{-1} B magmablas_ctrsm_outofplace_batched( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, 1, n, nrhs, c_one, dA_array, ldda, // dA dB_array, lddb, // dB dwork_array, n, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue ); // solve L^{T}X= dwork ==> X = L^{-T} dwork magmablas_ctrsm_outofplace_batched( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, 1, n, nrhs, c_one, dA_array, ldda, // dA dwork_array, n, // dB dB_array, lddb, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue ); } else { // A = L L^T // solve LX= B ==> dwork = L^{-1} B magmablas_ctrsv_outofplace_batched( MagmaLower, MagmaNoTrans, MagmaNonUnit, n, dA_array, ldda, // dA dB_array, 1, // dB dwork_array, // dX //output batchCount, queue, 0 ); // solve L^{T}X= dwork ==> X = L^{-T} dwork magmablas_ctrsv_outofplace_batched( MagmaLower, MagmaConjTrans, MagmaNonUnit, n, dA_array, ldda, // dA dwork_array, 1, // dB dB_array, // dX //output batchCount, queue, 0 ); } } magma_queue_sync(queue); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); return info; }