int main(int argc, char* argv[]) { if (argc != 2) return usage(argv[0]); int n = atoi(argv[1]), n2 = n * n; if (n <= 0) return usage(argv[0]); // Generate random matrix. size_t size = sizeof(float) * n2; float* A1 = (float*)malloc(size); int one = 1, seed[4] = { 0, 0, 0, 1 }; slarnv_(&one, seed, &n2, A1); // Symmetrize and increase the diagonal. for (int i = 0; i < n; i++) { A1[i * n + i] += n; for (int j = 0; j < i; j++) A1[i * n + j] = A1[j * n + i]; } // Clone generated matrix for GPU version // (we can't use one copy of A, because // spotrf rewrites the input matrix). float* A2 = (float*)malloc(size); memcpy(A2, A1, size); // Use upper part of input matrix and // rewrite it with Cholessky factor. char uplo = 'U'; // The status info (routine must return 0 into info). int info = 0; // Perform decomposition on CPU. printf("Computing on CPU ... "); fflush(stdout); spotrf_(&uplo, &n, A1, &n, &info); chkerr(info); // Perform decomposition on GPU. printf("Computing on GPU ... "); fflush(stdout); magma_spotrf(uplo, n, A2, n, &info); chkerr(info); // Compare results. float maxdiff = fabs(A1[0] - A2[0]); for (int i = 0; i < n; i++) for (int j = 0; j < i; j++) { maxdiff = fmax(maxdiff, fabs(A1[i * n + j] - A2[i * n + j])); maxdiff = fmax(maxdiff, fabs(A1[j * n + i] - A2[j * n + i])); } printf("Done! max diff = %f\n", maxdiff); free(A1); free(A2); }
void magmaf_spotrf( magma_uplo_t *uplo, magma_int_t *n, float *A, magma_int_t *lda, magma_int_t *info, magma_queue_t *queue ) { magma_spotrf( *uplo, *n, A, *lda, info, queue ); }
extern "C" magma_int_t magma_sposv ( char uplo, magma_int_t n, magma_int_t nrhs, float *A, magma_int_t lda, float *B, magma_int_t ldb, magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= SPOSV computes the solution to a real system of linear equations A * X = B, where A is an N-by-N symmetric positive definite matrix and X and B are N-by-NRHS matrices. The Cholesky decomposition is used to factor A as A = U**T * U, if UPLO = 'U', or A = L * L**T, if UPLO = 'L', where U is an upper triangular matrix and L is a lower triangular matrix. The factored form of A is then used to solve the system of equations A * X = B. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. A (input/output) REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**T*U or A = L*L**T. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input/output) REAL array, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ magma_int_t num_gpus, ldda, lddb; *info = 0 ; if( (uplo != 'U') && (uplo != 'u') && (uplo != 'L') && (uplo != 'l') ) *info = -1; if( n < 0 ) *info = -2; if( nrhs < 0) *info = -3; if ( lda < max(1, n) ) *info = -5; if ( ldb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n==0) || (nrhs == 0) ) { return *info; } /* If single-GPU and allocation suceeds, use GPU interface. */ num_gpus = magma_num_gpus(); float *dA, *dB; if ( num_gpus > 1 ) { goto CPU_INTERFACE; } ldda = ((n+31)/32)*32; lddb = ldda; if ( MAGMA_SUCCESS != magma_smalloc( &dA, ldda*n )) { goto CPU_INTERFACE; } if ( MAGMA_SUCCESS != magma_smalloc( &dB, lddb*nrhs )) { magma_free( dA ); goto CPU_INTERFACE; } magma_ssetmatrix( n, n, A, lda, dA, ldda ); magma_spotrf_gpu( uplo, n, dA, ldda, info ); if ( *info == MAGMA_ERR_DEVICE_ALLOC ) { magma_free( dA ); magma_free( dB ); goto CPU_INTERFACE; } magma_sgetmatrix( n, n, dA, ldda, A, lda ); if ( *info == 0 ) { magma_ssetmatrix( n, nrhs, B, ldb, dB, lddb ); magma_spotrs_gpu( uplo, n, nrhs, dA, ldda, dB, lddb, info ); magma_sgetmatrix( n, nrhs, dB, lddb, B, ldb ); } magma_free( dA ); magma_free( dB ); return *info; CPU_INTERFACE: /* If multi-GPU or allocation failed, use CPU interface and LAPACK. * Faster to use LAPACK for potrs than to copy A to GPU. */ magma_spotrf( uplo, n, A, lda, info ); if ( *info == 0 ) { lapackf77_spotrs( &uplo, &n, &nrhs, A, &lda, B, &ldb, info ); } return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing spotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float *h_A, *h_R; magma_int_t N, n2, lda, info; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu %d, uplo %c\n", (int) opts.ngpu, opts.uplo ); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = N; n2 = lda*N; gflops = FLOPS_SPOTRF( N ) / 1e9; TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); magma_smake_hpd( N, h_A, lda ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_spotrf( opts.uplo, N, h_R, lda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_spotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_spotrf( &opts.uplo, &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_spotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_slange("f", &N, &N, h_A, &lda, work); blasf77_saxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_slange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e%s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "" : " failed") ); status |= ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ssygst */ int main( int argc, char** argv) { TESTING_INIT(); // Constants const float c_neg_one = MAGMA_S_NEG_ONE; const magma_int_t ione = 1; // Local variables real_Double_t gpu_time, cpu_time; float *h_A, *h_B, *h_R; magmaFloat_ptr d_A, d_B; float Anorm, error, work[1]; magma_int_t N, n2, lda, ldda, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("%% itype N CPU time (sec) GPU time (sec) |R| \n"); printf("%%=======================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = magma_roundup( lda, opts.align ); n2 = N*lda; TESTING_MALLOC_CPU( h_A, float, lda*N ); TESTING_MALLOC_CPU( h_B, float, lda*N ); TESTING_MALLOC_PIN( h_R, float, lda*N ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, ldda*N ); /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slarnv( &ione, ISEED, &n2, h_B ); magma_smake_symmetric( N, h_A, lda ); magma_smake_hpd( N, h_B, lda ); magma_spotrf( opts.uplo, N, h_B, lda, &info ); if (info != 0) { printf("magma_spotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_ssetmatrix( N, N, h_A, lda, d_A, ldda, opts.queue ); magma_ssetmatrix( N, N, h_B, lda, d_B, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_ssygst_gpu( opts.itype, opts.uplo, N, d_A, ldda, d_B, ldda, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) { printf("magma_ssygst_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_ssygst( &opts.itype, lapack_uplo_const(opts.uplo), &N, h_A, &lda, h_B, &lda, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) { printf("lapackf77_ssygst returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_sgetmatrix( N, N, d_A, ldda, h_R, lda, opts.queue ); blasf77_saxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); Anorm = safe_lapackf77_slansy("f", lapack_uplo_const(opts.uplo), &N, h_A, &lda, work ); error = safe_lapackf77_slansy("f", lapack_uplo_const(opts.uplo), &N, h_R, &lda, work ) / Anorm; bool okay = (error < tol); status += ! okay; printf("%3d %5d %7.2f %7.2f %8.2e %s\n", (int) opts.itype, (int) N, cpu_time, gpu_time, error, (okay ? "ok" : "failed")); } else { printf("%3d %5d --- %7.2f\n", (int) opts.itype, (int) N, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- SPOSV computes the solution to a real system of linear equations A * X = B, where A is an N-by-N symmetric positive definite matrix and X and B are N-by-NRHS matrices. The Cholesky decomposition is used to factor A as A = U**H * U, if UPLO = MagmaUpper, or A = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is a lower triangular matrix. The factored form of A is then used to solve the system of equations A * X = B. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B REAL array, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sposv_driver ********************************************************************/ extern "C" magma_int_t magma_sposv( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, float *A, magma_int_t lda, float *B, magma_int_t ldb, magma_int_t *info ) { magma_int_t ngpu, ldda, lddb; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) *info = -1; if ( n < 0 ) *info = -2; if ( nrhs < 0) *info = -3; if ( lda < max(1, n) ) *info = -5; if ( ldb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return *info; } /* If single-GPU and allocation suceeds, use GPU interface. */ ngpu = magma_num_gpus(); float *dA, *dB; if ( ngpu > 1 ) { goto CPU_INTERFACE; } ldda = ((n+31)/32)*32; lddb = ldda; if ( MAGMA_SUCCESS != magma_smalloc( &dA, ldda*n )) { goto CPU_INTERFACE; } if ( MAGMA_SUCCESS != magma_smalloc( &dB, lddb*nrhs )) { magma_free( dA ); goto CPU_INTERFACE; } magma_ssetmatrix( n, n, A, lda, dA, ldda ); magma_spotrf_gpu( uplo, n, dA, ldda, info ); if ( *info == MAGMA_ERR_DEVICE_ALLOC ) { magma_free( dA ); magma_free( dB ); goto CPU_INTERFACE; } magma_sgetmatrix( n, n, dA, ldda, A, lda ); if ( *info == 0 ) { magma_ssetmatrix( n, nrhs, B, ldb, dB, lddb ); magma_spotrs_gpu( uplo, n, nrhs, dA, ldda, dB, lddb, info ); magma_sgetmatrix( n, nrhs, dB, lddb, B, ldb ); } magma_free( dA ); magma_free( dB ); return *info; CPU_INTERFACE: /* If multi-GPU or allocation failed, use CPU interface and LAPACK. * Faster to use LAPACK for potrs than to copy A to GPU. */ magma_spotrf( uplo, n, A, lda, info ); if ( *info == 0 ) { lapackf77_spotrs( lapack_uplo_const(uplo), &n, &nrhs, A, &lda, B, &ldb, info ); } return *info; }