void magmaf_dgetri_gpu( magma_int_t *n, devptr_t *dA, magma_int_t *ldda, magma_int_t *ipiv, devptr_t *dwork, magma_int_t *lwork, magma_int_t *info ) { magma_dgetri_gpu( *n, magma_ddevptr(dA), *ldda, ipiv, magma_ddevptr(dwork), *lwork, info ); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetrf */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double *h_A, *h_R, *work; magmaDouble_ptr d_A, dwork; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t N, n2, lda, ldda, info, lwork, ldwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double tmp; double error, rwork[1]; magma_int_t *ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) // need looser bound (3000*eps instead of 30*eps) for tests // TODO: should compute ||I - A*A^{-1}|| / (n*||A||*||A^{-1}||) opts.tolerance = max( 3000., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / (N*||A||_F)\n"); printf("=================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; ldwork = N * magma_get_dgetri_nb( N ); gflops = FLOPS_DGETRI( N ) / 1e9; // query for workspace size lwork = -1; lapackf77_dgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) printf("lapackf77_dgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = int( MAGMA_D_REAL( tmp )); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, double, lwork ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( dwork, double, ldwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); error = lapackf77_dlange( "f", &N, &N, h_A, &lda, rwork ); // norm(A) /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_dsetmatrix( N, N, h_A, lda, d_A, 0, ldda, opts.queue ); magma_dgetrf_gpu( N, N, d_A, 0, ldda, ipiv, opts.queue, &info ); magma_dgetmatrix( N, N, d_A, 0, ldda, h_A, lda, opts.queue ); if ( info != 0 ) printf("magma_dgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // check for exact singularity //h_A[ 10 + 10*lda ] = MAGMA_D_MAKE( 0.0, 0.0 ); //magma_dsetmatrix( N, N, h_A, lda, d_A, 0, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, opts.queues2, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgetri_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix( N, N, d_A, 0, ldda, h_R, lda, opts.queue ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_daxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_dlange( "f", &N, &N, h_R, &lda, rwork ) / (N*error); printf( "%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( "%5d --- ( --- ) %7.2f (%7.2f) ---\n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetri */ int main( int argc, char** argv ) { TESTING_INIT(); // constants const double c_zero = MAGMA_D_ZERO; const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double *h_A, *h_Ainv, *h_R, *work; magmaDouble_ptr d_A, dwork; magma_int_t N, n2, lda, ldda, info, lwork, ldwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double tmp; double error, rwork[1]; magma_int_t *ipiv; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% N CPU Gflop/s (sec) GPU Gflop/s (sec) ||I - A*A^{-1}||_1 / (N*cond(A))\n"); printf("%%===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default ldwork = N * magma_get_dgetri_nb( N ); gflops = FLOPS_DGETRI( N ) / 1e9; // query for workspace size lwork = -1; lapackf77_dgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) { printf("lapackf77_dgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = magma_int_t( MAGMA_D_REAL( tmp )); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, double, lwork ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_Ainv, double, n2 ); TESTING_MALLOC_CPU( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( dwork, double, ldwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_dsetmatrix( N, N, h_A, lda, d_A, ldda, opts.queue ); magma_dgetrf_gpu( N, N, d_A, ldda, ipiv, &info ); magma_dgetmatrix( N, N, d_A, ldda, h_Ainv, lda, opts.queue ); if (info != 0) { printf("magma_dgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } // check for exact singularity //h_Ainv[ 10 + 10*lda ] = MAGMA_D_MAKE( 0.0, 0.0 ); //magma_dsetmatrix( N, N, h_Ainv, lda, d_A, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dgetri_gpu( N, d_A, ldda, ipiv, dwork, ldwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgetri_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dgetri( &N, h_Ainv, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } printf( "%5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf( "%5d --- ( --- ) %7.2f (%7.2f)", (int) N, gpu_perf, gpu_time ); } /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { magma_dgetmatrix( N, N, d_A, ldda, h_Ainv, lda, opts.queue ); // compute 1-norm condition number estimate, following LAPACK's zget03 double normA, normAinv, rcond; normA = lapackf77_dlange( "1", &N, &N, h_A, &lda, rwork ); normAinv = lapackf77_dlange( "1", &N, &N, h_Ainv, &lda, rwork ); if ( normA <= 0 || normAinv <= 0 ) { rcond = 0; error = 1 / (tol/opts.tolerance); // == 1/eps } else { rcond = (1 / normA) / normAinv; // R = I // R -= A*A^{-1} // err = ||I - A*A^{-1}|| / ( N ||A||*||A^{-1}|| ) = ||R|| * rcond / N, using 1-norm lapackf77_dlaset( "full", &N, &N, &c_zero, &c_one, h_R, &lda ); blasf77_dgemm( "no", "no", &N, &N, &N, &c_neg_one, h_A, &lda, h_Ainv, &lda, &c_one, h_R, &lda ); error = lapackf77_dlange( "1", &N, &N, h_R, &lda, rwork ); error = error * rcond / N; } bool okay = (error < tol); status += ! okay; printf( " %8.2e %s\n", error, (okay ? "ok" : "failed")); } else { printf( "\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_Ainv ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
SEXP magma_dgeMatrix_solve(SEXP a) { #ifdef HIPLAR_WITH_MAGMA /* compute the 1-norm of the matrix, which is needed later for the computation of the reciprocal condition number. */ double aNorm = magma_get_norm(a, "1"); /* the LU decomposition : */ /* Given that we may be performing this operation * on the GPU we may put in an optimisation here * where if we call the LU solver we, we do not require * the decomposition to be transferred back to CPU. This is TODO */ SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix"))), lu = magma_dgeMatrix_LU_(a, TRUE); int *dims = INTEGER(GET_SLOT(lu, Matrix_DimSym)), *pivot = INTEGER(GET_SLOT(lu, Matrix_permSym)); /* prepare variables for the dgetri calls */ double *x, tmp; int info, lwork = -1; if (dims[0] != dims[1]) error(_("Solve requires a square matrix")); slot_dup(val, lu, Matrix_xSym); x = REAL(GET_SLOT(val, Matrix_xSym)); slot_dup(val, lu, Matrix_DimSym); int N2 = dims[0] * dims[0]; if(dims[0]) /* the dimension is not zero */ { /* is the matrix is *computationally* singular ? */ double rcond; F77_CALL(dgecon)("1", dims, x, dims, &aNorm, &rcond, (double *) R_alloc(4*dims[0], sizeof(double)), (int *) R_alloc(dims[0], sizeof(int)), &info); if (info) error(_("error [%d] from Lapack 'dgecon()'"), info); if(rcond < DOUBLE_EPS) error(_("Lapack dgecon(): system computationally singular, reciprocal condition number = %g"), rcond); /* only now try the inversion and check if the matrix is *exactly* singular: */ // This is also a work space query. This is not an option in magma F77_CALL(dgetri)(dims, x, dims, pivot, &tmp, &lwork, &info); lwork = (int) tmp; if( GPUFlag == 0){ F77_CALL(dgetri)(dims, x, dims, pivot, (double *) R_alloc((size_t) lwork, sizeof(double)), &lwork, &info); #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solve using LU using dgetri;"); #endif } else if(GPUFlag == 1) { double *d_x, *dwork; cublasStatus retStatus; #ifdef HIPLAR_DBG R_ShowMessage("Solve using LU using magma_dgetri;"); #endif cublasAlloc(N2, sizeof(double), (void**)&d_x); //cublasAlloc(N2 , sizeof(double), (void**)&dtmp); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation on Device")); /********************************************/ cublasSetVector( N2, sizeof(double), x, 1, d_x, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ lwork = dims[0] * magma_get_dgetri_nb( dims[0] ); cublasAlloc(lwork, sizeof(double), (void**)&dwork); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation on Device")); /********************************************/ magma_dgetri_gpu(dims[0], d_x, dims[0], pivot, dwork , lwork, &info); cublasGetVector(N2, sizeof(double), d_x, 1, x, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data From to Device")); /********************************************/ cublasFree(dwork); cublasFree(d_x); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error freeing memory")); /********************************************/ } else error(_("GPUFlag not set correctly")); if (info) error(_("Lapack routine dgetri: system is exactly singular")); } UNPROTECT(1); return val; #endif return R_NilValue; }