double *cholesky(double *ml, int m) { magma_int_t info; magma_dpotrf('L',m,ml,m,&info); magma_dpotri('L',m,ml,m,&info); return ml; }
int main() { //Magma initialization magma_init(); //Declaration of local variables double *a, *b, *dev_a, results=0; const int N=4098; int i,j; magma_int_t info=0, lda=N, ngpu=2; //Memory Allocation Segment magma_malloc_pinned((void**) &a,(N*N)*sizeof(double)); magma_malloc_pinned((void**) &b,(N*N)*sizeof(double)); //Generate two copies of Symmetric Positive Definite Matrix for(i=0;i<N;i++) { for(j=0;j<i;j++) { a[i*N+j] = 1e-9* (double)rand(); a[j*N+i] = a[i*N+j]; b[i*N+j] = a[i*N+j]; b[j*N+i] = b[i*N+j]; } a[i*N+i] = 1e-9*(double)rand() + 1000.0; b[i*N+i] = a[i*N+i]; } //Call custom Magma Cholesky for obtaining results rr_dpotrf_m(ngpu,MagmaUpper,N,a,N,&info); //Call Standard Magma Cholesky for result validation magma_dpotrf(MagmaUpper,N,b,N,&info); if(info != 0) { printf("magma_dpotrf original returned error %d: %s. \n",(int) info, magma_strerror(info)); } //Validate the results; Compute the RMS error value. for(i=0;i<N;i++) for(j=0;j<N;j++) results = results + (a[i*N+j] - b[i*N+j]) * (a[i*N+j] - b[i*N+j]); //Display the results of the test if(results < 1e-5) printf("The two functions have identical results\n"); else printf("The custom function had significant errors. The RMS value was %g\n",results); magma_free_pinned(a); magma_free_pinned(b); magma_finalize(); return 0; }
void magmaf_dpotrf( magma_uplo_t *uplo, magma_int_t *n, double *A, magma_int_t *lda, magma_int_t *info ) { magma_dpotrf( *uplo, *n, A, *lda, info ); }
SEXP magChol(SEXP a) { SEXP gpu = GET_SLOT(a, install("gpu")), b = PROTECT(NEW_OBJECT(MAKE_CLASS("magma"))); int *DIMA = INTEGER(GET_DIM(a)), N = DIMA[0], N2 = N * N, LDB = N, info; double *B; if(DIMA[1] != N) error("non-square matrix"); b = SET_SLOT(b, install(".Data"), AS_NUMERIC(a)); SET_SLOT(b, install("gpu"), duplicate(gpu)); B = REAL(b); if(LOGICAL_VALUE(gpu)) { double *dB; magma_malloc((void**)&dB, N2*sizeof(double)); magma_dsetmatrix(N, N, B, LDB, dB, LDB); magma_dpotrf_gpu(magma_uplo_const('U'), N, dB, LDB, &info); magma_dgetmatrix(N, N, dB, LDB, B, LDB); magma_free(dB); } else { double *hB; magma_malloc_pinned((void**)&hB, N2*sizeof(double)); lapackf77_dlacpy(MagmaUpperStr, &N, &N, B, &LDB, hB, &LDB); magma_dpotrf(magma_uplo_const('U'), N, hB, N, &info); lapackf77_dlacpy(MagmaUpperStr, &N, &N, hB, &LDB, B, &LDB); magma_free_pinned(hB); } if(info < 0) error("illegal argument %d in 'magChol", -1 * info); else if(info > 0) error("leading minor of order %d is not positive definite", info); int i, j; for(j = 0; j < N; j++) { for(i = j + 1; i < N; i++) { B[i + j * N] = 0.0; } } UNPROTECT(1); return b; }
extern "C" magma_int_t magma_dposv ( char uplo, magma_int_t n, magma_int_t nrhs, double *A, magma_int_t lda, double *B, magma_int_t ldb, magma_int_t *info ) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= DPOSV computes the solution to a real system of linear equations A * X = B, where A is an N-by-N symmetric positive definite matrix and X and B are N-by-NRHS matrices. The Cholesky decomposition is used to factor A as A = U**T * U, if UPLO = 'U', or A = L * L**T, if UPLO = 'L', where U is an upper triangular matrix and L is a lower triangular matrix. The factored form of A is then used to solve the system of equations A * X = B. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**T*U or A = L*L**T. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input/output) DOUBLE_PRECISION array, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ magma_int_t num_gpus, ldda, lddb; *info = 0 ; if( (uplo != 'U') && (uplo != 'u') && (uplo != 'L') && (uplo != 'l') ) *info = -1; if( n < 0 ) *info = -2; if( nrhs < 0) *info = -3; if ( lda < max(1, n) ) *info = -5; if ( ldb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n==0) || (nrhs == 0) ) { return *info; } /* If single-GPU and allocation suceeds, use GPU interface. */ num_gpus = magma_num_gpus(); double *dA, *dB; if ( num_gpus > 1 ) { goto CPU_INTERFACE; } ldda = ((n+31)/32)*32; lddb = ldda; if ( MAGMA_SUCCESS != magma_dmalloc( &dA, ldda*n )) { goto CPU_INTERFACE; } if ( MAGMA_SUCCESS != magma_dmalloc( &dB, lddb*nrhs )) { magma_free( dA ); dA = NULL; goto CPU_INTERFACE; } assert( num_gpus == 1 && dA != NULL && dB != NULL ); magma_dsetmatrix( n, n, A, lda, dA, ldda ); magma_dpotrf_gpu( uplo, n, dA, ldda, info ); magma_dgetmatrix( n, n, dA, ldda, A, lda ); if ( *info == 0 ) { magma_dsetmatrix( n, nrhs, B, ldb, dB, lddb ); magma_dpotrs_gpu( uplo, n, nrhs, dA, ldda, dB, lddb, info ); magma_dgetmatrix( n, nrhs, dB, lddb, B, ldb ); } magma_free( dA ); magma_free( dB ); return *info; CPU_INTERFACE: /* If multi-GPU or allocation failed, use CPU interface and LAPACK. * Faster to use LAPACK for potrs than to copy A to GPU. */ magma_dpotrf( uplo, n, A, lda, info ); if ( *info == 0 ) { lapackf77_dpotrs( &uplo, &n, &nrhs, A, &lda, B, &ldb, info ); } return *info; }
SEXP magma_dpoMatrix_chol(SEXP x) { #ifdef HIPLAR_WITH_MAGMA SEXP val = get_factors(x, "Cholesky"), dimP = GET_SLOT(x, Matrix_DimSym), uploP = GET_SLOT(x, Matrix_uploSym); const char *uplo = CHAR(STRING_ELT(uploP, 0)); int *dims = INTEGER(dimP), info; int n = dims[0]; double *vx; cublasStatus retStatus; if (val != R_NilValue) return val; dims = INTEGER(dimP); val = PROTECT(NEW_OBJECT(MAKE_CLASS("Cholesky"))); SET_SLOT(val, Matrix_uploSym, duplicate(uploP)); SET_SLOT(val, Matrix_diagSym, mkString("N")); SET_SLOT(val, Matrix_DimSym, duplicate(dimP)); vx = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, n * n)); AZERO(vx, n * n); //we could put in magmablas_dlacpy but it only //copies all of the matrix F77_CALL(dlacpy)(uplo, &n, &n, REAL(GET_SLOT(x, Matrix_xSym)), &n, vx, &n); if (n > 0) { if(GPUFlag == 0){ #ifdef HIPLAR_DBG R_ShowMessage("DBG: Cholesky decomposition using dpotrf;"); #endif F77_CALL(dpotrf)(uplo, &n, vx, &n, &info); } else if(GPUFlag == 1 && Interface == 0){ #ifdef HIPLAR_DBG R_ShowMessage("DBG: Cholesky decomposition using magma_dpotrf;"); #endif int nrows, ncols; nrows = ncols = n; magma_int_t lda; lda = nrows; magma_dpotrf(uplo[0], ncols, vx, lda, &info); /* Error Checking */ retStatus = cudaGetLastError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in magma_dpotrf")); /********************************************/ } else if(GPUFlag == 1 && Interface == 1) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Cholesky decomposition using magma_dpotrf_gpu;"); #endif double *d_c; int nrows, ncols; nrows = ncols = n; int N2 = nrows * ncols; magma_int_t lda; lda = nrows; cublasAlloc(lda * ncols, sizeof(double), (void**)&d_c); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector(N2, sizeof(double), vx, 1, d_c, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Date Transfer to Device")); /********************************************/ magma_dpotrf_gpu(uplo[0], ncols, d_c, lda, &info); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in magma_dpotrf_gpu")); /********************************************/ cublasGetVector(nrows * ncols, sizeof(double), d_c, 1, vx, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Date Transfer from Device")); /********************************************/ cublasFree(d_c); } else error(_("MAGMA/LAPACK/Interface Flag not defined correctly")); } if (info) { if(info > 0) error(_("the leading minor of order %d is not positive definite"), info); else /* should never happen! */ error(_("Lapack routine %s returned error code %d"), "dpotrf", info); } UNPROTECT(1); return set_factors(x, val, "Cholesky"); #endif return R_NilValue; }
int main( int argc, char **argv ) { printf("Starting\n"); int size; cudaError_t cudaStat; magma_err_t magmaStat; cublasStatus_t stat; cublasHandle_t handle; int it,i; cublasOperation_t N = 'N'; cublasOperation_t T = 'T'; char N2 = 'N'; char T2 = 'T'; double one = 1., zero=0.; char uplo = 'L'; int info; int err; double* A; double* B; magmaStat = magma_init(); int use_pinned; if(argc > 1) { use_pinned = atoi(argv[1]); } else use_pinned = 0; printf("Setting use_pinned to %d\n", use_pinned); for( size = 256; size <= 8192; size*=2 ) { if(use_pinned) { // allocate pinned memory on CPU err = magma_dmalloc_pinned( &A, size*size ); assert( err == 0 ); err = magma_dmalloc_pinned( &B, size*size ); assert( err == 0 ); } else { // allocate standard memory on CPU A = (double*) malloc( sizeof(double)*size*size ); B = (double*) malloc( sizeof(double)*size*size ); } cudaDeviceSynchronize(); double tInit = read_timer(); double *dA,*dB; // allocate memory on GPU magma_malloc( (void**) &dA, sizeof(double)*size*size ); magma_malloc( (void**) &dB, sizeof(double)*size*size ); cudaDeviceSynchronize(); double tAlloc = read_timer(); fillMatrix(B, size*size); cudaDeviceSynchronize(); double tInit2 = read_timer(); // transfer data to GPU magma_dsetmatrix( size, size, B, size, dB, size ); cudaDeviceSynchronize(); double tTransferToGPU = read_timer(); // matrix multiply magmablas_dgemm('N', 'T', size, size, size, one, dB, size, dB, size, zero, dA, size ); // magma_dgemm is apparently synonymous with magmablas_dgemm cudaDeviceSynchronize(); double tMatMult = read_timer(); // Cholesky decomposition on GPU with GPU interface (called with object on GPU) magma_dpotrf_gpu( 'L', size, dA, size, &info ); cudaDeviceSynchronize(); double tChol = read_timer(); // transfer data back to CPU magma_dgetmatrix( size, size, dA, size, A, size ); cudaDeviceSynchronize(); double tTransferFromGPU = read_timer(); // standard BLAS matrix multiply on CPU dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size ); cudaDeviceSynchronize(); double tMatMultBlas = read_timer(); // Cholesky decomposition on GPU with CPU interface (called with object on CPU) magma_dpotrf( 'L', size, A, size, &info ); cudaDeviceSynchronize(); double tCholCpuInterface = read_timer(); // recreate A = B * B (could just do a save and copy instead....) dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size ); cudaDeviceSynchronize(); double tInit3 = read_timer(); // standard Lapack Cholesky decomposition on CPU dpotrf_(&uplo, &size, A, &size, &info); cudaDeviceSynchronize(); double tCholCpu= read_timer(); printf("====================================================\n"); printf("Timing results for n = %d\n", size); printf("GPU memory allocation time: %f\n", tAlloc - tInit); printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit2); printf("Matrix multiply time (GPU): %f\n", tMatMult - tTransferToGPU); printf("Matrix multiply time (BLAS): %f\n", tMatMultBlas - tTransferToGPU); printf("Cholesky factorization time (GPU w/ GPU interface): %f\n", tChol - tMatMult); printf("Cholesky factorization time (GPU w/ CPU interface): %f\n", tCholCpuInterface - tMatMultBlas); printf("Cholesky factorization time (LAPACK): %f\n", tCholCpu - tInit3); printf("Transfer from GPU time: %f\n", tTransferFromGPU - tChol); if(use_pinned) { magma_free_pinned(A); magma_free_pinned(B); } else { free(A); free(B); } magma_free(dA); magma_free(dB); } return EXIT_SUCCESS; }
SEXP magmaCholeskyFinal(SEXP A, SEXP n, SEXP NB, SEXP id, SEXP zeroTri, SEXP lowerTri) { magma_init(); // magma_print_devices(); double *h_R; int In, INB, ID; In = INTEGER_VALUE(n); INB = INTEGER_VALUE(NB); ID = INTEGER_VALUE(id); double *PA = NUMERIC_POINTER(A); int i,j; magma_int_t N, n2, lda, status, info, max_size; N=In; lda = N; n2 = lda*N; /* for(i = 0; i < In; i++) { for(j = 0; j < In; j++) { printf("%.8f ", PA[i+j*In]); } printf("\n"); } */ if ( MAGMA_SUCCESS != magma_malloc_pinned( (void**) &h_R, (n2)*sizeof(double) )) { fprintf( stderr, "!!!! magma_malloc_pinned failed for: %s\n", h_R ); magma_finalize(); exit(-1); } lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, PA, &lda, h_R, &lda ); N = In; status = 0; magma_setdevice(ID); //printf("Modified by Vinay in one GPU\n"); //INB = magma_get_dpotrf_nb(N); // INB = 224; // printf("INB = %d\n", INB); //ngpu = ndevices; // printf("ngpu = %d\n", ngpu); //max_size = INB*(1+N/(INB*ndevices))*INB*((N+INB-1)/INB); // printf("max_size = %d\n", max_size); //int imax_size = max_size; //double *dA; //magma_dmalloc_pinned((void**)&dA, In*In*sizeof(double)); //ldda = (1+N/(INB*ndevices))*INB; // printf("ldda = %d\n", ldda); //magma_dsetmatrix_1D_row_bcyclic(N, N, PA, N, dA, ldda, ngpu, INB); //magma_dpotrf_mgpu(ngpu, MagmaLower, N, dA, ldda, &info); int lTri; lTri = INTEGER_VALUE(lowerTri); if(lTri) magma_dpotrf(MagmaLower, N, h_R, N, &info); else magma_dpotrf(MagmaUpper, N, h_R, N, &info); if(info != 0) { printf("magma_dpotrf returned error %d: %s.\n", (int) info, magma_strerror(info)); } lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_R, &lda, PA, &lda ); //magma_dgetmatrix_1D_row_bcyclic(N, N, dA, ldda, PA, N, ngpu, INB); //for(dev = 0; dev < ndevices; dev++) //{ //magma_setdevice(dev); //cudaFree(dA[dev]); //} magma_free_pinned(h_R); magma_finalize(); cublasShutdown(); /* int IZeroTri; IZeroTri = INTEGER_VALUE(zeroTri); if(IZeroTri & lTri) { for(i = 1; i < In; i++) { for(j=0; j< i; j++) { PA[i*In+j] = 0.0; } } } else if(IZeroTri) for(i = 0; i < In; i++) { for(j=i+1; j < In; j++) { PA[i*In+j] = 0.0; } }*/ return(R_NilValue); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dpotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double *h_A, *h_R; magma_int_t N, n2, lda, info; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("ngpu %d, uplo %c\n", (int) opts.ngpu, opts.uplo ); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = N; n2 = lda*N; gflops = FLOPS_DPOTRF( N ) / 1e9; TESTING_MALLOC( h_A, double, n2 ); TESTING_HOSTALLOC( h_R, double, n2 ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); magma_dmake_hpd( N, h_A, lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dpotrf( opts.uplo, N, h_R, lda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dpotrf( &opts.uplo, &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_dlange("f", &N, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e%s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "" : " failed") ); status |= ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- DPOSV computes the solution to a real system of linear equations A * X = B, where A is an N-by-N symmetric positive definite matrix and X and B are N-by-NRHS matrices. The Cholesky decomposition is used to factor A as A = U**H * U, if UPLO = MagmaUpper, or A = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is a lower triangular matrix. The factored form of A is then used to solve the system of equations A * X = B. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B DOUBLE PRECISION array, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dposv_driver ********************************************************************/ extern "C" magma_int_t magma_dposv( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, double *A, magma_int_t lda, double *B, magma_int_t ldb, magma_int_t *info ) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_) + (j_)*ldda) #define dB(i_, j_) dB, ((i_) + (j_)*lddb) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #define dB(i_, j_) (dB + (i_) + (j_)*lddb) #endif magma_int_t ngpu, ldda, lddb; magma_queue_t queue = NULL; magma_device_t cdev; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) *info = -1; if ( n < 0 ) *info = -2; if ( nrhs < 0) *info = -3; if ( lda < max(1, n) ) *info = -5; if ( ldb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } /* If single-GPU and allocation suceeds, use GPU interface. */ ngpu = magma_num_gpus(); magmaDouble_ptr dA, dB; if ( ngpu > 1 ) { goto CPU_INTERFACE; } ldda = magma_roundup( n, 32 ); lddb = ldda; if ( MAGMA_SUCCESS != magma_dmalloc( &dA, ldda*n )) { goto CPU_INTERFACE; } if ( MAGMA_SUCCESS != magma_dmalloc( &dB, lddb*nrhs )) { magma_free( dA ); goto CPU_INTERFACE; } magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); magma_dsetmatrix( n, n, A, lda, dA(0,0), ldda, queue ); magma_dpotrf_gpu( uplo, n, dA(0,0), ldda, info ); if ( *info == MAGMA_ERR_DEVICE_ALLOC ) { magma_queue_destroy( queue ); magma_free( dA ); magma_free( dB ); goto CPU_INTERFACE; } magma_dgetmatrix( n, n, dA(0,0), ldda, A, lda, queue ); if ( *info == 0 ) { magma_dsetmatrix( n, nrhs, B, ldb, dB(0,0), lddb, queue ); magma_dpotrs_gpu( uplo, n, nrhs, dA(0,0), ldda, dB(0,0), lddb, info ); magma_dgetmatrix( n, nrhs, dB(0,0), lddb, B, ldb, queue ); } magma_queue_destroy( queue ); magma_free( dA ); magma_free( dB ); return *info; CPU_INTERFACE: /* If multi-GPU or allocation failed, use CPU interface and LAPACK. * Faster to use LAPACK for potrs than to copy A to GPU. */ magma_dpotrf( uplo, n, A, lda, info ); if ( *info == 0 ) { lapackf77_dpotrs( lapack_uplo_const(uplo), &n, &nrhs, A, &lda, B, &ldb, info ); } return *info; }