예제 #1
0
double *cholesky(double *ml, int m) {
 	magma_int_t info;
 
 	magma_dpotrf('L',m,ml,m,&info);
 	magma_dpotri('L',m,ml,m,&info);
 
 	return ml;
}
예제 #2
0
int main()
{
	//Magma initialization
	magma_init();
	//Declaration of local variables
	double *a, *b, *dev_a, results=0;
	const int N=4098;
	int i,j;
	magma_int_t info=0, lda=N, ngpu=2;


	//Memory Allocation Segment
	magma_malloc_pinned((void**) &a,(N*N)*sizeof(double));
	magma_malloc_pinned((void**) &b,(N*N)*sizeof(double));

	
	//Generate two copies of Symmetric Positive Definite Matrix
	for(i=0;i<N;i++)
	{
		for(j=0;j<i;j++)
		{
			a[i*N+j] = 1e-9* (double)rand();
			a[j*N+i] = a[i*N+j];
			b[i*N+j] = a[i*N+j];
			b[j*N+i] = b[i*N+j];
		}
		a[i*N+i] = 1e-9*(double)rand() +  1000.0;
		b[i*N+i] = a[i*N+i];
	}

	//Call custom Magma Cholesky for obtaining results
	rr_dpotrf_m(ngpu,MagmaUpper,N,a,N,&info);

	//Call Standard Magma Cholesky for result validation
	magma_dpotrf(MagmaUpper,N,b,N,&info);
	if(info != 0)
	{
		printf("magma_dpotrf original returned error %d: %s. \n",(int) info, magma_strerror(info));
	}

	//Validate the results; Compute the RMS error value.
	for(i=0;i<N;i++)
		for(j=0;j<N;j++)
			results = results + (a[i*N+j] - b[i*N+j]) * (a[i*N+j] - b[i*N+j]);
	
	//Display the results of the test
	if(results < 1e-5)
		printf("The two functions have identical results\n");
	else
		printf("The custom function had significant errors. The RMS value was %g\n",results);
	magma_free_pinned(a);
	magma_free_pinned(b);
	magma_finalize();
	return 0;

}
예제 #3
0
void magmaf_dpotrf(
    magma_uplo_t *uplo, magma_int_t *n,
    double *A, magma_int_t *lda,
    magma_int_t *info )
{
    magma_dpotrf(
        *uplo, *n,
        A, *lda,
        info );
}
예제 #4
0
파일: magFactors.c 프로젝트: sgsokol/magma
SEXP magChol(SEXP a)
{
   SEXP gpu = GET_SLOT(a, install("gpu")),
        b = PROTECT(NEW_OBJECT(MAKE_CLASS("magma")));
   int *DIMA = INTEGER(GET_DIM(a)), N = DIMA[0], N2 = N * N, LDB = N, info;
   double *B;

   if(DIMA[1] != N) error("non-square matrix");

   b = SET_SLOT(b, install(".Data"), AS_NUMERIC(a));
   SET_SLOT(b, install("gpu"), duplicate(gpu));
   B = REAL(b);
   
   if(LOGICAL_VALUE(gpu)) {
      double *dB;

      magma_malloc((void**)&dB, N2*sizeof(double));

      magma_dsetmatrix(N, N, B, LDB, dB, LDB);
      magma_dpotrf_gpu(magma_uplo_const('U'), N, dB, LDB, &info);
      magma_dgetmatrix(N, N, dB, LDB, B, LDB);

      magma_free(dB);
   } else {
      double *hB;

      magma_malloc_pinned((void**)&hB, N2*sizeof(double));
      lapackf77_dlacpy(MagmaUpperStr, &N, &N, B, &LDB, hB, &LDB);
      magma_dpotrf(magma_uplo_const('U'), N, hB, N, &info);
      lapackf77_dlacpy(MagmaUpperStr, &N, &N, hB, &LDB, B, &LDB);

      magma_free_pinned(hB);
   }

   if(info < 0) error("illegal argument %d in 'magChol", -1 * info);
   else if(info > 0) error("leading minor of order %d is not positive definite", info);

   int i, j;
   for(j = 0; j < N; j++) {
      for(i = j + 1; i < N; i++) {
         B[i + j * N] = 0.0;
      }
   }

   UNPROTECT(1);

   return b;
}
예제 #5
0
extern "C" magma_int_t
magma_dposv    ( char uplo, magma_int_t n, magma_int_t nrhs, 
                 double *A, magma_int_t lda, 
                 double *B, magma_int_t ldb, magma_int_t *info )
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012
 
    Purpose
    =======

    DPOSV computes the solution to a real system of linear equations
       A * X = B,
    where A is an N-by-N symmetric positive definite matrix and X and B
    are N-by-NRHS matrices.
    The Cholesky decomposition is used to factor A as
       A = U**T * U,  if UPLO = 'U', or
       A = L * L**T,  if UPLO = 'L',
    where U is an upper triangular matrix and  L is a lower triangular
    matrix.  The factored form of A is then used to solve the system of
    equations A * X = B.

    Arguments
    =========
 
    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangle of A is stored;
            = 'L':  Lower triangle of A is stored.

    N       (input) INTEGER
            The order of the matrix A.  N >= 0.

    NRHS    (input) INTEGER
            The number of right hand sides, i.e., the number of columns
            of the matrix B.  NRHS >= 0.

    A       (input/output) DOUBLE_PRECISION array, dimension (LDA,N)   
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization A = U**T*U or A = L*L**T.   

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    B       (input/output) DOUBLE_PRECISION array, dimension (LDB,NRHS)
            On entry, the right hand side matrix B.
            On exit, the solution matrix X.

    LDB     (input) INTEGER
            The leading dimension of the array B.  LDB >= max(1,N).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
    =====================================================================   */

    magma_int_t num_gpus, ldda, lddb;
    
    *info = 0 ; 
    if( (uplo != 'U') && (uplo != 'u') && (uplo != 'L') && (uplo != 'l') )
        *info = -1; 
    if( n < 0 )
        *info = -2; 
    if( nrhs < 0) 
        *info = -3; 
    if ( lda < max(1, n) )
        *info = -5; 
    if ( ldb < max(1, n) )
        *info = -7;
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if ( (n==0) || (nrhs == 0) ) {
        return *info;
    }

    /* If single-GPU and allocation suceeds, use GPU interface. */
    num_gpus = magma_num_gpus();
    double *dA, *dB;
    if ( num_gpus > 1 ) {
        goto CPU_INTERFACE;
    }
    ldda = ((n+31)/32)*32;
    lddb = ldda;
    if ( MAGMA_SUCCESS != magma_dmalloc( &dA, ldda*n )) {
        goto CPU_INTERFACE;
    }
    if ( MAGMA_SUCCESS != magma_dmalloc( &dB, lddb*nrhs )) {
        magma_free( dA );
        dA = NULL;
        goto CPU_INTERFACE;
    }
    assert( num_gpus == 1 && dA != NULL && dB != NULL );
    magma_dsetmatrix( n, n, A, lda, dA, ldda );
    magma_dpotrf_gpu( uplo, n, dA, ldda, info );
    magma_dgetmatrix( n, n, dA, ldda, A, lda );
    if ( *info == 0 ) {
        magma_dsetmatrix( n, nrhs, B, ldb, dB, lddb );
        magma_dpotrs_gpu( uplo, n, nrhs, dA, ldda, dB, lddb, info );
        magma_dgetmatrix( n, nrhs, dB, lddb, B, ldb );
    }
    magma_free( dA );
    magma_free( dB );
    return *info;

CPU_INTERFACE:
    /* If multi-GPU or allocation failed, use CPU interface and LAPACK.
     * Faster to use LAPACK for potrs than to copy A to GPU. */
    magma_dpotrf( uplo, n, A, lda, info );
    if ( *info == 0 ) {
        lapackf77_dpotrs( &uplo, &n, &nrhs, A, &lda, B, &ldb, info );
    }
    
    return *info;
}
예제 #6
0
SEXP magma_dpoMatrix_chol(SEXP x)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP val = get_factors(x, "Cholesky"),
			 dimP = GET_SLOT(x, Matrix_DimSym),
			 uploP = GET_SLOT(x, Matrix_uploSym);

	const char *uplo = CHAR(STRING_ELT(uploP, 0));
	int *dims = INTEGER(dimP), info;
	int n = dims[0];
	double *vx;
	cublasStatus retStatus;
	if (val != R_NilValue) return val;
	dims = INTEGER(dimP);
	val = PROTECT(NEW_OBJECT(MAKE_CLASS("Cholesky")));
	SET_SLOT(val, Matrix_uploSym, duplicate(uploP));
	SET_SLOT(val, Matrix_diagSym, mkString("N"));
	SET_SLOT(val, Matrix_DimSym, duplicate(dimP));
	vx = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, n * n));
	AZERO(vx, n * n);
	
	//we could put in magmablas_dlacpy but it only
	//copies all of the matrix 
	F77_CALL(dlacpy)(uplo, &n, &n, REAL(GET_SLOT(x, Matrix_xSym)), &n, vx, &n);
	if (n > 0) {

		if(GPUFlag == 0){
#ifdef HIPLAR_DBG	
		R_ShowMessage("DBG: Cholesky decomposition using dpotrf;");
#endif
			F77_CALL(dpotrf)(uplo, &n, vx, &n, &info);
		}
		else if(GPUFlag == 1 && Interface == 0){
		
#ifdef HIPLAR_DBG	
			R_ShowMessage("DBG: Cholesky decomposition using magma_dpotrf;");
#endif			
			int nrows, ncols;
			nrows = ncols = n;

			magma_int_t lda;
			lda = nrows;

			magma_dpotrf(uplo[0], ncols, vx, lda, &info);

			/* Error Checking */
			retStatus = cudaGetLastError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in magma_dpotrf"));
			/********************************************/
			

		}
		else if(GPUFlag == 1 && Interface == 1) {
	
#ifdef HIPLAR_DBG	
			R_ShowMessage("DBG: Cholesky decomposition using magma_dpotrf_gpu;");	
#endif
			double *d_c;
			int nrows, ncols;
			nrows = ncols = n;
			int N2 = nrows * ncols;


			magma_int_t lda;
			lda = nrows;

			cublasAlloc(lda * ncols, sizeof(double), (void**)&d_c);
			
			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector(N2, sizeof(double), vx, 1, d_c, 1);
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Date Transfer to Device"));
			/********************************************/


			magma_dpotrf_gpu(uplo[0], ncols, d_c, lda, &info);
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in magma_dpotrf_gpu"));
			/********************************************/
			

			cublasGetVector(nrows * ncols, sizeof(double), d_c, 1, vx, 1);		
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Date Transfer from Device"));
			/********************************************/
			
			cublasFree(d_c);
		}
		else
			error(_("MAGMA/LAPACK/Interface Flag not defined correctly"));
		}
		
	if (info) {
			if(info > 0)
				error(_("the leading minor of order %d is not positive definite"),
						info);
			else /* should never happen! */
				error(_("Lapack routine %s returned error code %d"), "dpotrf", info);
		}

	UNPROTECT(1);
	return set_factors(x, val, "Cholesky");
#endif
	return R_NilValue;
}
예제 #7
0
int main( int argc, char **argv ) {
  printf("Starting\n");
  int size;
  cudaError_t cudaStat;
  magma_err_t magmaStat;
  cublasStatus_t stat;
  cublasHandle_t handle;
  int it,i;

  cublasOperation_t N = 'N';
  cublasOperation_t T = 'T';
  char N2 = 'N';
  char T2 = 'T';
  double one = 1., zero=0.;
  char uplo = 'L';
  int info;
  
  int err; double* A; double* B;
  magmaStat = magma_init();

  int use_pinned;
  if(argc > 1) {
    use_pinned = atoi(argv[1]);
  } else use_pinned = 0;
  printf("Setting use_pinned to %d\n", use_pinned);

  for( size = 256; size <= 8192; size*=2 ) {
 
     if(use_pinned) {
       // allocate pinned memory on CPU
       err = magma_dmalloc_pinned( &A,  size*size );  assert( err == 0 );
       err = magma_dmalloc_pinned( &B,  size*size );  assert( err == 0 );
     } else {
       // allocate standard memory on CPU
       A = (double*) malloc( sizeof(double)*size*size );
       B = (double*) malloc( sizeof(double)*size*size );
     }

    cudaDeviceSynchronize();
    double tInit = read_timer();     
    double *dA,*dB;
    // allocate memory on GPU
    magma_malloc( (void**) &dA, sizeof(double)*size*size );
    magma_malloc( (void**) &dB, sizeof(double)*size*size );
    
    cudaDeviceSynchronize();
    double tAlloc = read_timer();     
 
    fillMatrix(B, size*size);
 

    cudaDeviceSynchronize();
    double tInit2 = read_timer();

    // transfer data to GPU
    magma_dsetmatrix( size, size, B, size, dB, size );

    cudaDeviceSynchronize();
    double tTransferToGPU = read_timer();

    // matrix multiply
    magmablas_dgemm('N', 'T', size, size, size, one, dB, size, dB, size, zero, dA, size );
    // magma_dgemm is apparently synonymous with magmablas_dgemm

    cudaDeviceSynchronize();
    double tMatMult = read_timer();
 
    // Cholesky decomposition on GPU with GPU interface (called with object on GPU)
    magma_dpotrf_gpu( 'L', size, dA, size, &info );

    cudaDeviceSynchronize();
    double tChol = read_timer();

    // transfer data back to CPU
    magma_dgetmatrix( size, size, dA, size, A, size );
    cudaDeviceSynchronize();
    double tTransferFromGPU = read_timer();
 
    // standard BLAS matrix multiply on CPU
    dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size );

    cudaDeviceSynchronize();
    double tMatMultBlas = read_timer();

    // Cholesky decomposition on GPU with CPU interface (called with object on CPU)
    magma_dpotrf( 'L', size, A, size, &info );

    cudaDeviceSynchronize();
    double tCholCpuInterface = read_timer();

    // recreate A = B * B (could just do a save and copy instead....)
    dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size );

    cudaDeviceSynchronize();
    double tInit3 = read_timer();

    // standard Lapack Cholesky decomposition on CPU
    dpotrf_(&uplo, &size, A, &size, &info);
  
    cudaDeviceSynchronize();
    double tCholCpu= read_timer();
 

    printf("====================================================\n");
    printf("Timing results for n = %d\n", size);
    printf("GPU memory allocation time: %f\n", tAlloc - tInit);
    printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit2);
    printf("Matrix multiply time (GPU): %f\n", tMatMult - tTransferToGPU);
    printf("Matrix multiply time (BLAS): %f\n", tMatMultBlas - tTransferToGPU);
    printf("Cholesky factorization time (GPU w/ GPU interface): %f\n", tChol - tMatMult);
    printf("Cholesky factorization time (GPU w/ CPU interface): %f\n", tCholCpuInterface - tMatMultBlas);
    printf("Cholesky factorization time (LAPACK): %f\n", tCholCpu - tInit3);
    printf("Transfer from GPU time: %f\n", tTransferFromGPU - tChol);

    if(use_pinned) {
      magma_free_pinned(A);
      magma_free_pinned(B);
    } else {
      free(A);
      free(B);
    }
    magma_free(dA);
    magma_free(dB);
 
  }
  return EXIT_SUCCESS;
}
예제 #8
0
SEXP magmaCholeskyFinal(SEXP A, SEXP n, SEXP NB, SEXP id, SEXP zeroTri, SEXP lowerTri)
{
	magma_init();
//	magma_print_devices();
	
	double *h_R;
	int In, INB, ID;
	In = INTEGER_VALUE(n);
	INB = INTEGER_VALUE(NB);
	ID = INTEGER_VALUE(id);
	double *PA = NUMERIC_POINTER(A);
	int i,j;

	magma_int_t N, n2, lda, status, info, max_size;
	N=In;
   	lda = N;
   	n2 = lda*N;
  	

/*	for(i = 0; i < In; i++)
	{
		for(j = 0; j < In; j++)
		{
			printf("%.8f ", PA[i+j*In]);
		}
		printf("\n");
	}	*/

	if ( MAGMA_SUCCESS != magma_malloc_pinned( (void**) &h_R, (n2)*sizeof(double) )) {      
        fprintf( stderr, "!!!! magma_malloc_pinned failed for: %s\n", h_R ); 
        magma_finalize();                                                  
        exit(-1);   
     	}
        

        lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, PA, &lda, h_R, &lda );
	
	N = In;
	status = 0;
	magma_setdevice(ID);
	//printf("Modified by Vinay in one GPU\n");
	//INB = magma_get_dpotrf_nb(N);
//	INB = 224;
//	printf("INB = %d\n", INB);
	//ngpu = ndevices;
//	printf("ngpu = %d\n", ngpu);
	//max_size = INB*(1+N/(INB*ndevices))*INB*((N+INB-1)/INB);
//	printf("max_size = %d\n", max_size);
	//int imax_size = max_size;
	//double *dA;
	//magma_dmalloc_pinned((void**)&dA, In*In*sizeof(double));
	//ldda = (1+N/(INB*ndevices))*INB;
//	printf("ldda = %d\n", ldda);
	//magma_dsetmatrix_1D_row_bcyclic(N, N, PA, N, dA, ldda, ngpu, INB);
	//magma_dpotrf_mgpu(ngpu, MagmaLower, N, dA, ldda, &info);
	int lTri;
	lTri = INTEGER_VALUE(lowerTri);
	if(lTri)
		magma_dpotrf(MagmaLower, N, h_R, N, &info);
	else
		magma_dpotrf(MagmaUpper, N, h_R, N, &info);
	if(info != 0)
	{
		printf("magma_dpotrf returned error %d: %s.\n", (int) info, magma_strerror(info));
	}
		
	lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_R, &lda, PA, &lda );
	//magma_dgetmatrix_1D_row_bcyclic(N, N, dA, ldda, PA, N, ngpu, INB);
	//for(dev = 0; dev < ndevices; dev++)
	//{
		//magma_setdevice(dev);
		//cudaFree(dA[dev]);
	//}
	magma_free_pinned(h_R);
	magma_finalize();
	cublasShutdown();
	/*
	int IZeroTri;
        IZeroTri = INTEGER_VALUE(zeroTri);
	if(IZeroTri & lTri) {
		for(i = 1; i < In; i++)
        	{
       			for(j=0; j< i; j++)
                	{
                       		PA[i*In+j] = 0.0;
                	}
        	}
	}
	else if(IZeroTri)
		for(i = 0; i < In; i++)
                {
                        for(j=i+1; j < In; j++)
                        {
                                PA[i*In+j] = 0.0;
                        }
                }*/
	return(R_NilValue);
}
예제 #9
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing dpotrf
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gflops, gpu_perf, gpu_time, cpu_perf, cpu_time;
    double *h_A, *h_R;
    magma_int_t N, n2, lda, info;
    double c_neg_one = MAGMA_D_NEG_ONE;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    double      work[1], error;
    magma_int_t status = 0;

    magma_opts opts;
    parse_opts( argc, argv, &opts );
    opts.lapack |= opts.check;  // check (-c) implies lapack (-l)
    
    double tol = opts.tolerance * lapackf77_dlamch("E");
    
    printf("ngpu %d, uplo %c\n", (int) opts.ngpu, opts.uplo );
    printf("    N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R_magma - R_lapack||_F / ||R_lapack||_F\n");
    printf("========================================================\n");
    for( int i = 0; i < opts.ntest; ++i ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N     = opts.nsize[i];
            lda   = N;
            n2    = lda*N;
            gflops = FLOPS_DPOTRF( N ) / 1e9;
            
            TESTING_MALLOC(    h_A, double, n2 );
            TESTING_HOSTALLOC( h_R, double, n2 );
            
            /* Initialize the matrix */
            lapackf77_dlarnv( &ione, ISEED, &n2, h_A );
            magma_dmake_hpd( N, h_A, lda );
            lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            gpu_time = magma_wtime();
            magma_dpotrf( opts.uplo, N, h_R, lda, &info );
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_dpotrf returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            if ( opts.lapack ) {
                /* =====================================================================
                   Performs operation using LAPACK
                   =================================================================== */
                cpu_time = magma_wtime();
                lapackf77_dpotrf( &opts.uplo, &N, h_A, &lda, &info );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0)
                    printf("lapackf77_dpotrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                
                /* =====================================================================
                   Check the result compared to LAPACK
                   =================================================================== */
                error = lapackf77_dlange("f", &N, &N, h_A, &lda, work);
                blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione);
                error = lapackf77_dlange("f", &N, &N, h_R, &lda, work) / error;
                
                printf("%5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e%s\n",
                       (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time,
                       error, (error < tol ? "" : "  failed") );
                status |= ! (error < tol);
            }
            else {
                printf("%5d     ---   (  ---  )   %7.2f (%7.2f)     ---  \n",
                       (int) N, gpu_perf, gpu_time );
            }
            TESTING_FREE(     h_A );
            TESTING_HOSTFREE( h_R );
        }
    }

    TESTING_FINALIZE();
    return status;
}
예제 #10
0
/**
    Purpose
    -------
    DPOSV computes the solution to a real system of linear equations
       A * X = B,
    where A is an N-by-N symmetric positive definite matrix and X and B
    are N-by-NRHS matrices.
    The Cholesky decomposition is used to factor A as
       A = U**H * U,  if UPLO = MagmaUpper, or
       A = L * L**H,  if UPLO = MagmaLower,
    where U is an upper triangular matrix and  L is a lower triangular
    matrix.  The factored form of A is then used to solve the system of
    equations A * X = B.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in]
    nrhs    INTEGER
            The number of right hand sides, i.e., the number of columns
            of the matrix B.  NRHS >= 0.

    @param[in,out]
    A       DOUBLE PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization A = U**H*U or A = L*L**H.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[in,out]
    B       DOUBLE PRECISION array, dimension (LDB,NRHS)
            On entry, the right hand side matrix B.
            On exit, the solution matrix X.

    @param[in]
    ldb     INTEGER
            The leading dimension of the array B.  LDB >= max(1,N).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_dposv_driver
    ********************************************************************/
extern "C" magma_int_t
magma_dposv(
    magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs,
    double *A, magma_int_t lda,
    double *B, magma_int_t ldb,
    magma_int_t *info )
{
    #ifdef HAVE_clBLAS
    #define  dA(i_, j_)  dA, ((i_) + (j_)*ldda)
    #define  dB(i_, j_)  dB, ((i_) + (j_)*lddb)
    #else
    #define  dA(i_, j_) (dA + (i_) + (j_)*ldda)
    #define  dB(i_, j_) (dB + (i_) + (j_)*lddb)
    #endif
    
    magma_int_t ngpu, ldda, lddb;
    magma_queue_t queue = NULL;
    magma_device_t cdev;
    
    *info = 0;
    if ( uplo != MagmaUpper && uplo != MagmaLower )
        *info = -1;
    if ( n < 0 )
        *info = -2;
    if ( nrhs < 0)
        *info = -3;
    if ( lda < max(1, n) )
        *info = -5;
    if ( ldb < max(1, n) )
        *info = -7;
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (n == 0 || nrhs == 0) {
        return *info;
    }
    
    /* If single-GPU and allocation suceeds, use GPU interface. */
    ngpu = magma_num_gpus();
    magmaDouble_ptr dA, dB;
    if ( ngpu > 1 ) {
        goto CPU_INTERFACE;
    }
    ldda = magma_roundup( n, 32 );
    lddb = ldda;
    if ( MAGMA_SUCCESS != magma_dmalloc( &dA, ldda*n )) {
        goto CPU_INTERFACE;
    }
    if ( MAGMA_SUCCESS != magma_dmalloc( &dB, lddb*nrhs )) {
        magma_free( dA );
        goto CPU_INTERFACE;
    }
    
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queue );
    
    magma_dsetmatrix( n, n, A, lda, dA(0,0), ldda, queue );
    magma_dpotrf_gpu( uplo, n, dA(0,0), ldda, info );
    if ( *info == MAGMA_ERR_DEVICE_ALLOC ) {
        magma_queue_destroy( queue );
        magma_free( dA );
        magma_free( dB );
        goto CPU_INTERFACE;
    }
    magma_dgetmatrix( n, n, dA(0,0), ldda, A, lda, queue );
    if ( *info == 0 ) {
        magma_dsetmatrix( n, nrhs, B, ldb, dB(0,0), lddb, queue );
        magma_dpotrs_gpu( uplo, n, nrhs, dA(0,0), ldda, dB(0,0), lddb, info );
        magma_dgetmatrix( n, nrhs, dB(0,0), lddb, B, ldb, queue );
    }
    magma_queue_destroy( queue );
    magma_free( dA );
    magma_free( dB );
    return *info;

CPU_INTERFACE:
    /* If multi-GPU or allocation failed, use CPU interface and LAPACK.
     * Faster to use LAPACK for potrs than to copy A to GPU. */
    magma_dpotrf( uplo, n, A, lda, info );
    if ( *info == 0 ) {
        lapackf77_dpotrs( lapack_uplo_const(uplo), &n, &nrhs, A, &lda, B, &ldb, info );
    }
    return *info;
}