コード例 #1
0
extern "C" magma_int_t
magma_ztrtri_gpu(magma_uplo_t uplo, magma_diag_t diag, magma_int_t n,
		magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *info)
{
	/*  -- clMAGMA (version 1.0.0) --
		Univ. of Tennessee, Knoxville
		Univ. of California, Berkeley
		Univ. of Colorado, Denver
		August 2012

		Purpose
		=======

		ZTRTRI computes the inverse of a real upper or lower triangular
		matrix dA.

		This is the Level 3 BLAS version of the algorithm.

		Arguments
		=========

		UPLO    (input) CHARACTER*1
		= 'U':  A is upper triangular;
		= 'L':  A is lower triangular.

		DIAG    (input) CHARACTER*1
		= 'N':  A is non-unit triangular;
		= 'U':  A is unit triangular.

		N       (input) INTEGER
		The order of the matrix A.  N >= 0.

		dA       (input/output) DOUBLE PRECISION array ON THE GPU, dimension (LDDA,N)
		On entry, the triangular matrix A.  If UPLO = 'U', the
		leading N-by-N upper triangular part of the array dA contains
		the upper triangular matrix, and the strictly lower
		triangular part of A is not referenced.  If UPLO = 'L', the
		leading N-by-N lower triangular part of the array dA contains
		the lower triangular matrix, and the strictly upper
		triangular part of A is not referenced.  If DIAG = 'U', the
		diagonal elements of A are also not referenced and are
		assumed to be 1.
		On exit, the (triangular) inverse of the original matrix, in
		the same storage format.

		LDDA     (input) INTEGER
		The leading dimension of the array dA.  LDDA >= max(1,N).
		INFO    (output) INTEGER
		= 0: successful exit
		< 0: if INFO = -i, the i-th argument had an illegal value
		> 0: if INFO = i, dA(i,i) is exactly zero.  The triangular
		matrix is singular and its inverse can not be computed.

		===================================================================== */

	/* Local variables */
	magma_uplo_t uplo_ = uplo;
	magma_diag_t diag_ = diag;
	magma_int_t         nb, nn, j, jb;
	magmaDoubleComplex     c_one      = MAGMA_Z_ONE;
	magmaDoubleComplex     c_neg_one  = MAGMA_Z_NEG_ONE;
	magmaDoubleComplex     *work;

	int upper  = lapackf77_lsame(lapack_const(uplo_), lapack_const(MagmaUpper));
	int nounit = lapackf77_lsame(lapack_const(diag_), lapack_const(MagmaNonUnit));

	*info = 0;

	if ((! upper) && (! lapackf77_lsame(lapack_const(uplo_), lapack_const(MagmaLower))))
		*info = -1;
	else if ((! nounit) && (! lapackf77_lsame(lapack_const(diag_), lapack_const(MagmaUnit))))
		*info = -2;
	else if (n < 0)
		*info = -3;
	else if (ldda < max(1,n))
		*info = -5;

	if (*info != 0) {
		magma_xerbla( __func__, -(*info) );
		return *info;
	}

	nb = magma_get_zpotrf_nb(n);

	/* Create Queues */
	magma_queue_t  queues[2];
	magma_device_t device;
	int num = 0;
	magma_err_t err;

	err = magma_get_devices( &device, 1, &num );
	if ( err != 0 || num < 1 ) {
		fprintf( stderr, "magma_get_devices failed: %d\n", err );
		exit(-1);
	}
	err = magma_queue_create( device, &queues[0] );
	if ( err != 0 ) {
		fprintf( stderr, "magma_queue_create 0 failed: %d\n", err );
		exit(-1);
	}	
	err = magma_queue_create( device, &queues[1] );
	if ( err != 0 ) {
		fprintf( stderr, "magma_queue_create 1 failed: %d\n", err );
		exit(-1);
	}	

	if (MAGMA_SUCCESS != magma_malloc_host( (void**)&work, nb*nb*sizeof(magmaDoubleComplex) )) {
		*info = MAGMA_ERR_HOST_ALLOC;
		return *info;
	}

	if (nb <= 1 || nb >= n)
	{
		magma_zgetmatrix( n, n, dA, dA_offset, ldda, work, 0, n, queues[0] );
		lapackf77_ztrtri(lapack_const(uplo_), lapack_const(diag_), &n, work, &n, info);
		magma_zsetmatrix( n, n, work, 0, n, dA, dA_offset, ldda, queues[0] );
	}
	else
	{
		if (upper){
			/* Compute inverse of upper triangular matrix */
			for (j=0; j<n; j =j+ nb){
				jb = min(nb, (n-j));

				/* Compute rows 1:j-1 of current block column */
				magma_ztrmm(MagmaLeft, MagmaUpper, 
						MagmaNoTrans, MagmaNonUnit, j, jb, 
						c_one, dA(0,0), ldda, dA(0, j), ldda, 
						queues[0]);

				magma_ztrsm(MagmaRight, MagmaUpper, 
						MagmaNoTrans, MagmaNonUnit, j, jb,
						c_neg_one, dA(j,j), ldda, dA(0, j), ldda, 
						queues[0]);
	
				magma_zgetmatrix_async( jb, jb,
						dA(j, j), ldda,
						work, 0, jb, queues[1], NULL );
				
				magma_queue_sync( queues[1] );

				/* Compute inverse of current diagonal block */
				lapackf77_ztrtri(MagmaUpperStr, lapack_const(diag_), &jb, work, &jb, info);
				/*
				magma_zsetmatrix_async( jb, jb, 
						work, 0, jb,
						dA(j, j), ldda, queues[0], NULL );
				*/
				magma_zsetmatrix( jb, jb, 
						work, 0, jb,
						dA(j, j), ldda, queues[0]);
			}
		}
		else{
			/* Compute inverse of lower triangular matrix */
			nn=((n-1)/nb)*nb+1;

			for(j=nn-1; j>=0; j=j-nb){
				jb=min(nb,(n-j));

				if((j+jb) < n){
					/* Compute rows j+jb:n of current block column */
					magma_ztrmm(MagmaLeft, MagmaLower, 
							MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb,
							c_one, dA(j+jb,j+jb), ldda, dA(j+jb, j), ldda, 
							queues[0]);

					magma_ztrsm(MagmaRight, MagmaLower,
							MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb,
							c_neg_one, dA(j,j), ldda, dA(j+jb, j), ldda, 
							queues[0]);
				}
				magma_zgetmatrix_async( jb, jb, 
						dA(j, j), ldda,
						work, 0, jb, queues[1], NULL );
				
				magma_queue_sync( queues[1] );

				/* Compute inverse of current diagonal block */
				lapackf77_ztrtri(MagmaLowerStr, lapack_const(diag_), &jb, work, &jb, info);
				/*
				magma_zsetmatrix_async( jb, jb,
						work, 0, jb,
						dA(j, j), ldda, queues[0], NULL );
				*/
				magma_zsetmatrix( jb, jb,
						work, 0, jb,
						dA(j, j), ldda, queues[0] );
			}
		}
	}

	magma_free_host( work );
	magma_queue_destroy(queues[0]);
	magma_queue_destroy(queues[1]);
	return *info;
}
コード例 #2
0
magma_err_t
magma_sgeqrf2_gpu( magma_int_t m, magma_int_t n,
                   magmaFloat_ptr dA, size_t dA_offset, magma_int_t ldda,
                   float *tau, magma_err_t *info,
		   magma_queue_t queue)
{
/*  -- clMAGMA (version 1.0.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       April 2012

    Purpose
    =======
    SGEQRF computes a QR factorization of a real M-by-N matrix A:
    A = Q * R.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    dA      (input/output) REAL array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix dA.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

    LDDA    (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    TAU     (output) REAL array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  if INFO = -9, internal GPU memory allocation failed.

    Further Details
    ===============

    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).
    =====================================================================    */

    #define dA(a_1,a_2)    dA, (dA_offset + (a_1) + (a_2)*(ldda))
    #define work_ref(a_1)  ( work + (a_1))
    #define hwork          ( work + (nb)*(m))

    magmaFloat_ptr dwork;
    float  *work;

    magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows;
    magma_int_t nbmin, nx, ib, nb;
    magma_int_t lhwork, lwork;

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    k = min(m,n);
    if (k == 0)
        return MAGMA_SUCCESS;

    nb = magma_get_sgeqrf_nb(m);

    lwork  = (m+n) * nb;
    lhwork = lwork - (m)*nb;

    
    if ( MAGMA_SUCCESS != magma_malloc( &dwork, n*nb*sizeof(float))) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    
    if ( MAGMA_SUCCESS != magma_malloc_host((void**)&work, lwork*sizeof(float)) ) {
        *info = MAGMA_ERR_HOST_ALLOC;
        magma_free( dwork );
        return *info;
    }
    
    magma_event_t event[2] = {NULL, NULL};                                                            

    nbmin = 2;
    nx    = nb;
    ldwork = m;
    lddwork= n;

    if (nb >= nbmin && nb < k && nx < k) {
        /* Use blocked code initially */
        old_i = 0; old_ib = nb;
        for (i = 0; i < k-nx; i += nb) {
            ib = min(k-i, nb);
            rows = m -i;
	    
	    magma_queue_sync( queue );
	    magma_sgetmatrix_async(rows, ib, dA(i, i), ldda, work_ref(i), 0, ldwork, queue, &event[0]);
          
            if (i>0){
                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, n-old_i-2*old_ib, old_ib,
                                  dA(old_i, old_i         ), ldda, dwork,0,      lddwork,
                                  dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queue);

		magma_ssetmatrix_async( old_ib, old_ib, work_ref(old_i), 0, ldwork,
					dA(old_i, old_i), ldda, queue, &event[1]); 
            }

	    magma_event_sync(event[0]);
            lapackf77_sgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);
   
	    if (i > 0) {
	      magma_event_sync(event[1]);
	    }

	    /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, 
                              &rows, &ib, 
                              work_ref(i), &ldwork, tau+i, hwork, &ib);

            spanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );
	    magma_ssetmatrix(rows, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queue);
            sq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );
	    
            if (i + ib < n) 
	      {
		magma_ssetmatrix(ib, ib, hwork, 0, ib, dwork, 0, lddwork, queue);
		
                if (i+nb < k-nx)
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
                    magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib, 
                                      dA(i, i   ), ldda, dwork,0,  lddwork, 
                                      dA(i, i+ib), ldda, dwork,ib, lddwork, queue);
                else {
                    magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                      rows, n-i-ib, ib, 
                                      dA(i, i   ), ldda, dwork,0,  lddwork, 
                                      dA(i, i+ib), ldda, dwork,ib, lddwork, queue);
		    magma_ssetmatrix(ib, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queue);
                }
                old_i  = i;
                old_ib = ib;
	      }
        }
    } else {
        i = 0;
    }

    magma_free(dwork);

    /* Use unblocked code to factor the last or only block. */
    if (i < k) {
        ib   = n-i;
        rows = m-i;
	magma_sgetmatrix(rows, ib, dA(i, i), ldda, work, 0, rows, queue);

        lhwork = lwork - rows*ib;
        lapackf77_sgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);
        
	magma_ssetmatrix(rows, ib, work, 0, rows, dA(i, i), ldda, queue);
    }

    magma_free_host(work);

    return *info;
} /* magma_sgeqrf2_gpu */
コード例 #3
0
magma_int_t
magma_zsytrf_stapiv_gpu(char uplo, magma_int_t n,
                        cuDoubleComplex *dA, magma_int_t ldda,
                        double criteria, PASTIX_INT * npiv, PASTIX_FLOAT * tmp4, magma_int_t *info)
{
/*  -- MAGMA (version 1.2.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       May 2012

    Purpose
    =======
    ZSYTRF computes the Cholesky factorization of a complex Hermitian
    positive definite matrix dA.

    The factorization has the form
       dA = U**H * U,  if UPLO = 'U', or
       dA = L  * L**H,  if UPLO = 'L',
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangle of dA is stored;
            = 'L':  Lower triangle of dA is stored.

    N       (input) INTEGER
            The order of the matrix dA.  N >= 0.

    dA      (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N)
            On entry, the Hermitian matrix dA.  If UPLO = 'U', the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = 'L', the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.

            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**H * U or dA = L * L**H.

    LDDA     (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
            > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.
    =====================================================================   */


    magma_int_t     j, jb, nb;
    char            uplo_[2] = {uplo, 0};
    cuDoubleComplex c_one     = MAGMA_Z_ONE;
    cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    cuDoubleComplex *work;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    long int        upper = uplo_lapackf77_lsame(uplo_, "U");

    *info = 0;
    if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,n)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    nb = magma_get_zsytrf_nb(n);

    if (MAGMA_SUCCESS != magma_zmalloc_host( &work, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    static cudaStream_t stream[2];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );

    if ((nb <= 1) || (nb >= n)) {
        /*  Use unblocked code. */
        magma_zgetmatrix( n, n, dA, ldda, work, n );
        assert(!upper); /* PaStiX only works with lower */
        PASTIX_sytrf_block((PASTIX_FLOAT*)work, n, n,
                           npiv,
                           criteria, tmp4);
        magma_zsetmatrix( n, n, work, n, dA, ldda );
    } else {
        /* Use blocked code. */
        if (upper) {
          assert(0); /* PaStiX only works with lower */
      
            /* Compute the Cholesky factorization A = U'*U. */
            for (j=0; j<n; j+=nb) {

                /* Update and factorize the current diagonal block and test
                   for non-positive-definiteness. Computing MIN */
                jb = min(nb, (n-j));

                magma_zherk(MagmaUpper, MagmaConjTrans, jb, j,
                            d_neg_one, dA(0, j), ldda,
                            d_one,     dA(j, j), ldda);

                magma_zgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, stream[1] );

                if ( (j+jb) < n) {
                    /* Compute the current block row. */
                    magma_zgemm(MagmaConjTrans, MagmaNoTrans,
                                jb, (n-j-jb), j,
                                c_neg_one, dA(0, j   ), ldda,
                                           dA(0, j+jb), ldda,
                                c_one,     dA(j, j+jb), ldda);
                }

                magma_queue_sync( stream[1] );

                /* lapackf77_zsytrf(MagmaUpperStr, &jb, work, &jb, info); */
                magma_zsetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, stream[0] );
                if (*info != 0) {
                  *info = *info + j;
                  break;
                }

                if ( (j+jb) < n)
                    magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                 jb, (n-j-jb),
                                 c_one, dA(j, j   ), ldda,
                                        dA(j, j+jb), ldda);
            }
        } else {
            //=========================================================
            // Compute the Cholesky factorization A = L*L'.
            for (j=0; j<n; j+=nb) {

                //  Update and factorize the current diagonal block and test
                //  for non-positive-definiteness. Computing MIN
                jb = min(nb, (n-j));

                magma_zsyrk(MagmaLower, MagmaNoTrans, jb, j,
                            c_neg_one, dA(j, 0), ldda,
                            c_one,     dA(j, j), ldda);

                magma_zgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, stream[1] );

                if ( (j+jb) < n) {
                    magma_zgemm( MagmaNoTrans, MagmaConjTrans,
                                 (n-j-jb), jb, j,
                                 c_neg_one, dA(j+jb, 0), ldda,
                                            dA(j,    0), ldda,
                                 c_one,     dA(j+jb, j), ldda);
                }

                magma_queue_sync( stream[1] );
                PASTIX_sytrf_block((PASTIX_FLOAT*)work, jb, jb,
                                   npiv,
                                   criteria, tmp4);

                magma_zsetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, stream[0] );
                if (*info != 0) {
                  *info = *info + j;
                  break;
                }

                if ( (j+jb) < n)
                    magma_ztrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                (n-j-jb), jb,
                                c_one, dA(j,    j), ldda,
                                       dA(j+jb, j), ldda);
            }

        }
    }

    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
    magma_free_host( work );

    return *info;
} /* magma_zsytrf_gpu */