コード例 #1
0
ファイル: sgetrf_msub.cpp プロジェクト: kjbartel/clmagma
extern "C" magma_int_t
magma_sgetrf_msub(
    magma_trans_t trans, magma_int_t num_subs, magma_int_t num_gpus, 
    magma_int_t m, magma_int_t n, 
    magmaFloat_ptr *d_lA, size_t dlA_offset, magma_int_t ldda,
    magma_int_t *ipiv,
    magma_queue_t *queues,
    magma_int_t *info)
{
/*  -- clMAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose
    =======
    SGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    =========
    NUM_GPUS 
            (input) INTEGER
            The number of GPUS to be used for the factorization.

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) REAL array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

    #define d_lAT(id,i,j)  d_lAT[(id)], (((i)*nb)*lddat + (j)*nb)
    #define d_lA( id,i,j)  d_lA[(id)],  (((i)*nb)+ldda  * (j)*nb)

    magma_int_t maxm, tot_subs = num_subs*num_gpus;
    magma_int_t i, j, d, lddat;
    /* submatrix info */
    magma_int_t nb, n_local[ MagmaMaxSubs * MagmaMaxGPUs ];
    magmaFloat_ptr d_lAT[ MagmaMaxSubs * MagmaMaxGPUs ];
    /* local workspace per GPU */
    magmaFloat_ptr d_panel[ MagmaMaxGPUs ];
    magmaFloat_ptr d_lAP[ MagmaMaxGPUs ];
    float *work;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (trans == MagmaTrans && ldda < max(1,m))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    nb = magma_get_sgetrf_nb(m);

    if (nb <= 1 || nb >= n) {
        /* Use CPU code. */
        magma_smalloc_cpu( &work, m * n );
        if (work == NULL) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        printf( "trans %c, m %d, n %d\n", lapacke_trans_const(trans), m, n );
        magma_sgetmatrix( m, n, d_lA[0], 0, ldda, work, m, queues[0] );
        lapackf77_sgetrf( &m, &n, work, &m, ipiv, info );
        magma_ssetmatrix( m, n, work, m, d_lA[0], 0, ldda, queues[0] );
        magma_free_cpu( work );
    } else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;
        if (tot_subs > ceil((float)n/nb)) {
            printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) tot_subs );
            *info = -1;
            return *info;
        }

        /* allocate workspace for each GPU */
        lddat = n/nb;           /* number of block columns         */
        lddat = lddat/tot_subs; /* number of block columns per GPU */
        lddat = nb*lddat;       /* number of columns per GPU       */
        if (lddat * tot_subs < n) {
            /* left over */
            if (n-lddat*tot_subs >= nb) {
                lddat += nb;
            } else {
                lddat += (n-lddat*tot_subs)%nb;
            }
        }
        lddat = ((lddat+31)/32)*32; /* make it a multiple of 32 */
        /* allocating workspace */
        for (d=0; d < num_gpus; d++) {
            //#define SINGLE_GPU_PER_CONTEXT
            #ifdef SINGLE_GPU_PER_CONTEXT
            if ((MAGMA_SUCCESS != magma_smalloc_mgpu( d, &d_panel[d], (2+num_gpus)*nb*maxm ))  ||
                (MAGMA_SUCCESS != magma_smalloc_mgpu( d, &d_lAP[d],   (2+num_gpus)*nb*maxm )) ) {
            #else
            if ((MAGMA_SUCCESS != magma_smalloc( &d_panel[d], (2+num_gpus)*nb*maxm ))  ||
                (MAGMA_SUCCESS != magma_smalloc( &d_lAP[d], (2+num_gpus)*nb*maxm )) ) {
            #endif
                for( i=0; i < d; i++ ) {
                    magma_free( d_panel[i] );
                    magma_free( d_lAP[i] );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
        }
        /* transposing the local matrix */
        for (i=0; i < tot_subs; i++) {
            /* local-n and local-ld */
            n_local[i] = ((n/nb)/tot_subs)*nb;
            if (i < (n/nb)%tot_subs)
               n_local[i] += nb;
            else if (i == (n/nb)%tot_subs)
               n_local[i] += n%nb;

            /* local-matrix storage */
            if (trans == MagmaNoTrans) {
                d_lAT[i] = d_lA[i];
            } else {
                if ( m == n_local[i] ) {
                    d_lAT[i] = d_lA[i];
                    magmablas_stranspose_inplace( m, d_lA[i], 0, ldda, queues[2*(i%num_gpus)+1] );
                } else {
                    #ifdef SINGLE_GPU_PER_CONTEXT
                    if (MAGMA_SUCCESS != magma_smalloc_mgpu( i%num_gpus, &d_lAT[i], lddat*maxm )) {
                    #else
                    if (MAGMA_SUCCESS != magma_smalloc( &d_lAT[i], lddat*maxm )) {
                    #endif
                        for (j=0; j <= i; j++) {
                            magma_free( d_panel[j] );
                            magma_free( d_lAP[j] );
                        }
                        for (j=0; j < i; j++) {
                            if (d_lAT[j] != d_lA[j]) magma_free( d_lAT[j] );
                        }
                        *info = MAGMA_ERR_DEVICE_ALLOC;
                        return *info;
                    }
                    magmablas_stranspose( m, n_local[i], d_lA[i], 0, ldda, d_lAT[i], 0, lddat, queues[2*(i%num_gpus)+1]);
                }
            }
        }
        if (trans == MagmaNoTrans) {
            for (d=0; d < num_gpus; d++){
                magma_queue_sync(queues[2*d+1]);
            }
        }

        /* cpu workspace */
        #ifdef USE_PINNED_CLMEMORY
        cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*maxm*nb*(1+num_gpus), NULL, NULL);
        for (d=0; d < num_gpus; d++) {
            work = (float*)clEnqueueMapBuffer(queues[2*d], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
                                                           sizeof(float)*maxm*nb*(1+num_gpus), 0, NULL, NULL, NULL);
        }
        #else
        if (MAGMA_SUCCESS != magma_smalloc_cpu( &work, maxm*nb*(1+num_gpus) )) {
            for(d=0; d < num_gpus; d++ ) magma_free( d_panel[d] );
            for(d=0; d < tot_subs; d++ ) {
                if( d_lAT[d] != d_lA[d] ) magma_free( d_lAT[d] );
            }
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        #endif

        /* calling multi-gpu interface with allocated workspaces and streams */
        magma_sgetrf2_msub(num_subs, num_gpus, m, n, nb, 0, d_lAT, 0, lddat, ipiv, d_lAP, d_panel, 0, work, maxm,
                           queues, info);

        /* save on output */
        for (d=0; d < tot_subs; d++) {
            if (trans == MagmaNoTrans) {
                //magma_scopymatrix( n_local[d], m, d_lAT[d], 0, lddat, d_lA[d], 0, ldda, queues[2*d+1] );
            } else {
                if (d_lAT[d] == d_lA[d]) {
                    magmablas_stranspose_inplace( m, d_lA[d], 0, ldda, queues[2*(d%num_gpus)+1] );
                } else {
                    magmablas_stranspose( n_local[d], m, d_lAT[d], 0, lddat, d_lA[d], 0, ldda, queues[2*(d%num_gpus)+1] );
                }
            }
        }
        /* clean up */
        for (d=0; d < num_gpus; d++) {
            magma_queue_sync(queues[2*d+1]);
            magma_free( d_panel[d] );
            magma_free( d_lAP[d] );
            d_panel[d] = d_lAP[d] = NULL;
        } 
        for (d=0; d < tot_subs; d++) {
            if (d_lAT[d] != d_lA[d]) {
                magma_free( d_lAT[d] ); 
                d_lAT[d] = NULL;
            }
        }
        #ifdef USE_PINNED_CLMEMORY
        for (d=0; d < num_gpus; d++) {
            clEnqueueUnmapMemObject(queues[2*d], buffer, work, 0, NULL, NULL);
        }
        clReleaseMemObject( buffer );
        #else
        magma_free_cpu( work );
        #endif
        work = NULL;
      }
      return *info;       
      /* End of MAGMA_SGETRF_MSUB */
}
コード例 #2
0
ファイル: sgelqf.cpp プロジェクト: maxhutch/magma
/***************************************************************************//**
    Purpose
    -------
    SGELQF computes an LQ factorization of a REAL M-by-N matrix A:
    A = L * Q.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and below the diagonal of the array
            contain the m-by-min(m,n) lower trapezoidal matrix L (L is
            lower triangular if m <= n); the elements above the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of elementary reflectors (see Further Details).
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    tau     REAL array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= max(1,M).
            For optimum performance LWORK >= M*NB, where NB is the
            optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.
    \n
            TODO: work is currently unused. sgeqrf2 allocates its own work of (m + n)*nb.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

       Q = H(k) . . . H(2) H(1), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
    and tau in TAU(i).

    @ingroup magma_gelqf
*******************************************************************************/
extern "C" magma_int_t
magma_sgelqf(
    magma_int_t m, magma_int_t n,
    float *A,    magma_int_t lda,   float *tau,
    float *work, magma_int_t lwork,
    magma_int_t *info)
{
    #define  dA(i_, j_)  (dA  + (i_) + (j_)*ldda)
    #define dAT(i_, j_)  (dAT + (i_) + (j_)*ldda)
    
    /* Constants */
    const float c_one = MAGMA_S_ONE;
    const magma_int_t ione = 1;
    MAGMA_UNUSED( ione );  // used only for real
    
    /* Local variables */
    magmaFloat_ptr dA=NULL, dAT=NULL;
    magma_int_t min_mn, maxm, maxn, maxdim, nb;
    magma_int_t iinfo, ldda, lddat;

    /* Function Body */
    *info = 0;
    nb = magma_get_sgelqf_nb( m, n );
    min_mn = min( m, n );

    work[0] = magma_smake_lwork( m*nb );
    bool lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1,m) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (min_mn == 0) {
        work[0] = c_one;
        return *info;
    }

    maxm = magma_roundup( m, 32 );
    maxn = magma_roundup( n, 32 );
    maxdim = max( maxm, maxn );

    magma_queue_t queue = NULL;
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queue );
    
    // copy to GPU and transpose
    if (maxdim*maxdim < 2*maxm*maxn) {
        // close to square, do everything in-place
        ldda  = maxdim;
        lddat = maxdim;

        if (MAGMA_SUCCESS != magma_smalloc( &dA, maxdim*maxdim )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto cleanup;
        }

        magma_ssetmatrix( m, n, A, lda, dA(0,0), ldda, queue );
        dAT = dA;
        magmablas_stranspose_inplace( lddat, dAT(0,0), lddat, queue );
    }
    else {
        // rectangular, do everything out-of-place
        ldda  = maxm;
        lddat = maxn;

        if (MAGMA_SUCCESS != magma_smalloc( &dA, 2*maxn*maxm )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto cleanup;
        }

        magma_ssetmatrix( m, n, A, lda, dA(0,0), ldda, queue );

        dAT = dA + maxn * maxm;
        magmablas_stranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queue );
    }

    // factor QR
    magma_sgeqrf2_gpu( n, m, dAT(0,0), lddat, tau, &iinfo );
    assert( iinfo >= 0 );
    if ( iinfo > 0 ) {
        *info = iinfo;
    }
    
    // conjugate tau
    #ifdef COMPLEX
    lapackf77_slacgv( &min_mn, tau, &ione );
    #endif

    // undo transpose
    if (maxdim*maxdim < 2*maxm*maxn) {
        magmablas_stranspose_inplace( lddat, dAT(0,0), lddat, queue );
        magma_sgetmatrix( m, n, dA(0,0), ldda, A, lda, queue );
    } else {
        magmablas_stranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queue );
        magma_sgetmatrix( m, n, dA(0,0), ldda, A, lda, queue );
    }

cleanup:
    magma_queue_destroy( queue );
    magma_free( dA );

    return *info;
} /* magma_sgelqf */
コード例 #3
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing stranspose
   Code is very similar to testing_ssymmetrize.cpp
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t    gbytes, gpu_perf, gpu_time, gpu_perf2=0, gpu_time2=0, cpu_perf, cpu_time;
    float           error, error2, work[1];
    float  c_neg_one = MAGMA_S_NEG_ONE;
    float *h_A, *h_B, *h_R;
    float *d_A, *d_B;
    magma_int_t M, N, size, lda, ldda, ldb, lddb;
    magma_int_t ione     = 1;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );

    printf("Inplace transpose requires M==N.\n");
    printf("    M     N   CPU GByte/s (ms)    GPU GByte/s (ms)  check   Inplace GB/s (ms)  check\n");
    printf("====================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            lda    = M;
            ldda   = ((M+31)/32)*32;
            ldb    = N;
            lddb   = ((N+31)/32)*32;
            // load entire matrix, save entire matrix
            gbytes = sizeof(float) * 2.*M*N / 1e9;
            
            TESTING_MALLOC_CPU( h_A, float, lda*N  );  // input:  M x N
            TESTING_MALLOC_CPU( h_B, float, ldb*M  );  // output: N x M
            TESTING_MALLOC_CPU( h_R, float, ldb*M  );  // output: N x M
            
            TESTING_MALLOC_DEV( d_A, float, ldda*N );  // input:  M x N
            TESTING_MALLOC_DEV( d_B, float, lddb*M );  // output: N x M
            
            /* Initialize the matrix */
            for( int j = 0; j < N; ++j ) {
                for( int i = 0; i < M; ++i ) {
                    h_A[i + j*lda] = MAGMA_S_MAKE( i + j/10000., j );
                }
            }
            for( int j = 0; j < M; ++j ) {
                for( int i = 0; i < N; ++i ) {
                    h_B[i + j*ldb] = MAGMA_S_MAKE( i + j/10000., j );
                }
            }
            magma_ssetmatrix( N, M, h_B, ldb, d_B, lddb );
            
            /* =====================================================================
               Performs operation using naive out-of-place algorithm
               (LAPACK doesn't implement transpose)
               =================================================================== */
            cpu_time = magma_wtime();
            //for( int j = 1; j < N-1; ++j ) {      // inset by 1 row & col
            //    for( int i = 1; i < M-1; ++i ) {  // inset by 1 row & col
            for( int j = 0; j < N; ++j ) {
                for( int i = 0; i < M; ++i ) {
                    h_B[j + i*ldb] = h_A[i + j*lda];
                }
            }
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gbytes / cpu_time;
            
            /* ====================================================================
               Performs operation using MAGMA, out-of-place
               =================================================================== */
            magma_ssetmatrix( M, N, h_A, lda, d_A, ldda );
            magma_ssetmatrix( N, M, h_B, ldb, d_B, lddb );
            
            gpu_time = magma_sync_wtime( 0 );
            //magmablas_stranspose( M-2, N-2, d_A+1+ldda, ldda, d_B+1+lddb, lddb );  // inset by 1 row & col
            magmablas_stranspose( M, N, d_A, ldda, d_B, lddb );
            gpu_time = magma_sync_wtime( 0 ) - gpu_time;
            gpu_perf = gbytes / gpu_time;
            
            /* ====================================================================
               Performs operation using MAGMA, in-place
               =================================================================== */
            if ( M == N ) {
                magma_ssetmatrix( M, N, h_A, lda, d_A, ldda );
                
                gpu_time2 = magma_sync_wtime( 0 );
                //magmablas_stranspose_inplace( N-2, d_A+1+ldda, ldda );  // inset by 1 row & col
                magmablas_stranspose_inplace( N, d_A, ldda );
                gpu_time2 = magma_sync_wtime( 0 ) - gpu_time2;
                gpu_perf2 = gbytes / gpu_time2;
            }
            
            /* =====================================================================
               Check the result
               =================================================================== */
            // check out-of-place transpose (d_B)
            size = ldb*M;
            magma_sgetmatrix( N, M, d_B, lddb, h_R, ldb );
            blasf77_saxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione );
            error = lapackf77_slange("f", &N, &M, h_R, &ldb, work );
            
            if ( M == N ) {
                // also check in-place tranpose (d_A)
                magma_sgetmatrix( N, M, d_A, ldda, h_R, ldb );
                blasf77_saxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione );
                error2 = lapackf77_slange("f", &N, &M, h_R, &ldb, work );
    
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)  %6s  %7.2f (%7.2f)  %s\n",
                       (int) M, (int) N,
                       cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000.,
                       (error  == 0. ? "ok" : "failed"),
                       gpu_perf2, gpu_time2,
                       (error2 == 0. ? "ok" : "failed") );
                status += ! (error == 0. && error2 == 0.);
            }
            else {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)  %6s    ---   (  ---  )\n",
                       (int) M, (int) N,
                       cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000.,
                       (error  == 0. ? "ok" : "failed") );
                status += ! (error == 0.);
            }
            
            TESTING_FREE_CPU( h_A );
            TESTING_FREE_CPU( h_B );
            TESTING_FREE_CPU( h_R );
            
            TESTING_FREE_DEV( d_A );
            TESTING_FREE_DEV( d_B );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
コード例 #4
0
ファイル: sgelqf.cpp プロジェクト: soulsheng/magma
extern "C" magma_int_t
magma_sgelqf( magma_int_t m, magma_int_t n,
              float *a,    magma_int_t lda,   float *tau,
              float *work, magma_int_t lwork, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    SGELQF computes an LQ factorization of a REAL M-by-N matrix A:
    A = L * Q.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) REAL array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and below the diagonal of the array
            contain the m-by-min(m,n) lower trapezoidal matrix L (L is
            lower triangular if m <= n); the elements above the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of elementary reflectors (see Further Details).

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    TAU     (output) REAL array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LWORK   (input) INTEGER
            The dimension of the array WORK.  LWORK >= max(1,M).
            For optimum performance LWORK >= M*NB, where NB is the
            optimal blocksize.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  if INFO = -10 internal GPU memory allocation failed.

    Further Details
    ===============
    The matrix Q is represented as a product of elementary reflectors

       Q = H(k) . . . H(2) H(1), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
    and tau in TAU(i).
    =====================================================================    */

    #define  a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))

    float *dA, *dAT;
    float c_one = MAGMA_S_ONE;
    magma_int_t maxm, maxn, maxdim, nb;
    magma_int_t iinfo, ldda;
    int lquery;

    /* Function Body */
    *info = 0;
    nb = magma_get_sgelqf_nb(m);

    work[0] = MAGMA_S_MAKE( (float)(m*nb), 0 );
    lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1,m) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /*  Quick return if possible */
    if (min(m, n) == 0) {
        work[0] = c_one;
        return *info;
    }

    maxm = ((m + 31)/32)*32;
    maxn = ((n + 31)/32)*32;
    maxdim = max(maxm, maxn);

    if (maxdim*maxdim < 2*maxm*maxn)
        {
            ldda = maxdim;

            if (MAGMA_SUCCESS != magma_smalloc( &dA, maxdim*maxdim )) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }

            magma_ssetmatrix( m, n, a, lda, dA, ldda );
            dAT = dA;
            magmablas_stranspose_inplace( ldda, dAT, ldda );
        }
    else
        {
            ldda = maxn;

            if (MAGMA_SUCCESS != magma_smalloc( &dA, 2*maxn*maxm )) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }

            magma_ssetmatrix( m, n, a, lda, dA, maxm );

            dAT = dA + maxn * maxm;
            magmablas_stranspose2( dAT, ldda, dA, maxm, m, n );
        }

    magma_sgeqrf2_gpu(n, m, dAT, ldda, tau, &iinfo);

    if (maxdim*maxdim < 2*maxm*maxn) {
        magmablas_stranspose_inplace( ldda, dAT, ldda );
        magma_sgetmatrix( m, n, dA, ldda, a, lda );
    } else {
        magmablas_stranspose2( dA, maxm, dAT, ldda, n, m );
        magma_sgetmatrix( m, n, dA, maxm, a, lda );
    }

    magma_free( dA );

    return *info;
} /* magma_sgelqf */