Exemple #1
0
void F77WRAP_LARFB
   (const F77_INTEGER *iside, const F77_INTEGER *itrans,
    const F77_INTEGER *idirect, const F77_INTEGER *istore,
    const F77_INTEGER *M, const F77_INTEGER *N, const F77_INTEGER *K,
    TYPE *V, const F77_INTEGER *ldv, TYPE *T, const F77_INTEGER *ldt,
    TYPE *C, const F77_INTEGER *ldc, TYPE *work, const F77_INTEGER *ldwork)
{
   ATL_larfb(*iside, *itrans, *idirect, *istore, *M, *N, *K, V, *ldv, T, *ldt,
             C, *ldc, work, *ldwork);
}
Exemple #2
0
int ATL_ormqr
   (const enum CBLAS_SIDE SIDE, const enum CBLAS_TRANSPOSE TRANS,
    ATL_CINT M, ATL_CINT N, ATL_CINT K, TYPE *A, ATL_CINT lda,
    const TYPE *TAU, TYPE *C, ATL_CINT ldc, TYPE *WORK, ATL_CINT LWORK)
/*
 * This is the C translation of the standard LAPACK Fortran routine:
 *      SUBROUTINE ATL_ormqr( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
 *                        WORK, LWORK, INFO )
 *
 * ATL_ormqr.c :
 * int ATL_ormqr(const enum CBLAS_SIDE SIDE SIDE,
 *        const enum CBLAS_TRANSPOSE TRANS, ATL_CINT M, ATL_CINT N,
 *        ATL_CINT K, TYPE * A, ATL_CINT lda,TYPE * TAU, TYPE * C, ATL_CINT ldc,
 *                       TYPE * WORK, ATL_CINT LWORK)
 *
 *      NOTE :   ATL_ormqr.c will get compiled to four precisions
 *               single precision real,      double precision real
 *               single precision complex,   double precision complex
 *
 *
 *
 *  Purpose
 *  =======
 *
 *  ATL_ormqr overwrites the general real M-by-N matrix C with
 *
 *                  SIDE = 'L'     SIDE = 'R'
 *  TRANS = 'N':      Q * C          C * Q
 *  TRANS = 'T':      Q**T * C       C * Q**T
 *
 *  where Q is,
 *        a real orthogonal matrix defined as the product of k
 *        elementary reflectors
 *
 *        Q = H(1) H(2) . . . H(k)
 *
 *   OR
 *        a complex unitary matrix defined as a product of k
 *        elementary reflectors
 *
 *        Q = H(1) H(2) . . . H(k)
 *
 *  as returned by ATLL_geqrf.c. Q is of order M if SIDE = 'L' and of order N
 *  if SIDE = 'R'.
 *
 *  Arguments
 *  =========
 *
 *  SIDE    (input) CHARACTER*1
 *          = 'L': apply Q or Q**T from the Left;
 *          = 'R': apply Q or Q**T from the Right.
 *
 *  TRANS   (input) CHARACTER*1
 *          = 'N':  No transpose, apply Q;
 *          = 'T':  Transpose, apply Q**T.
 *
 *  M       (input) INTEGER
 *          The number of rows of the matrix C. M >= 0.
 *
 *  N       (input) INTEGER
 *          The number of columns of the matrix C. N >= 0.
 *
 *  K       (input) INTEGER
 *          The number of elementary reflectors whose product defines
 *          the matrix Q.
 *          If SIDE = 'L', M >= K >= 0;
 *          if SIDE = 'R', N >= K >= 0.
 *
 *  A       (input) array, dimension (LDA,K)
 *          The i-th column must contain the vector which defines the
 *          elementary reflector H(i), for i = 1,2,...,k, as returned by
 *          DGEQRF in the first k columns of its array argument A.
 *          A is modified by the routine but restored on exit.
 *
 *  lda     (input) INTEGER
 *          The leading dimension of the array A.
 *          If SIDE = 'L', LDA >= max(1,M);
 *          if SIDE = 'R', LDA >= max(1,N).
 *
 *  TAU     (input)  array, dimension (K)
 *          TAU(i) must contain the scalar factor of the elementary
 *          reflector H(i), as returned by ATL_geqrf.c.
 *
 *  C       (input/output)  array, dimension (LDC,N)
 *          On entry, the M-by-N matrix C.
 *          On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q.
 *
 *  ldc     (input) INTEGER
 *          The leading dimension of the array C. LDC >= max(1,M).
 *
 *  WORK    (workspace/output) array, dimension (MAX(1,LWORK))
 *          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *
 *  LWORK   (input) INTEGER
 *          The dimension of the array WORK.
 *          If SIDE = 'L', LWORK >= max(1,N);
 *          if SIDE = 'R', LWORK >= max(1,M).
 *          For optimum performance LWORK >= N*NB if SIDE = 'L', and
 *          LWORK >= M*NB if SIDE = 'R', where NB is the optimal
 *          blocksize.
 *
 *          If LWORK = -1, then a workspace query is assumed; the routine
 *          only calculates the optimal size of the WORK array, returns
 *          this value as the first entry of the WORK array, and no error
 *          message related to LWORK is issued by XERBLA.
 *
 *  INFO    (output) INTEGER
 *          = 0:  successful exit
 *          < 0:  if INFO = -i, the i-th argument had an illegal value
 */
{
   ATL_CINT minMN = Mmin(M, N), maxMN = Mmax(M, N);
   ATL_INT n, nb, j, ib, mi, ni, ic, jc ;
   TYPE  *ws_QR2,  *ws_T, *ws_larfb;        /* Workspace for QR2,T, larfb     */
   void *vp=NULL;

   nb = clapack_ilaenv(LAIS_OPT_NB, LAormqr, MYOPT+LARight+LAUpper, M, N, K,-1);

/*
 * If it is a workspace query, return the size of work required.
 *    wrksz = wrksz of ATL_larfb + ATL_larft + ATL_geqr2
 */
   if (LWORK < 0)
   {
      if(SIDE == CblasLeft)
      {
         *WORK = ( N*nb + nb*nb + maxMN )  ;
      }
      else
      {
         *WORK = ( M*nb + nb*nb + maxMN )  ;
      }
      return(0);
   }
   else if (M < 1 || N < 1)                 /* quick return if no work to do  */
      return(0);
/*
 * If the user gives us too little space, see if we can allocate it ourselves
 */
   else
   {
      if(SIDE == CblasLeft)
      {
         if (LWORK < (N*nb + nb*nb + maxMN))
         {
            vp = malloc(ATL_MulBySize(N*nb + nb*nb + maxMN) + ATL_Cachelen);
            if (!vp)
               return(-7);
            WORK = ATL_AlignPtr(vp);
         }
      }
      else
      {
         if (LWORK < (M*nb + nb*nb + maxMN))
         {
            vp = malloc(ATL_MulBySize(M*nb + nb*nb + maxMN) + ATL_Cachelen);
            if (!vp)
               return(-7);
            WORK = ATL_AlignPtr(vp);
         }
      } /* if CblasRight */
   }

/*
 * Assign workspace areas for ATL_larft, ATL_geqr2, ATL_larfb
 */

   ws_T = WORK;                             /* T at begining of work          */
   ws_QR2 = WORK +(nb SHIFT)*nb;            /* After T Work space             */
   ws_larfb = ws_QR2 + (maxMN SHIFT);       /* After workspace for T and QR2  */


   if (SIDE == CblasLeft)
   {
      if ( TRANS == CblasNoTrans )
      {
         j = (K/nb)*nb;
         if (j == K)
         {
            j=K -nb;
         }
 	 for (j; j >= 0; j = j - nb)
         {
            ib = nb;
            if ((j+nb) > K)
            {
               ib = K - j;
            }
/*
 *          Form the triangular factor of the block reflector
 *          H = H(i) H(i+1) . . . H(i+ib-1)
 */
            ATL_larft(LAForward, LAColumnStore, M-j, ib, A+(j SHIFT)*(lda+1),
                      lda, TAU+(j SHIFT), ws_T, ib);
/*
 *          H or H' is applied to C(i:m,1:n)
 */
            ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore,
                      (M-j), N, ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib,
                      C+(j SHIFT), ldc, ws_larfb, N);
          }                                 /* for                            */
      }                                     /* CblasNoTrans                   */
      else
      {
         for (j = 0 ; j < K; j = j + nb)
         {
            ib = Mmin(K-j, nb);
/*
 *          Form the triangular factor of the block reflector
 *          H = H(i) H(i+1) . . . H(i+ib-1)
 */
            ATL_larft(LAForward, LAColumnStore, M-j, ib, A+(j SHIFT)*(lda+1),
                      lda, TAU+(j SHIFT), ws_T, ib);
/*
 *          H or H' is applied to C(i:m,1:n)
 */
            ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore,
                      (M-j), N, ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib,
                      C+(j SHIFT), ldc, ws_larfb, N);
         }                                  /* for                            */
      }                                     /* CblasNoTran                    */
   }                                        /* cblasLeft                      */
   else
   {
      if ( TRANS == CblasNoTrans )
      {
 	 for (j = 0 ; j < K; j = j + nb)
         {
            ib = Mmin(K-j, nb);
/*
 *          Form the triangular factor of the block reflector
 *          H = H(i) H(i+1) . . . H(i+ib-1)
 */
            ATL_larft(LAForward, LAColumnStore, N-j, ib, A+(j SHIFT)*(lda+1),
                      lda, TAU+(j SHIFT), ws_T, ib);
/*
 *              H or H' is applied to C(1:m,i:n)
 */
            ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore,
                      M, N-j, ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib,
                      C+((j SHIFT)*ldc), ldc, ws_larfb, M);
          }                                 /* for                            */
      }
      else
      {
         j = (K/nb)*nb;
         if (j == K)
         {
            j=K -nb;
         }
 	 for (j; j >= 0; j = j - nb)
         {

            ib = nb;
            if ((j+nb) > K)
            {
               ib = K - j;
            }
/*
 *          Form the triangular factor of the block reflector
 *          H = H(i) H(i+1) . . . H(i+ib-1)
 */
            ATL_larft(LAForward, LAColumnStore, N-j, ib, A+(j SHIFT)*(lda+1),
                      lda, TAU+(j SHIFT), ws_T, ib);
/*
 *              H or H' is applied to C(1:m,i:n)
 */

            ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore,
                      M, N-j , ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib,
                      C+((j SHIFT)*ldc) , ldc, ws_larfb, M);
         }                                  /* for                            */

      }                                     /* Cblas Tran on Right            */
   }

   if (vp)
      free(vp);
   return(0);
}                                           /* END ATL_ormqr                  */
Exemple #3
0
int ATL_gelqf(ATL_CINT M, ATL_CINT N, TYPE  *A, ATL_CINT lda, TYPE *TAU,
               TYPE *WORK, ATL_CINT LWORK)
/*
 * This is the C translation of the standard LAPACK Fortran routine:
 *      SUBROUTINE gelqf( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
 * ATL_gelqf.c :
 * int ATL_gelqf(int M, int N, TYPE  *A, int LDA, TYPE  *TAU,
 *              TYPE *WORK, int LWORK)
 *
 *  Purpose
 *  =======
 *
 *  ATL_gelqf  computes an LQ factorization of a real/complex M-by-N matrix A:
 *  A = L * Q.
 *
 *  Compared to LAPACK, here, a recursive panel factorization is implemented.
 *  Refer to ATL_gelqr.c andd ATL_larft.c for details.
 *
 *  Arguments
 *  =========
 *
 *  M       (input) INTEGER
 *          The number of rows of the matrix A.  M >= 0.
 *
 *  N       (input) INTEGER
 *          The number of columns of the matrix A.  N >= 0.
 *
 *  A       (input/output) array, dimension (LDA,N)
 *          On entry, the M-by-N matrix A.
 *          On exit, the elements on and above the diagonal of the array
 *          contain the min(M,N)-by-N upper trapezoidal matrix R (R is
 *          upper triangular if m >= n); the elements below the diagonal,
 *          with the array TAU, represent the orthogonal matrix Q as a
 *          product of min(m,n) elementary reflectors (see Further
 *          Details).
 *
 *  LDA     (input) INTEGER
 *          The leading dimension of the array A.  LDA >= max(1,M).
 *
 *  TAU     (output) array, dimension (min(M,N))
 *          The scalar factors of the elementary reflectors (see Further
 *          Details).
 *
 *  WORK    (workspace/output) array, dimension (MAX(1,LWORK))
 *          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *
 *  LWORK   (input) INTEGER
 *          The dimension of the array WORK.  LWORK >= max(1,N).
 *          For optimum performance LWORK >= N*NB, where NB is
 *          the optimal blocksize.
 *
 *          If LWORK = -1, then a workspace query is assumed; the routine
 *          only calculates the optimal size of the WORK array, returns
 *          this value as the first entry of the WORK array, and no error
 *          message related to LWORK is issued .
 *
 *  INFO    (output) INTEGER
 *          = 0:  successful exit
 *          < 0:  if INFO = -i, the i-th argument had an illegal value
 *
 *  Further Details
 *  ===============
 *
 *  The matrix Q is represented as a product of elementary reflectors
 *
 *     Q = H(1) H(2) . . . H(k), where k = min(m,n).
 *
 *  Each H(i) has the form
 *
 *     H(i) = I - tau * v * v'                  (For Real precision)
 *     H(i) = I - tau * v * conjugate(v)'       (For Complex precision)
 *
 *  where tau is a real/complex scalar, and v is a real/complex vector with
 *  v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
 *  and tau in TAU(i).
 *
 */
{
   ATL_CINT minMN = Mmin(M, N), maxMN = Mmax(M, N);
   ATL_INT n, nb, j;
   TYPE  *ws_LQ2,  *ws_T, *ws_larfb;        /* Workspace level 2, T, larfb.   */
   void *vp=NULL;

   /* For transpose function, may need type-appropriate 'ONE' for alpha. */
   #ifdef TREAL
      const TYPE ONE = ATL_rone;
   #else
      const TYPE ONE[2] = {ATL_rone, ATL_rzero};
   #endif
   TYPE *ws_CP=NULL, *ws_CPRaw=NULL;
   ATL_INT ldCP;

   #if defined(ATL_TUNING)
   /*-------------------------------------------------------------------------*/
   /* For tuning recursion crossover points, the blocking factor is set by    */
   /* la2xover, the tuning program for that purpose.                          */
   /*-------------------------------------------------------------------------*/
   if (ATL_PanelTune) nb=ATL_PanelTune; else
   #endif /* ATL_TUNING */

   nb = clapack_ilaenv(LAIS_OPT_NB, LAgeqrf, MYOPT+LALeft+LALower, M, N,-1,-1);

/*
 * If it is a workspace query, return the size of work required.
 *    wrksz = wrksz of ATL_larfb + ATL_larft + ATL_gelq2
 */
   if (LWORK < 0)
   {
      *WORK = ( maxMN*nb + nb*nb + maxMN )  ;
      return(0);
   }
   else if (M < 1 || N < 1)  /* quick return if no work to do */
      return(0);

/*
 * LQ is the transpose of QR: We use this to go from row-major LQ to
 * col-major QR, typically faster. Here, if we are square and large,
 * we transpose the whole matrix in-place and then transpose it back.
 * This should be a tunable parameter; perhaps if the matrix fits in
 * L1 or L2? (Note by Tony C, short on time to conduct tuning).
 */
   if (M == N && N >= 128)
   {
      Mjoin(PATL,sqtrans)(N, A, lda);
      n = ATL_geqrf(M, N, A, lda, TAU, WORK, LWORK);
      Mjoin(PATL,sqtrans)(N, A, lda);

      /* Take the conjugate for Complex TAU. */
      #ifdef TCPLX
      ATL_INT i;
      for (i=1; i<(minMN<<1); i+=2)
         *(TAU+i) = 0.-*(TAU+i);          /* Negate imaginary part. */
      #endif
      return(n);
   }
/*
 * If the user gives us too little space, see if we can allocate it ourselves
 */
   else if (LWORK < (maxMN*nb + nb*nb + maxMN))
   {
      vp = malloc(ATL_MulBySize(maxMN*nb + nb*nb + maxMN) + ATL_Cachelen);
      if (!vp)
         return(-7);
       WORK = ATL_AlignPtr(vp);
   }

/*
 * Assign workspace areas for ATL_larft, ATL_gelq2, ATL_larfb
 */
   ws_T = WORK;                         /* T at begining of work */
   ws_LQ2 = WORK +(nb SHIFT)*nb;        /* After T Work space             */
   ws_larfb = ws_LQ2 + (maxMN SHIFT);   /* After workspace for T and LQ2  */

/*
 * Leave one iteration to be done outside loop, so we don't build T
 * Any loop iterations are therefore known to be of size nb (no partial blocks)
 */
   n = (minMN / nb) * nb;
   if (n == minMN)
      n -= Mmin(nb, minMN);       /* when n is a multiple of nb, reduce by nb */
   #if !defined(ATL_USEPTHREADS)        /* If no PCA, try to copy up front. */
      j = M - n;
      j = Mmax(nb, j);
      ldCP = (N&7) ? (((N+7)>>3)<<3) : N;
      ws_CPRaw = malloc(ATL_MulBySize(ldCP)*j + ATL_Cachelen);
      if (ws_CPRaw) ws_CP=ATL_AlignPtr(ws_CPRaw);  /* Align if malloced. */
   #endif /* Serial Mode */


   for (j=0; j < n; j += nb)
   {
      #if !defined(ATL_USEPTHREADS) /* If no PCA it won't copy. Try it here. */
      /* If we got our copy workspace, transpose panel before recursion. */
      if (ws_CP)                             /* If workspace exists. */
      {
         int ci, cj;                         /* for conjugation.     */
         ldCP = N-j;
         if (ldCP&7)
            ldCP = ((ldCP+7)>>3)<<3;
         Mjoin(PATL,gemoveT)(N-j, nb, ONE, A+(j SHIFT)*(lda+1),
                             lda, ws_CP, ldCP);

         ATL_assert(!ATL_geqrr(N-j, nb, ws_CP, ldCP, TAU+(j SHIFT),
                               ws_LQ2, ws_T, nb, ws_larfb, 1));

         Mjoin(PATL,gemoveT)(nb, N-j, ONE, ws_CP, ldCP,
                             A+(j SHIFT)*(lda+1), lda);

         #if defined(TCPLX)               /* conj upTri T, TAU. */
         for (cj=0; cj<nb; cj++)          /* column loop... */
         {
            TAU[((j+cj) SHIFT)+1] = 0.-TAU[((j+cj) SHIFT)+1];
            for (ci=0; ci<=cj; ci++)      /* row loop... */
               ws_T[((ci+cj*nb) SHIFT)+1] = 0.-ws_T[((ci+cj*nb) SHIFT)+1];
         }
         #endif /* defined(TCPLX) */
      } else /* copy workspace was not allocated, use native. */
      #endif /* Serial Mode (No PCA) */
      {
         ATL_assert(!ATL_gelqr(nb, N-j,  A+(j SHIFT)*(lda+1), lda,
                               TAU+(j SHIFT), ws_LQ2, ws_T, nb, ws_larfb, 1));
      }

      if (j+nb < M)  /* if there are more cols left to bottom, update them */
      {
/*
 *       ======================================================================
 *       Form the triangular factor of the block reflector
 *       After gelqr, ws_T contains 'T', the nb x nb triangular factor 'T'
 *       of the block reflector. It is an output used in the next call, dlarfb.
 *          H = Id - Y'*T*Y, with Id=(N-j)x(N-j), Y=(N-j)xNB.
 *
 *       The ws_T array used above is an input to dlarfb; it is 'T' in
 *       that routine, and LDT x K (translates here to LDWORK x NB).
 *       WORK is an LDWORK x NB workspace (not input or output).
 *       ======================================================================
 */
         ATL_larfb(CblasRight, CblasNoTrans, LAForward, LARowStore,
                   M-j-nb, N-j, nb, A+(j SHIFT)*(lda+1), lda, ws_T, nb,
                   A+((j SHIFT)*(lda+1))+(nb SHIFT), lda, ws_larfb, M);
      }
   }
Exemple #4
0
int ATL_geqrf(ATL_CINT M, ATL_CINT N, TYPE  *A, ATL_CINT lda, TYPE *TAU,
               TYPE *WORK, ATL_CINT LWORK)
/*
 * This is the C translation of the standard LAPACK Fortran routine:
 *      SUBROUTINE geqrf( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
 * ATL_geqrf.c :
 * int ATL_geqrf(int M, int N, TYPE  *A, int LDA, TYPE  *TAU,
 *              TYPE *WORK, int LWORK)
 *
 *  Purpose
 *  =======
 *
 *  ATL_geqrf  computes a QR factorization of a real/complex M-by-N matrix A:
 *  A = Q * R.
 *
 *  Compared to LAPACK, here, a recursive panel factorization is implemented.
 *  Refer to ATL_geqrr.c andd ATL_larft.c for details.
 *
 *  Arguments
 *  =========
 *
 *  M       (input) INTEGER
 *          The number of rows of the matrix A.  M >= 0.
 *
 *  N       (input) INTEGER
 *          The number of columns of the matrix A.  N >= 0.
 *
 *  A       (input/output) array, dimension (LDA,N)
 *          On entry, the M-by-N matrix A.
 *          On exit, the elements on and above the diagonal of the array
 *          contain the min(M,N)-by-N upper trapezoidal matrix R (R is
 *          upper triangular if m >= n); the elements below the diagonal,
 *          with the array TAU, represent the orthogonal matrix Q as a
 *          product of min(m,n) elementary reflectors (see Further
 *          Details).
 *
 *  LDA     (input) INTEGER
 *          The leading dimension of the array A.  LDA >= max(1,M).
 *
 *  TAU     (output) array, dimension (min(M,N))
 *          The scalar factors of the elementary reflectors (see Further
 *          Details).
 *
 *  WORK    (workspace/output) array, dimension (MAX(1,LWORK))
 *          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *
 *  LWORK   (input) INTEGER
 *          The dimension of the array WORK.  LWORK >= max(1,N).
 *          For optimum performance LWORK >= N*NB, where NB is
 *          the optimal blocksize.
 *
 *          If LWORK = -1, then a workspace query is assumed; the routine
 *          only calculates the optimal size of the WORK array, returns
 *          this value as the first entry of the WORK array, and no error
 *          message related to LWORK is issued .
 *
 *  INFO    (output) INTEGER
 *          = 0:  successful exit
 *          < 0:  if INFO = -i, the i-th argument had an illegal value
 *
 *  Further Details
 *  ===============
 *
 *  The matrix Q is represented as a product of elementary reflectors
 *
 *     Q = H(1) H(2) . . . H(k), where k = min(m,n).
 *
 *  Each H(i) has the form
 *
 *     H(i) = I - tau * v * v'                  (For Real precision)
 *     H(i) = I - tau * v * conjugate(v)'       (For Complex precision)
 *
 *  where tau is a real/complex scalar, and v is a real/complex vector with
 *  v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
 *  and tau in TAU(i).
 *
 */
{
   ATL_CINT minMN = Mmin(M, N), maxMN = Mmax(M, N);
   ATL_INT n, nb, j;
   TYPE  *ws_QR2,  *ws_T, *ws_larfb;        /* Workspace level 2, T, larfb.   */
   void *vp=NULL;

   #if defined(ATL_TUNING)
   /*-------------------------------------------------------------------------*/
   /* For tuning recursion crossover points, the blocking factor is set by    */
   /* la2xover, the tuning program for that purpose.                          */
   /*-------------------------------------------------------------------------*/
   if (ATL_PanelTune) nb=ATL_PanelTune; else
   #endif /* ATL_TUNING */

   nb = clapack_ilaenv(LAIS_OPT_NB, LAgeqrf, MYOPT+LARight+LAUpper, M, N,-1,-1);

/*
 * If it is a workspace query, return the size of work required.
 *    wrksz = wrksz of ATL_larfb + ATL_larft + ATL_geqr2
 */
   if (LWORK < 0)
   {
      *WORK = ( N*nb + nb*nb + maxMN )  ;
      return(0);
   }
   else if (M < 1 || N < 1)                 /* quick return if no work to do  */
      return(0);
/*
 * If the user gives us too little space, see if we can allocate it ourselves
 */
   else if (LWORK < (N*nb + nb*nb + maxMN))
   {
      vp = malloc(ATL_MulBySize(N*nb + nb*nb + maxMN) + ATL_Cachelen);
      if (!vp)
         return(-7);
      WORK = ATL_AlignPtr(vp);
   }

/*
 * Assign workspace areas for ATL_larft, ATL_geqr2, ATL_larfb
 * RCW Q: Why can't LARFB & GEQR2 workspaces be overlapped?
 */
   ws_T = WORK;                             /* T at begining of work          */
   ws_QR2 = WORK +(nb SHIFT)*nb;            /* After T Work space             */
   ws_larfb = ws_QR2 + (maxMN SHIFT);       /* After workspace for T and QR2  */

/*
 * Leave one iteration to be done outside loop, so we don't build T
 * Any loop iterations are therefore known to be of size nb (no partial blocks)
 */
   n = (minMN / nb) * nb;
   if (n == minMN)
      n -= Mmin(nb, minMN);

   for (j=0; j < n; j += nb)
   {
      ATL_assert(!ATL_geqrr(M-j, nb, A+(j SHIFT)*(lda+1), lda,
                            TAU+(j SHIFT), ws_QR2, ws_T, nb, ws_larfb, 1));

      if (j+nb < N)     /* if there are more cols left to right, update them  */
      {
/*
 *       Form the triangular factor of the block reflector
 *          H = H(i) H(i+1) . . . H(i+ib-1)
 *       After geqrr, ws_T contains 'T', the nb x nb triangular factor 'T'
 *       of the block reflector. It is an output used in the next call, dlarfb.
 *          H = Id - Y*T*Y', with Id=(M-j)x(M-j), Y=(M-j)xNB.
 *
 *       Apply H' to A(j:m,j+nb:N) from the left
 *
 *       The ws_T array used above is an input to dlarfb; it is 'T' in
 *       that routine, and LDT x K (translates here to LDWORK x NB).
 *       WORK is an LDWORK x NB workspace (not input or output).
 */
         ATL_larfb(CblasLeft, CblasTrans, LAForward, LAColumnStore,
                   M-j, N-j-nb, nb, A+(j SHIFT)*(lda+1), lda, ws_T, nb,
                   A+(j SHIFT)+((j+nb)SHIFT)*lda, lda, ws_larfb, N);
      }
   }
/*
 * Build Last panel.  buildT is passed as (0).
 */
   nb = minMN - n;                              /* remaining columns.         */
   ATL_assert(!ATL_geqrr(M-n, N-n, A+(n SHIFT)*(lda+1), lda, TAU+(n SHIFT),
                         ws_QR2, ws_T, nb, ws_larfb, 0));

   if (vp)
      free(vp);
   return(0);
} /* END ATL_dgeqrf */