Ejemplo n.º 1
0
int ATL_gelqf(ATL_CINT M, ATL_CINT N, TYPE  *A, ATL_CINT lda, TYPE *TAU,
               TYPE *WORK, ATL_CINT LWORK)
/*
 * This is the C translation of the standard LAPACK Fortran routine:
 *      SUBROUTINE gelqf( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
 * ATL_gelqf.c :
 * int ATL_gelqf(int M, int N, TYPE  *A, int LDA, TYPE  *TAU,
 *              TYPE *WORK, int LWORK)
 *
 *  Purpose
 *  =======
 *
 *  ATL_gelqf  computes an LQ factorization of a real/complex M-by-N matrix A:
 *  A = L * Q.
 *
 *  Compared to LAPACK, here, a recursive panel factorization is implemented.
 *  Refer to ATL_gelqr.c andd ATL_larft.c for details.
 *
 *  Arguments
 *  =========
 *
 *  M       (input) INTEGER
 *          The number of rows of the matrix A.  M >= 0.
 *
 *  N       (input) INTEGER
 *          The number of columns of the matrix A.  N >= 0.
 *
 *  A       (input/output) array, dimension (LDA,N)
 *          On entry, the M-by-N matrix A.
 *          On exit, the elements on and above the diagonal of the array
 *          contain the min(M,N)-by-N upper trapezoidal matrix R (R is
 *          upper triangular if m >= n); the elements below the diagonal,
 *          with the array TAU, represent the orthogonal matrix Q as a
 *          product of min(m,n) elementary reflectors (see Further
 *          Details).
 *
 *  LDA     (input) INTEGER
 *          The leading dimension of the array A.  LDA >= max(1,M).
 *
 *  TAU     (output) array, dimension (min(M,N))
 *          The scalar factors of the elementary reflectors (see Further
 *          Details).
 *
 *  WORK    (workspace/output) array, dimension (MAX(1,LWORK))
 *          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *
 *  LWORK   (input) INTEGER
 *          The dimension of the array WORK.  LWORK >= max(1,N).
 *          For optimum performance LWORK >= N*NB, where NB is
 *          the optimal blocksize.
 *
 *          If LWORK = -1, then a workspace query is assumed; the routine
 *          only calculates the optimal size of the WORK array, returns
 *          this value as the first entry of the WORK array, and no error
 *          message related to LWORK is issued .
 *
 *  INFO    (output) INTEGER
 *          = 0:  successful exit
 *          < 0:  if INFO = -i, the i-th argument had an illegal value
 *
 *  Further Details
 *  ===============
 *
 *  The matrix Q is represented as a product of elementary reflectors
 *
 *     Q = H(1) H(2) . . . H(k), where k = min(m,n).
 *
 *  Each H(i) has the form
 *
 *     H(i) = I - tau * v * v'                  (For Real precision)
 *     H(i) = I - tau * v * conjugate(v)'       (For Complex precision)
 *
 *  where tau is a real/complex scalar, and v is a real/complex vector with
 *  v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
 *  and tau in TAU(i).
 *
 */
{
   ATL_CINT minMN = Mmin(M, N), maxMN = Mmax(M, N);
   ATL_INT n, nb, j;
   TYPE  *ws_LQ2,  *ws_T, *ws_larfb;        /* Workspace level 2, T, larfb.   */
   void *vp=NULL;

   /* For transpose function, may need type-appropriate 'ONE' for alpha. */
   #ifdef TREAL
      const TYPE ONE = ATL_rone;
   #else
      const TYPE ONE[2] = {ATL_rone, ATL_rzero};
   #endif
   TYPE *ws_CP=NULL, *ws_CPRaw=NULL;
   ATL_INT ldCP;

   #if defined(ATL_TUNING)
   /*-------------------------------------------------------------------------*/
   /* For tuning recursion crossover points, the blocking factor is set by    */
   /* la2xover, the tuning program for that purpose.                          */
   /*-------------------------------------------------------------------------*/
   if (ATL_PanelTune) nb=ATL_PanelTune; else
   #endif /* ATL_TUNING */

   nb = clapack_ilaenv(LAIS_OPT_NB, LAgeqrf, MYOPT+LALeft+LALower, M, N,-1,-1);

/*
 * If it is a workspace query, return the size of work required.
 *    wrksz = wrksz of ATL_larfb + ATL_larft + ATL_gelq2
 */
   if (LWORK < 0)
   {
      *WORK = ( maxMN*nb + nb*nb + maxMN )  ;
      return(0);
   }
   else if (M < 1 || N < 1)  /* quick return if no work to do */
      return(0);

/*
 * LQ is the transpose of QR: We use this to go from row-major LQ to
 * col-major QR, typically faster. Here, if we are square and large,
 * we transpose the whole matrix in-place and then transpose it back.
 * This should be a tunable parameter; perhaps if the matrix fits in
 * L1 or L2? (Note by Tony C, short on time to conduct tuning).
 */
   if (M == N && N >= 128)
   {
      Mjoin(PATL,sqtrans)(N, A, lda);
      n = ATL_geqrf(M, N, A, lda, TAU, WORK, LWORK);
      Mjoin(PATL,sqtrans)(N, A, lda);

      /* Take the conjugate for Complex TAU. */
      #ifdef TCPLX
      ATL_INT i;
      for (i=1; i<(minMN<<1); i+=2)
         *(TAU+i) = 0.-*(TAU+i);          /* Negate imaginary part. */
      #endif
      return(n);
   }
/*
 * If the user gives us too little space, see if we can allocate it ourselves
 */
   else if (LWORK < (maxMN*nb + nb*nb + maxMN))
   {
      vp = malloc(ATL_MulBySize(maxMN*nb + nb*nb + maxMN) + ATL_Cachelen);
      if (!vp)
         return(-7);
       WORK = ATL_AlignPtr(vp);
   }

/*
 * Assign workspace areas for ATL_larft, ATL_gelq2, ATL_larfb
 */
   ws_T = WORK;                         /* T at begining of work */
   ws_LQ2 = WORK +(nb SHIFT)*nb;        /* After T Work space             */
   ws_larfb = ws_LQ2 + (maxMN SHIFT);   /* After workspace for T and LQ2  */

/*
 * Leave one iteration to be done outside loop, so we don't build T
 * Any loop iterations are therefore known to be of size nb (no partial blocks)
 */
   n = (minMN / nb) * nb;
   if (n == minMN)
      n -= Mmin(nb, minMN);       /* when n is a multiple of nb, reduce by nb */
   #if !defined(ATL_USEPTHREADS)        /* If no PCA, try to copy up front. */
      j = M - n;
      j = Mmax(nb, j);
      ldCP = (N&7) ? (((N+7)>>3)<<3) : N;
      ws_CPRaw = malloc(ATL_MulBySize(ldCP)*j + ATL_Cachelen);
      if (ws_CPRaw) ws_CP=ATL_AlignPtr(ws_CPRaw);  /* Align if malloced. */
   #endif /* Serial Mode */


   for (j=0; j < n; j += nb)
   {
      #if !defined(ATL_USEPTHREADS) /* If no PCA it won't copy. Try it here. */
      /* If we got our copy workspace, transpose panel before recursion. */
      if (ws_CP)                             /* If workspace exists. */
      {
         int ci, cj;                         /* for conjugation.     */
         ldCP = N-j;
         if (ldCP&7)
            ldCP = ((ldCP+7)>>3)<<3;
         Mjoin(PATL,gemoveT)(N-j, nb, ONE, A+(j SHIFT)*(lda+1),
                             lda, ws_CP, ldCP);

         ATL_assert(!ATL_geqrr(N-j, nb, ws_CP, ldCP, TAU+(j SHIFT),
                               ws_LQ2, ws_T, nb, ws_larfb, 1));

         Mjoin(PATL,gemoveT)(nb, N-j, ONE, ws_CP, ldCP,
                             A+(j SHIFT)*(lda+1), lda);

         #if defined(TCPLX)               /* conj upTri T, TAU. */
         for (cj=0; cj<nb; cj++)          /* column loop... */
         {
            TAU[((j+cj) SHIFT)+1] = 0.-TAU[((j+cj) SHIFT)+1];
            for (ci=0; ci<=cj; ci++)      /* row loop... */
               ws_T[((ci+cj*nb) SHIFT)+1] = 0.-ws_T[((ci+cj*nb) SHIFT)+1];
         }
         #endif /* defined(TCPLX) */
      } else /* copy workspace was not allocated, use native. */
      #endif /* Serial Mode (No PCA) */
      {
         ATL_assert(!ATL_gelqr(nb, N-j,  A+(j SHIFT)*(lda+1), lda,
                               TAU+(j SHIFT), ws_LQ2, ws_T, nb, ws_larfb, 1));
      }

      if (j+nb < M)  /* if there are more cols left to bottom, update them */
      {
/*
 *       ======================================================================
 *       Form the triangular factor of the block reflector
 *       After gelqr, ws_T contains 'T', the nb x nb triangular factor 'T'
 *       of the block reflector. It is an output used in the next call, dlarfb.
 *          H = Id - Y'*T*Y, with Id=(N-j)x(N-j), Y=(N-j)xNB.
 *
 *       The ws_T array used above is an input to dlarfb; it is 'T' in
 *       that routine, and LDT x K (translates here to LDWORK x NB).
 *       WORK is an LDWORK x NB workspace (not input or output).
 *       ======================================================================
 */
         ATL_larfb(CblasRight, CblasNoTrans, LAForward, LARowStore,
                   M-j-nb, N-j, nb, A+(j SHIFT)*(lda+1), lda, ws_T, nb,
                   A+((j SHIFT)*(lda+1))+(nb SHIFT), lda, ws_larfb, M);
      }
   }
Ejemplo n.º 2
0
int ATL_geqrf(ATL_CINT M, ATL_CINT N, TYPE  *A, ATL_CINT lda, TYPE *TAU,
               TYPE *WORK, ATL_CINT LWORK)
/*
 * This is the C translation of the standard LAPACK Fortran routine:
 *      SUBROUTINE geqrf( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
 * ATL_geqrf.c :
 * int ATL_geqrf(int M, int N, TYPE  *A, int LDA, TYPE  *TAU,
 *              TYPE *WORK, int LWORK)
 *
 *  Purpose
 *  =======
 *
 *  ATL_geqrf  computes a QR factorization of a real/complex M-by-N matrix A:
 *  A = Q * R.
 *
 *  Compared to LAPACK, here, a recursive panel factorization is implemented.
 *  Refer to ATL_geqrr.c andd ATL_larft.c for details.
 *
 *  Arguments
 *  =========
 *
 *  M       (input) INTEGER
 *          The number of rows of the matrix A.  M >= 0.
 *
 *  N       (input) INTEGER
 *          The number of columns of the matrix A.  N >= 0.
 *
 *  A       (input/output) array, dimension (LDA,N)
 *          On entry, the M-by-N matrix A.
 *          On exit, the elements on and above the diagonal of the array
 *          contain the min(M,N)-by-N upper trapezoidal matrix R (R is
 *          upper triangular if m >= n); the elements below the diagonal,
 *          with the array TAU, represent the orthogonal matrix Q as a
 *          product of min(m,n) elementary reflectors (see Further
 *          Details).
 *
 *  LDA     (input) INTEGER
 *          The leading dimension of the array A.  LDA >= max(1,M).
 *
 *  TAU     (output) array, dimension (min(M,N))
 *          The scalar factors of the elementary reflectors (see Further
 *          Details).
 *
 *  WORK    (workspace/output) array, dimension (MAX(1,LWORK))
 *          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *
 *  LWORK   (input) INTEGER
 *          The dimension of the array WORK.  LWORK >= max(1,N).
 *          For optimum performance LWORK >= N*NB, where NB is
 *          the optimal blocksize.
 *
 *          If LWORK = -1, then a workspace query is assumed; the routine
 *          only calculates the optimal size of the WORK array, returns
 *          this value as the first entry of the WORK array, and no error
 *          message related to LWORK is issued .
 *
 *  INFO    (output) INTEGER
 *          = 0:  successful exit
 *          < 0:  if INFO = -i, the i-th argument had an illegal value
 *
 *  Further Details
 *  ===============
 *
 *  The matrix Q is represented as a product of elementary reflectors
 *
 *     Q = H(1) H(2) . . . H(k), where k = min(m,n).
 *
 *  Each H(i) has the form
 *
 *     H(i) = I - tau * v * v'                  (For Real precision)
 *     H(i) = I - tau * v * conjugate(v)'       (For Complex precision)
 *
 *  where tau is a real/complex scalar, and v is a real/complex vector with
 *  v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
 *  and tau in TAU(i).
 *
 */
{
   ATL_CINT minMN = Mmin(M, N), maxMN = Mmax(M, N);
   ATL_INT n, nb, j;
   TYPE  *ws_QR2,  *ws_T, *ws_larfb;        /* Workspace level 2, T, larfb.   */
   void *vp=NULL;

   #if defined(ATL_TUNING)
   /*-------------------------------------------------------------------------*/
   /* For tuning recursion crossover points, the blocking factor is set by    */
   /* la2xover, the tuning program for that purpose.                          */
   /*-------------------------------------------------------------------------*/
   if (ATL_PanelTune) nb=ATL_PanelTune; else
   #endif /* ATL_TUNING */

   nb = clapack_ilaenv(LAIS_OPT_NB, LAgeqrf, MYOPT+LARight+LAUpper, M, N,-1,-1);

/*
 * If it is a workspace query, return the size of work required.
 *    wrksz = wrksz of ATL_larfb + ATL_larft + ATL_geqr2
 */
   if (LWORK < 0)
   {
      *WORK = ( N*nb + nb*nb + maxMN )  ;
      return(0);
   }
   else if (M < 1 || N < 1)                 /* quick return if no work to do  */
      return(0);
/*
 * If the user gives us too little space, see if we can allocate it ourselves
 */
   else if (LWORK < (N*nb + nb*nb + maxMN))
   {
      vp = malloc(ATL_MulBySize(N*nb + nb*nb + maxMN) + ATL_Cachelen);
      if (!vp)
         return(-7);
      WORK = ATL_AlignPtr(vp);
   }

/*
 * Assign workspace areas for ATL_larft, ATL_geqr2, ATL_larfb
 * RCW Q: Why can't LARFB & GEQR2 workspaces be overlapped?
 */
   ws_T = WORK;                             /* T at begining of work          */
   ws_QR2 = WORK +(nb SHIFT)*nb;            /* After T Work space             */
   ws_larfb = ws_QR2 + (maxMN SHIFT);       /* After workspace for T and QR2  */

/*
 * Leave one iteration to be done outside loop, so we don't build T
 * Any loop iterations are therefore known to be of size nb (no partial blocks)
 */
   n = (minMN / nb) * nb;
   if (n == minMN)
      n -= Mmin(nb, minMN);

   for (j=0; j < n; j += nb)
   {
      ATL_assert(!ATL_geqrr(M-j, nb, A+(j SHIFT)*(lda+1), lda,
                            TAU+(j SHIFT), ws_QR2, ws_T, nb, ws_larfb, 1));

      if (j+nb < N)     /* if there are more cols left to right, update them  */
      {
/*
 *       Form the triangular factor of the block reflector
 *          H = H(i) H(i+1) . . . H(i+ib-1)
 *       After geqrr, ws_T contains 'T', the nb x nb triangular factor 'T'
 *       of the block reflector. It is an output used in the next call, dlarfb.
 *          H = Id - Y*T*Y', with Id=(M-j)x(M-j), Y=(M-j)xNB.
 *
 *       Apply H' to A(j:m,j+nb:N) from the left
 *
 *       The ws_T array used above is an input to dlarfb; it is 'T' in
 *       that routine, and LDT x K (translates here to LDWORK x NB).
 *       WORK is an LDWORK x NB workspace (not input or output).
 */
         ATL_larfb(CblasLeft, CblasTrans, LAForward, LAColumnStore,
                   M-j, N-j-nb, nb, A+(j SHIFT)*(lda+1), lda, ws_T, nb,
                   A+(j SHIFT)+((j+nb)SHIFT)*lda, lda, ws_larfb, N);
      }
   }
/*
 * Build Last panel.  buildT is passed as (0).
 */
   nb = minMN - n;                              /* remaining columns.         */
   ATL_assert(!ATL_geqrr(M-n, N-n, A+(n SHIFT)*(lda+1), lda, TAU+(n SHIFT),
                         ws_QR2, ws_T, nb, ws_larfb, 0));

   if (vp)
      free(vp);
   return(0);
} /* END ATL_dgeqrf */