Esempio n. 1
0
void Mjoin( PATLF77WRAP, hemm )
(
   F77_INTEGER                * ISIDE,
   F77_INTEGER                * IUPLO,
   F77_INTEGER                * M,
   F77_INTEGER                * N,
   TYPE                       * ALPHA,
   TYPE                       * A,
   F77_INTEGER                * LDA,
   TYPE                       * B,
   F77_INTEGER                * LDB,
   TYPE                       * BETA,
   TYPE                       * C,
   F77_INTEGER                * LDC
)
{
/*
 * Purpose
 * =======
 *
 * ATL_F77wrap_hemm performs one of the matrix-matrix operations
 *
 *    C := alpha * A * B + beta * C,
 *
 * or
 *
 *    C := alpha * B * A + beta * C,
 *
 * where alpha and beta are scalars,  A is a Hermitian matrix and B and
 * C are m by n matrices.
 *
 * Notes
 * =====
 *
 * This routine is an internal wrapper function written in  C  called by
 * the corresponding Fortran 77 user callable subroutine.  It calls  the
 * appropriate ATLAS routine performing the actual computation.
 *
 * This wrapper layer resolves the following portability issues:
 *
 *    - the routines' name sheme translation imposed by the  Fortran / C
 *      compilers of your target computer,
 *    - the translation of Fortran characters into the ATLAS  correspon-
 *      ding C enumerated type (in cooperation with the Fortan user cal-
 *      lable subroutine),
 *    - the translation of Fortran integers into the proper C correspon-
 *      ding native type;
 *
 * and the following ease-of-programming issue:
 *
 *    - a pointer to the the first entry of vector operands (when appli-
 *      cable) is passed to the  ATLAS computational routine even if the
 *      corresponding input increment value is negative. This allows for
 *      a more natural expression in  C  of the computation performed by
 *      these ATLAS functions.
 *
 * ---------------------------------------------------------------------
 */
/* ..
 * .. Executable Statements ..
 *
 */
   Mjoin( PATL, hemm )( *ISIDE, *IUPLO, *M, *N, SVVAL ALPHA, A, *LDA,
                        B, *LDB, SVVAL BETA, C, *LDC );
/*
 * End of Mjoin( PATLF77WRAP, hemm )
 */
}
Esempio n. 2
0
int Mjoin(PATL,mmJIK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M0, const int N, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda,
                      const TYPE *B, const int ldb, const SCALAR beta,
                      TYPE *C, const int ldc)
/*
 * Outer three loops for matmul with outer loop over columns of B
 */
{
   int M = M0;
   int nMb, nNb, nKb, ib, jb, kb, ib2, h, i, j, k, m, n, incA, incB, incC;
   int AlphaIsOne;
   const int incK = ATL_MulByNB(K);
   void *vB=NULL, *vC=NULL;
   TYPE *pA, *pB, *pC;
   const TYPE one[2] = {1.0,0.0}, zero[2] = {0.0,0.0};
   MAT2BLK A2blk, B2blk;
   MATSCAL gescal;
   NBMM0 NBmm0;

   nMb = ATL_DivByNB(M);
   nNb = ATL_DivByNB(N);
   nKb = ATL_DivByNB(K);
   ib = M - ATL_MulByNB(nMb);
   jb = N - ATL_MulByNB(nNb);
   kb = K - ATL_MulByNB(nKb);

   pC = C;
   if (beta[1] == ATL_rzero)
   {
      gescal = NULL;
      if (*beta == ATL_rone) NBmm0 = Mjoin(PATL,CNBmm_b1);
      else if (*beta == ATL_rzero) NBmm0 = Mjoin(PATL,CNBmm_b0);
      else NBmm0 = Mjoin(PATL,CNBmm_bX);
   }
   else
   {
      NBmm0 = Mjoin(PATL,CNBmm_b1);
      gescal = Mjoin(PATL,gescal_bX);
   }
/*
 * Special case for when what we are really doing is
 *    C <- beta*C + alpha * A * A'   or   C <- beta*C + alpha * A' * A
 */
   if ( A == B && M == N && TA != TB && (SCALAR_IS_ONE(alpha) || M <= NB)
        && TA != AtlasConjTrans && TB != AtlasConjTrans && lda == ldb )
   {
      AlphaIsOne = SCALAR_IS_ONE(alpha);
      i = ATL_MulBySize(M * K);
      if (!AlphaIsOne && pC == C && !SCALAR_IS_ZERO(beta))
         i += ATL_MulBySize(M*N);
      if (i <= ATL_MaxMalloc) vB = malloc(i + ATL_Cachelen);
      if (vB)
      {
         pA = ATL_AlignPtr(vB);
         if (TA == AtlasNoTrans)
            Mjoin(PATL,row2blkT2_a1)(M, K, A, lda, pA, alpha);
         else Mjoin(PATL,col2blk2_a1)(K, M, A, lda, pA, alpha);
/*
 *       Can't write directly to C if alpha is not one
 */
         if (!AlphaIsOne)
         {
            if (SCALAR_IS_ZERO(beta)) h = ldc;
            else if (pC == C)
            {
               pC = pA + (M * K SHIFT);
               h = M;
            }
            else h = NB;
            Mjoin(PATL,mmJIK2)(K, nMb, nNb, nKb, ib, jb, kb, one, pA, NULL,
                               ldb, pA, 0, NULL, zero, pC, h,
                               Mjoin(PATL,gescal_b0), Mjoin(PATL,CNBmm_b0));

            if (alpha[1] == ATL_rzero)
               Mjoin(PATL,gescal_bXi0)(M, N, alpha, pC, h);
            else Mjoin(PATL,gescal_bX)(M, N, alpha, pC, h);

            if (C != pC)
            {
               if (beta[1] == ATL_rzero)
               {
                  if (*beta == ATL_rone)
                     Mjoin(PATL,putblk_b1)(M, N, pC, C, ldc, beta);
                  else if (*beta == ATL_rnone)
                     Mjoin(PATL,putblk_bn1)(M, N, pC, C, ldc, beta);
                  else if (*beta == ATL_rzero)
                     Mjoin(PATL,putblk_b0)(M, N, pC, C, ldc, beta);
                  else Mjoin(PATL,putblk_bXi0)(M, N, pC, C, ldc, beta);
               }
               else Mjoin(PATL,putblk_bX)(M, N, pC, C, ldc, beta);
            }
         }
         else Mjoin(PATL,mmJIK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, pA, NULL,
                                 ldb, pA, 0, NULL, beta, C, ldc, gescal, NBmm0);
         free(vB);
         if (vC) free(vC);
         return(0);
      }
   }
   i = ATL_Cachelen + ATL_MulBySize(M*K + incK);
   if (i <= ATL_MaxMalloc) vB = malloc(i);
   if (!vB)
   {
      if (TA != AtlasNoTrans && TB != AtlasNoTrans) return(1);
      if (ib) n = nMb + 1;
      else n = nMb;
      for (j=2; !vB; j++)
      {
         k = n / j;
         if (k < 1) break;
         if (k*j < n) k++;
         h = ATL_Cachelen + ATL_MulBySize((k+1)*incK);
         if (h <= ATL_MaxMalloc) vB = malloc(h);
      }
      if (!vB) return(-1);
      n = k;
      m = ATL_MulByNB(n);
      ib2 = 0;
   }
   else
   {
      n = nMb;
      m = M;
      ib2 = ib;
   }
   pB = ATL_AlignPtr(vB);
   if (TA == AtlasNoTrans)
   {
      incA = m SHIFT;
      if (alpha[1] == ATL_rzero)
      {
         if (*alpha == ATL_rone) A2blk = Mjoin(PATL,row2blkT2_a1);
         else A2blk = Mjoin(PATL,row2blkT2_aXi0);
      }
      else A2blk = Mjoin(PATL,row2blkT2_aX);
   }
   else if (TA == AtlasConjTrans)
   {
      incA = lda*m SHIFT;
      if (alpha[1] == ATL_rzero)
      {
         if (*alpha == ATL_rone) A2blk = Mjoin(PATL,col2blkConj2_a1);
         else A2blk = Mjoin(PATL,col2blkConj2_aXi0);
      }
      else A2blk = Mjoin(PATL,col2blkConj2_aX);
   }
   else
   {
      incA = lda*m SHIFT;
      if (alpha[1] == ATL_rzero)
      {
         if (*alpha == ATL_rone) A2blk = Mjoin(PATL,col2blk2_a1);
         else A2blk = Mjoin(PATL,col2blk2_aXi0);
      }
      else A2blk = Mjoin(PATL,col2blk2_aX);
   }
   if (TB == AtlasNoTrans)
   {
      incB = ATL_MulByNB(ldb) SHIFT;
      B2blk = Mjoin(PATL,col2blk_a1);
   }
   else if (TB == AtlasConjTrans)
   {
      incB = NB2;
      B2blk = Mjoin(PATL,row2blkC_a1);
   }
   else
   {
      incB = NB2;
      B2blk = Mjoin(PATL,row2blkT_a1);
   }
   incC = m SHIFT;

   pA = pB + (incK SHIFT);
   do
   {
      if (TA == AtlasNoTrans) A2blk(m, K, A, lda, pA, alpha);
      else A2blk(K, m, A, lda, pA, alpha);
      Mjoin(PATL,mmJIK2)(K, n, nNb, nKb, ib2, jb, kb, alpha, pA, B, ldb, pB,
                         incB, B2blk, beta, C, ldc, gescal, NBmm0);
      M -= m;
      nMb -= n;
      if (M <= m)
      {
         ib2 = ib;
         m = M;
         n = nMb;
      }
      C += incC;
      A += incA;
   }
   while (M);
   free(vB);
   if (vC) free(vC);
   return(0);
}
Esempio n. 3
0
void Mjoin(PATL,mmJIK2)
             (int K, int nMb, int nNb, int nKb, int ib, int jb, int kb,
              const SCALAR alpha, const TYPE *pA0, const TYPE *B, int ldb,
              TYPE *pB0, int incB, MAT2BLK B2blk, const SCALAR beta,
              TYPE *C, int ldc, MATSCAL gescal, NBMM0 NBmm0)
{
   const int incK = ATL_MulByNB(K)SHIFT, incC = ATL_MulByNB(ldc-nMb) SHIFT;
   const int ZEROC = ((gescal == NULL) && SCALAR_IS_ZERO(beta));
   int i, j = nNb;
   const TYPE *pA=pA0;
   const TYPE rbeta = ( (gescal) ? ATL_rone : *beta );
   TYPE *pB=pB0, *stB=pB0+(ATL_MulByNBNB(nKb)SHIFT);

   if (nNb)
   {
      do  /* Loop over full column panels of B */
      {
         if (B)
         {
            B2blk(K, NB, B, ldb, pB, alpha);
            B += incB;
         }
         if (nMb)
         {
            i = nMb;
            do /* loop over full row panels of A */
            {
               if (gescal) gescal(NB, NB, beta, C, ldc);
               if (nKb) /* loop over full blocks in panels */
               {
                  NBmm0(MB, NB, KB, ATL_rone, pA, KB, pB, KB, rbeta, C, ldc);
                  pA += NBNB2;
                  pB += NBNB2;
                  if (nKb != 1)
                  {
                     do
                     {
                        NBmm_b1(MB, NB, KB, ATL_rone, pA, KB, pB, KB, ATL_rone,
                                C, ldc);
                        pA += NBNB2;
                        pB += NBNB2;
                     }
                     while (pB != stB);
                  }
                  if (kb)
                  {
                     KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, ATL_rone,
                          C, ldc);
                     pA += ATL_MulByNB(kb)<<1;
                  }
               }
               else if (kb)
               {
                  if (ZEROC) Mjoin(PATL,gezero)(MB, NB, C, ldc);
                  KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, rbeta, C, ldc);
                  pA += ATL_MulByNB(kb)<<1;
               }
               pB = pB0;
               C += NB2;
            }
            while (--i);
         }
         if (ib)
         {
            if (gescal) gescal(ib, NB, beta, C, ldc);
            IBNBmm(ib, K, pA, pB, rbeta, C, ldc);
         }
         if (!B)
         {
            pB0 += incK;
            pB = pB0;
            stB += incK;
         }
         C += incC;
         pA = pA0;
      }
      while (--j);
   }
   if (jb)
   {
      if (B) B2blk(K, jb, B, ldb, pB, alpha);
      for (i=nMb; i; i--)
      {
         if (gescal) gescal(NB, jb, beta, C, ldc);
         NBJBmm(jb, K, pA, pB, rbeta, C, ldc);
         pA += incK;
         C += NB2;
      }
      if (ib)
      {
         if (gescal) gescal(ib, jb, beta, C, ldc);
         IBJBmm(ib, jb, K, pA, pB, rbeta, C, ldc);
      }
   }
}
Esempio n. 4
0
static TYPE trtritest(enum ATLAS_ORDER Order, enum ATLAS_UPLO Uplo,
                      enum ATLAS_DIAG Diag, int CacheSize, int N, int lda,
		      double *tim)
{
   TYPE *A, *Acompare;
   int i;
   double t0, t1;
   TYPE normA, eps, resid;
   /*int ierr;*/

   #ifdef TCPLX
      const TYPE one[2]={ATL_rone, ATL_rzero};
   #else
      const TYPE one = ATL_rone;
   #endif


   eps = Mjoin(PATL,epsilon)();
   A = malloc(ATL_MulBySize(lda)*N);
   Acompare = malloc(ATL_MulBySize(lda)*N);
   if (A == NULL) return(-1);
   if (Acompare == NULL) return(-1);
   t0 = ATL_flushcache(CacheSize);

   /* create random, diagonally dominant matrix with magic value at
      unused places. Last number is just the random seed. */
   trigen(Order, Uplo, Diag, N, A, lda, PADVAL, N*1029+lda);

   /* Create backup to calculate residual. This one has to be used
      as a full matrix, so it has zero fills and correct diagonal. */
   trigen(Order, Uplo, Diag, N, Acompare, lda, ATL_rzero, N*1029+lda);
   if (Diag==AtlasUnit)
     for (i=0; i < N; i++)
       Acompare[(i*(lda+1)) SHIFT] = ATL_rone;


   normA = trinrm1(Order,Uplo, Diag, N, A, lda);
#ifdef DEBUG
   Mjoin(PATL,geprint)("A0", N, N, A, lda);
#endif

   t0 = ATL_flushcache(-1);

   /* Calculate and time a solution */
   t0 = time00();
   test_trtri(Order, Uplo, Diag, N, A, lda);
   t1 = time00() - t0;
   *tim = t1;

/*   if (ierr != 0)
   {
     fprintf(stderr, "Return values != 0 : %d \n",ierr);
     return(9999.9999);
   }*/


   t0 = ATL_flushcache(0);

   /* Instroduce a padding error. */
   /* A[(5+5*lda)SHIFT]=114.0; */

#ifdef DEBUG
   Mjoin(PATL,geprint)("L", N, N, A, lda);
#endif
   ATL_checkpad(Order, Uplo, Diag, N, A, lda);

   /* Calculate A^{-1}*A */
   cblas_trmm(Order,CblasLeft,Uplo,AtlasNoTrans,Diag,
		    N,N,one,A,lda,Acompare,lda);

#ifdef DEBUG
     Mjoin(PATL,geprint)("A^{-1}*A", N, N, Acompare, N);
#endif

   /* Subtract diagonal */
   for (i=0; i < N; i++)
     Acompare[i*((lda+1) SHIFT)] -= ATL_rone;

/*
   resid = trinrm1(Order, Uplo,AtlasNonUnit,N,Acompare,lda);
   fprintf(stderr, "normA=%e, eps=%e, num=%e\n", normA, eps, resid);
*/

   resid = Mjoin(PATL,genrm1)(N, N, Acompare, lda);


#ifdef DEBUG
   if (resid/(normA*eps*N) > 10.0)
     fprintf(stderr, "normA=%e, eps=%e, num=%e\n", normA, eps, resid);
#endif
   resid /= (normA * eps * N);

   free(Acompare);
   free(A);

   return(resid);
}
Esempio n. 5
0
void Mjoin( PATL, trmv )
(
   const enum ATLAS_UPLO      UPLO,
   const enum ATLAS_TRANS     TRANS,
   const enum ATLAS_DIAG      DIAG,
   const int                  N,
   const TYPE                 * A,
   const int                  LDA,
   TYPE                       * X,
   const int                  INCX
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, trmv ) performs one of the matrix-vector operations
 *
 *    x := A * x,   or   x := conjg( A  ) * x,   or
 *
 *    x := A'* x,   or   x := conjg( A' ) * x,
 *
 * where x is an n-element vector and  A is an n by n unit, or non-unit,
 * upper or lower triangular matrix.
 *
 * This is a blocked version of the algorithm.  For a more detailed des-
 * cription of  the arguments of this function, see the reference imple-
 * mentation in the ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
   void                       * vx = NULL;
   TYPE                       * x;
/* ..
 * .. Executable Statements ..
 *
 */
   if( N == 0 ) return;

   if( INCX == 1 ) { x = X; }
   else
   {
      vx = (TYPE *)malloc( ATL_Cachelen + ATL_MulBySize( N ) );
      ATL_assert( vx ); x = ATL_AlignPtr( vx );
      Mjoin( PATL, copy )( N, X, INCX, x, 1 );
   }

#ifdef TREAL
   if( ( TRANS == AtlasNoTrans ) || ( TRANS == AtlasConj ) )
#else
   if( TRANS == AtlasNoTrans )
#endif
   {
      if( UPLO == AtlasUpper ) Mjoin( PATL, trmvUN )( DIAG, N,    A, LDA, x );
      else                     Mjoin( PATL, trmvLN )( DIAG, N,    A, LDA, x );
   }
#ifdef TCPLX
   else if( TRANS == AtlasConj )
   {
      if( UPLO == AtlasUpper ) Mjoin( PATL, trmvUC )( DIAG, N,    A, LDA, x );
      else                     Mjoin( PATL, trmvLC )( DIAG, N,    A, LDA, x );
   }
#endif
#ifdef TREAL
   else
#else
   else if( TRANS == AtlasTrans )
#endif
   {
      if( UPLO == AtlasUpper ) Mjoin( PATL, trmvUT )( DIAG, N,    A, LDA, x );
      else                     Mjoin( PATL, trmvLT )( DIAG, N,    A, LDA, x );
   }
#ifdef TCPLX
   else
   {
      if( UPLO == AtlasUpper ) Mjoin( PATL, trmvUH )( DIAG, N,    A, LDA, x );
      else                     Mjoin( PATL, trmvLH )( DIAG, N,    A, LDA, x );
   }
#endif
   if( vx ) { Mjoin( PATL, copy )( N, x, 1, X, INCX ); free( vx ); }
/*
 * End of Mjoin( PATL, trmv )
 */
}
Esempio n. 6
0
void Mjoin(PATL,ttrmm)(const enum ATLAS_SIDE side, const enum ATLAS_UPLO uplo,
                       const enum ATLAS_TRANS TA, const enum ATLAS_DIAG diag,
                       ATL_CINT M, ATL_CINT N, const SCALAR alpha,
                       const TYPE *A, ATL_CINT lda, TYPE *B, ATL_CINT ldb)
{
   ATL_TTRSM_t trsms[ATL_NTHREADS];
   TYPE *b;
   ATL_INT n, nblks, minblks;
   double tblks;
   int nr, p, i, j, extrablks;
   static int nb=0;

   if (M < 1 || N < 1)
      return;
   if (SCALAR_IS_ZERO(alpha))
   {
      Mjoin(PATL,gezero)(M, N, B, ldb);
      return;
   }
/*
 * Distribute RHS over the processors
 */
   if (!nb) nb = Mjoin(PATL,GetNB)();
   if (side == AtlasLeft)
   {
      nblks = N/nb;
      nr = N - nblks*nb;
      tblks = ((double)(M*N)) / ( (double)nb * nb );
      p = (tblks+ATL_TTRSM_XOVER-1)/ATL_TTRSM_XOVER;
      p = Mmin(p, ATL_NTHREADS);
      p = p ? p : 1;

      b = B;
      minblks = nblks / p;
      extrablks = nblks - minblks*p;
      for (i=0; i < p; i++)
      {
         if (i < extrablks)
            n = (minblks+1)*nb;
         else if (i == extrablks)
            n = minblks*nb + nr;
         else
            n = minblks*nb;
         trsms[i].A = A;
         trsms[i].M = M;
         trsms[i].N = n;
         trsms[i].lda = lda;
         trsms[i].ldb = ldb;
         trsms[i].B = b;
         trsms[i].alpha = SADD alpha;
         trsms[i].side = side;
         trsms[i].uplo = uplo;
         trsms[i].TA   = TA;
         trsms[i].diag = diag;
         n *= (ldb << Mjoin(PATL,shift));
         b = MindxT(b, n);
      }
   }
   else /* Side == AtlasRight */
   {
      nblks = M/nb;
      nr = M - nblks*nb;
      tblks = (N/nb)*nblks;
      p = (tblks+ATL_TTRSM_XOVER-1)/ATL_TTRSM_XOVER;
      p = Mmin(p, ATL_NTHREADS);
      p = p ? p : 1;

      b = B;
      minblks = nblks / p;
      extrablks = nblks - minblks*p;
      for (i=0; i < p; i++)
      {
         if (i < extrablks)
            n = (minblks+1)*nb;
         else if (i == extrablks)
            n = minblks*nb + nr;
         else
            n = minblks*nb;
         trsms[i].A = A;
         trsms[i].M = n;
         trsms[i].N = N;
         trsms[i].lda = lda;
         trsms[i].ldb = ldb;
         trsms[i].B = b;
         trsms[i].alpha = SADD alpha;
         trsms[i].side = side;
         trsms[i].uplo = uplo;
         trsms[i].TA   = TA;
         trsms[i].diag = diag;
         n <<= Mjoin(PATL,shift);
         b = MindxT(b, n);
      }
   }
   if (p < 2)
   {
      Mjoin(PATL,trmm)(side, uplo, TA, diag, M, N, alpha, A, lda, B, ldb);
      return;
   }
   for (; i < ATL_NTHREADS; i++)  /* flag rest of struct as uninitialized */
      trsms[i].B = NULL;
   ATL_goparallel(p, Mjoin(PATL,DoWorkTRMM), trsms, NULL);
}
Esempio n. 7
0
void Mjoin( PATLF77WRAP, ger )
(
   F77_INTEGER                * M,
   F77_INTEGER                * N,
   TYPE                       * ALPHA,
   TYPE                       * X,
   F77_INTEGER                * INCX,
   TYPE                       * Y,
   F77_INTEGER                * INCY,
   TYPE                       * A,
   F77_INTEGER                * LDA
)
{
/*
 * Purpose
 * =======
 *
 * ATL_F77wrap_ger performs the rank 1 operation
 *
 *    A := alpha * x * y' + A,
 *
 * where alpha is a scalar,  x is an m-element vector, y is an n-element
 * vector and A is an m by n matrix.
 *
 * Notes
 * =====
 *
 * This routine is an internal wrapper function written in  C  called by
 * the corresponding Fortran 77 user callable subroutine.  It calls  the
 * appropriate ATLAS routine performing the actual computation.
 *
 * This wrapper layer resolves the following portability issues:
 *
 *    - the routines' name sheme translation imposed by the  Fortran / C
 *      compilers of your target computer,
 *    - the translation of Fortran characters into the ATLAS  correspon-
 *      ding C enumerated type (in cooperation with the Fortan user cal-
 *      lable subroutine),
 *    - the translation of Fortran integers into the proper C correspon-
 *      ding native type;
 *
 * and the following ease-of-programming issue:
 *
 *    - a pointer to the the first entry of vector operands (when appli-
 *      cable) is passed to the  ATLAS computational routine even if the
 *      corresponding input increment value is negative. This allows for
 *      a more natural expression in  C  of the computation performed by
 *      these ATLAS functions.
 *
 * ---------------------------------------------------------------------
 */
/* ..
 * .. Executable Statements ..
 *
 */
   Mjoin( PATL, ger )( *M, *N, SVVAL ALPHA, W1N( M, X, INCX ), *INCX,
                       W1N( N, Y, INCY ), *INCY, A, *LDA );
/*
 * End of Mjoin( PATLF77WRAP, ger )
 */
}
Esempio n. 8
0
void Mjoin( PATLF77WRAP, gbmv )
(
   F77_INTEGER                * ITRANS,
   F77_INTEGER                * M,
   F77_INTEGER                * N,
   F77_INTEGER                * KL,
   F77_INTEGER                * KU,
   TYPE                       * ALPHA,
   TYPE                       * A,
   F77_INTEGER                * LDA,
   TYPE                       * X,
   F77_INTEGER                * INCX,
   TYPE                       * BETA,
   TYPE                       * Y,
   F77_INTEGER                * INCY
)
{
/*
 * Purpose
 * =======
 *
 * ATL_F77wrap_gbmv performs one of the matrix-vector operations
 *
 *    y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   or
 *
 *    y := alpha*conjg( A' )*x + beta*y,
 *
 * where  alpha and beta are scalars, x and y are vectors and  A is an m
 * by n band matrix, with kl sub-diagonals and ku super-diagonals.
 *
 * Notes
 * =====
 *
 * This routine is an internal wrapper function written in  C  called by
 * the corresponding Fortran 77 user callable subroutine.  It calls  the
 * appropriate ATLAS routine performing the actual computation.
 *
 * This wrapper layer resolves the following portability issues:
 *
 *    - the routines' name sheme translation imposed by the  Fortran / C
 *      compilers of your target computer,
 *    - the translation of Fortran characters into the ATLAS  correspon-
 *      ding C enumerated type (in cooperation with the Fortan user cal-
 *      lable subroutine),
 *    - the translation of Fortran integers into the proper C correspon-
 *      ding native type;
 *
 * and the following ease-of-programming issue:
 *
 *    - a pointer to the the first entry of vector operands (when appli-
 *      cable) is passed to the  ATLAS computational routine even if the
 *      corresponding input increment value is negative. This allows for
 *      a more natural expression in  C  of the computation performed by
 *      these ATLAS functions.
 *
 * ---------------------------------------------------------------------
 */
/* ..
 * .. Executable Statements ..
 *
 */
   if( (*ITRANS) == F77_INOTRAN )
   {
      Mjoin( PATL, gbmv )( *ITRANS, *M, *N, *KL, *KU, SVVAL ALPHA, A, *LDA,
                           W1N( N, X, INCX ), *INCX, SVVAL BETA, W1N( M, Y,
                           INCY ), *INCY);
   }
   else
   {
      Mjoin( PATL, gbmv )( *ITRANS, *N, *M, *KL, *KU, SVVAL ALPHA, A, *LDA,
                           W1N( M, X, INCX ), *INCX, SVVAL BETA, W1N( N, Y,
                           INCY ), *INCY);
   }
/*
 * End of Mjoin( PATLF77WRAP, gbmv )
 */
}
Esempio n. 9
0
/*
 * This is special-case code that handles rank-2 update by calling GER2
 */
int Mjoin(PATL,ammm_rk2)
(
   enum ATLAS_TRANS TA,
   enum ATLAS_TRANS TB,
   ATL_CSZT M,
   ATL_CSZT N,
   const SCALAR alpha,
   const TYPE *A,
   ATL_CSZT lda,
   const TYPE *B,
   ATL_CSZT ldb,
   const SCALAR beta,
   TYPE *C,
   ATL_CSZT ldc
)
{
   #ifdef DREAL
      const int MB=512, NB=32;
   #else /* SREAL */
      const int MB=512, NB=64;
   #endif
   void *vp;
   TYPE *x, *y, *w, *z;
   size_t j;
   ATL_CSZT incC = NB*ldc;

/*
 * If beta is one, can handle by one call to ger2
 */
   if (SCALAR_IS_ONE(beta))
   {
      if (TA == AtlasNoTrans)
      {
         if (TB == AtlasNoTrans)
            Mjoin(PATL,ger2)(M, N, alpha, A, 1, B, ldb, alpha, A+lda, 1,
                             B+1, ldb, C, ldc);
         else
            Mjoin(PATL,ger2)(M, N, alpha, A, 1, B, 1, alpha, A+lda, 1,
                             B+ldb, 1, C, ldc);
      }
      else if (TB == AtlasNoTrans)
         Mjoin(PATL,ger2)(M, N, alpha, A, lda, B, ldb, alpha, A+1, lda,
                          B+1, ldb, C, ldc);
      else
         Mjoin(PATL,ger2)(M, N, alpha, A, lda, B, 1, alpha, A+1, lda,
                          B+ldb, 1, C, ldc);
      return(0);
   }
/*
 * Later on, do smart think like copy only MB/NB at a time, and don't copy
 * at all if vectors are contiguous, but right now, always do copy up-front
 * so loop does not have to worry about TA/TB; this is a O(N) cost in N^2 alg
 */
   vp = malloc(2*ATL_MulBySize(M+N)+4*ATL_Cachelen);
   if (!vp)
      return(1);
   x = ATL_AlignPtr(vp);
   y = x + M;
   y = ATL_AlignPtr(y);
   w = y + N;
   w = ATL_AlignPtr(w);
   z = w + M;
   z = ATL_AlignPtr(z);
   if (TA == AtlasNoTrans)
   {
      Mjoin(PATL,copy)(M, A, 1, x, 1);
      Mjoin(PATL,copy)(M, A+lda, 1, w, 1);
   }
   else
   {
      Mjoin(PATL,copy)(M, A, lda, x, 1);
      Mjoin(PATL,copy)(M, A+1, lda, w, 1);
   }
   if (SCALAR_IS_ONE(alpha))
   {
      if (TB == AtlasNoTrans)
      {
         Mjoin(PATL,copy)(N, B, ldb, y, 1);
         Mjoin(PATL,copy)(N, B+1, ldb, z, 1);
      }
      else
      {
         Mjoin(PATL,copy)(N, B, 1, y, 1);
         Mjoin(PATL,copy)(N, B+ldb, 1, z, 1);
      }
   }
   else
   {
      if (TB == AtlasNoTrans)
      {
         Mjoin(PATL,cpsc)(N, alpha, B, ldb, y, 1);
         Mjoin(PATL,cpsc)(N, alpha, B+1, ldb, z, 1);
      }
      else
      {
         Mjoin(PATL,cpsc)(N, alpha, B, 1, y, 1);
         Mjoin(PATL,cpsc)(N, alpha, B+ldb, 1, z, 1);
      }
   }
   for (j=0; j < N; j += NB, C += incC)
   {
      size_t i, nb = N-j;
      nb = (nb >= NB) ? NB : nb;
      for (i=0; i < M; i += MB)
      {
         size_t mb = M-i;
         mb = (mb >= MB) ? MB : mb;
         Mjoin(PATL,gescal)(mb, nb, beta, C+i, ldc);
         Mjoin(PATL,ger2)(mb, nb, ATL_rone, x+i, 1, y+j, 1, ATL_rone,
                          w+i, 1, z+j, 1, C+i, ldc);
      }
   }
   free(vp);
   return(0);
}
Esempio n. 10
0
void ATL_gemv
   (ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda,
    const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY)
/*
 * Y = alpha*conj(A)*X + beta*Y
 * For Conjugate transpose, first form x = conj(X), y = A^T * conj(X),
 * then use axpbyConj to add this to original Y in the operation
 * Y = beta*Y + alpha*conj(y) = beta*Y + alpha*(A^H * X), which is
 * Y = beta*Y + alpha * A^H * X.
 */
{
   ATL_mvkern_t mvtk, mvtk_b1, mvtk_b0;
   void *vp=NULL;
   TYPE *x = (TYPE*)X, *y = (TYPE*)Y;
   size_t t1, t2;
   ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1;
   int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, COPYX, COPYY, APPLYALPHAX;
   int minM, minN;
   TYPE one[2] = {ATL_rone, ATL_rzero};
   TYPE Zero[2] = {ATL_rzero, ATL_rzero};
   TYPE *beta = (TYPE*) beta0;
   const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha0[1] == ATL_rzero);

   if (M < 1 || N < 1)          /* F77 BLAS doesn't scale in either case */
      return;
   if (SCALAR_IS_ZERO(alpha0))   /* No contrib from alpha*A*x */
   {
      if (!SCALAR_IS_ONE(beta0))
      {
         if (SCALAR_IS_ZERO(beta0))
            Mjoin(PATL,zero)(N, Y, incY);
         else
            Mjoin(PATL,scal)(N, beta, Y, incY);
      }
      return;
   }
/*
 * ATLAS's MVT kernels loop over M in inner loop, which is bad news if M is
 * very small.  Call code that requires no copy of X & Y for these degenerate
 * cases
 */
   if (M < 16)
   {
      Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX,
                          beta0, Y, incY);
      return;
   }
/*
 * Get mvtk kernel pointer along with any usage guidelines, and use the
 * optimized CacheElts to compute the correct blocking factor
 */
   mvtk_b1 = ATL_GetMVTKern(M, N, A, lda, &mvtk_b0, &mu, &nu,
                            &minM, &minN, &alignX, &ALIGNX2A, &alignY,
                            &ForceNU, &CacheElts);
/*
 * Set up to handle case where kernel requres N to be a multiple if NU
 */
   if (ForceNU)
   {
      Nm = (N/nu)*nu;
      nr = N - Nm;
   }
   else
   {
      Nm = N;
      nr = 0;
   }
/*
 * For very small N, we can't afford the data copy, so call special case code
 */
   if (N < 4 || Nm < 1)
   {
      Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX,
                          beta0, Y, incY);
      return;
   }
   if (CacheElts)
   {
      mb = (CacheElts - 2*nu) / (2*(nu+1));
      mb = (mb > mu) ? (mb/mu)*mu : M;
      mb = (mb > M) ? M : mb;
   }
   else
      mb = M;
   vp = malloc(ATL_MulBySize(mb+N) + 2*ATL_Cachelen);
/*
 * If we cannot allocate enough space to copy the vectors, give up and
 * call the simple loop-based implementation
 */
   if (!vp)
   {
      Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX,
                          beta0, Y, incY);
      return;
   }
   y = ATL_AlignPtr(vp);
   x = y + (N SHIFT);
   x = (ALIGNX2A) ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x);
   beta = Zero;
/*
 * In this step, we form y = A^T * conj(X)
 */
   mvtk = mvtk_b0;
   m = M;
   do
   {
      imb = Mmin(mb, m);
      Mjoin(PATL,copyConj)(imb, X, incX, x, 1);  /* x = conj(X) */
/*
 *    Call optimized kernel (can be restricted or general)
 */
      if (imb >= minM)
         mvtk(imb, Nm, A, lda, x, y);
      else
         Mjoin(PATL,mvtk_Mlt16)(imb, Nm, one, A, lda, x, 1, beta, y, 1);
/*
 *    Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy
 */
      if (nr)
         Mjoin(PATL,mvtk_smallN)(imb, nr, one, A+((size_t)lda)*(Nm SHIFT), lda,
                                 x, 1, beta, y+(Nm SHIFT), 1);
      beta = one;
      mvtk = mvtk_b1;
      A += imb SHIFT;
      X += (imb*incX)SHIFT;
      m -= imb;
      imb = Mmin(m,mb);
   }
   while(m);

/*
 * Given y = A^T * conj(X) from above, now do:
 *    Y = beta*Y + alpha*conj(y) = beta*Y + alpha*(A^H * x), which is
 *    Y = beta*Y + alpha * A^H * x.
 */
   Mjoin(PATL,axpbyConj)(N, alpha0, y, 1, beta0, Y, incY);
   free(vp);
}
Esempio n. 11
0
/*
 * This routine handles N <= MAXN, K & M large (left-looking shape)
 */
int Mjoin(PATL,ammm_tN)
(
   enum ATLAS_TRANS TA,
   enum ATLAS_TRANS TB,
   ATL_CSZT M,
   ATL_CSZT N,
   ATL_CSZT K,
   const SCALAR alpha,
   const TYPE *A,
   ATL_CSZT lda,
   const TYPE *B,
   ATL_CSZT ldb,
   const SCALAR beta,
   TYPE *C,
   ATL_CSZT ldc
)
{
   ablk2cmat_t blk2c;
   cm2am_t a2blk, b2blk;
   ammkern_t ammK0_b0, ammK0_b1, ammK0_bn, amm_b1, amm_bn;
   amminfo_t mminfo;
   const TYPE ONE[2] = {ATL_rone, ATL_rzero};
   const TYPE *alpA=ONE, *alpB=ONE, *alpC=ONE;
   TYPE *rA, *iA, *pB0, *rC, *iC;
   int mu, nu, ku, nnu, NN, MB, NMU, KB, KB0, kb0, incBw, incBw0;
   size_t incAk0, incAk, mulAm, incBk0, incBk, nkb, k, nmblks, nsmblks, i, m;
   void *vp;

   ATL_assert(N <= ATL_AMM_MAXNB);
   mu = Mjoin(PATL,GetAmmmInfo)(&mminfo, TA, TB, M, N, K, alpha, beta);
   if (!mu)
      alpA = alpha;
   else if (mu == 1)
      alpB = alpha;
   else
      alpC = alpha;

   mu = mminfo.mu;
   nu = mminfo.nu;
   ku = mminfo.ku;
   MB = ATL_ComputeB(M, mu, MY_MAXMB, &nsmblks, &nmblks);
   NMU = MB / mu;
   nnu = (N+nu-1)/nu;
   KB = mminfo.kb;
   NN = nnu * nu;
   nkb = K/KB;
/*
 * kb0: K remainder, KB0 is CEIL(kb0/ku)*ku for k-vector kerns, and
 * same as kb0 for M-vector kerns
 */
   KB0 = kb0 = K - nkb*KB;
   if (!kb0)
   {
      KB0 = kb0 = KB;
      nkb--;
   }
   #if ATL_AMM_MAXKMAJ > 1
      if (ATL_AMMFLG_KMAJOR(mminfo.flag))
      {
         KB0 = ((kb0+ku-1)/ku)*ku;
         if (ATL_AMMFLG_KRUNTIME(mminfo.flag))
            ammK0_b0 = mminfo.amm_b0;
         else
            ammK0_b0 = (mminfo.kb==KB0) ? mminfo.amm_b0 : mminfo.amm_k1_b0;
      }
      else
   #endif
   {
      ammK0_b0 = (kb0 == KB) ? mminfo.amm_b0 : mminfo.amm_k1_b0;
      if (ATL_AMMFLG_KRUNTIME(mminfo.flag) && (kb0/ku)*ku == kb0 &&
          kb0 > mminfo.kbmin)
         ammK0_b0 = mminfo.amm_b0;
   }
   amm_b1 = mminfo.amm_b1;
   amm_bn = mminfo.amm_bn;
   if (ammK0_b0 == mminfo.amm_b0)
   {
      ammK0_b1 = mminfo.amm_b1;
      ammK0_bn = mminfo.amm_bn;
   }
   else
   {
      ammK0_b1 = mminfo.amm_k1_b1;
      ammK0_bn = mminfo.amm_k1_bn;
   }
   a2blk = mminfo.a2blk;
   b2blk = mminfo.b2blk;
   blk2c = mminfo.Cblk2cm;

/*
 * Do memory allocation, setup pointers
 */
   {
      const int szA = MB*KB, szC=MB*NN;
      size_t szB = (nkb*KB+KB0)*NN;
      const size_t tsz = ATL_MulBySize(szA+szB+szC+mu*nu*ku) + 3*ATL_Cachelen;
      if (tsz > ATL_MaxMalloc)
         return(2);
      vp = malloc(tsz);
      if (!vp)
         return(1);
      iA = ATL_AlignPtr(vp);
      rA = iA + szA;
      iC = rA + szA;
      iC = ATL_AlignPtr(iC);
      rC = iC + szC;
      pB0 = rC + szC;
      pB0 = ATL_AlignPtr(pB0);
   }
   if (TA == AtlasNoTrans)
   {
      incAk = (lda*KB)SHIFT;
      incAk0 = (lda*kb0)SHIFT;
      mulAm = 2;
   }
   else
   {
      incAk = KB SHIFT;
      incAk0 = kb0 SHIFT;
      mulAm = lda SHIFT;
   }
   if (TB == AtlasNoTrans)
   {
      incBk = KB SHIFT;
      incBk0 = kb0 SHIFT;
   }
   else
   {
      incBk = (KB*ldb)SHIFT;
      incBk0 = (kb0*ldb)SHIFT;
   }
   incBw0 = KB0*NN;
   incBw = KB*NN;
   for (m=M,i=0; i < nmblks; i++)
   {
      const TYPE *An;
      TYPE *iB=pB0, *rB=pB0+incBw0, *pBn=rB+incBw0;
      int mb, nmu, mm;

      if (i < nsmblks)
      {
         mb = MB-mu;
         nmu = NMU-1;
      }
      else
      {
         mb = MB;
         nmu = NMU;
      }
      mm = Mmin(m, mb);
      m -= mm;
      An = A + mm*mulAm;
/*
 *    Do first (possibly partial) K-block
 */
      a2blk(kb0, mm, alpA, A, lda, rA, iA);
      A += incAk0;
      if (!i)
      {
         b2blk(kb0, N, alpB, B, ldb, rB, iB);
         B += incBk0;
      }
      rB = iB + incBw0;
      pBn = rB + incBw0;
      ammK0_b0(nmu, nnu, KB0, iA, iB, rC, rA, iB, iC);
      ammK0_b0(nmu, nnu, KB0, rA, iB, iC, rA, rB, rC);
      ammK0_bn(nmu, nnu, KB0, rA, rB, rC, iA, rB, iC);
      ammK0_b1(nmu, nnu, KB0, iA, rB, iC, rA, pBn, iC);
      iB = pBn;
/*
 *    Loop over all full-sized blocks
 */
      for (k=0; k < nkb; k++)
      {
         a2blk(KB, mm, alpA, A, lda, rA, iA);
         A += incAk;
         rB = iB + incBw;
         if (!i)
         {
            b2blk(KB, N, alpB, B, ldb, rB, iB);
            B += incBk;
         }
         pBn = (k < nkb-1) ? rB+incBw : pB0;
         amm_bn(nmu, nnu, KB, iA, iB, rC, rA, iB, iC);
         amm_b1(nmu, nnu, KB, rA, iB, iC, rA, rB, rC);
         amm_bn(nmu, nnu, KB, rA, rB, rC, iA, rB, iC);
         amm_b1(nmu, nnu, KB, iA, rB, iC, rA, pBn, iC);
         iB = pBn;
      }
      blk2c(mm, N, alpC, rC, iC, beta, C, ldc);
      C += mm+mm;
      A = An;
   }

   free(vp);
   return(0);
}
Esempio n. 12
0
void Mjoin( PATL, tpmvUC )
(
   const enum ATLAS_DIAG      DIAG,
   const int                  N,         /* N > 0 assumed */
   const TYPE                 * A,
   const int                  LDA,
   TYPE                       * X
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, tpmvUC ) performs the following matrix-vector operation
 *
 *    x := conjg( A  ) * x,
 *
 * where x is an n-element vector and  A is an n by n unit or  non-unit,
 * upper triangular matrix supplied in packed form.
 *
 * This is a blocked version of the algorithm.  For a more detailed des-
 * cription of  the arguments of this function, see the reference imple-
 * mentation in the  ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
#ifdef TREAL
#define    one                ATL_rone
#else
   TYPE                       one[2] = { ATL_rone, ATL_rzero };
#endif
#ifdef ATL_AXPYMV
   TYPE                       * A0, * x0;
   int                        incX, lda, lda0, mb, mb1, n, nb;
#else
   TYPE                       * x0;
   int                        incX, lda,       m, mb, nb, nb1;
#endif
   void                       (*tpmv0)( const int, const TYPE *, const int,
                              TYPE * );
#define    gpmv0              Mjoin( PATL, gpmvUNc_a1_x1_b1_y1  )
/* ..
 * .. Executable Statements ..
 *
 */
   ATL_GetPartMVN( A, LDA, &mb, &nb );

   if( DIAG == AtlasNonUnit ) tpmv0 = Mjoin( PATL, tpmvUCN );
   else                       tpmv0 = Mjoin( PATL, tpmvUCU );

#ifdef ATL_AXPYMV
   mb1 = N - ( ( N - 1 ) / mb ) * mb; incX = (mb SHIFT);
   lda = lda0 = LDA; A0 = (TYPE *)(A); MUpnext( mb, A0, lda0 );

   for( n  = N - mb, x0 = X + incX; n > 0; n -= mb, X += incX, x0 += incX )
   {
      tpmv0( mb, A, lda, X );
      gpmv0( mb, n, one, A0 - incX, lda0, x0, 1, one, X, 1 );
      lda = lda0; A = A0; MUpnext( mb, A0, lda0 );
   }
   tpmv0( mb1, A, lda, X );
#else
   nb1 = N - ( ( N - 1 ) / nb ) * nb; incX = (nb SHIFT); x0 = X; lda = LDA;
   tpmv0( nb1, A, lda, X ); X += (nb1 SHIFT); MUpnext( nb1, A, lda );

   for( m = nb1; m < N; m += nb, X += incX )
   {
      gpmv0( m, nb, one, A - (m SHIFT), lda, X, 1, one, x0, 1 );
      tpmv0(  nb, A, lda, X );
      MUpnext( nb, A, lda );
   }
#endif
/*
 * End of Mjoin( PATL, tpmvUC )
 */
}
Esempio n. 13
0
void Mjoin(PATL,sprk_rK)
   (const enum PACK_UPLO UA, const enum PACK_TRANS TA,
    const enum ATLAS_UPLO UC, const int CP,
    const int N, const int K, int R, const SCALAR alpha,
    const TYPE *A, int lda, const SCALAR beta0, TYPE *C, const int ldc)
/*
 * This routine does the packed symmetric rank-K update by doing ceil(K/R)
 * rank-R updates of C.  This primarily done for CacheEdge, but is also
 * useful to auto-reduce R until enough workspace may be allocated.
 */
{
   const enum PACK_UPLO UC2 = ((CP) ? UC : PackGen);
   int k=0, kb, ierr;
   const int ldainc = (UA == AtlasUpper) ? 1 : ((UA == AtlasLower) ? -1 : 0);
   const int ldcinc = (UC2 == AtlasUpper) ? 1 : ((UC2 == AtlasLower) ? -1 : 0);
   #ifdef TREAL
      TYPE beta = beta0;
   #else
      TYPE beta[2];
      *beta = *beta0;
      beta[1] = beta0[1];
   #endif

   if (R < NB) R = NB<<4;
   if ((K - R) < 2*NB) R = K;
   do
   {
      kb = K - k;
      if (kb - R < 2*NB) R = kb;
      kb = Mmin(R, kb);
/*
 *    If we can't do the rank-R update, reduce R until we can, or R = 1
 */
      ierr = Mjoin(PATL,prk_kmm)(UC, UA, TA, N, kb, alpha,
                                 A, lda, beta, CP, C, ldc);
      if (ierr && R <= NB*8)
      {
         if (UC == AtlasUpper)
         {
            if (TA == AtlasNoTrans)
               ATL_rk_recUN(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc);
            else
               ATL_rk_recUT(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc);
         }
         else
         {
            if (TA == AtlasNoTrans)
               ATL_rk_recLN(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc);
            else
               ATL_rk_recLT(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc);
         }
         ierr = 0;
      }
      if (ierr)
      {
         R = Mmin(NB*8, R>>1);
         ATL_assert(R);
      }
/*
 *    Subsequent updates use beta = 1
 */
      else
      {
         #ifdef TREAL
            beta = ATL_rone;
         #else
            *beta = ATL_rone;
            beta[1] = ATL_rzero;
         #endif
         if (TA == AtlasNoTrans)
         {
            A += MindexP(UA, 0, R, lda);
            lda = Mpld(UA, R, lda);
         }
         else A += MindexP(UA, R, 0, lda);
         k += R;
      }
   }
Esempio n. 14
0
int clapack_zgels(const enum CBLAS_ORDER Order,
                  const enum CBLAS_TRANSPOSE TA,
                  ATL_CINT M, ATL_CINT N, ATL_CINT NRHS,
                  void *A, ATL_CINT lda, void *B, const int ldb)
/*
 *  GELS solves overdetermined or underdetermined linear systems
 *  involving an M-by-N matrix A, or its conjugate-transpose, using a QR
 *  or LQ factorization of A.  It is assumed that A has full rank.
 */
{
   int ierr = 0;

   if (Order != CblasRowMajor && Order != CblasColMajor)
   {
      ierr = -1;
      cblas_xerbla(1, "clapack_zgesv",
                   "Order must be %d or %d, but is set to %d.\n",
                   CblasRowMajor, CblasColMajor, Order);
   }
   if (TA != AtlasNoTrans && TA != mytrans)
   {
      ierr = -2;
      cblas_xerbla(2, "clapack_zgesv",
                   "Trans must be %d or %d, but is set to %d.\n",
                   CblasNoTrans, mytrans, TA);
   }
   if (M < 0)
   {
      ierr = -3;
      cblas_xerbla(3, "clapack_zgesv",
                   "M cannot be less than zero 0,; is set to %d.\n", N);
   }
   if (N < 0)
   {
      ierr = -4;
      cblas_xerbla(4, "clapack_zgesv",
                   "N cannot be less than zero 0,; is set to %d.\n", N);
   }
   if (NRHS < 0)
   {
      ierr = -5;
      cblas_xerbla(5, "clapack_zgesv",
                   "NRHS cannot be less than zero 0,; is set to %d.\n", NRHS);
   }
   if (lda < M || lda < 1)
   {
      ierr = -7;
      cblas_xerbla(7, "clapack_zgesv",
                   "lda must be >= MAX(M,1): lda=%d M=%d\n", lda, M);
   }
   if (ldb < Mmax(M,N) || ldb < 1)
   {
      ierr = -9;
      cblas_xerbla(9, "clapack_zgesv",
                   "ldb must be >= MAX(M,N,1): ldb=%d M=%d N=%d\n", ldb, M, N);

   }
   if (Order == CblasColMajor)
      ierr = ATL_zgels(TA, M, N, NRHS, A, lda, B, ldb, NULL, 0);
   else  /* row-major array */
   {
      enum CBLAS_TRANSPOSE TAr = (TA == AtlasNoTrans) ? mytrans : CblasNoTrans;
      TYPE *a=((TYPE*)A)+1;
      ATL_CINT lda2 = lda+lda;
      ATL_INT j;
/*
 *    Complex takes only the conjugate tranpose, so conjugate entries before
 *    calling with reversed transpose setting
 */
      for (j=0; j < N; j++, a += lda2)
         Mjoin(PATLU,scal)(N, ATL_rnone, a, 2);
      ierr = ATL_zgels(TAr, N, M, NRHS, A, lda, B, ldb, NULL, 0);
   }
   return(ierr);
}
Esempio n. 15
0
void Mjoin(PATL,mmIJK2)
   (int K, int nMb, int nNb, int nKb, int ib, int jb, int kb,
    const SCALAR alpha, const TYPE *A, const int lda, TYPE *pA0, const int incA,
    MAT2BLK A2blk, TYPE *pB0, const SCALAR beta, TYPE *C, int ldc,
    MATSCAL gescal, NBMM0 NBmm0)
{
   const int incK = ATL_MulByNB(K)<<1;
   const int incCn = ATL_MulByNB(ldc)<<1, incCm = (MB<<1) - nNb*incCn;
   const int ZEROC = ((gescal == NULL) && SCALAR_IS_ZERO(beta));
   int i, j, k;
   const TYPE *pB=pB0;
   const TYPE rbeta = ( (gescal) ? ATL_rone : *beta );
   TYPE *pA=pA0;

   for (i=nMb; i; i--)
   {
      if (A)
      {
         A2blk(K, NB, A, lda, pA, alpha);  /* get 1 row panel of A */
         A += incA;
      }
      for (j=nNb; j; j--)
      {
         if (gescal) gescal(MB, NB, beta, C, ldc);
         if (nKb)
         {
            NBmm0(MB, NB, KB, ATL_rone, pA, KB, pB, KB, rbeta, C, ldc);
            pA += NBNB2;
            pB += NBNB2;
            if (nKb != 1)
            {
               for (k=nKb-1; k; k--, pA += NBNB2, pB += NBNB2)
                  NBmm_b1(MB, NB, KB, ATL_rone, pA, KB, pB, KB,
                          ATL_rone, C, ldc);
            }
            if (kb)
            {
               KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, ATL_rone, C, ldc);
               pB += ATL_MulByNB(kb)<<1;
            }
         }
         else
         {
            if (ZEROC) Mjoin(PATL,gezero)(MB, NB, C, ldc);
            if (kb)
            {
               KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, rbeta, C, ldc);
               pB += ATL_MulByNB(kb)<<1;
            }
         }
         pA = pA0;
         C += incCn;
      }
      if (jb)
      {
         if (gescal) gescal(MB, jb, beta, C, ldc);
         MBJBmm(jb, K, pA, pB, rbeta, C, ldc);
      }
      pB = pB0;
      if (!A)
      {
         pA0 += incK;
         pA = pA0;
      }
      C += incCm;
   }
   if (ib)
   {
      if (A) A2blk(K, ib, A, lda, pA, alpha);   /* get last row panel of A */
      for(j=nNb; j; j--) /* full column panels of B */
      {
         if (gescal) gescal(ib, NB, beta, C, ldc);
         IBNBmm(ib, K, pA, pB, rbeta, C, ldc);
         pB += incK;
         C += incCn;
      }
      if (jb)
      {
         if (gescal) gescal(ib, jb, beta, C, ldc);
         IBJBmm(ib, jb, K, pA, pB, rbeta, C, ldc);
      }
   }
}
Esempio n. 16
0
int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M, const int N0, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda,
                      const TYPE *B, const int ldb, const SCALAR beta,
                      TYPE *C, const int ldc)
{
   int N = N0;
   int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n, incA, incB, incC;
   const int incK = ATL_MulByNB(K);
   void *vA=NULL;
   TYPE *pA, *pB;
   MAT2BLK A2blk, B2blk;
   MATSCAL gescal;
   NBMM0 NBmm0;

   nMb = ATL_DivByNB(M);
   nNb = ATL_DivByNB(N);
   nKb = ATL_DivByNB(K);
   ib = M - ATL_MulByNB(nMb);
   jb = N - ATL_MulByNB(nNb);
   kb = K - ATL_MulByNB(nKb);

   if (beta[1] == ATL_rzero)
   {
      gescal = NULL;
      if (*beta == ATL_rone) NBmm0 = Mjoin(PATL,CNBmm_b1);
      else if (*beta == ATL_rzero) NBmm0 = Mjoin(PATL,CNBmm_b0);
      else NBmm0 = Mjoin(PATL,CNBmm_bX);
   }
   else
   {
      gescal = Mjoin(PATL,gescal_bX);
      NBmm0 = Mjoin(PATL,CNBmm_b1);
   }

   i = ATL_Cachelen + ATL_MulBySize(N*K + incK);
   if (i <= ATL_MaxMalloc) vA = malloc(i);
   if (!vA)
   {
      if (TA == AtlasNoTrans && TB == AtlasNoTrans) return(1);
      if (jb) n = nNb + 1;
      else n = nNb;
      for (j=2; !vA; j++)
      {
         k = n / j;
         if (k < 1) break;
         if (k*j < n) k++;
         h = ATL_Cachelen + ATL_MulBySize((k+1)*incK);
         if (h <= ATL_MaxMalloc) vA = malloc(h);
      }
      if (!vA) return(-1);
      n = ATL_MulByNB(k);
      jb2 = 0;
   }
   else
   {
      jb2 = jb;
      k = nNb;
      n = N;
   }
   pA = ATL_AlignPtr(vA);
   if (TB == AtlasNoTrans)
   {
      incB = ldb*n<<1;
      if (alpha[1] == ATL_rzero)
      {
         if (*alpha == ATL_rone) B2blk = Mjoin(PATL,col2blk2_a1);
         else B2blk = Mjoin(PATL,col2blk2_aXi0);
      }
      else B2blk = Mjoin(PATL,col2blk2_aX);
   }
   else if (TB == AtlasConjTrans)
   {
      incB = n<<1;
      if (alpha[1] == ATL_rzero)
      {
         if (*alpha == ATL_rone) B2blk = Mjoin(PATL,row2blkC2_a1);
         else B2blk = Mjoin(PATL,row2blkC2_aXi0);
      }
      else B2blk = Mjoin(PATL,row2blkC2_aX);
   }
   else
   {
      incB = n<<1;
      if (alpha[1] == ATL_rzero)
      {
         if (*alpha == ATL_rone) B2blk = Mjoin(PATL,row2blkT2_a1);
         else B2blk = Mjoin(PATL,row2blkT2_aXi0);
      }
      else B2blk = Mjoin(PATL,row2blkT2_aX);
   }
   if (TA == AtlasNoTrans)
   {
      incA = NB<<1;
      A2blk = Mjoin(PATL,row2blkT_a1);
   }
   else if (TA == AtlasConjTrans)
   {
      incA = ATL_MulByNB(lda)<<1;
      A2blk = Mjoin(PATL,col2blkConj_a1);
   }
   else
   {
      incA = ATL_MulByNB(lda)<<1;
      A2blk = Mjoin(PATL,col2blk_a1);
   }
   incC = ldc*n<<1;
   pB = pA + (incK<<1);

   do
   {
      if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha);
      else B2blk(n, K, B, ldb, pB, alpha);
      Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA,
                         incA, A2blk, pB, beta, C, ldc, gescal, NBmm0);
      N -= n;
      nNb -= k;
      if (N < n)
      {
         jb2 = jb;
         n = N;
         k = nNb;
      }
      C += incC;
      B += incB;
   }
   while (N);

   free(vA);
   return(0);
}