コード例 #1
0
ファイル: cblas_cher2.c プロジェクト: AIDman/Kaldi
void cblas_cher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                 const int N, const void *alpha,
                 const void *X, const int incX,
                 const void *Y, const int incY, void *A, const int lda)
{
   int info = 2000;
   void *vx, *vy;
   float *x0, *y0;
   const float *x=X, *y=Y, *alp=alpha;
   const float one[2]={ATL_rone, ATL_rzero};

#ifndef NoCblasErrorChecks
   if (Order != CblasColMajor && Order != CblasRowMajor)
      info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d",
                          CblasRowMajor, CblasColMajor, Order);
   if (Uplo != CblasUpper && Uplo != CblasLower)
      info = cblas_errprn(2, info, "UPLO must be %d or %d, but is set to %d",
                          CblasUpper, CblasLower, Uplo);
   if (N < 0) info = cblas_errprn(3, info,
                        "N cannot be less than zero; is set to %d.", N);
   if (!incX) info = cblas_errprn(6, info,
                                  "incX cannot be zero; is set to %d.", incX);
   if (!incY) info = cblas_errprn(8, info,
                                  "incY cannot be zero; is set to %d.", incY);
   if (lda < N || lda < 1)
      info = cblas_errprn(10, info, "lda must be >= MAX(N,1): lda=%d N=%d",
                          lda, N);
   if (info != 2000)
   {
      cblas_xerbla(info, "cblas_cher2", "");
      return;
   }
#endif

   if (incX < 0) x += (1-N)*incX<<1;
   if (incY < 0) y += (1-N)*incY<<1;

   if (Order == CblasColMajor)
      ATL_cher2(Uplo, N, alpha, x, incX, y, incY, A, lda);
   else if (alp[0] != ATL_rzero || alp[1] != ATL_rzero)
   {
      vx = malloc(ATL_Cachelen + ATL_MulBySize(N));
      vy = malloc(ATL_Cachelen + ATL_MulBySize(N));
      ATL_assert(vx != NULL && vy != NULL);
      x0 = ATL_AlignPtr(vx);
      y0 = ATL_AlignPtr(vy);
      ATL_cmoveConj(N, alpha, y, incY, y0, 1);
      ATL_ccopyConj(N, x, incX, x0, 1);
      ATL_cher2(( (Uplo == CblasUpper) ? CblasLower : CblasUpper ),
                N, one, y0, 1, x0, 1, A, lda);
      free(vx);
      free(vy);
   }
   else ATL_cher2(( (Uplo == CblasUpper) ? CblasLower : CblasUpper ),
                  N, alpha, y, incY, x, incX, A, lda);
}
コード例 #2
0
ファイル: ATL_herk_N.c プロジェクト: certik/vendor
void Mjoin(Mjoin(Mjoin(PATL,herk),UploNM),N)
   (const int N, const int K, const void *valpha, const void *A, const int lda,
    const void *vbeta, void *C, const int ldc)
{
   void *vc;
   TYPE *c;
   TYPE alpha[2];
   const TYPE beta = *( (const TYPE *)vbeta  );
   const TYPE zero[2] = {0.0, 0.0};

   alpha[0] = *( (const TYPE *)valpha );
   if (K > HERK_Xover)
   {
      alpha[1] = 0.0;
      vc = malloc(ATL_Cachelen+ATL_MulBySize(N)*N);
      ATL_assert(vc);
      c = ATL_AlignPtr(vc);
      CgemmNC(N, N, K, alpha, A, lda, A, lda, zero, c, N);
      if ( beta == 1.0 ) Mjoin(her_put,_b1)(N, c, vbeta, C, ldc);
      else if ( beta == 0.0 ) Mjoin(her_put,_b0)(N, c, vbeta, C, ldc);
      else Mjoin(her_put,_bXi0)(N, c, vbeta, C, ldc);
      free(vc);
   }
   else Mjoin(PATL,refherk)(Uplo_, AtlasNoTrans, N, K, *alpha, A, lda,
                            beta, C, ldc);
}
コード例 #3
0
ファイル: ATL_her2k.c プロジェクト: kevinoid/atlas-debian
      int Mjoin(PATL,her2kLN)
   #endif
#endif
   (const int N, const int K, const void *valpha, const void *A, const int lda,
    const void *B, const int ldb, const void *vbeta, void *C, const int ldc)
{
   int i;
   void *vc=NULL;
   TYPE *c;
   const TYPE beta =*( (const TYPE *)vbeta  );
   const TYPE zero[2]={0.0, 0.0};

   i = ATL_MulBySize(N)*N;
   if (i <= ATL_MaxMalloc) vc = malloc(ATL_Cachelen+i);
   if (vc == NULL) return(1);
   c = ATL_AlignPtr(vc);
   #ifdef Transpose_
      ATL_ammm(AtlasConjTrans, AtlasNoTrans, N, N, K, valpha, A, lda, B, ldb,
   #else
      ATL_ammm(AtlasNoTrans, AtlasConjTrans, N, N, K, valpha, A, lda, B, ldb,
   #endif
               zero, c, N);
   if ( beta == 1.0 ) Mjoin(her2k_put,_b1)(N, c, vbeta, C, ldc);
   else if ( beta == 0.0 ) Mjoin(her2k_put,_b0)(N, c, vbeta, C, ldc);
   else Mjoin(her2k_put,_bXi0)(N, c, vbeta, C, ldc);
   free(vc);
   return(0);
}
コード例 #4
0
ファイル: ATL_symmL.c プロジェクト: Leobin7/Kaldi
void Mjoin(Mjoin(PATL,symmL),UploNM)
(const int M, const int N, const void *valpha, const void *A, const int lda,
 const void *B, const int ldb, const void *vbeta, void *C, const int ldc)
{
#ifdef TREAL
    const SCALAR alpha=*( (const SCALAR *)valpha );
    const SCALAR beta =*( (const SCALAR *)vbeta  );
    const SCALAR one=1.0;
#else
#define alpha valpha
#define beta vbeta
#endif
    TYPE *a;
    void *va;

    if (N > SYMM_Xover)
    {
        va = malloc(ATL_Cachelen + (ATL_MulBySize(M)*M));
        ATL_assert(va);
        a = ATL_AlignPtr(va);
#ifdef TREAL
        if ( SCALAR_IS_ONE(alpha) )
            Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_a1)(M, alpha, A, lda, a);
        else Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_aX)(M, alpha, A, lda, a);
        CgemmTN(M, N, M, one, a, M, B, ldb, beta, C, ldc);
#else
        Mjoin(Mjoin(PATL,sycopy),UploNM)(M, A, lda, a);
        CgemmTN(M, N, M, valpha, a, M, B, ldb, vbeta, C, ldc);
#endif
        free(va);
    }
    else Mjoin(PATL,refsymm)(AtlasLeft, Uplo_, M, N, alpha, A, lda, B, ldb,
                                 beta, C, ldc);
}
コード例 #5
0
ファイル: ATL_syr2k_T.c プロジェクト: AIDman/Kaldi
   int Mjoin(PATL,syr2kLT)
#endif
   (const int N, const int K, const void *valpha, const void *A, const int lda,
    const void *B, const int ldb, const void *vbeta, void *C, const int ldc)
{
   int i;
   void *vc=NULL;
   TYPE *c;
   #ifdef TREAL
      const SCALAR alpha=*( (const SCALAR *)valpha );
      const SCALAR beta =*( (const SCALAR *)vbeta  );
      const SCALAR one=1.0, zero=0.0;
   #else
      #define alpha valpha
      const TYPE *beta=vbeta;
      const TYPE one[2]={1.0,0.0}, zero[2]={0.0,0.0};
   #endif

   i = ATL_MulBySize(N)*N;
   if (i <= ATL_MaxMalloc) vc = malloc(ATL_Cachelen+i);
   if (vc == NULL) return(1);
   c = ATL_AlignPtr(vc);
   CgemmTN(N, N, K, alpha, A, lda, B, ldb, zero, c, N);
   if ( SCALAR_IS_ONE(beta) ) Mjoin(syr2k_put,_b1)(N, c, beta, C, ldc);
   else if ( SCALAR_IS_ZERO(beta) ) Mjoin(syr2k_put,_b0)(N, c, beta, C, ldc);
   #ifdef TCPLX
      else if (SCALAR_IS_NONE(beta)) Mjoin(syr2k_put,_bn1)(N, c, beta, C, ldc);
      else if (beta[1] == *zero) Mjoin(syr2k_put,_bXi0)(N, c, beta, C, ldc);
   #endif
   else Mjoin(syr2k_put,_bX)(N, c, beta, C, ldc);
   free(vc);
   return(0);
}
コード例 #6
0
ファイル: ATL_syrk_T.c プロジェクト: certik/vendor
void Mjoin(Mjoin(Mjoin(PATL,syrk),UploNM),T)
   (const int N, const int K, const void *valpha, const void *A, const int lda,
    const void *vbeta, void *C, const int ldc)
{
   void *vc;
   TYPE *c;
   #ifdef TREAL
      const SCALAR alpha=*( (const SCALAR *)valpha );
      const SCALAR beta =*( (const SCALAR *)vbeta  );
      const SCALAR one=1.0, zero=0.0;
   #else
      #define alpha valpha
      const TYPE *beta=vbeta;
      const TYPE one[2]={1.0,0.0}, zero[2]={0.0,0.0};
   #endif

   if (K > SYRK_Xover)
   {
      vc = malloc(ATL_Cachelen+ATL_MulBySize(N)*N);
      ATL_assert(vc);
      c = ATL_AlignPtr(vc);
      CgemmTN(N, N, K, alpha, A, lda, A, lda, zero, c, N);
      if ( SCALAR_IS_ONE(beta) ) Mjoin(syr_put,_b1)(N, c, beta, C, ldc);
      else if ( SCALAR_IS_ZERO(beta) ) Mjoin(syr_put,_b0)(N, c, beta, C, ldc);
      #ifdef TCPLX
         else if ( SCALAR_IS_NONE(beta) )
            Mjoin(syr_put,_bn1)(N, c, beta, C, ldc);
         else if (beta[1] == *zero) Mjoin(syr_put,_bXi0)(N, c, beta, C, ldc);
      #endif
      else Mjoin(syr_put,_bX)(N, c, beta, C, ldc);
      free(vc);
   }
   else Mjoin(PATL,refsyrk)(Uplo_, AtlasTrans, N, K, alpha, A, lda,
                            beta, C, ldc);
}
コード例 #7
0
ファイル: ATL_symmR.c プロジェクト: kevinoid/atlas-debian
void Mjoin(Mjoin(PATL,symmR),UploNM)
   (const int M, const int N, const void *valpha, const void *A, const int lda,
    const void *B, const int ldb, const void *vbeta, void *C, const int ldc)
{
   #ifdef TREAL
      const SCALAR alpha=*( (const SCALAR *)valpha );
      const SCALAR beta =*( (const SCALAR *)vbeta  );
      const SCALAR one=1.0;
   #else
      #define alpha valpha
      #define beta  vbeta
   #endif
   void *va;
   TYPE *a;

   if (M > SYMM_Xover)
   {
      va = malloc(ATL_Cachelen + ATL_MulBySize(N)*N);
      ATL_assert(va);
      a = ATL_AlignPtr(va);
      #ifdef TREAL
         if ( SCALAR_IS_ONE(alpha) )
            Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_a1)(N, alpha, A, lda, a);
         else Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_aX)(N, alpha, A, lda, a);
         ATL_ammm(AtlasNoTrans, AtlasNoTrans, M, N, N, one, B, ldb, a, N, beta, C, ldc);
      #else
         Mjoin(Mjoin(PATL,sycopy),UploNM)(N, A, lda, a);
         ATL_ammm(AtlasNoTrans, AtlasNoTrans, M, N, N, valpha, B, ldb, a, N, vbeta, C, ldc);
      #endif
      free(va);
   }
   else Mjoin(PATL,refsymm)(AtlasRight, Uplo_, M, N, alpha, A, lda, B, ldb,
                            beta, C, ldc);
}
コード例 #8
0
ファイル: ATL_trmmL.c プロジェクト: AIDman/Kaldi
void Mjoin(Mjoin(PATL,trmmL),ATLP)
   (const int M, const int N, const void *valpha, const void *A, const int lda,
    void *C, const int ldc)
{
   #ifdef TREAL
      const SCALAR alpha=*( (const SCALAR *)valpha );
      const SCALAR one=1.0, zero=0.0;
   #else
      const TYPE zero[2]={0.0,0.0};
      #define alpha valpha
   #endif
   void *va;
   TYPE *a;

   if (N > TRMM_Xover)
   {
      va = malloc(ATL_Cachelen + ATL_MulBySize(M)*M);
      ATL_assert(va);
      a = ATL_AlignPtr(va);
      #ifdef TREAL
         if ( SCALAR_IS_ONE(alpha) ) Mjoin(ATL_trcopy,_a1)(M, alpha, A, lda, a);
         else Mjoin(ATL_trcopy,_aX)(M, alpha, A, lda, a);
         CAgemmTN(M, N, M, one, a, M, C, ldc, zero, C, ldc);
      #else
         ATL_trcopy(M, A, lda, a);
         CAgemmTN(M, N, M, valpha, a, M, C, ldc, zero, C, ldc);
      #endif
      free(va);
   }
   else Mjoin(PATL,reftrmm)(AtlasLeft, Uplo_, Trans_, Unit_, M, N, alpha,
                            A, lda, C, ldc);
}
コード例 #9
0
ファイル: cblas_zgerc.c プロジェクト: GorgonCryoEM/Gorgon-CVS
void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
                 const void *alpha, const void *X, const int incX,
                 const void *Y, const int incY, void *A, const int lda)
{
   int info = 2000;
   const double *x = X, *y = Y;
   void *vy;
   double *y0;
   double one[2] = {ATL_rone, ATL_rzero};

#ifndef NoCblasErrorChecks
   if (M < 0) info = cblas_errprn(2, info,
                        "M cannot be less than zero; is set to %d.", M);
   if (N < 0) info = cblas_errprn(3, info,
                        "N cannot be less than zero; is set to %d.", N);
   if (!incX) info = cblas_errprn(6, info,
                                  "incX cannot be zero; is set to %d.", incX);
   if (!incY) info = cblas_errprn(8, info,
                                  "incY cannot be zero; is set to %d.", incY);
   if (Order == CblasColMajor)
   {
      if (lda < M || lda < 1)
         info = cblas_errprn(10, info, "lda must be >= MAX(M,1): lda=%d M=%d",
                             lda, M);
   }
   else if (Order == CblasRowMajor)
   {
      if (lda < N || lda < 1)
         info = cblas_errprn(10, info, "lda must be >= MAX(N,1): lda=%d M=%d",
                             lda, N);
   }
   else
      info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d",
                          CblasRowMajor, CblasColMajor, Order);
   if (info != 2000)
   {
      cblas_xerbla(info, "cblas_zgerc", "");
      return;
   }
#endif

   if (incX < 0) x += (1-M)*incX<<1;
   if (incY < 0) y += (1-N)*incY<<1;

   if (Order == CblasColMajor)
      ATL_zgerc(M, N, alpha, x, incX, y, incY, A, lda);
   else
   {
      vy = malloc(ATL_Cachelen + ATL_MulBySize(N));
      ATL_assert(vy);
      y0 = ATL_AlignPtr(vy);
      ATL_zmoveConj(N, alpha, y, incY, y0, 1);
      ATL_zgeru(N, M, one, y0, 1, x, incX, A, lda);
      free(vy);
   }
}
コード例 #10
0
ファイル: cblas_zher.c プロジェクト: apollos/atlas
void cblas_zher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                const int N, const double alpha,
                const void *X, const int incX, void *A, const int lda)
{
   int info = 2000;
   void *vx;
   double one[2] = {ATL_rone, ATL_rzero};
   double *x0;
   const double *x=X;

#ifndef NoCblasErrorChecks
   if (Order != CblasColMajor && Order != CblasRowMajor)
      info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d",
                          CblasRowMajor, CblasColMajor, Order);
   if (Uplo != CblasUpper && Uplo != CblasLower)
      info = cblas_errprn(2, info, "UPLO must be %d or %d, but is set to %d",
                          CblasUpper, CblasLower, Uplo);
   if (N < 0) info = cblas_errprn(3, info,
                        "N cannot be less than zero; is set to %d.", N);
   if (!incX) info = cblas_errprn(6, info,
                                  "incX cannot be zero; is set to %d.", incX);
   if (lda < N || lda < 1)
      info = cblas_errprn(8, info, "lda must be >= MAX(N,1): lda=%d N=%d",
                          lda, N);
   if (info != 2000)
   {
      cblas_xerbla(info, "cblas_zher", "");
      return;
   }
#endif

   if (incX < 0) x += (1-N)*incX<<1;

   if (Order == CblasColMajor)
      ATL_zher(Uplo, N, alpha, x, incX, A, lda);
   else if (alpha != ATL_rzero)
   {
      vx = malloc(ATL_Cachelen + ATL_MulBySize(N));
      ATL_assert(vx);
      x0 = ATL_AlignPtr(vx);
      ATL_zmoveConj(N, one, x, incX, x0, 1);
      ATL_zher(( (Uplo == CblasUpper) ? CblasLower : CblasUpper ),
               N, alpha, x0, 1, A, lda);
      free(vx);
   }
   else
      ATL_zher(( (Uplo == CblasUpper) ? CblasLower : CblasUpper ),
               N, ATL_rzero, x, incX, A, lda);
}
コード例 #11
0
ファイル: ATL_flushcache.c プロジェクト: AIDman/Kaldi
double ATL_flushcache(long long size)
/*
 * flush cache by reading enough mem; note that if the compiler gets
 * really smart, may be necessary to make vp a global variable so it
 * can't figure out it's not being modified other than during setup;
 * the fact that ATL_dzero is external will confuse most compilers
 */
{
  static void *vp=NULL;
  static long long N = 0;
  double *cache;
  double dret=0.0;
  size_t i;

  if (size < 0) /* flush cache */
  {
     ATL_assert(vp);
     cache = ATL_AlignPtr(vp);
     if (N > 0) for (i=0; i != N; i++) dret += cache[i];
  }
  else if (size > 0) /* initialize */
  {
     vp = malloc(ATL_Cachelen + size);
     ATL_assert(vp);
     N = size / sizeof(double);
     cache = ATL_AlignPtr(vp);
     ATL_dzero(N, cache, 1);
  }
  else if (size == 0) /* free cache */
  {
     if (vp) free(vp);
     vp = NULL;
     N = 0;
  }
  return(dret);
}
コード例 #12
0
ファイル: ATL_hemmL.c プロジェクト: kevinoid/atlas-debian
void Mjoin(Mjoin(PATL,hemmL),UploNM)
   (const int M, const int N, const void *alpha, const void *A, const int lda,
    const void *B, const int ldb, const void *beta, void *C, const int ldc)
{
   TYPE *a;
   void *va;

   if (N > HEMM_Xover)
   {
      va = malloc(ATL_Cachelen + (ATL_MulBySize(M)*M));
      ATL_assert(va);
      a = ATL_AlignPtr(va);
      Mjoin(Mjoin(PATL,hecopy),UploNM)(M, A, lda, a);
      ATL_ammm(AtlasNoTrans, AtlasNoTrans, M, N, M, alpha, a, M, B, ldb,
               beta, C, ldc);
      free(va);
   }
   else Mjoin(PATL,refhemm)(AtlasLeft, Uplo_, M, N, alpha, A, lda, B, ldb,
                            beta, C, ldc);
}
コード例 #13
0
double ATL_ptflushcache(long long size)
/*
 * flush cache by reading enough mem; note that if the compiler gets
 * really smart, may be necessary to make vp a global variable so it
 * can't figure out it's not being modified other than during setup;
 * the fact that ATL_dzero is external will confuse most compilers
 */
{
    static void *vp=NULL;
    static double *cache=NULL;
    double dret=0.0;
    static long long i, N = 0;
    ATL_FC fct[ATL_NTHREADS];

    if (size < 0) /* flush cache */
    {
        ATL_assert(cache);
        for (i=0; i < ATL_NTHREADS; i++)
        {
            fct[i].N = N;
            fct[i].dp = cache+i*N;
        }
        ATL_goparallel(ATL_NTHREADS, ATL_DoWorkFC, fct, NULL);
    }
    else if (size > 0) /* initialize */
    {
        vp = malloc(ATL_Cachelen + (size * ATL_NTHREADS));
        ATL_assert(vp);
        cache = ATL_AlignPtr(vp);
        N = size / sizeof(double);
        ATL_dzero(N*ATL_NTHREADS, cache, 1);
    }
    else if (size == 0) /* free cache */
    {
        if (vp) free(vp);
        vp = cache = NULL;
        N = 0;
    }
    return(dret);
}
コード例 #14
0
void Mjoin(Mjoin(PATL,trsmR),ATLP)
   (const int M, const int N, const void *valpha, const void *A, const int lda,
    void *C, const int ldc)
{
   const TYPE *alpha=valpha;
#ifdef TREAL
   #if defined(Transpose_) || defined(ConjTrans_)
      if ( M > (N<<2) )
      {
         void *va;
	 TYPE *a;

         va = malloc(ATL_Cachelen + (ATL_MulBySize(N*N)));
         ATL_assert(va);
         a = ATL_AlignPtr(va);
         #ifdef TREAL
            Mjoin(ATL_trcopy,_a1)(N, ATL_rone, A, lda, a);
         #else
            ATL_trcopy(N, A, lda, a);
         #endif
         Mjoin(Mjoin(PATL,trsmKR),ATLPt)(M, N, *alpha, a, N, C, ldc);
         free(va);
      }
      else Mjoin(PATL,reftrsm)(AtlasRight, Uplo_, Trans_, Unit_, M, N, *alpha,
                               A, lda, C, ldc);
   #else
      Mjoin(Mjoin(PATL,trsmKR),ATLP)(M, N, *alpha, A, lda, C, ldc);
   #endif
#else
   if (M > (N<<2) && N <= 4)
      Mjoin(PATL,CtrsmKR)(Uplo_, Trans_, Unit_, M, N, valpha, A, lda, C, ldc);
   else
      Mjoin(PATL,reftrsm)(AtlasRight, Uplo_, Trans_, Unit_, M, N, alpha,
                          A, lda, C, ldc);
#endif
}
コード例 #15
0
ファイル: ATL_C2Formrq.c プロジェクト: apollos/atlas
int Mjoin(PC2F,ormrq)
   (const enum CBLAS_SIDE Side, const enum CBLAS_TRANSPOSE TA,
    ATL_CINT M, ATL_CINT N, ATL_CINT K, TYPE *A, ATL_CINT lda, TYPE *TAU,
    TYPE *C, ATL_CINT ldc)
{
   TYPE work[2];
   void *vp;
   TYPE *wrk;
   ATL_INT lwrk;
   int iret;
/*
 * Query routine for optimal workspace, allocate it, and call routine with it
 */
   ATL_assert(!Mjoin(PC2F,ormrq_wrk)(Side, TA, M, N, K, A, lda, TAU, C, ldc,
                                     work, -1));
   lwrk = work[0];
   vp = malloc(ATL_MulBySize(lwrk) + ATL_Cachelen);
   ATL_assert(vp);
   wrk = ATL_AlignPtr(vp);
   iret = Mjoin(PC2F,ormrq_wrk)(Side, TA, M, N, K, A, lda, TAU, C, ldc,
                                wrk, lwrk);
   free(vp);
   return(iret);
}
コード例 #16
0
ファイル: clapack_sgetri.c プロジェクト: apollos/atlas
int clapack_sgetri(const enum CBLAS_ORDER Order, const int N, float *A,
                   const int lda, const int *ipiv)
{
   int ierr=0, lwrk;
   int Mjoin(PATL,GetNB)();
   void *vp;

   lwrk = Mjoin(PATL,GetNB)();
   if (lwrk <= N) lwrk *= N;
   else lwrk = N*N;
   vp = malloc(ATL_Cachelen + ATL_MulBySize(lwrk));
   if (vp)
   {
      ierr = ATL_getri(Order, N, A, lda, ipiv, ATL_AlignPtr(vp), &lwrk);
      free(vp);
   }
   else
   {
      cblas_xerbla(7, "clapack_sgetri",
                   "Cannot allocate workspace of %d\n", lwrk);
      return(-7);
   }
   return(ierr);
}
コード例 #17
0
ファイル: cblas_cger2c.c プロジェクト: apollos/atlas
void cblas_cger2c(const enum CBLAS_ORDER Order, ATL_CINT M, ATL_CINT N,
                 const void *alpha, const void *X, ATL_CINT incX,
                 const void *Y, ATL_CINT incY, const void *beta,
                 const void *W, ATL_CINT incW,
                 const void *Z, ATL_CINT incZ, void *A, ATL_CINT lda)
{
   int info = 2000;
   const float *x = X, *y = Y, *w = W, *z = Z;
   void *vy;
   float *y0, *z0;
   float one[2] = {ATL_rone, ATL_rzero};

#ifndef NoCblasErrorChecks
   if (M < 0) info = cblas_errprn(2, info,
                        "M cannot be less than zero; is set to %d.", M);
   if (N < 0) info = cblas_errprn(3, info,
                        "N cannot be less than zero; is set to %d.", N);
   if (!incX) info = cblas_errprn(6, info,
                                  "incX cannot be zero; is set to %d.", incX);
   if (!incY) info = cblas_errprn(8, info,
                                  "incY cannot be zero; is set to %d.", incY);
   if (!incW) info = cblas_errprn(11, info,
                                  "incW cannot be zero; is set to %d.", incW);
   if (!incZ) info = cblas_errprn(13, info,
                                  "incZ cannot be zero; is set to %d.", incZ);
   if (Order == CblasColMajor)
   {
      if (lda < M || lda < 1)
         info = cblas_errprn(15, info, "lda must be >= MAX(M,1): lda=%d M=%d",
                             lda, M);
   }
   else if (Order == CblasRowMajor)
   {
      if (lda < N || lda < 1)
         info = cblas_errprn(15, info, "lda must be >= MAX(N,1): lda=%d M=%d",
                             lda, N);
   }
   else
      info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d",
                          CblasRowMajor, CblasColMajor, Order);
   if (info != 2000)
   {
      cblas_xerbla(info, "cblas_cger2c", "");
      return;
   }
#endif

   if (incX < 0) x += (1-M)*incX<<1;
   if (incY < 0) y += (1-N)*incY<<1;
   if (incW < 0) w += (1-M)*incW<<1;
   if (incZ < 0) z += (1-N)*incZ<<1;

   if (Order == CblasColMajor)
      ATL_cger2c(M, N, alpha, x, incX, y, incY, beta, w, incW, z, incZ, A, lda);
   else
   {
      vy = malloc(ATL_Cachelen+ATL_Cachelen + ATL_MulBySize(N+N));
      ATL_assert(vy);
      y0 = ATL_AlignPtr(vy);
      z0 = y0 + N;
      z0 = ATL_AlignPtr(z0);
      ATL_cmoveConj(N, alpha, y, incY, y0, 1);
      ATL_cmoveConj(N, alpha, z, incZ, z0, 1);
      ATL_cger2u(N, M, one, y0, 1, x, incX, beta, w, incW, z, incZ, A, lda);
      free(vy);
   }
}
コード例 #18
0
ファイル: ATL_ormqr.c プロジェクト: AIDman/Kaldi
int ATL_ormqr
   (const enum CBLAS_SIDE SIDE, const enum CBLAS_TRANSPOSE TRANS,
    ATL_CINT M, ATL_CINT N, ATL_CINT K, TYPE *A, ATL_CINT lda,
    const TYPE *TAU, TYPE *C, ATL_CINT ldc, TYPE *WORK, ATL_CINT LWORK)
/*
 * This is the C translation of the standard LAPACK Fortran routine:
 *      SUBROUTINE ATL_ormqr( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
 *                        WORK, LWORK, INFO )
 *
 * ATL_ormqr.c :
 * int ATL_ormqr(const enum CBLAS_SIDE SIDE SIDE,
 *        const enum CBLAS_TRANSPOSE TRANS, ATL_CINT M, ATL_CINT N,
 *        ATL_CINT K, TYPE * A, ATL_CINT lda,TYPE * TAU, TYPE * C, ATL_CINT ldc,
 *                       TYPE * WORK, ATL_CINT LWORK)
 *
 *      NOTE :   ATL_ormqr.c will get compiled to four precisions
 *               single precision real,      double precision real
 *               single precision complex,   double precision complex
 *
 *
 *
 *  Purpose
 *  =======
 *
 *  ATL_ormqr overwrites the general real M-by-N matrix C with
 *
 *                  SIDE = 'L'     SIDE = 'R'
 *  TRANS = 'N':      Q * C          C * Q
 *  TRANS = 'T':      Q**T * C       C * Q**T
 *
 *  where Q is,
 *        a real orthogonal matrix defined as the product of k
 *        elementary reflectors
 *
 *        Q = H(1) H(2) . . . H(k)
 *
 *   OR
 *        a complex unitary matrix defined as a product of k
 *        elementary reflectors
 *
 *        Q = H(1) H(2) . . . H(k)
 *
 *  as returned by ATLL_geqrf.c. Q is of order M if SIDE = 'L' and of order N
 *  if SIDE = 'R'.
 *
 *  Arguments
 *  =========
 *
 *  SIDE    (input) CHARACTER*1
 *          = 'L': apply Q or Q**T from the Left;
 *          = 'R': apply Q or Q**T from the Right.
 *
 *  TRANS   (input) CHARACTER*1
 *          = 'N':  No transpose, apply Q;
 *          = 'T':  Transpose, apply Q**T.
 *
 *  M       (input) INTEGER
 *          The number of rows of the matrix C. M >= 0.
 *
 *  N       (input) INTEGER
 *          The number of columns of the matrix C. N >= 0.
 *
 *  K       (input) INTEGER
 *          The number of elementary reflectors whose product defines
 *          the matrix Q.
 *          If SIDE = 'L', M >= K >= 0;
 *          if SIDE = 'R', N >= K >= 0.
 *
 *  A       (input) array, dimension (LDA,K)
 *          The i-th column must contain the vector which defines the
 *          elementary reflector H(i), for i = 1,2,...,k, as returned by
 *          DGEQRF in the first k columns of its array argument A.
 *          A is modified by the routine but restored on exit.
 *
 *  lda     (input) INTEGER
 *          The leading dimension of the array A.
 *          If SIDE = 'L', LDA >= max(1,M);
 *          if SIDE = 'R', LDA >= max(1,N).
 *
 *  TAU     (input)  array, dimension (K)
 *          TAU(i) must contain the scalar factor of the elementary
 *          reflector H(i), as returned by ATL_geqrf.c.
 *
 *  C       (input/output)  array, dimension (LDC,N)
 *          On entry, the M-by-N matrix C.
 *          On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q.
 *
 *  ldc     (input) INTEGER
 *          The leading dimension of the array C. LDC >= max(1,M).
 *
 *  WORK    (workspace/output) array, dimension (MAX(1,LWORK))
 *          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *
 *  LWORK   (input) INTEGER
 *          The dimension of the array WORK.
 *          If SIDE = 'L', LWORK >= max(1,N);
 *          if SIDE = 'R', LWORK >= max(1,M).
 *          For optimum performance LWORK >= N*NB if SIDE = 'L', and
 *          LWORK >= M*NB if SIDE = 'R', where NB is the optimal
 *          blocksize.
 *
 *          If LWORK = -1, then a workspace query is assumed; the routine
 *          only calculates the optimal size of the WORK array, returns
 *          this value as the first entry of the WORK array, and no error
 *          message related to LWORK is issued by XERBLA.
 *
 *  INFO    (output) INTEGER
 *          = 0:  successful exit
 *          < 0:  if INFO = -i, the i-th argument had an illegal value
 */
{
   ATL_CINT minMN = Mmin(M, N), maxMN = Mmax(M, N);
   ATL_INT n, nb, j, ib, mi, ni, ic, jc ;
   TYPE  *ws_QR2,  *ws_T, *ws_larfb;        /* Workspace for QR2,T, larfb     */
   void *vp=NULL;

   nb = clapack_ilaenv(LAIS_OPT_NB, LAormqr, MYOPT+LARight+LAUpper, M, N, K,-1);

/*
 * If it is a workspace query, return the size of work required.
 *    wrksz = wrksz of ATL_larfb + ATL_larft + ATL_geqr2
 */
   if (LWORK < 0)
   {
      if(SIDE == CblasLeft)
      {
         *WORK = ( N*nb + nb*nb + maxMN )  ;
      }
      else
      {
         *WORK = ( M*nb + nb*nb + maxMN )  ;
      }
      return(0);
   }
   else if (M < 1 || N < 1)                 /* quick return if no work to do  */
      return(0);
/*
 * If the user gives us too little space, see if we can allocate it ourselves
 */
   else
   {
      if(SIDE == CblasLeft)
      {
         if (LWORK < (N*nb + nb*nb + maxMN))
         {
            vp = malloc(ATL_MulBySize(N*nb + nb*nb + maxMN) + ATL_Cachelen);
            if (!vp)
               return(-7);
            WORK = ATL_AlignPtr(vp);
         }
      }
      else
      {
         if (LWORK < (M*nb + nb*nb + maxMN))
         {
            vp = malloc(ATL_MulBySize(M*nb + nb*nb + maxMN) + ATL_Cachelen);
            if (!vp)
               return(-7);
            WORK = ATL_AlignPtr(vp);
         }
      } /* if CblasRight */
   }

/*
 * Assign workspace areas for ATL_larft, ATL_geqr2, ATL_larfb
 */

   ws_T = WORK;                             /* T at begining of work          */
   ws_QR2 = WORK +(nb SHIFT)*nb;            /* After T Work space             */
   ws_larfb = ws_QR2 + (maxMN SHIFT);       /* After workspace for T and QR2  */


   if (SIDE == CblasLeft)
   {
      if ( TRANS == CblasNoTrans )
      {
         j = (K/nb)*nb;
         if (j == K)
         {
            j=K -nb;
         }
 	 for (j; j >= 0; j = j - nb)
         {
            ib = nb;
            if ((j+nb) > K)
            {
               ib = K - j;
            }
/*
 *          Form the triangular factor of the block reflector
 *          H = H(i) H(i+1) . . . H(i+ib-1)
 */
            ATL_larft(LAForward, LAColumnStore, M-j, ib, A+(j SHIFT)*(lda+1),
                      lda, TAU+(j SHIFT), ws_T, ib);
/*
 *          H or H' is applied to C(i:m,1:n)
 */
            ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore,
                      (M-j), N, ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib,
                      C+(j SHIFT), ldc, ws_larfb, N);
          }                                 /* for                            */
      }                                     /* CblasNoTrans                   */
      else
      {
         for (j = 0 ; j < K; j = j + nb)
         {
            ib = Mmin(K-j, nb);
/*
 *          Form the triangular factor of the block reflector
 *          H = H(i) H(i+1) . . . H(i+ib-1)
 */
            ATL_larft(LAForward, LAColumnStore, M-j, ib, A+(j SHIFT)*(lda+1),
                      lda, TAU+(j SHIFT), ws_T, ib);
/*
 *          H or H' is applied to C(i:m,1:n)
 */
            ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore,
                      (M-j), N, ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib,
                      C+(j SHIFT), ldc, ws_larfb, N);
         }                                  /* for                            */
      }                                     /* CblasNoTran                    */
   }                                        /* cblasLeft                      */
   else
   {
      if ( TRANS == CblasNoTrans )
      {
 	 for (j = 0 ; j < K; j = j + nb)
         {
            ib = Mmin(K-j, nb);
/*
 *          Form the triangular factor of the block reflector
 *          H = H(i) H(i+1) . . . H(i+ib-1)
 */
            ATL_larft(LAForward, LAColumnStore, N-j, ib, A+(j SHIFT)*(lda+1),
                      lda, TAU+(j SHIFT), ws_T, ib);
/*
 *              H or H' is applied to C(1:m,i:n)
 */
            ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore,
                      M, N-j, ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib,
                      C+((j SHIFT)*ldc), ldc, ws_larfb, M);
          }                                 /* for                            */
      }
      else
      {
         j = (K/nb)*nb;
         if (j == K)
         {
            j=K -nb;
         }
 	 for (j; j >= 0; j = j - nb)
         {

            ib = nb;
            if ((j+nb) > K)
            {
               ib = K - j;
            }
/*
 *          Form the triangular factor of the block reflector
 *          H = H(i) H(i+1) . . . H(i+ib-1)
 */
            ATL_larft(LAForward, LAColumnStore, N-j, ib, A+(j SHIFT)*(lda+1),
                      lda, TAU+(j SHIFT), ws_T, ib);
/*
 *              H or H' is applied to C(1:m,i:n)
 */

            ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore,
                      M, N-j , ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib,
                      C+((j SHIFT)*ldc) , ldc, ws_larfb, M);
         }                                  /* for                            */

      }                                     /* Cblas Tran on Right            */
   }

   if (vp)
      free(vp);
   return(0);
}                                           /* END ATL_ormqr                  */
コード例 #19
0
ファイル: ATL_cmmJKI.c プロジェクト: AIDman/Kaldi
int Mjoin(PATL,mmJKI)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M, const int N, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda,
                      const TYPE *B, const int ldb, const SCALAR beta,
                      TYPE *C, const int ldc)
/*
 * This gemm is for small K, so we build gemm out of AXPY (outer product)
 * rather than dot (inner product).
 */
{
   int Mp, mp, m, k, ldaa=lda;
   void *vA=NULL;
   TYPE *pA;
   const TYPE CONE[2]={ATL_rone, ATL_rzero}, CNONE[2]={ATL_rnone, ATL_rzero};
   const SCALAR alp=alpha;

/*
 * Compute M partition necessary to promote reuse in the L1 cache.  Check
 * NB^2 in addition to L1elts, to catch machines where L1 is not used by FPU.
 * If this gives a small Mp, use CacheEdge instead (reuse in L2 instead of L1).
 */
   Mp = NB*NB;
   m = ATL_L1elts >> 1;
   Mp = (m > Mp) ? m : Mp;
   Mp /= ((K+2)<<1);
   if (Mp < 128)
   {
      #if !defined(CacheEdge) || CacheEdge == 0
         Mp = M;
      #else
         Mp = (CacheEdge) / ((K+2)*ATL_sizeof);
         if (Mp < 128)
            Mp = M;
      #endif
   }
   if (Mp > M)
      Mp = M;
/*
 * Change Mp if remainder is very small
 */
   else
   {
      Mp -= 16;      /* small safety margin on filling cache */
      mp = M / Mp;
      m = M - mp*Mp;
      if (m && m < 32)
         Mp += (m+mp-1)/mp;
   }
/*
 * If A not in NoTrans format, need to copy so it can use axpy wt stride=1.
 * NOTE: this routine should not be called when you can't afford this copy
 */
   if (TA != AtlasNoTrans)
   {
      vA = malloc(ATL_Cachelen + Mp*ATL_MulBySize(K));
      if (!vA) return(-1);
      pA = ATL_AlignPtr(vA);
      alp = CONE;
      ldaa = Mp;
      pA += Mp+Mp;
   }
   else
      pA = (TYPE *) A;
   for (m=0; m < M; m += Mp)
   {
      mp = M - m;
      if (mp > Mp)
         mp = Mp;
/*
 *    If the thing is in Trans format, copy to NoTrans
 */
      if (vA)
      {
         pA -= (Mp+Mp);
         if (TA == AtlasConjTrans)
         {
            for (k=0; k < K; k++)
            {
               Mjoin(PATL,copy)(mp, A+k+k, lda, pA+((k*ldaa)<<1), 1);
               Mjoin(PATLU,scal)(mp, ATL_rnone, pA+1+((k*ldaa)<<1), 2);
               if (!SCALAR_IS_ONE(alpha))
                  Mjoin(PATL,scal)(mp, alpha, pA+((k*ldaa)<<1), 1);
            }
         }
         else
         {
            for (k=0; k < K; k++)
               Mjoin(PATL,cpsc)(mp, alpha, A+k+k, lda, pA+((k*ldaa)<<1), 1);
         }
         A += mp*(lda+lda);
      }
      Mjoin(PATL,mm_axpy)(AtlasNoTrans, TB, mp, N, K, alp, pA, ldaa, B, ldb,
                          beta, C, ldc);
      pA += mp+mp;
      C += mp+mp;
   }
   if (vA) free(vA);
   return(0);
}
コード例 #20
0
ファイル: ATL_her.c プロジェクト: AIDman/Kaldi
void ATL_her(const enum ATLAS_UPLO Uplo, ATL_CINT N, const TYPE alpha,
               const TYPE *X, ATL_CINT incX, TYPE *A, ATL_CINT lda)
{
   const TYPE calpha[2] = {alpha, ATL_rzero};
   void *vp=NULL;
   TYPE *x, *xt;
   ATL_r1kern_t gerk;
   ATL_INT CacheElts;
   const int ALP1 = (alpha == ATL_rone);
   int COPYX, COPYXt;
   int mu, nu, minM, minN, alignX, alignXt, FNU, ALIGNX2A;
   if (N < 1 || (alpha == ATL_rzero))
      return;
/*
 * For very small problems, avoid overhead of func calls & data copy
 */
   if (N < 50)
   {
      Mjoin(PATL,refher)(Uplo, N, alpha, X, incX, A, lda);
      return;
   }
/*
 * Determine the GER kernel to use, and its parameters
 */
   gerk = ATL_GetR1Kern(N-ATL_s1L_NU, ATL_s1L_NU, A, lda, &mu, &nu,
                        &minM, &minN, &alignX, &ALIGNX2A, &alignXt,
                        &FNU, &CacheElts);
/*
 * Determine if we need to copy the vectors
 */
   COPYX = (incX != 1);
   if (!COPYX)  /* may still need to copy due to alignment issues */
   {
/*
 *    ATL_Cachelen is the highest alignment that can be requested, so
 *    make X's % with Cachelen match that of A if you want A & X to have
 *    the same alignment
 */
      if (ALIGNX2A)
      {
         size_t t1 = (size_t) A, t2 = (size_t) X;
         COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
                 (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
      }
      else if (alignX)
      {
         size_t t1 = (size_t) X;
         COPYX = ((t1/alignX)*alignX != t1);
      }
   }
   vp = malloc((ATL_Cachelen+ATL_MulBySize(N))*(1+COPYX));
   if (!vp)
   {
      Mjoin(PATL,refher)(Uplo, N, alpha, X, incX, A, lda);
      return;
   }
   xt = ATL_AlignPtr(vp);
   if (COPYX)
   {
      x = xt + N+N;
      x = ALIGNX2A ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x);
      Mjoin(PATL,copy)(N, X, incX, x, 1);
   }
   else
      x = (TYPE*) X;
   if (ALP1)
      Mjoin(PATL,copyConj)(N, X, incX, xt, 1);
   else
      Mjoin(PATL,moveConj)(N, calpha, X, incX, xt, 1);
   if (Uplo == AtlasUpper)
      Mjoin(PATL,her_kU)(gerk, N, alpha, x, xt, A, lda);
   else
      Mjoin(PATL,her_kL)(gerk, N, alpha, x, xt, A, lda);
   if (vp)
     free(vp);
}
コード例 #21
0
ファイル: ATL_mmJITcp.c プロジェクト: AIDman/Kaldi
int Mjoin(PATL,mmJITcp)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                        const int M0, const int N, const int K,
                        const SCALAR alpha, const TYPE *A, const int lda,
                        const TYPE *B, const int ldb, const SCALAR beta,
                        TYPE *C, const int ldc)
/*
 * Copy matmul algorithm, copies A and B on-the-fly
 * If M < 0, allocates only (MB+NB)*KB workspace
 */
{
   void *v=NULL;
   const TYPE *a=A;
   TYPE *pA, *pB, *pB0;
   MAT2BLK2 A2blk, B2blk;
   NBMM0 NBmm0, NBmm1, pNBmm0;
   const int M = (M0 >= 0) ? M0 : -M0;
   int nkblks, nmblks, nnblks, mr, nr, kr, KR, bigK, h, i, j, ZEROC;
   size_t incAk, incBk, incAm, incBn, incAW, incAWp, incBW, incBWp, incW;

/*
 * If both M and N <= NB, and one of them is not full, call BPP, which
 * can sometimes avoid doing cleanup forall cases
 */
   if (M <= MB && N <= NB && (M != MB || N != NB))
      return(Mjoin(PATL,mmBPP)(TA, TB, M, N, K, alpha, A, lda, B, ldb,
                               beta, C, ldc));
/*
 * If these workspace increments are 0, we do JIT NBxNB copies instead of
 * copying entire array/panel.  Don't copy mat if you can't reuse it.
 */
   if (M0 > 0)
   {
      incAW = (N > NB) ? KB*MB : 0;
      incBW = (M > NB) ? KB*NB : 0;
   }
   else /* allocate in minimal space */
      incAW = incBW = 0;
   nmblks = M/MB;
   nnblks = N/NB;
   nkblks = K/KB;
   mr = M - nmblks*MB;
   nr = N - nnblks*NB;
   kr = K - nkblks*KB;
/*
 * K-loop is special, in that we don't call user cleanup, must explicitly zero,
 * and K-cleanup is typically slower even for generated kernels.  Therefore,
 * allow extra leaway for doing extra flops.  Note error is unaffected by
 * any of these extra flops: K-loop has elts zeroed, and multiplying zeros
 * and adding in zeros doesn't add to error
 */
   KR = (kr && kr+4 >= KB) ? KB : kr;
   bigK = nkblks*KB+KR;
   if (incAW)
   {
      i = MB*bigK;
      incAWp = KB*mr;
   }
   else
   {
      i = MB*KB;
      incAWp = 0;
   }
   if (incBW)
   {
      incBWp = KB*nr;
      incW = bigK*NB;
      i += N*bigK;
   }
   else
   {
      incBWp = incW = 0;
      i += NB*KB;
   }
   i *= sizeof(TYPE);
   if (i <= ATL_MaxMalloc || !(incAW | incBW))
      v = malloc(ATL_Cachelen+i);
   if (!v) return(-1);
   pA = ATL_AlignPtr(v);
   pB0 = pA + (incAW ? bigK*MB : KB*MB);
   if (TA == AtlasNoTrans)
   {
      A2blk = Mjoin(PATL,gemoveT);
      incAk = lda*KB;
      incAm = MB;
   }
   else
   {
      A2blk = Mjoin(PATL,gemove);
      incAk = KB;
      incAm = MB*lda;
   }
   if (TB == AtlasNoTrans)
   {
      B2blk = Mjoin(PATL,gemove);
      incBk = KB;
      incBn = NB*ldb;
   }
   else
   {
      B2blk = Mjoin(PATL,gemoveT);
      incBk = ldb*KB;
      incBn = NB;
   }
/*
 * See what kernel we're calling
 */
   if ( SCALAR_IS_ONE(beta) )
   {
      NBmm0 = NBmm_b1;
      pNBmm0 = Mjoin(PATL,pNBmm_b1);
   }
   else if ( SCALAR_IS_ZERO(beta) )
   {
      NBmm0 = NBmm_b0;
      pNBmm0 = Mjoin(PATL,pNBmm_b0);
   }
   else
   {
      NBmm0 = NBmm_bX;
      pNBmm0 = Mjoin(PATL,pNBmm_bX);
   }
   KR = (KR == KB) ? KB : 0;
   ZEROC = !KR && SCALAR_IS_ZERO(beta);

   for (i=0; i < nmblks; i++)
   {
      a = A+i*incAm;
      pB = pB0;       /* foreach row-panel of A, start at B's copy space */
      for (j=nnblks; j; j--)
      {
         Mjoin(PATL,mmK)(MB, MB, NB, NB, nkblks, kr, KR, ATL_rone, alpha, beta,
                         a, lda, incAk, pA, incAW, B, ldb, incBk, pB, incBW,
                         C, ldc, A2blk, B2blk, NBmm0, NBmm_b1);
         B += incBn;             /* copy next col panel of B */
         pB += incW;             /* to next col panel of pB  */
         a = (incAW ? NULL : a); /* reuse row-panel of A if copied */
         C += ldc*NB;
      }
      if (nr)
      {
         if (ZEROC)
            Mjoin(PATL,gezero)(MB, nr, C, ldc);
         Mjoin(PATL,mmK)(MB, MB, nr, nr, nkblks, kr, KR, ATL_rone, alpha, beta,
                         a, lda, incAk, pA, incAW, B, ldb, incBk, pB, incBWp,
                         C, ldc, A2blk, B2blk, pNBmm0, Mjoin(PATL,pNBmm_b1));
      }
      C += MB - nnblks*ldc*NB;
      if (incBW)
      {
         B = NULL;              /* finished copying B */
         incBn = 0;
      }
      else
         B -= nnblks*incBn;
   }
   if (mr)
   {
      a = A + nmblks*incAm;
      pB = pB0;
      if ( SCALAR_IS_ONE(beta) ) NBmm0 = Mjoin(PATL,pMBmm_b1);
      else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = Mjoin(PATL,pMBmm_b0);
      else NBmm0 = Mjoin(PATL,pMBmm_bX);
      for (j=nnblks; j; j--)
      {
         Mjoin(PATL,mmK)(mr, mr, NB, NB, nkblks, kr, KR, ATL_rone, alpha, beta,
                         a, lda, incAk, pA, incAWp, B, ldb, incBk, pB, incBW,
                         C, ldc, A2blk, B2blk, NBmm0, Mjoin(PATL,pMBmm_b1));
         B += incBn;              /* copy next col panel of B */
         pB += incW;              /* to next col panel of pB  */
         a = (incAW ? NULL : a);  /* reuse row-panel of A if copied */
         C += ldc*NB;
      }
      if (nr)
      {
         if ( SCALAR_IS_ZERO(beta) )
            Mjoin(PATL,gezero)(mr, nr, C, ldc);
         Mjoin(PATL,mmK)(mr, mr, nr, nr, nkblks, kr, (incAW | incBW) ? KR:0,
                         ATL_rone, alpha, beta, a, lda, incAk, pA, incAWp,
                         B, ldb, incBk, pB, incBWp, C, ldc, A2blk, B2blk,
                         Mjoin(PATL,pKBmm), Mjoin(PATL,pKBmm));
      }
   }
   free(v);
   return(0);
}
コード例 #22
0
ファイル: ATL_mmIJK.c プロジェクト: apollos/atlas
int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M, const int N0, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda0,
                      const TYPE *B, const int ldb0, const SCALAR beta,
                      TYPE *C, const int ldc0)
{
    size_t incA, incB, incC;
    const size_t lda=lda0, ldb=ldb0, ldc=ldc0;
    const size_t incK = ATL_MulByNB((size_t)K);
    int N = N0;
    int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n;
    void *vA=NULL, *vC=NULL;
    TYPE *pA, *pB, *pC;
    MAT2BLK A2blk, B2blk;
    PUTBLK putblk;
    NBMM0 NBmm0;

    nMb = ATL_DivByNB(M);
    nNb = ATL_DivByNB(N);
    nKb = ATL_DivByNB(K);
    ib = M - ATL_MulByNB(nMb);
    jb = N - ATL_MulByNB(nNb);
    kb = K - ATL_MulByNB(nKb);

    /*
     * If K sufficiently large, write to temporary C as safety measure;  otherwise
     * write directly to C
     */
    if (nKb < 12)
    {
        putblk = NULL;
        pC = C;
        if ( SCALAR_IS_ONE(beta) ) NBmm0 = NBmm_b1;
        else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = NBmm_b0;
        else NBmm0 = NBmm_bX;
    }
    else
    {
        NBmm0 = NBmm_b0;
        vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB));
        if (!vC) return(-1);
        pC = ATL_AlignPtr(vC);
        if ( SCALAR_IS_ONE(beta) ) putblk = Mjoin(PATL,putblk_b1);
        else if ( SCALAR_IS_ZERO(beta) ) putblk = Mjoin(PATL,putblk_b0);
        else if ( SCALAR_IS_NONE(beta) ) putblk = Mjoin(PATL,putblk_bn1);
        else putblk = Mjoin(PATL,putblk_bX);
    }
    /*
     * Special case if we don't need to copy one or more input matrix
     */
    if (K == NB && TB == AtlasNoTrans && ldb == NB && ATL_DataIsMinAligned(B))
    {
        if (lda == NB && TA == AtlasTrans && SCALAR_IS_ONE(alpha) &&
                ATL_DataIsMinAligned(A))
        {
            i = NBNB;
            pA = (TYPE *) A;
            A = NULL;
            A2blk = NULL;
            incA = 0;
        }
        else
        {
            vA = malloc(ATL_Cachelen + ATL_MulBySize(incK));
            if (!vA)
            {
                free(vC);
                return(-1);
            }
            pA = ATL_AlignPtr(vA);
            if (TA == AtlasNoTrans)
            {
                incA = NB;
                if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,row2blkT_a1);
                else A2blk = Mjoin(PATL,row2blkT_aX);
            }
            else
            {
                incA = ATL_MulByNB(lda);
                if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,col2blk_a1);
                else A2blk = Mjoin(PATL,col2blk_aX);
            }
        }
        Mjoin(PATL,mmIJK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, A, lda, pA,
                           incA, A2blk, B, beta, C, ldc, pC, putblk, NBmm0);
        if (vA) free(vA);
        if (vC) free(vC);
        return(0);
    }
    i = ATL_Cachelen + ATL_MulBySize(N*K + incK);
    if (i <= ATL_MaxMalloc) vA = malloc(i);
    if (!vA)
    {
        if (TA == AtlasNoTrans && TB == AtlasNoTrans)
        {
            if (vC) free(vC);
            return(1);
        }
        if (jb) n = nNb + 1;
        else n = nNb;
        for (j=2; !vA; j++)
        {
            k = n / j;
            if (k < 1) break;
            if (k*j < n) k++;
            h = ATL_Cachelen + ATL_MulBySize((k+1)*incK);
            if (h <= ATL_MaxMalloc) vA = malloc(h);
        }
        if (!vA)
        {
            if (vC) free(vC);
            return(-1);
        }
        n = ATL_MulByNB(k);
        jb2 = 0;
    }
    else
    {
        jb2 = jb;
        k = nNb;
        n = N;
    }
    pA = ATL_AlignPtr(vA);
    if (TB == AtlasNoTrans)
    {
        incB = ldb*n;
        if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,col2blk2_a1);
        else B2blk = Mjoin(PATL,col2blk2_aX);
    }
    else
    {
        incB = n;
        if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,row2blkT2_a1);
        else B2blk = Mjoin(PATL,row2blkT2_aX);
    }
    if (TA == AtlasNoTrans)
    {
        incA = NB;
        A2blk = Mjoin(PATL,row2blkT_a1);
    }
    else
    {
        incA = ATL_MulByNB(lda);
        A2blk = Mjoin(PATL,col2blk_a1);
    }
    incC = ldc*n;
    pB = pA + incK;

    do
    {
        if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha);
        else B2blk(n, K, B, ldb, pB, alpha);
        Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA,
                           incA, A2blk, pB, beta, C, ldc, pC, putblk, NBmm0);
        N -= n;
        nNb -= k;
        if (N < n)
        {
            jb2 = jb;
            n = N;
            k = nNb;
        }
        C += incC;
        B += incB;
        if (!putblk) pC = C;
    }
    while (N);

    if (vC) free(vC);
    free(vA);
    return(0);
}
コード例 #23
0
ファイル: ATL_gelqf.c プロジェクト: apollos/atlas
int ATL_gelqf(ATL_CINT M, ATL_CINT N, TYPE  *A, ATL_CINT lda, TYPE *TAU,
               TYPE *WORK, ATL_CINT LWORK)
/*
 * This is the C translation of the standard LAPACK Fortran routine:
 *      SUBROUTINE gelqf( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
 * ATL_gelqf.c :
 * int ATL_gelqf(int M, int N, TYPE  *A, int LDA, TYPE  *TAU,
 *              TYPE *WORK, int LWORK)
 *
 *  Purpose
 *  =======
 *
 *  ATL_gelqf  computes an LQ factorization of a real/complex M-by-N matrix A:
 *  A = L * Q.
 *
 *  Compared to LAPACK, here, a recursive panel factorization is implemented.
 *  Refer to ATL_gelqr.c andd ATL_larft.c for details.
 *
 *  Arguments
 *  =========
 *
 *  M       (input) INTEGER
 *          The number of rows of the matrix A.  M >= 0.
 *
 *  N       (input) INTEGER
 *          The number of columns of the matrix A.  N >= 0.
 *
 *  A       (input/output) array, dimension (LDA,N)
 *          On entry, the M-by-N matrix A.
 *          On exit, the elements on and above the diagonal of the array
 *          contain the min(M,N)-by-N upper trapezoidal matrix R (R is
 *          upper triangular if m >= n); the elements below the diagonal,
 *          with the array TAU, represent the orthogonal matrix Q as a
 *          product of min(m,n) elementary reflectors (see Further
 *          Details).
 *
 *  LDA     (input) INTEGER
 *          The leading dimension of the array A.  LDA >= max(1,M).
 *
 *  TAU     (output) array, dimension (min(M,N))
 *          The scalar factors of the elementary reflectors (see Further
 *          Details).
 *
 *  WORK    (workspace/output) array, dimension (MAX(1,LWORK))
 *          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *
 *  LWORK   (input) INTEGER
 *          The dimension of the array WORK.  LWORK >= max(1,N).
 *          For optimum performance LWORK >= N*NB, where NB is
 *          the optimal blocksize.
 *
 *          If LWORK = -1, then a workspace query is assumed; the routine
 *          only calculates the optimal size of the WORK array, returns
 *          this value as the first entry of the WORK array, and no error
 *          message related to LWORK is issued .
 *
 *  INFO    (output) INTEGER
 *          = 0:  successful exit
 *          < 0:  if INFO = -i, the i-th argument had an illegal value
 *
 *  Further Details
 *  ===============
 *
 *  The matrix Q is represented as a product of elementary reflectors
 *
 *     Q = H(1) H(2) . . . H(k), where k = min(m,n).
 *
 *  Each H(i) has the form
 *
 *     H(i) = I - tau * v * v'                  (For Real precision)
 *     H(i) = I - tau * v * conjugate(v)'       (For Complex precision)
 *
 *  where tau is a real/complex scalar, and v is a real/complex vector with
 *  v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
 *  and tau in TAU(i).
 *
 */
{
   ATL_CINT minMN = Mmin(M, N), maxMN = Mmax(M, N);
   ATL_INT n, nb, j;
   TYPE  *ws_LQ2,  *ws_T, *ws_larfb;        /* Workspace level 2, T, larfb.   */
   void *vp=NULL;

   /* For transpose function, may need type-appropriate 'ONE' for alpha. */
   #ifdef TREAL
      const TYPE ONE = ATL_rone;
   #else
      const TYPE ONE[2] = {ATL_rone, ATL_rzero};
   #endif
   TYPE *ws_CP=NULL, *ws_CPRaw=NULL;
   ATL_INT ldCP;

   #if defined(ATL_TUNING)
   /*-------------------------------------------------------------------------*/
   /* For tuning recursion crossover points, the blocking factor is set by    */
   /* la2xover, the tuning program for that purpose.                          */
   /*-------------------------------------------------------------------------*/
   if (ATL_PanelTune) nb=ATL_PanelTune; else
   #endif /* ATL_TUNING */

   nb = clapack_ilaenv(LAIS_OPT_NB, LAgeqrf, MYOPT+LALeft+LALower, M, N,-1,-1);

/*
 * If it is a workspace query, return the size of work required.
 *    wrksz = wrksz of ATL_larfb + ATL_larft + ATL_gelq2
 */
   if (LWORK < 0)
   {
      *WORK = ( maxMN*nb + nb*nb + maxMN )  ;
      return(0);
   }
   else if (M < 1 || N < 1)  /* quick return if no work to do */
      return(0);

/*
 * LQ is the transpose of QR: We use this to go from row-major LQ to
 * col-major QR, typically faster. Here, if we are square and large,
 * we transpose the whole matrix in-place and then transpose it back.
 * This should be a tunable parameter; perhaps if the matrix fits in
 * L1 or L2? (Note by Tony C, short on time to conduct tuning).
 */
   if (M == N && N >= 128)
   {
      Mjoin(PATL,sqtrans)(N, A, lda);
      n = ATL_geqrf(M, N, A, lda, TAU, WORK, LWORK);
      Mjoin(PATL,sqtrans)(N, A, lda);

      /* Take the conjugate for Complex TAU. */
      #ifdef TCPLX
      ATL_INT i;
      for (i=1; i<(minMN<<1); i+=2)
         *(TAU+i) = 0.-*(TAU+i);          /* Negate imaginary part. */
      #endif
      return(n);
   }
/*
 * If the user gives us too little space, see if we can allocate it ourselves
 */
   else if (LWORK < (maxMN*nb + nb*nb + maxMN))
   {
      vp = malloc(ATL_MulBySize(maxMN*nb + nb*nb + maxMN) + ATL_Cachelen);
      if (!vp)
         return(-7);
       WORK = ATL_AlignPtr(vp);
   }

/*
 * Assign workspace areas for ATL_larft, ATL_gelq2, ATL_larfb
 */
   ws_T = WORK;                         /* T at begining of work */
   ws_LQ2 = WORK +(nb SHIFT)*nb;        /* After T Work space             */
   ws_larfb = ws_LQ2 + (maxMN SHIFT);   /* After workspace for T and LQ2  */

/*
 * Leave one iteration to be done outside loop, so we don't build T
 * Any loop iterations are therefore known to be of size nb (no partial blocks)
 */
   n = (minMN / nb) * nb;
   if (n == minMN)
      n -= Mmin(nb, minMN);       /* when n is a multiple of nb, reduce by nb */
   #if !defined(ATL_USEPTHREADS)        /* If no PCA, try to copy up front. */
      j = M - n;
      j = Mmax(nb, j);
      ldCP = (N&7) ? (((N+7)>>3)<<3) : N;
      ws_CPRaw = malloc(ATL_MulBySize(ldCP)*j + ATL_Cachelen);
      if (ws_CPRaw) ws_CP=ATL_AlignPtr(ws_CPRaw);  /* Align if malloced. */
   #endif /* Serial Mode */


   for (j=0; j < n; j += nb)
   {
      #if !defined(ATL_USEPTHREADS) /* If no PCA it won't copy. Try it here. */
      /* If we got our copy workspace, transpose panel before recursion. */
      if (ws_CP)                             /* If workspace exists. */
      {
         int ci, cj;                         /* for conjugation.     */
         ldCP = N-j;
         if (ldCP&7)
            ldCP = ((ldCP+7)>>3)<<3;
         Mjoin(PATL,gemoveT)(N-j, nb, ONE, A+(j SHIFT)*(lda+1),
                             lda, ws_CP, ldCP);

         ATL_assert(!ATL_geqrr(N-j, nb, ws_CP, ldCP, TAU+(j SHIFT),
                               ws_LQ2, ws_T, nb, ws_larfb, 1));

         Mjoin(PATL,gemoveT)(nb, N-j, ONE, ws_CP, ldCP,
                             A+(j SHIFT)*(lda+1), lda);

         #if defined(TCPLX)               /* conj upTri T, TAU. */
         for (cj=0; cj<nb; cj++)          /* column loop... */
         {
            TAU[((j+cj) SHIFT)+1] = 0.-TAU[((j+cj) SHIFT)+1];
            for (ci=0; ci<=cj; ci++)      /* row loop... */
               ws_T[((ci+cj*nb) SHIFT)+1] = 0.-ws_T[((ci+cj*nb) SHIFT)+1];
         }
         #endif /* defined(TCPLX) */
      } else /* copy workspace was not allocated, use native. */
      #endif /* Serial Mode (No PCA) */
      {
         ATL_assert(!ATL_gelqr(nb, N-j,  A+(j SHIFT)*(lda+1), lda,
                               TAU+(j SHIFT), ws_LQ2, ws_T, nb, ws_larfb, 1));
      }

      if (j+nb < M)  /* if there are more cols left to bottom, update them */
      {
/*
 *       ======================================================================
 *       Form the triangular factor of the block reflector
 *       After gelqr, ws_T contains 'T', the nb x nb triangular factor 'T'
 *       of the block reflector. It is an output used in the next call, dlarfb.
 *          H = Id - Y'*T*Y, with Id=(N-j)x(N-j), Y=(N-j)xNB.
 *
 *       The ws_T array used above is an input to dlarfb; it is 'T' in
 *       that routine, and LDT x K (translates here to LDWORK x NB).
 *       WORK is an LDWORK x NB workspace (not input or output).
 *       ======================================================================
 */
         ATL_larfb(CblasRight, CblasNoTrans, LAForward, LARowStore,
                   M-j-nb, N-j, nb, A+(j SHIFT)*(lda+1), lda, ws_T, nb,
                   A+((j SHIFT)*(lda+1))+(nb SHIFT), lda, ws_larfb, M);
      }
   }
コード例 #24
0
ファイル: ATL_tbsv.c プロジェクト: AIDman/Kaldi
void Mjoin( PATL, tbsv )
(
   const enum ATLAS_UPLO      UPLO,
   const enum ATLAS_TRANS     TRANS,
   const enum ATLAS_DIAG      DIAG,
   const int                  N,
   const int                  K,
   const TYPE                 * A,
   const int                  LDA,
   TYPE                       * X,
   const int                  INCX
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, tbsv ) solves one of the systems of equations
 *
 *    A * x = b,   or   conjg( A  ) * x = b,   or
 *
 *    A'* x = b,   or   conjg( A' ) * x = b,
 *
 * where b and x are n-element vectors and  A is an n by n unit, or non-
 * unit, upper or lower triangular band matrix, with (k+1) diagonals.
 *
 * No test for  singularity  or  near-singularity  is included  in  this
 * routine. Such tests must be performed before calling this routine.
 *
 * This is a blocked version of the algorithm.  For a more detailed des-
 * cription of  the arguments of this function, see the reference imple-
 * mentation in the ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
   void                       * vx = NULL;
   TYPE                       * x;
/* ..
 * .. Executable Statements ..
 *
 */
   if( N == 0 ) return;
   Mjoin(PATL,reftbsv)(UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX);
   return;

   if( INCX == 1 ) { x = X; }
   else
   {
      vx = (TYPE *)malloc( ATL_Cachelen + ATL_MulBySize( N ) );
      ATL_assert( vx ); x = ATL_AlignPtr( vx );
      Mjoin( PATL, copy )( N, X, INCX, x, 1 );
   }

#ifdef TREAL
   if( ( TRANS == AtlasNoTrans ) || ( TRANS == AtlasConj ) )
#else
   if( TRANS == AtlasNoTrans )
#endif
   {
      if( UPLO == AtlasUpper ) Mjoin( PATL, tbsvUN )( DIAG, N, K, A, LDA, x );
      else                     Mjoin( PATL, tbsvLN )( DIAG, N, K, A, LDA, x );
   }
#ifdef TCPLX
   else if( TRANS == AtlasConj )
   {
      if( UPLO == AtlasUpper ) Mjoin( PATL, tbsvUC )( DIAG, N, K, A, LDA, x );
      else                     Mjoin( PATL, tbsvLC )( DIAG, N, K, A, LDA, x );
   }
#endif
#ifdef TREAL
   else
#else
   else if( TRANS == AtlasTrans )
#endif
   {
      if( UPLO == AtlasUpper ) Mjoin( PATL, tbsvUT )( DIAG, N, K, A, LDA, x );
      else                     Mjoin( PATL, tbsvLT )( DIAG, N, K, A, LDA, x );
   }
#ifdef TCPLX
   else
   {
      if( UPLO == AtlasUpper ) Mjoin( PATL, tbsvUH )( DIAG, N, K, A, LDA, x );
      else                     Mjoin( PATL, tbsvLH )( DIAG, N, K, A, LDA, x );
   }
#endif
   if( vx ) { Mjoin( PATL, copy )( N, x, 1, X, INCX ); free( vx ); }
/*
 * End of Mjoin( PATL, tbsv )
 */
}
コード例 #25
0
ファイル: cblas_zhbmv.c プロジェクト: apollos/atlas
void cblas_zhbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                 const int N, const int K, const void *alpha, const void *A,
                 const int lda, const void *X, const int incX,
                 const void *beta, void *Y, const int incY)
{
   int info = 2000;
   const enum CBLAS_UPLO ruplo = (Uplo == CblasUpper) ? CblasLower : CblasUpper;
   void *vx;
   double *X0, *x = (double*) X;
   double *y = Y;
   const double *alp=alpha;
   const double *bet=beta;
   double calpha[2], cbeta[2];
   const double one[2] = {ATL_rone, ATL_rzero};
   calpha[0] = *alp;
   calpha[1] = -alp[1];
   cbeta[0] = *bet;
   cbeta[1] = -bet[1];

#ifndef NoCblasErrorChecks
   if (Order != CblasColMajor && Order != CblasRowMajor)
      info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d",
                          CblasRowMajor, CblasColMajor, Order);
   if (Uplo != CblasUpper && Uplo != CblasLower)
      info = cblas_errprn(2, info,
                          "Uplo must be %d or %d, but is set to %d",
                          CblasUpper, CblasLower, Uplo);

   if (N < 0) info = cblas_errprn(3, info,
                        "N cannot be less than zero; is set to %d.", N);
   if (K < 0)
      info = cblas_errprn(4, info, "Valid K: 0 < K < N; K=%d, N=%d.", K, N);
   if (lda < K+1) info = cblas_errprn(7, info,
      "lda cannot be less than K+1;  K=%d, lda=%d\n", K, lda);
   if (!incX) info = cblas_errprn(9, info,
                                  "incX cannot be zero; is set to %d.", incX);
   if (!incY) info = cblas_errprn(12, info,
                                  "incY cannot be zero; is set to %d.", incY);
   if (info != 2000)
   {
      cblas_xerbla(info, "cblas_zhbmv", "");
      return;
   }
#endif

   if (incX < 0) x += (1-N)*incX<<1;
   if (incY < 0) y += (1-N)*incY<<1;
   if (Order == CblasColMajor)
      ATL_zhbmv(Uplo, N, K, alpha, A, lda, x, incX, beta, y, incY);
   else
   {
      vx = malloc(ATL_Cachelen + 2*N*sizeof(double));
      ATL_assert(vx);
      X0 = x;
      x = ATL_AlignPtr(vx);
      ATL_zmoveConj(N, calpha, X0, incX, x, 1);
      if (*bet != ATL_rzero || bet[1] != ATL_rzero)
      {
         ATL_zscalConj(N, cbeta, y, incY);
         ATL_zhbmv(ruplo, N, K, one, A, lda, x, 1, one, y, incY);
      }
      else ATL_zhbmv(ruplo, N, K, one, A, lda, x, 1, beta, y, incY);
      free(vx);
      ATL_zscalConj(N, one, y, incY);
   }
}
コード例 #26
0
ファイル: ATL_hpmv.c プロジェクト: certik/vendor
void Mjoin( PATL, hpmv )
(
   const enum ATLAS_UPLO      UPLO,
   const int                  N,
   const SCALAR               ALPHA,
   const TYPE                 * A,
   const TYPE                 * X,
   const int                  INCX,
   const SCALAR               BETA,
   TYPE                       * Y,
   const int                  INCY
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, hpmv ) performs the matrix-vector operation
 *
 *    y := alpha * A * x + beta * y,
 *
 * where alpha and beta are scalars, x and y are n-element vectors and A
 * is an n by n Hermitian matrix, supplied in packed form.
 *
 * This is a blocked version of the algorithm.  For a more detailed des-
 * cription of  the arguments of this function, see the reference imple-
 * mentation in the ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
   void                       (*gpmv0)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
   void                       (*gpmv1)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
   void                       (*gpmvN)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
#ifdef TREAL
   TYPE                       alphaY, beta0;
#define one                   ATL_rone
#define zero                  ATL_rzero
#else
   const TYPE                 * alphaY, * beta0;
   const TYPE                 one [2] = { ATL_rone,  ATL_rzero },
                              zero[2] = { ATL_rzero, ATL_rzero };
#endif
   void                       * vx = NULL, * vy = NULL;
   TYPE                       * A0, * A1, * x, * x0, * x1, * y, * y00, * y0,
                              * y1;
   int                        incXY, incXY1, j, jb, lda, lda0, lda1, mb, mb1,
                              n, nb;
/* ..
 * .. Executable Statements ..
 *
 */
   if( N == 0 ) return;

   if( SCALAR_IS_ZERO( ALPHA ) )
   {
      if( !( SCALAR_IS_ONE( BETA ) ) ) Mjoin( PATL, scal )( N, BETA, Y, INCY );
      return;
   }

   if( ( INCX != 1 ) || ( ( INCY == 1 ) && !( SCALAR_IS_ONE( ALPHA ) ) ) )
   {
      vx = (void *)malloc( ATL_Cachelen + ATL_MulBySize( N ) );
      ATL_assert( vx ); x = ATL_AlignPtr( vx );
      Mjoin( PATL, cpsc )( N, ALPHA, X, INCX, x, 1 );
      alphaY = one;
   }
   else { x = (TYPE *)(X); alphaY = ALPHA; }

   if( ( INCY != 1 ) || !( SCALAR_IS_ONE( alphaY ) ) )
   {
      vy = malloc( ATL_Cachelen + ATL_MulBySize( N ) );
      ATL_assert( vy ); y00 = y = ATL_AlignPtr( vy );
      beta0 = zero;
   }
   else { y00 = y = (TYPE *)(Y); beta0 = BETA; }

   ATL_GetPartSPMV( A, N, &mb, &nb );

   mb1 = N - ( ( N - 1 ) / mb ) * mb; incXY1 = (nb SHIFT);

   if( UPLO == AtlasUpper )
   {
      if(      SCALAR_IS_ZERO( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_b0_y1 );
      else if( SCALAR_IS_ONE ( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 );
      else
         gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_bX_y1 );
      gpmv1 = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 );
      gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b1_y1 );

      lda = 1; lda0 = lda; A0 = (TYPE *)(A); MUpnext( mb, A0, lda0 );
      incXY = (mb SHIFT); x0 = x + incXY; y0 = y + incXY;

      for( n = N - mb; n > 0; n -= mb, x0 += incXY, x += incXY,
           y0 += incXY, y += incXY )
      {
         Mjoin( PATL, hpmvU )( mb, A, lda, x, beta0, y );

         for( j = 0, lda1 = lda0, A1 = A0 - (mb SHIFT), x1 = x0, y1 = y0; j < n;
              j += nb, x1 += incXY1, y1 += incXY1 )
         {
            jb = n - j; jb = Mmin( jb, nb );
            gpmv0( jb, mb, one, A1, lda1, x,  1, beta0, y1, 1 );
            gpmvN( mb, jb, one, A1, lda1, x1, 1, one,   y,  1 );
            MUpnext( jb, A1, lda1 ); A1 -= (jb SHIFT);
         }
         beta0 = one; gpmv0 = gpmv1; lda = lda0; A = A0; MUpnext( mb, A0, lda0 );
      }
      Mjoin( PATL, hpmvU )( mb1, A, lda, x, beta0, y );
   }
   else
   {
      if(      SCALAR_IS_ZERO( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_b0_y1 );
      else if( SCALAR_IS_ONE ( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_b1_y1 );
      else
         gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_bX_y1 );
      gpmv1 = Mjoin( PATL, gpmvLC_a1_x1_b1_y1 );
      gpmvN = Mjoin( PATL, gpmvLN_a1_x1_b1_y1 );

      lda = N; lda0 = lda; A0 = (TYPE *)(A); MLpnext( N, A, lda );
      incXY = (mb SHIFT); x0 = x; y0 = y;

      for( n  = N - mb, x += ((N-mb) SHIFT), y += ((N-mb) SHIFT); n > 0;
           n -= mb, x -= incXY, y -= incXY )
      {
         MLpprev( mb, A, lda );
         Mjoin( PATL, hpmvL )( mb, A, lda, x, beta0, y );

         for( j = 0, lda1 = lda0, A1 = A0 + (n SHIFT), x1 = x0, y1 = y0; j < n;
              j += nb, x1 += incXY1, y1 += incXY1 )
         {
            jb = n - j; jb = Mmin( jb, nb );
            gpmv0( jb, mb, one, A1, lda1, x,  1, beta0, y1, 1 );
            gpmvN( mb, jb, one, A1, lda1, x1, 1, one,   y,  1 );
            MLpnext( jb, A1, lda1 ); A1 -= (jb SHIFT);
         }
         beta0 = one; gpmv0 = gpmv1;
      }
      Mjoin( PATL, hpmvL )( mb1, A0, lda0, x0, beta0, y0 );
   }

   if( vx ) free( vx );
   if( vy )
   { Mjoin( PATL, axpby )( N, alphaY, y00, 1, BETA, Y, INCY ); free( vy ); }
/*
 * End of Mjoin( PATL, hpmv )
 */
}
コード例 #27
0
int Mjoin(PATL,NCmmJIK_c)
   (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
    const int M, const int N, const int K, const SCALAR alpha,
    const TYPE *A, const int lda, const TYPE *B, const int ldb,
    const SCALAR beta, TYPE *C, const int ldc)
/*
 * JIK loop-ordered matmul with no matrix copy
 */
{
   const int Mb = M / MB, Nb = N / NB, Kb = K / KB;
   const int mr = M - Mb*MB, nr = N - Nb*NB, kr = K - Kb*KB;
   int incAk, incAm, incAn, incBk, incBm, incBn;
   #define incCm MB
   const int incCn = ldc*NB - M + mr;
   int i, j, k;
   const TYPE *a=A, *b=B;
   TYPE *c=C;
   TYPE btmp;
   void *vp;
   TYPE *cp;
   void (*geadd)(const int M, const int N, const SCALAR scalar, const TYPE *A,
                  const int lda, const SCALAR beta, TYPE *C, const int ldc);
   void (*mm_bX)(const int M, const int N, const int K, const SCALAR alpha,
                 const TYPE *A, const int lda, const TYPE *B, const int ldb,
                 const SCALAR beta, TYPE *C, const int ldc);
   void (*mm_b1)(const int M, const int N, const int K, const SCALAR alpha,
                 const TYPE *A, const int lda, const TYPE *B, const int ldb,
                 const SCALAR beta, TYPE *C, const int ldc);
   void (*mmcu) (const int M, const int N, const int K, const SCALAR alpha,
                 const TYPE *A, const int lda, const TYPE *B, const int ldb,
                 const SCALAR beta, TYPE *C, const int ldc);
   void (*mm_fixedKcu)(const int M, const int N, const int K,
                       const SCALAR alpha, const TYPE *A, const int lda,
                       const TYPE *B, const int ldb, const
                       SCALAR beta, TYPE *C, const int ldc);

   if (TA == AtlasNoTrans)
   {
      if (TB == AtlasNoTrans)
      {
         mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,NN),0x0x0),_a1_b0);
         mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,NN),0x0x0),_a1_b1);
         mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),NN),0x0x0_aX_bX);
         mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),NN),0x0x0_aX_bX);
      }
      else
      {
         mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,NT),0x0x0),_a1_b0);
         mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,NT),0x0x0),_a1_b1);
         mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),NT),0x0x0_aX_bX);
         mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),NT),0x0x0_aX_bX);
      }
      incAk = lda * KB;
      incAm = MB - Kb * incAk;
      incAn = -Mb * MB;
   }
   else
   {
      if (TB == AtlasNoTrans)
      {
         mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_a1_b0);
         mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_a1_b1);
         mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),TN),0x0x0_aX_bX);
         mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),TN),0x0x0_aX_bX);
      }
      else
      {
         mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b0);
         mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b1);
         mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),TT),0x0x0_aX_bX);
         mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),TT),0x0x0_aX_bX);
      }
      incAk = KB;
      incAm = lda*MB - Kb*KB;
      incAn = -lda*MB*Mb;
   }
   if (TB == AtlasNoTrans)
   {
      incBk = KB;
      incBm = -KB*Kb;
      incBn = ldb*NB;
   }
   else
   {
      incBk = KB*ldb;
      incBm = -Kb * incBk;
      incBn = NB;
   }

   if (alpha == ATL_rone)
   {
      if (beta == ATL_rzero) geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_a1),_b0);
      else if (beta == ATL_rone)
         geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_a1),_b1);
      else geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_a1),_bX);
   }
   else if (beta == ATL_rzero)
      geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_aX),_b0);
   else if (beta == ATL_rone)
      geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_aX),_b1);
   else geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_aX),_bX);
   vp = malloc(ATL_Cachelen + ATL_MulBySize(MB * NB));
   ATL_assert(vp);
   cp = ATL_AlignPtr(vp);
   if (mr || nr || kr) for (j=MB*NB, i=0; i != j; i++) cp[i] = ATL_rzero;

   for (j=Nb; j; j--, a += incAn, b += incBn, c += incCn)
   {
      for (i=Mb; i; i--, a += incAm, b += incBm, c += incCm)
      {
         if (Kb)
         {
            mm_bX(MB, NB, KB, ATL_rone, a, lda, b, ldb, ATL_rzero, cp, MB);
            a += incAk;  b += incBk;
            for (k=Kb-1; k; k--, a += incAk, b += incBk)
               mm_b1(MB, NB, KB, ATL_rone, a, lda, b, ldb, ATL_rone, cp, MB);
            if (kr)
               mmcu(MB, NB, kr, ATL_rone, a, lda, b, ldb, ATL_rone, cp, MB);
         }
         else if (kr)
         {
            Mjoin(PATL,zero)(MB*NB, cp, 1); /* kill NaN/INF from last time */
            mmcu(MB, NB, kr, ATL_rone, a, lda, b, ldb, ATL_rzero, cp, MB);
         }
         geadd(MB, NB, alpha, cp, MB, beta, c, ldc);
      }
   }
   if (mr && N != nr)
      ATL_assert(Mjoin(PATL,NCmmIJK)(TA, TB, mr, N-nr, K, alpha,
                                     A+Mb*(incAm+Kb*incAk), lda, B, ldb,
                                     beta, C+Mb*MB, ldc) ==0);
   if (nr)
   {
      for (i=Mb; i; i--, a += incAm, b += incBm, c += incCm)
      {
      Mjoin(PATL,zero)(MB*nr, cp, 1); /* kill NaN and INF from last time */
         if (Kb)
         {
            mm_fixedKcu(MB, nr, KB, ATL_rone, a, lda, b, ldb, ATL_rzero,
                        cp, MB);
            a += incAk;  b += incBk;
            for (k=Kb-1; k; k--, a += incAk, b += incBk)
               mm_fixedKcu(MB, nr, KB, ATL_rone, a, lda, b, ldb, ATL_rone,
                           cp, MB);
            if (kr)
               mmcu(MB, nr, kr, ATL_rone, a, lda, b, ldb, ATL_rone, cp, MB);
         }
         else if (kr)
            mmcu(MB, nr, kr, ATL_rone, a, lda, b, ldb, ATL_rzero, cp, MB);
         geadd(MB, nr, alpha, cp, MB, beta, c, ldc);
      }
      if (mr)  /* cleanup small mr x nr block of C */
      {
         c = C + Mb*MB + ldc*Nb*NB;
         a = A + Mb*(incAm+Kb*incAk);
         b = B + Nb*( incBn+(Mb*(incBm+Kb*incBk)) );
         Mjoin(PATL,zero)(MB*nr, cp, 1); /* kill NaN and INF from last time */
         if (Kb)
         {
            mm_fixedKcu(mr, nr, KB, ATL_rone, a, lda, b, ldb, ATL_rzero,
                        cp, MB);
            a += incAk;  b += incBk;
            for (k=Kb-1; k; k--, a += incAk, b += incBk)
               mm_fixedKcu(mr, nr, KB, ATL_rone, a, lda, b, ldb, ATL_rone,
                           cp, MB);
            if (kr)
               mmcu(mr, nr, kr, ATL_rone, a, lda, b, ldb, ATL_rone, cp, MB);
         }
         else if (kr)
            mmcu(mr, nr, kr, ATL_rone, a, lda, b, ldb, ATL_rzero, cp, MB);
         geadd(mr, nr, alpha, cp, MB, beta, c, ldc);
      }
   }
   free(vp);
   return(0);
}
コード例 #28
0
void Mjoin(PATL,CtrsmKL)
   (enum ATLAS_UPLO Uplo, enum ATLAS_TRANS Trans, enum ATLAS_DIAG Diag,
    const int M, const int N, const SCALAR alpha, const TYPE *A, const int lda,
    TYPE *B, const int ldb)
#endif
{
   TYPE tmp[2], ra, ia;
   void *vp;
   TYPE *a;

   if (N > 0)
   {
      if (M > 1)
      {
         vp = malloc(ATL_Cachelen + ATL_MulBySize(M)*M);
         ATL_assert(vp);
         a = ATL_AlignPtr(vp);
         Diag = trsmcopy(Uplo, Trans, Diag, M, alpha, A, lda, a);
         if (Trans != AtlasNoTrans)
         {
            if (Uplo == AtlasLower) Uplo = AtlasUpper;
            else Uplo = AtlasLower;
         }
         switch(M)
         {
         case 2:
            if (Uplo == AtlasLower) trsmLL_2(N, a, B, ldb);
            else trsmLU_2(N, a, B, ldb);
            break;
         case 3:
            if (Uplo == AtlasLower) trsmLL_3(N, a, B, ldb);
            else trsmLU_3(N, a, B, ldb);
            break;
         case 4:
            if (Uplo == AtlasLower) trsmLL_4(N, a, B, ldb);
            else trsmLU_4(N, a, B, ldb);
            break;
         default: /* this crap should never be used */
               tmp[0] = ATL_rone; tmp[1] = ATL_rzero;
               Mjoin(PATL,cplxinvert)(M, a, M+M+2, a, M+M+2);
               Mjoin(PATL,reftrsm)(AtlasLeft, Uplo, AtlasNoTrans, Diag, M, N,
                                   tmp, a, M, B, ldb);
         }
         free(vp);
      }
      else if (M == 1)
      {
         if (Diag == AtlasUnit)
         #ifdef Right_
            Mjoin(PATL,scal)(N, alpha, B, 1);
         #else
            Mjoin(PATL,scal)(N, alpha, B, ldb);
         #endif
         else
         {
            tmp[0] = A[0];
            if (Trans != AtlasConjTrans) tmp[1] = A[1];
            else tmp[1] = -A[1];
            Mjoin(PATL,cplxinvert)(1, tmp, 2, tmp, 2); /* safe cplx invers */
            ra = tmp[0]; ia = tmp[1];
            tmp[0] = *alpha * ra - alpha[1] * ia;
            tmp[1] = *alpha * ia + alpha[1] * ra;
            #ifdef Right_
               Mjoin(PATL,scal)(N, tmp, B, 1);
            #else
               Mjoin(PATL,scal)(N, tmp, B, ldb);
            #endif
         }
      }
   }
コード例 #29
0
ファイル: gemmtst.c プロジェクト: certik/vendor
int mmcase0(int MFLOP, int CACHESIZE, char TA, char TB, int M, int N, int K,
	    SCALAR alpha, int lda, int ldb, SCALAR beta, int ldc)
{
   char *pc;
#ifdef TREAL
   char *form="%4d   %c   %c %4d %4d %4d  %5.1f  %5.1f  %6.2f %5.1f %5.2f   %3s\n";
   #define MALPH alpha
   #define MBETA beta
   TYPE betinv, bet=beta;
#else
   #define MALPH *alpha, alpha[1]
   #define MBETA *beta, beta[1]
   char *form="%4d   %c   %c %4d %4d %4d  %5.1f %5.1f  %5.1f %5.1f  %6.2f %6.1f %4.2f   %3s\n";
   TYPE betinv[2], *bet=beta;
#endif
   int nreps, incA, incB, incC, inc, nmat, k;
   TYPE *c, *C, *a, *A, *b, *B, *st;
   int ii, jj, i, j=0, PASSED, nerrs;
   double t0, t1, t2, t3, mflop, mf, mops;
   TYPE maxval, f1, ferr;
   static TYPE feps=0.0;
   static int itst=1;
   enum ATLAS_TRANS TAc, TBc;
   void *vp;

   #ifdef TCPLX
      if (*beta == 0.0 && beta[1] == 0.0) betinv[0] = betinv[1] = 0.0;
      else if (beta[1] == 0.0) { betinv[0] = 1 / *beta;  betinv[1] = 0.0; }
      else
      {
         t0 = *beta;
         t1 = beta[1];
         if (Mabs(t1) <= Mabs(t0))
         {
            t2 = t1 / t0;
            betinv[0] = t0 = 1.0 / (t0 + t1*t2);
            betinv[1] = -t0 * t2;
         }
         else
         {
            t2 = t0 / t1;
            betinv[1] = t0 = -1.0 / (t1 + t0*t2);
            betinv[0] = -t2 * t0;
         }
      }
      mops = ( ((8.0*M)*N)*K ) / 1000000.0;
   #else
      if (beta != 0.0) betinv = 1.0 / beta;
      else betinv = beta;
      mops = ( ((2.0*M)*N)*K ) / 1000000.0;
   #endif
   nreps = MFLOP / mops;
   if (nreps < 1) nreps = 1;
   if (TA == 'n' || TA == 'N')
   {
      TAc = AtlasNoTrans;
      incA = lda * K;
   }
   else
   {
      if (TA == 'c' || TA == 'C') TAc = AtlasConjTrans;
      else TAc = AtlasTrans;
      incA = lda * M;
   }
   if (TB == 'n' || TB == 'N')
   {
      incB = ldb * N;
      TBc = AtlasNoTrans;
   }
   else
   {
      incB = ldb * K;
      if (TB == 'c' || TB == 'C') TBc = AtlasConjTrans;
      else TBc = AtlasTrans;
   }
   incC = ldc*N;
   inc = incA + incB + incC;
   i = M*K + K*N + M*N;  /* amount of inc actually referenced */
   /* This is a hack; change to use of flushcache instead. */
   nmat = ((CACHESIZE/ATL_sizeof) + i)/i;
   vp = malloc(ATL_MulBySize(nmat*inc)+ATL_Cachelen);
   ATL_assert(vp);
   C = c = ATL_AlignPtr(vp);
   a = A = C + incC;
   b = B = A + incA;
   st = C + nmat*inc;
   matgen(inc, nmat, C, inc, M*N);

#ifdef DEBUG
   printmat("A0", M, K, A, lda);
   printmat("B0", K, N, B, ldb);
   printmat("C0", M, N, C, ldc);
#endif

   t0 = time00();
   for (k=nreps; k; k--)
   {
      trusted_gemm(TAc, TBc, M, N, K, alpha, a, lda, b, ldb, bet, c, ldc);
      c += inc; a += inc; b += inc;
      if (c == st)
      {
         c = C; a = A; b = B;
         if (bet == beta) bet = betinv;
         else bet = beta;
      }
   }
   t1 = time00() - t0;
   t1 /= nreps;
   if (t1 <= 0.0) mflop = t1 = 0.0;
   else   /* flop rates actually 8MNK+12MN & 2MNK + 2MN, resp */
      mflop = mops / t1;
   printf(form, itst, TA, TB, M, N, K, MALPH, MBETA, t1, mflop, 1.0, "---");

#ifdef DEBUG
   printmat("C", M, N, C, ldc);
#endif

   matgen(inc, nmat, C, inc, M*N);
   t0 = time00();
   for (k=nreps; k; k--)
   {
      test_gemm(TAc, TBc, M, N, K, alpha, a, lda, b, ldb, bet, c, ldc);
      c += inc; a += inc; b += inc;
      if (c == st)
      {
         c = C; a = A; b = B;
         if (bet == beta) bet = betinv;
         else bet = beta;
      }
   }

   t2 = time00() - t0;
   t2 /= nreps;
   if (t2 <= 0.0) t2 = mflop = 0.0;
   else mflop = mops / t2;

   pc = "---";
   if (t1 == t2) t3 = 1.0;
   else if (t2 != 0.0) t3 = t1/t2;
   else t3 = 0.0;
   printf(form, itst++, TA, TB, M, N, K, MALPH, MBETA, t2, mflop, t3, pc);
   free(vp);
   return(1);
}
コード例 #30
0
ファイル: ATL_trmv.c プロジェクト: AIDman/Kaldi
static int ATL_trmvLT
(
   const enum ATLAS_DIAG  Diag,
   const int nb,
   ATL_CINT N,
   const TYPE *A,
   ATL_CINT lda,
   TYPE *X,
   ATL_CINT incX
)
/*
 * RETURNS: 0 if TRMV was performed, non-zero if nothing done
 */
{
   static void (*trmvK)(ATL_CINT, const TYPE*, ATL_CINT, const TYPE*, TYPE*);
   void (*gemv)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT,
                const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT);
   void *vp;
   TYPE *x, *y;
   const size_t opsize = (N*N+N+N)*sizeof(TYPE)SHIFT;
   size_t t0;
   #ifdef TCPLX
      size_t N2=N+N, lda2 = lda+lda;
      TYPE one[2] = {ATL_rone, ATL_rzero};
   #else
      #define N2 N
      #define lda2 lda
      #define one ATL_rone
   #endif
   const size_t incA = ((size_t)lda+1)*(nb SHIFT);
   ATL_CINT Nnb = ((N-1)/nb)*nb, Nr = N-Nnb;
   ATL_INT j;

   if (N < nb+nb)
      return(1);
   if (opsize > MY_CE)
      gemv = Mjoin(PATL,gemvT);
   else
      gemv = (opsize <= ATL_MulBySize(ATL_L1elts)) ? Mjoin(PATL,gemvT_L1) :
             Mjoin(PATL,gemvT_L2);
   trmvK = (Diag == AtlasNonUnit) ? ATL_trmvLTNk : ATL_trmvLTUk;
/*
 * If X is aligned to Cachelen wt inc=1, use it as y
 */
   t0 = (size_t) X;
   if (incX == 1 && (ATL_MulByCachelen(ATL_DivByCachelen(t0)) == t0))
   {
      ATL_INT i;
      vp = malloc(ATL_Cachelen+ATL_MulBySize(N));
      if (!vp)
         return(2);
      x = ATL_AlignPtr(vp);
      y = X;
      for (i=0; i < N2; i++)
      {
         x[i] = X[i];
         X[i] = ATL_rzero;
      }

   }
   else  /* allocate both X and Y */
   {
      vp = malloc((ATL_Cachelen+ATL_MulBySize(N))<<1);
      if (!vp)
         return(3);
      x = ATL_AlignPtr(vp);
      y = x + N2;
      y = ATL_AlignPtr(y);
      Mjoin(PATL,copy)(N, X, incX, x, 1);
      Mjoin(PATL,zero)(N, y, 1);
   }
   for (j=0; j < Nnb; j += nb, A += incA)
   {
      #ifdef TCPLX
         const register size_t j2=j+j, nb2=nb+nb;
      #else
         #define j2 j
         #define nb2 nb
      #endif
      trmvK(nb, A, lda, x+j2, y+j2);
      gemv(N-j-nb, nb, one, A+nb2, lda, x+j2+nb2, 1, one, y+j2, 1);
      #ifndef TCPLX
         #undef j2
         #undef nb2
      #endif
   }
   #ifdef TCPLX
      j += j;
   #endif
   trmvK(Nr, A, lda, x+j, y+j);
   if (y != X)
      Mjoin(PATL,copy)(N, y, 1, X, incX);
   free(vp);
   return(0);
}