示例#1
0
文件: ATL_gpmm.c 项目: apollos/atlas
void Mjoin(PATL,gpmm)
   (const enum PACK_UPLO UA, const enum PACK_TRANS TA,
    const enum PACK_UPLO UB, const enum PACK_TRANS TB, const enum PACK_UPLO UC,
    const int M, const int N, const int K, const SCALAR alpha,
    const TYPE *A, const int IA, const int JA, const int lda,
    const TYPE *B, const int IB, const int JB, const int ldb,
    const SCALAR beta, TYPE *C, const int IC, const int JC, const int ldc)
{
   int j;
   #ifdef CacheEdge
      static const int CE_K = ((ATL_DivBySize(CacheEdge)-(NBNB SHIFT)) /
                                           (NB*(NB+NB)))*NB;
   #else
      #define CE_K K
   #endif
   if (!M || !N) return;
   if (!K || SCALAR_IS_ZERO(alpha))
   {
      for (j=0; j != N; j++)
         Mjoin(PATL,scal)(M, beta, C+MindexP(UC,IC,JC+j,ldc), 1);
      return;
   }
/*
 * Packed gpmm not yet implemented for complex,
 * so die if not really a dense gemm
 */
   #ifdef TCPLX
      ATL_assert (UA == PackGen && UB == PackGen && UC == PackGen);
      Mjoin(PATL,gemm)(TA, TB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
   #else
   Mjoin(PATL,prankK)(UA, TA, UB, TB, M, N, K, CE_K, alpha,
          A+MindexP(UA,IA,JA,lda), Mpld(UA,JA,lda), B+MindexP(UB,IB,JB,ldb),
          Mpld(UB,JB,ldb), beta, UC, C+MindexP(UC,IB,JB,ldc), Mpld(UC,JC,ldc));
   #endif
}
示例#2
0
文件: ATL_sprk.c 项目: certik/vendor
void Mjoin(PATL,sprk)
   (const enum PACK_UPLO UA, const enum PACK_TRANS TA,
    const enum ATLAS_UPLO UC, const int CP,
    const int N, const int K, const SCALAR alpha,
    const TYPE *A, const int IA, const int JA, const int lda,
    const SCALAR beta, TYPE *C, const int IC, const int JC, const int ldc)
{
   const enum PACK_UPLO UC2 = ((CP) ? UC : PackGen);
   int j;
   #ifdef CacheEdge
      static const int CE_K = ((ATL_DivBySize(CacheEdge SHIFT)-(NBNB SHIFT)) /
                                           (NB*(NB+NB)))*NB;
   #else
      #define CE_K K
   #endif

   if ((!N) || ((SCALAR_IS_ZERO(alpha) || (!K)) && (SCALAR_IS_ONE(beta))))
      return;
   if (!K || SCALAR_IS_ZERO(alpha))
   {
      if (UC == CblasLower)
      {
         for (j=0; j != N; j++)
            Mjoin(PATL,scal)(N-j, beta, C+MindexP(UC2,IC+j,JC+j,ldc), 1);
      }
      else /* UC == CblasUpper */
      {
         for (j=0; j != N; j++)
            Mjoin(PATL,scal)(j+1, beta, C+MindexP(UC2,IC,JC+j,ldc), 1);
      }
      return;
   }
   Mjoin(PATL,sprk_rK)(UA, TA, UC, CP, N, K, CE_K, alpha, A, lda, beta, C, ldc);
}
示例#3
0
文件: invtrsm.c 项目: certik/vendor
static double RunTiming
   (enum CBLAS_ORDER Order, enum TEST_UPLO Uplo, int N, int lda,
    int CacheSize, int nreps)
{
   TYPE *A, *a;
   const int incA = N*lda;
   int i, k;
   double t0, t1=0.0;

   if (nreps < 1) nreps = 1;
   i = ATL_DivBySize(2*CacheSize) ATL_PTCACHEMUL;
   k = i = (i + N*N-1) / (N*N);
   if (nreps > i) k = i = nreps;
   a = A = malloc(i * ATL_MulBySize(incA));
   if (A)
   {
      if (Uplo == TestGE)
         for (i=0; i < k; i++) Mjoin(PATL,gegen)(N, N, A+i*incA, lda, N+lda);
      else for (i=0; i < k; i++) hegen(Order, Uplo, N, A+i*incA, lda);

      t0 = time00();
      for (i=nreps; i; i--, a += incA) test_inv(Order, Uplo, N, a, lda);
      t1 = time00() - t0;
      free(A);
   }
   else fprintf(stderr, "   WARNING: not enough mem to run timings!\n");
   return(t1/nreps);
}
示例#4
0
int RunCase(int CacheSize, TYPE thresh, int MFLOP, enum ATLAS_ORDER Order,
            enum ATLAS_UPLO Uplo, int N, int lda)
{
   char *Ups, *Ord;
   TYPE resid = 0.0;
   double mflop, mflops, t0, tim=0.0;
   int nreps=1, passed, i, imem;
   const int incA = lda*N;
   TYPE *a, *A;

   mflops = N;
   mflops = (mflops*mflops*mflops) / 4.0;
   #ifdef TCPLX
      mflops *= 4.0;
   #endif
   mflops /= 1000000.0;

   if (thresh > ATL_rzero) resid =
      uumtest(Order, Uplo, CacheSize, N, lda, &tim);
   else resid = -1.0;

   if (MFLOP > mflops || thresh <= ATL_rzero) /* need to time repetitively */
   {
      nreps = (mflops * 1000000);
      nreps = (MFLOP*1000000 + nreps-1) / nreps;
      if (nreps < 1) nreps = 1;
      imem = ATL_DivBySize(CacheSize) ATL_PTCACHEMUL;
      imem = (imem + 2*N*N-1) / (N*N);
      if (imem < nreps) imem = nreps;
      a = A = malloc(imem * ATL_MulBySize(incA));
      if (A != NULL)
      {
         for (i=0; i < imem; i++) lltgen(Uplo, N, A+i*incA, lda, N*1029+lda);
         t0 = time00();
         for (i=nreps; i; i--, a += incA)
            test_lauum(Order, Uplo, N, a, lda);
         tim = time00() - t0;
         tim /= nreps;
         free(A);
      }
      else fprintf(stderr, "   WARNING: not enough mem to run timings!\n");
   }

   if (tim > 0.0) mflop = mflops / tim;
   else mflop = 0.0;
   if (Uplo == AtlasUpper) Ups = "Upper";
   else Ups = "Lower";
   if (Order == CblasColMajor) Ord = "Col";
   else Ord = "Row";
   fprintf(stdout, "%5d  %3s  %5s %6d %6d  %12.5f  %12.3f  %12e\n",
           nreps, Ord, Ups, N, lda, tim, mflop, resid);
   if (resid > thresh || resid != resid) passed = 0;
   else if (resid < 0.0) passed = -1;
   else passed = 1;
   return(passed);
}
示例#5
0
文件: ATL_tger.c 项目: Leobin7/Kaldi
void Mjoin(Mjoin(PATL,t),MY_GER)
(ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *X,
 ATL_CINT incX, const TYPE *Y, ATL_CINT incY, TYPE *A, ATL_CINT lda)
{
    ATL_INT mb, nb, mu, nu, nblks, nrblks, ncblks, ldaP;
    ATL_TGER_t pd;
    int P;
    static TYPE *A0=NULL, *A0e=NULL;

    if (M < 1 || N < 1 || SCALAR_IS_ZERO(alpha))  /* quick return if no-op */
        return;

    pd.M = M;
    pd.N = N;
    pd.incX = incX;
    pd.incY = incY;
    pd.lda = lda;
    pd.alpha = alpha;
    pd.X = X;
    pd.Y = Y;
    pd.A = A;
    pd.flg = (A0 == A || A0e == A+(M SHIFT)) ? 1 : 2;
    A0 = A;
    A0e = A+(M SHIFT);

    P = ATL_DivBySize(CacheEdge);
    P = ((size_t)M*N+P-1) / P;   /* add more procs only when cache is full */
    P = (P&1 && P > 1)?P+1 : P;  /* don't use odd P, since it hurts alignment */
//   printf("TGER, P=%d\n", P);
    P = Mmin(ATL_NTHREADS, P);
    /*
     * Make sure we don't overflow 32-bit integer lda
     */
    ldaP = P * lda;
    while ((size_t)ldaP != ((size_t)lda)*P)
    {
        P--;
        ldaP = P * lda;
    }
    if (P > 1)
        ATL_goparallel(P, MY_DOWORK_cols, &pd, NULL);
    else
        MY_GER1(M, N, alpha, X, incX, Y, incY, A, lda);
}
示例#6
0
void Mjoin(PATL,tgemv)
   (const enum ATLAS_TRANS TA, ATL_CINT M, ATL_CINT N, const SCALAR alpha,
    const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX,
    const SCALAR beta, TYPE *Y, ATL_CINT incY)
{
   static size_t ALb=0, ALe=0;
   size_t at = (size_t) A;
   ATL_INT n, P, ldaP;
   ATL_TGEMV_t pd;
/*
 * quick return if possible.
 */
   if (M < 1 || N < 1)
      return;
   if (SCALAR_IS_ZERO(alpha))   /* No contrib from alpha*A*x */
   {
      ATL_CINT NY = (TA == AtlasTrans || TA == AtlasConjTrans) ? N : M;
      if (!SCALAR_IS_ONE(beta))
      {
         if (SCALAR_IS_ZERO(beta))
            Mjoin(PATL,zero)(NY, Y, incY);
         else
            Mjoin(PATL,scal)(NY, beta, Y, incY);
      }
      return;
   }
   pd.flg = (at >= ALb && at <= ALe) ? 1 : 0;
   ALb = (size_t)A;
   ALe = (size_t)(A+(M SHIFT));
   #ifdef TREAL
      pd.flg |= (TA == AtlasTrans || TA == AtlasConjTrans) ? 2 : 0;
   #else
      if (TA != AtlasNoTrans)
      {
         if (TA == AtlasConj)
            pd.flg |= 4;
         else if (TA == AtlasTrans)
            pd.flg |= 2;
         else /* if (TA == AtlasConjTrans) */
            pd.flg |= (2|4);
      }
   #endif
   P = ATL_DivBySize(CacheEdge);
   P = ((size_t)M*N+P-1) / P;   /* add more procs only when cache is full */
   P = (P&1 && P > 1)?P+1 : P;  /* don't use odd P; it hurts alignment */
   P = Mmin(ATL_NTHREADS, P);
if (TA == AtlasNoTrans || TA == AtlasConj)
   P=1;
//fprintf(stderr, "P=%d, TA=%d, M=%d, N=%d\n", P, (TA==AtlasTrans), M, N);
/*
 * Make sure we don't overflow 32-bit integer lda
 */
   ldaP = P * lda;
   while ((size_t)ldaP != ((size_t)lda)*P)
   {
      P--;
      ldaP = P * lda;
   }
   if (P > 1)
   {
      pd.M = M; pd.N = N; pd.incX = incX; pd.incY = incY; pd.lda = lda;
      pd.alpha = alpha; pd.beta = beta;
      pd.X = X; pd.Y = Y; pd.A = A;
      pd.P = P;
      n = N / P;
      pd.n = n;
      pd.nr = N - n*P;
      if (pd.flg & 2)   /* Transpose case */
      {
         ATL_goparallel(P, Mjoin(PATL,DOMVTWORK_cols), &pd, NULL);
         return;
      }
/*
 *    For gemvN, everyone needs a private M-length y.  Don't do this unless
 *    we are sure the combine cost is likely dominated by the parallelism
 */
      else if (n > Mmax(P,8))
      {
         int vrank;
         const TYPE *a;
         TYPE *y, *y0;
         #ifdef TCPLX
            TYPE one[2] = {ATL_rone, ATL_rzero};
            TYPE zero[2] = {ATL_rzero, ATL_rzero};
         #endif

         y0 = y = malloc(P*(ATL_Cachelen+ATL_MulBySize(M)));
         ATL_assert(y);
         pd.Y = y;
         pd.incY = 1;
         #ifdef TREAL
            pd.alpha = ATL_rone;
            pd.beta  = ATL_rzero;
         #else
            pd.alpha = one;
            pd.beta  = zero;
         #endif
         ATL_goparallel(P, Mjoin(PATL,DOMVNWORK_cols), &pd,
                        Mjoin(PATL,CombineMVN));
/*
 *       goparallel reduces all node's Ys to node 0's.  Extract his from the
 *       work array, and combine it with input array, applying both alpha
 *       and beta in the process
 */
         vrank = (!pd.nr || (pd.flg & 1)) ? 0 : pd.nr-1;
         a = A + (lda SHIFT)*vrank;
         y = ATL_Align2Ptr(y, a);
         Mjoin(PATL,axpby)(M, alpha, y, 1, beta, Y, incY);
         free(y0);
         return;
      }
   }
/*
 * If we haven't parallelized this thing, just do it serial
 */
   Mjoin(PATL,gemv)(TA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
}
示例#7
0
文件: lutst.c 项目: apollos/atlas
int RunCase(int CacheSize, TYPE thresh, int MFLOP, enum ATLAS_ORDER Order,
            int M, int N, int lda)
{
   char *cord = (Order == AtlasColMajor ? "Col" : "Row");
   const double maxMN = Mmax(M,N), minMN = Mmin(M,N);
   unsigned long nreps=0;
   int npiv=(-1), *ipiv;
   const int incA = (Order == AtlasColMajor ? N*lda : M*lda);
   double mflops, mflop, resid, tim=(-1.0), t0;
   TYPE *A, *a;
   int i;

   #ifdef TREAL
      mflops = maxMN * minMN * minMN - ((minMN*minMN*minMN) / 3.0) -
               (minMN*minMN) / 2.0;
   #else
      mflops = (maxMN * minMN * minMN - ((minMN*minMN*minMN) / 3.0) +
                (maxMN*minMN) / 2.0)*4.0 - 3.0 * minMN*minMN;
   #endif
   mflops /= 1000000.0;

   if (thresh > ATL_rzero)
   {
      if (Order == AtlasColMajor)
         resid = lutestC(CacheSize, M, N, lda, &npiv, &tim);
      else resid = lutestR(CacheSize, M, N, lda, &npiv, &tim);
   }
   else resid = -1.0;
   if (MFLOP > mflops || thresh <= ATL_rzero) /* need to time repetitively */
   {
      nreps = (mflops*1000000);
      nreps = (MFLOP*1000000 + nreps-1) / nreps;
      if (nreps < 1) nreps = 1;
      i = ATL_DivBySize(2*CacheSize) ATL_PTCACHEMUL;
      i = (i + M*N) / (M*N);
      if (i < nreps) i = nreps;  /* don't reuse mem or no pivoting */
      a = A = malloc(i * ATL_MulBySize(incA));
      if (A != NULL)
      {
         ipiv = malloc(Mmin(M,N)*sizeof(int));  /* what the hell - reuse ipiv */
         if (ipiv)
         {
            Mjoin(PATL,gegen)(i*incA, 1, A, i*incA, incA+M+3012);
            t0 = time00();
            for (i=nreps; i; i--, a += incA)
               test_getrf(Order, M, N, a, lda, ipiv);
            tim = time00() - t0;
            tim /= nreps;
            if (npiv == 0) npiv = findnpvt(Mmin(M,N), ipiv);
            free(ipiv);
         }
         else fprintf(stderr, "   WARNING: not enough mem to run timings!\n");
         free(A);
      }
      else fprintf(stderr, "   WARNING: not enough mem to run timings!\n");
   }
   if (tim > 0.0) mflop = mflops / tim;
   else mflop = 0.0;
   fprintf(stdout, "%5d  %3s   %6d %6d %6d %6d %9.3f %9.3f %9.3e\n",
           nreps, cord, M, N, lda, npiv, tim, mflop, resid);
   return(resid <= thresh);
}
示例#8
0
double gemvcase(const int MFLOP, const enum ATLAS_TRANS TA, const int l2size,
                const int M, const int N, const SCALAR alpha, const int lda,
                const SCALAR beta)
{
   unsigned long reps;
   int i, lx, ly, la;
   #ifdef TREAL
      const double flops = 2.0 * M * N;
   #else
      const double flops = 8.0 * M * N;
   #endif
   double ttest, mftest, t0;
   const int aincY=1, aincX=1, incY=1, incX=1;
   const int inca = (TA == AtlasNoTrans) ? lda * (N SHIFT) : lda * (M SHIFT);
   const int incx = N*incX SHIFT, incy = M*incY SHIFT;
   TYPE *a, *A, *stA, *A0, *x, *X, *X0, *stX, *y, *Y, *Y0, *stY;
   #ifdef TREAL
      const TYPE nbeta = -beta;
      TYPE bet = beta;
   #else
      const TYPE *bet = beta;
      TYPE nbeta[2];
      nbeta[0] = -beta[0]; nbeta[1] = -beta[1];
   #endif

   i = (ATL_DivBySize(l2size)+N-1)/N;
   if (i < 1) i = 1;
   lx = i * N * aincX;
   X0 = X = x = malloc(ATL_MulBySize(lx));
   if (x == NULL) return(-1);

   i = (ATL_DivBySize(l2size)+M-1)/M;
   if (i < 1) i = 1;
   ly = i * M * aincY;
   Y0 = Y = y = malloc(ATL_MulBySize(ly));
   if (y == NULL)
   {
      free(x);
      return(-1);
   }
   i = (ATL_DivBySize(l2size)+M*N)/(M*N);
   la = i * inca;
   A0 = A = a = malloc(ATL_MulBySize(la));
   if (a == NULL)
   {
      free(x);
      free(y);
      return(-1);
   }
   if (incX < 1)
   {
      stX = x;
      x = X = x + (lx SHIFT);
   }
   else stX = x + (lx SHIFT);
   if (incY < 1)
   {
      stY = y;
      y = Y = y + (ly SHIFT);
   }
   else stY = y + (ly SHIFT);
   stA = a + (la SHIFT);

   reps = (MFLOP * 1000000.0) / flops;
   if (reps < 1) reps = 1;
   Mjoin(PATL,gegen)(ly, 1, Y0, ly, M*incY);
   Mjoin(PATL,gegen)(lx, 1, X0, lx, N*incY+127*50+77);
   Mjoin(PATL,gegen)(la, 1, A0, la, N*M+513*7+90);

   t0 = time00();
   for (i=reps; i; i--)
   {
      #ifdef SYMM_
         Mjoin(PATL,symv)(AtlasLower, N, alpha, a, lda, x, incX, beta, y, incY);
      #else
         Mjoin(PATL,gemv)(TA, M, N, alpha, a, lda, x, incX, beta, y, incY);
      #endif
      x += incx;
      y += incy;
      a += inca;
      if (x == stX) x = X;
      if (y == stY)
      {
         y = Y;
         if (bet == beta) bet = nbeta;
         else bet = beta;
      }
      if (a == stA) a = A;
   }
   ttest = time00() - t0;

   if (ttest > 0.0) mftest = (reps * flops) / (1000000.0 * ttest);
   else mftest = 0.0;

   free(A0);
   free(X0);
   free(Y0);
   return(mftest);
}
示例#9
0
/*
 * This routine handles the case where N <= maxNB && K <= maxKB, so B is
 * only one block.  It is particularly important for the panel factorizations
 * of both LU and QR.
 */
int Mjoin(PATL,tammm_tNK)
(
   enum ATLAS_TRANS TA,
   enum ATLAS_TRANS TB,
   ATL_CINT M,
   ATL_CINT N,
   ATL_CINT K,
   const SCALAR alpha,
   const TYPE *A,
   ATL_CINT lda,
   const TYPE *B,
   ATL_CINT ldb,
   const SCALAR beta,
   TYPE *C,
   ATL_CINT ldc
)
{
   ATL_SZT nmblks;
   amminfo_t mminfo;
   unsigned int i, mb, nb, kb, mu, nu, ku, P, mr;
   ATL_tamm_tNK_t pd;  /* problem definition structure */
   void *vp;

/*
 * Special case for tiny N&K, and large M
 */
   if (N >= ATL_AMM_MAXNB || K >= ATL_AMM_MAXKB || M < ATL_AMM_MAXMB ||
       M < Mmin(8,ATL_NTHREADS)*ATL_AMM_MAXMB)
      return(1);
   Mjoin(PATL,GetRankKInfo)(&mminfo, TA, TB, M, N, K, alpha, beta);
   pd.a2blk = mminfo.a2blk;
   pd.b2blk = mminfo.b2blk;
   pd.blk2c = mminfo.Cblk2cm;
   pd.amm_b0 = mminfo.amm_b0;
   pd.TA = (TA == AtlasTrans);
   pd.TB = (TB == AtlasTrans);
   pd.N = N;
   pd.K = K;
   pd.A = A;
   pd.B = B;
   pd.C = C;
   pd.lda = lda;
   pd.ldb = ldb;
   pd.ldc = ldc;
   pd.alpha = &alpha;
   pd.beta  = &beta;
   mu = mminfo.mu;
   nu = mminfo.nu;
   ku = mminfo.ku;
   pd.mb = mb = mminfo.mb;
   pd.nmu = mb / mu;
   pd.nnu = (N+nu-1)/nu;
   nb = pd.nnu * nu;
   kb = mminfo.kb;
   nmblks = M / mb;
   mr = M - nmblks*mb;
   if (!mr)
   {
      pd.mbL = mr = mb;
      pd.nmuL = pd.nmu;
   }
   else
   {
      nmblks++;
      pd.nmuL = (mr+mu-1)/mu;
      pd.mbL = pd.nmuL * mu;
   }
   pd.mr = mr;
   pd.nmblks = nmblks;
   pd.KB0 = K;
   #if ATL_MAXKMAJ_RKK > 1
      if (ATL_AMMFLG_KMAJOR(mminfo.flag))
         pd.KB0 = ((K+ku-1)/ku)*ku;
   #endif
/*
 * Maximum scale is limited by NTHREADS or max number of M-blocks
 */
   P = (ATL_NTHREADS <= nmblks) ? ATL_NTHREADS : nmblks;
/*
 * We have a common B wrk of size KB0*nb, then
 * for each node, we need workspace: sz(A,C) = mb*K, K*nb, mb*N, laid out
 * in memory as A,C, then we add safety margin mu*nu*ku so advance loads don't
 * seg fault, and we add space for aligning the ptrs
 */
   pd.bsz = pd.KB0*nb;
   pd.wsz = mb*(pd.nnu*nu + pd.bsz) + 2*ATL_DivBySize(ATL_Cachelen);

   vp = malloc(ATL_MulBySize(pd.wsz*P + pd.bsz+mu*nu*ku) + ATL_Cachelen);
   if (!vp)
      return(2);
   pd.w = ATL_AlignPtr(vp);
   pd.MbCtr = ATL_SetGlobalAtomicCount(ATL_EstNctr(nmblks, P), nmblks, 0);
   pd.BassgCtr = ATL_SetAtomicCount(1);
   pd.BdoneCtr = ATL_SetAtomicCount(1);
   #ifdef DEBUG1
   {
      ATL_LAUNCHSTRUCT_t ls;
      ATL_thread_t ts;
      ts.rank = 0;
      ts.P = 1;
      ls.opstruct = &pd;
      Mjoin(PATL,DoWork_tamm_tNK)(&ls, &ts);
   }
   #else
      ATL_goparallel(P, Mjoin(PATL,DoWork_tamm_tNK), &pd, NULL);
   #endif

   ATL_FreeAtomicCount(pd.BdoneCtr);
   ATL_FreeAtomicCount(pd.BassgCtr);
   ATL_FreeGlobalAtomicCount(pd.MbCtr);

   free(vp);
   return(0);
}
示例#10
0
文件: ATL_hemv.c 项目: AIDman/Kaldi
static void ATL_symvL
(
   ATL_symvK_t symvK,
   const int NB,
   ATL_CINT N,
   const TYPE *A,
   ATL_CINT lda,
   const TYPE *x,
   TYPE  *y,
   const TYPE *xt,
   TYPE  *yt
)
{
   const TYPE one[2] = {ATL_rone, ATL_rzero};
   ATL_INT Mmb2;
   ATL_INT Mmb, mr, MB, j;
   const size_t incA = (NB SHIFT)*lda;
   const size_t opsize = ((size_t)(N+8)*(N+4))*(sizeof(TYPE)>>1)SHIFT;
   void (*gemvT)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT,
                 const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT);
   void (*gemvN)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT,
                 const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT);

   if (opsize > MY_CE)
   {
      gemvT = Mjoin(PATL,gemvT);
      gemvN = Mjoin(PATL,gemvN_L2);
   }
   else if (opsize <= ATL_MulBySize(ATL_L1elts))
   {
      gemvT = Mjoin(PATL,gemvT_L1);
      gemvN = Mjoin(PATL,gemvN_L1);
   }
   else
   {
      gemvT = Mjoin(PATL,gemvT_L2);
      gemvN = Mjoin(PATL,gemvN_L2);
   }
/*
 * Choose MB such that A is retained in L2 cache for second GEMV call
 * If partial block is tiny, absorbe it into last block since cache is not
 * precise anyway.
 */
   MB = ATL_DivBySize(MY_CE) / NB;
   MB = (MB > N || MB < 240) ? N : MB;
   Mmb2 = Mmb+Mmb;
   for (j=0; j < N; j += NB, A += incA)
   {
      const register size_t j2 = j+j;
      register int i, nb=N-j;
      nb = (nb >= NB) ? NB : nb;
      symvK(AtlasLower, nb, one, A+j2, lda, x+j2, 1, one, y+j2, 1);
      for (i=j+nb; i < N; i += MB)
      {
         const register size_t i2 = i+i;
         register int mb = N-i;
         mb = (mb >= MB) ? MB : mb;
         gemvT(mb, nb, one, A+i2, lda, xt+i2, 1, one, yt+j2, 1);
         gemvN(mb, nb, one, A+i2, lda, x+j2, 1, one, y+i2, 1);
      }
   }
}