Esempio n. 1
0
void Mjoin(PATL,DOMVNWORK_cols)(ATL_LAUNCHSTRUCT_t *lp, void *vp)
{
   ATL_thread_t *tp = vp;
   ATL_TGEMV_t *pd = lp->opstruct;
   ATL_CINT N = pd->N, lda = pd->lda;
   const int P = tp->P;
   ATL_CINT nr = pd->nr;
   const int vrank = (!nr || (pd->flg & 1)) ? tp->rank : (P + tp->rank+nr-1)%P;
   ATL_INT n = pd->n;
   TYPE *y = pd->Y + (tp->rank)*((pd->M SHIFT)+ATL_Cachelen/sizeof(TYPE));
   const TYPE *a = pd->A + (lda SHIFT)*vrank;
   #ifdef TCPLX
      TYPE one[2] = {ATL_rone, ATL_rzero}, zero[2] = {ATL_rzero, ATL_rzero};
      const enum ATLAS_TRANS TA = (pd->flg & 4) ? AtlasConj : AtlasNoTrans;
   #else
      const enum ATLAS_TRANS TA = AtlasNoTrans;
      #define one ATL_rone
      #define zero ATL_rzero
   #endif

   y = ATL_Align2Ptr(y, a);
   if (vrank < nr)
      n++;

   Mjoin(PATL,gemv)(TA, pd->M, n, one, a, lda*P,
                    pd->X+vrank*((pd->incX)SHIFT), P*pd->incX, zero, y, 1);
}
Esempio n. 2
0
void Mjoin(PATL,CombineMVN)
(
   void *vp,          /* void ptr to ATL_GEMV_t struct given to threads */
   const int myrank,  /* my entry in MMNODE_t array */
   const int hisrank  /* array entry to be combined into mine */
)
{
   ATL_TGEMV_t *pd = vp;
   ATL_CINT M = pd->M;
   ATL_INT i;
   const int P = pd->P;
   const int vrank = (!pd->nr || (pd->flg & 1)) ?  myrank :
                     (P + myrank+pd->nr-1)%P;
   const int hvrank = (!pd->nr || (pd->flg & 1)) ?  hisrank :
                      (P + hisrank+pd->nr-1)%P;
   const TYPE *a = pd->A + (pd->lda SHIFT)*vrank;
   const TYPE *ha = pd->A + (pd->lda SHIFT)*hvrank;
   TYPE *y = pd->Y + (myrank)*((M SHIFT)+ATL_Cachelen/sizeof(TYPE));
   TYPE *hy = pd->Y + (hisrank)*((M SHIFT)+ATL_Cachelen/sizeof(TYPE));
   #ifdef TCPLX
      ATL_CINT M2 = M+M;
      const TYPE one[2] = {ATL_rone, ATL_rzero};
   #else
      #define one ATL_rone
   #endif

   y = ATL_Align2Ptr(y, a);
   hy = ATL_Align2Ptr(hy, ha);

#ifdef TCPLX
   for (i=0; i < M2; i++)
#else
   for (i=0; i < M; i++)
#endif
      y[i] += hy[i];
}
Esempio n. 3
0
void Mjoin(PATL,tgemv)
   (const enum ATLAS_TRANS TA, ATL_CINT M, ATL_CINT N, const SCALAR alpha,
    const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX,
    const SCALAR beta, TYPE *Y, ATL_CINT incY)
{
   static size_t ALb=0, ALe=0;
   size_t at = (size_t) A;
   ATL_INT n, P, ldaP;
   ATL_TGEMV_t pd;
/*
 * quick return if possible.
 */
   if (M < 1 || N < 1)
      return;
   if (SCALAR_IS_ZERO(alpha))   /* No contrib from alpha*A*x */
   {
      ATL_CINT NY = (TA == AtlasTrans || TA == AtlasConjTrans) ? N : M;
      if (!SCALAR_IS_ONE(beta))
      {
         if (SCALAR_IS_ZERO(beta))
            Mjoin(PATL,zero)(NY, Y, incY);
         else
            Mjoin(PATL,scal)(NY, beta, Y, incY);
      }
      return;
   }
   pd.flg = (at >= ALb && at <= ALe) ? 1 : 0;
   ALb = (size_t)A;
   ALe = (size_t)(A+(M SHIFT));
   #ifdef TREAL
      pd.flg |= (TA == AtlasTrans || TA == AtlasConjTrans) ? 2 : 0;
   #else
      if (TA != AtlasNoTrans)
      {
         if (TA == AtlasConj)
            pd.flg |= 4;
         else if (TA == AtlasTrans)
            pd.flg |= 2;
         else /* if (TA == AtlasConjTrans) */
            pd.flg |= (2|4);
      }
   #endif
   P = ATL_DivBySize(CacheEdge);
   P = ((size_t)M*N+P-1) / P;   /* add more procs only when cache is full */
   P = (P&1 && P > 1)?P+1 : P;  /* don't use odd P; it hurts alignment */
   P = Mmin(ATL_NTHREADS, P);
if (TA == AtlasNoTrans || TA == AtlasConj)
   P=1;
//fprintf(stderr, "P=%d, TA=%d, M=%d, N=%d\n", P, (TA==AtlasTrans), M, N);
/*
 * Make sure we don't overflow 32-bit integer lda
 */
   ldaP = P * lda;
   while ((size_t)ldaP != ((size_t)lda)*P)
   {
      P--;
      ldaP = P * lda;
   }
   if (P > 1)
   {
      pd.M = M; pd.N = N; pd.incX = incX; pd.incY = incY; pd.lda = lda;
      pd.alpha = alpha; pd.beta = beta;
      pd.X = X; pd.Y = Y; pd.A = A;
      pd.P = P;
      n = N / P;
      pd.n = n;
      pd.nr = N - n*P;
      if (pd.flg & 2)   /* Transpose case */
      {
         ATL_goparallel(P, Mjoin(PATL,DOMVTWORK_cols), &pd, NULL);
         return;
      }
/*
 *    For gemvN, everyone needs a private M-length y.  Don't do this unless
 *    we are sure the combine cost is likely dominated by the parallelism
 */
      else if (n > Mmax(P,8))
      {
         int vrank;
         const TYPE *a;
         TYPE *y, *y0;
         #ifdef TCPLX
            TYPE one[2] = {ATL_rone, ATL_rzero};
            TYPE zero[2] = {ATL_rzero, ATL_rzero};
         #endif

         y0 = y = malloc(P*(ATL_Cachelen+ATL_MulBySize(M)));
         ATL_assert(y);
         pd.Y = y;
         pd.incY = 1;
         #ifdef TREAL
            pd.alpha = ATL_rone;
            pd.beta  = ATL_rzero;
         #else
            pd.alpha = one;
            pd.beta  = zero;
         #endif
         ATL_goparallel(P, Mjoin(PATL,DOMVNWORK_cols), &pd,
                        Mjoin(PATL,CombineMVN));
/*
 *       goparallel reduces all node's Ys to node 0's.  Extract his from the
 *       work array, and combine it with input array, applying both alpha
 *       and beta in the process
 */
         vrank = (!pd.nr || (pd.flg & 1)) ? 0 : pd.nr-1;
         a = A + (lda SHIFT)*vrank;
         y = ATL_Align2Ptr(y, a);
         Mjoin(PATL,axpby)(M, alpha, y, 1, beta, Y, incY);
         free(y0);
         return;
      }
   }
/*
 * If we haven't parallelized this thing, just do it serial
 */
   Mjoin(PATL,gemv)(TA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
}
Esempio n. 4
0
void ATL_her(const enum ATLAS_UPLO Uplo, ATL_CINT N, const TYPE alpha,
               const TYPE *X, ATL_CINT incX, TYPE *A, ATL_CINT lda)
{
   const TYPE calpha[2] = {alpha, ATL_rzero};
   void *vp=NULL;
   TYPE *x, *xt;
   ATL_r1kern_t gerk;
   ATL_INT CacheElts;
   const int ALP1 = (alpha == ATL_rone);
   int COPYX, COPYXt;
   int mu, nu, minM, minN, alignX, alignXt, FNU, ALIGNX2A;
   if (N < 1 || (alpha == ATL_rzero))
      return;
/*
 * For very small problems, avoid overhead of func calls & data copy
 */
   if (N < 50)
   {
      Mjoin(PATL,refher)(Uplo, N, alpha, X, incX, A, lda);
      return;
   }
/*
 * Determine the GER kernel to use, and its parameters
 */
   gerk = ATL_GetR1Kern(N-ATL_s1L_NU, ATL_s1L_NU, A, lda, &mu, &nu,
                        &minM, &minN, &alignX, &ALIGNX2A, &alignXt,
                        &FNU, &CacheElts);
/*
 * Determine if we need to copy the vectors
 */
   COPYX = (incX != 1);
   if (!COPYX)  /* may still need to copy due to alignment issues */
   {
/*
 *    ATL_Cachelen is the highest alignment that can be requested, so
 *    make X's % with Cachelen match that of A if you want A & X to have
 *    the same alignment
 */
      if (ALIGNX2A)
      {
         size_t t1 = (size_t) A, t2 = (size_t) X;
         COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
                 (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
      }
      else if (alignX)
      {
         size_t t1 = (size_t) X;
         COPYX = ((t1/alignX)*alignX != t1);
      }
   }
   vp = malloc((ATL_Cachelen+ATL_MulBySize(N))*(1+COPYX));
   if (!vp)
   {
      Mjoin(PATL,refher)(Uplo, N, alpha, X, incX, A, lda);
      return;
   }
   xt = ATL_AlignPtr(vp);
   if (COPYX)
   {
      x = xt + N+N;
      x = ALIGNX2A ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x);
      Mjoin(PATL,copy)(N, X, incX, x, 1);
   }
   else
      x = (TYPE*) X;
   if (ALP1)
      Mjoin(PATL,copyConj)(N, X, incX, xt, 1);
   else
      Mjoin(PATL,moveConj)(N, calpha, X, incX, xt, 1);
   if (Uplo == AtlasUpper)
      Mjoin(PATL,her_kU)(gerk, N, alpha, x, xt, A, lda);
   else
      Mjoin(PATL,her_kL)(gerk, N, alpha, x, xt, A, lda);
   if (vp)
     free(vp);
}
Esempio n. 5
0
void ATL_ger
   (const int M, const int N, const SCALAR alpha0, const TYPE *X,
    const int incX, const TYPE *Y, const int incY, TYPE *A, const int lda)
{
#ifdef ATL_CHOICE
   const size_t opsize = (M*N+M+N)*sizeof(TYPE)SHIFT;

   if (opsize <= ATL_MulBySize(ATL_L1elts))
      Mjoin(MY_GER,_L1)(M, N, alpha0, X, incX, Y, incY, A, lda);
   else if (opsize <= MY_CE)
      Mjoin(MY_GER,_L2)(M, N, alpha0, X, incX, Y, incY, A, lda);
   else
      Mjoin(MY_GER,_OOC)(M, N, alpha0, X, incX, Y, incY, A, lda);
#else
   void (*getX)(const int N, const SCALAR alpha, const TYPE *X,
                const int incX, TYPE *Y, const int incY);
   ATL_r1kern_t gerk;
   void *vp=NULL;
   TYPE *x = (TYPE*)X, *y = (TYPE*)Y;
   size_t t1, t2;
   ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1;
   int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, COPYX, COPYY, APPLYALPHAX;
   int minM, minN;
   #ifdef TREAL
      #define one ATL_rone
      TYPE alpha = alpha0;
      const int ALPHA_IS_ONE = (alpha0 == ATL_rone);
   #else
      TYPE one[2] = {ATL_rone, ATL_rzero}, *alpha=(TYPE*)alpha0;
      const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha[1] == ATL_rzero);
   #endif

   if (M < 1 || N < 1 || SCALAR_IS_ZERO(alpha))
      return;
/*
 * Get gerk kernel pointer along with any usage guidelines, and use the
 * optimized CacheElts to compute the correct blocking factor
 */
   gerk = ATL_GetR1Kern(M, N, A, lda, &mu, &nu, &minM, &minN, &alignX,
                         &ALIGNX2A, &alignY, &ForceNU, &CacheElts);
   if (CacheElts)
   {
      mb = (CacheElts - 2*nu) / (2*(nu+1));
      mb = (mb > mu) ? (mb/mu)*mu : M;
      mb = (mb > M) ? M : mb;
   }
   else
      mb = M;
/*
 * Set up to handle case where kernel requres N to be a multiple if NU
 */
   if (ForceNU)
   {
      Nm = (N/nu)*nu;
      nr = N - Nm;
   }
   else
   {
      Nm = N;
      nr = 0;
   }
/*
 * For very small N, we can't afford the data copy, so call AXPY-based routine
 */
   if (N < 4 || Nm < 1)
   {
      MY_GERK_AXPY(M, N, alpha0, X, incX, Y, incY, A, lda);
      return;
   }
/*
 * ATLAS's GER kernels loop over M in inner loop, which is bad news if M is
 * very small.  Call code that requires no copy of A & B for these degenerate
 * cases
 */
   if (M < 16 || M < minM)
   {
      MY_GERK_MLT16(M, N, alpha0, X, incX, Y, incY, A, lda);
      return;
   }
/*
 *****************************************************************************
 Figure out whether vecs need be copied, and which one will be scaled by alpha
 *****************************************************************************
 */
   #ifdef Conj_
      COPYY = 1;
   #else
      COPYY = (incY != 1);
      if (!COPYY && alignY)
      {
         t1 = (size_t) Y;
         COPYY = ((t1/alignY)*alignY != t1);
      }
   #endif
   COPYX = (incX != 1);
   if (!COPYX)  /* may still need to copy due to alignment issues */
   {
/*
 *    ATL_Cachelen is the highest alignment that can be requested, so
 *    make X's % with Cachelen match that of A if you want A & X to have
 *    the same alignment
 */
      if (ALIGNX2A)
      {
         t1 = (size_t) A;
         t2 = (size_t) X;
         COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
                 (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
      }
      else if (alignX)
      {
         t1 = (size_t) X;
         COPYX = ((t1/alignX)*alignX != t1);
      }
   }
   if (COPYX != COPYY)         /* if only one of them is already being copied */
      APPLYALPHAX = COPYX;     /* apply alpha to that one */
   else if (!COPYY && !COPYX)  /* nobody currently being copied means */
   {                           /* we'll need to force a copy to apply alpha */
      APPLYALPHAX = (M < N);   /* apply alpha to shorter vector */
      if (!ALPHA_IS_ONE)       /* force copy if alpha != 1.0 */
      {
         COPYX = APPLYALPHAX;
         COPYY = !APPLYALPHAX;
      }
   }
   else                        /* if both are being copied anyway */
      APPLYALPHAX = (M < N);   /* apply alpha to shorter vector */

   if (COPYX | COPYY)         /* if I need to copy either vector */
   {                          /* allocate & align them */
      vp = malloc(ATL_MulBySize(COPYX*mb+COPYY*N) + 2*ATL_Cachelen);
/*
 *    If we cannot allocate enough space to copy the vectors, give up and
 *    call the simple loop-based implementation
 */
      if (!vp)
      {
         MY_GERK_AXPY(M, N, alpha0, X, incX, Y, incY, A, lda);
         return;
      }
      if (COPYY)
      {
         y = ATL_AlignPtr(vp);
         x = y + (N SHIFT);
         x = (ALIGNX2A) ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x);
         if (!APPLYALPHAX && !ALPHA_IS_ONE)  /* need to apply alpha to Y */
         {
            #ifdef Conj_
               Mjoin(PATL,moveConj)(N, alpha, Y, incY, y, 1);
            #else
               Mjoin(PATL,cpsc)(N, alpha, Y, incY, y, 1);
            #endif
            alpha = one;
         }
         else  /* do not apply alpha */
         #ifdef Conj_
            Mjoin(PATL,copyConj)(N, Y, incY, y, 1);
         #else
            Mjoin(PATL,copy)(N, Y, incY, y, 1);
         #endif
      }
      else if (ALIGNX2A)
         x = ATL_Align2Ptr(vp, A);
      else
         x = ATL_AlignPtr(vp);
   }
   getX = (COPYX) ? Mjoin(PATL,cpsc) : NULL;
   m = M;
   do
   {
      imb = Mmin(mb, m);
      if (getX)    /* copy X if necessary */
         getX(imb, alpha, X, incX, x, 1);
      else
         x = (TYPE*) X;
/*
 *    Call optimized kernel (can be restricted or general)
 */
      if (imb > minM)
         gerk(imb, Nm, x, y, A, lda);
      else
         Mjoin(PATL,gerk_Mlt16)(imb, Nm, one, x, 1, y, 1, A, lda);
/*
 *    Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy
 */
      if (nr)
         Mjoin(PATL,gerk_axpy)(imb, nr, one, x, 1, y+(Nm SHIFT), 1,
                               A+((size_t)lda)*(Nm SHIFT), lda);
      A += imb SHIFT;
      X += (imb*incX)SHIFT;
      m -= imb;
      imb = Mmin(m,mb);
   }
   while(m);
   if (vp)
      free(vp);
#endif
}
Esempio n. 6
0
void ATL_gemv
   (ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda,
    const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY)
/*
 * y = alpha*A*x + beta*y, A is MxN, len(X) = N, len(Y) = M
 */
{
   ATL_mvkern_t mvnk, mvnk_b1, mvnk_b0;
   void *vp=NULL;
   TYPE *x = (TYPE*)X, *y = (TYPE*)Y, *p;
   size_t t1, t2;
   ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1;
   int mu, nu, alignX, alignY, ALIGNY2A, ForceNU, COPYX, COPYY, APPLYALPHAX;
   int minM, minN, DOTBASED;
   #ifdef TREAL
      #define one ATL_rone
      #define Zero ATL_rzero
      TYPE alpha = alpha0, beta = beta0;
      const int ALPHA_IS_ONE = (alpha0 == ATL_rone);
   #else
      TYPE one[2] = {ATL_rone, ATL_rzero}, *alpha=(TYPE*)alpha0;
      TYPE Zero[2] = {ATL_rzero, ATL_rzero};
      TYPE *beta = (TYPE*) beta0;
      const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha[1] == ATL_rzero);
   #endif

   if (M < 1 || N < 1)          /* F77BLAS doesn't scale in either case */
      return;
   if (SCALAR_IS_ZERO(alpha))   /* No contrib from alpha*A*x */
   {
      if (!SCALAR_IS_ONE(beta))
      {
         if (SCALAR_IS_ZERO(beta))
            Mjoin(PATL,zero)(M, Y, incY);
         else
            Mjoin(PATL,scal)(M, beta, Y, incY);
      }
      return;
   }
/*
 * ATLAS's mvn kernels loop over M in inner loop, which is bad news if M is
 * very small.  Call code that requires no copy of X & Y for these degenerate
 * cases
 */
   if (M < 16)
   {
      Mjoin(PATL,mvnk_Mlt16)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY);
      return;
   }
/*
 * Get mvnk kernel pointer along with any usage guidelines, and use the
 * optimized CacheElts to compute the correct blocking factor
 * For no transpose, X alignment args really apply to Y, and vice versa.
 */
   mvnk_b1 = ATL_GetMVNKern(M, N, A, lda, &mvnk_b0, &DOTBASED, &mu, &nu,
                            &minM, &minN, &alignY, &ALIGNY2A, &alignX,
                            &ForceNU, &CacheElts);
/*
 * Set up to handle case where kernel requires N to be a multiple if NU
 */
   if (ForceNU)
   {
      Nm = (N/nu)*nu;
      nr = N - Nm;
   }
   else
   {
      Nm = N;
      nr = 0;
   }
/*
 * For very small N, we can't afford the data copy, so call special case code
 */
   if (N < 4 || Nm < 1)
   {
      Mjoin(PATL,mvnk_smallN)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY);
      return;
   }
   if (CacheElts)
   {
      mb = (CacheElts - 2*nu) / (2*(nu+1));
      mb = (mb > mu) ? (mb/mu)*mu : M;
      mb = (mb > M) ? M : mb;
   }
   else
      mb = M;
/*
 *****************************************************************************
 Figure out whether vecs need be copied, and which one will be scaled by alpha
 *****************************************************************************
 */
   COPYX = (incX != 1);
   if (!COPYX && alignX)
   {
      t1 = (size_t) X;
      COPYX = ((t1/alignX)*alignX != t1);
   }
   COPYY = (incY != 1);
   if (!COPYY)  /* may still need to copy due to alignment issues */
   {
/*
 *    ATL_Cachelen is the highest alignment that can be requested, so
 *    make X's % with Cachelen match that of A if you want A & X to have
 *    the same alignment
 */
      if (ALIGNY2A)
      {
         t1 = (size_t) A;
         t2 = (size_t) Y;
         COPYY = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
                 (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
      }
      else if (alignY)
      {
         t1 = (size_t) Y;
         COPYY = ((t1/alignY)*alignY != t1);
      }
   }
   if (COPYX != COPYY)         /* if only one of them is already being copied */
      APPLYALPHAX = COPYX;     /* apply alpha to that one */
   else if (!COPYY && !COPYX)  /* nobody currently being copied means */
   {                           /* we'll need to force a copy to apply alpha */
      APPLYALPHAX = (M < N);   /* apply alpha to vector requiring least */
      if (!ALPHA_IS_ONE)       /* workspace if alpha != 1.0 */
      {
         COPYX = APPLYALPHAX;
         COPYY = !APPLYALPHAX;
      }
   }
   else                        /* if both are being copied anyway */
      APPLYALPHAX = 0;         /* apply alpha during update of Y */

   if (COPYX | COPYY)         /* if I need to copy either vector */
   {                          /* allocate & align them */
      vp = malloc(ATL_MulBySize(COPYY*mb+COPYX*N) + 2*ATL_Cachelen);
/*
 *    If we cannot allocate enough space to copy the vectors, give up and
 *    call the simple loop-based implementation
 */
      if (!vp)
      {
         Mjoin(PATL,mvnk_smallN)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY);
         return;
      }
      if (COPYX)
      {
         x = ATL_AlignPtr(vp);
         if (APPLYALPHAX && !ALPHA_IS_ONE)
            Mjoin(PATL,cpsc)(N, alpha, X, incX, x, 1);
         else
            Mjoin(PATL,copy)(N, X, incX, x, 1);
         if (COPYY)
            y = x + (N SHIFT);
      }
      else /* if (COPYY)  known true by surrounding if */
         y = vp;
      if (COPYY)
      {
         y = (ALIGNY2A) ? ATL_Align2Ptr(y, A) : ATL_AlignPtr(y);
         beta = Zero;
         alpha = one;
      }
   }
/*
 * Apply beta to Y if we aren't copying Y
 */
   if (!COPYY && !SCALAR_IS_ONE(beta0))
   {
      if (SCALAR_IS_ZERO(beta0))
         beta = Zero;
      else
      {
         Mjoin(PATL,scal)(M, beta0, Y, incY);
         beta = one;
      }
   }
   mvnk = (COPYY || SCALAR_IS_ZERO(beta)) ? mvnk_b0 : mvnk_b1;
   m = M;
   do
   {
      imb = Mmin(mb, m);
/*
 *    Call optimized kernel (can be restricted or general)
 */
      if (imb >= minM)
         mvnk(imb, Nm, A, lda, x, y);
      else
         Mjoin(PATL,mvnk_Mlt16)(imb, Nm, one, A, lda, x, 1, beta, y, 1);
/*
 *    Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy
 */
      if (nr)
         Mjoin(PATL,mvnk_smallN)(imb, nr, one, A+((size_t)lda)*(Nm SHIFT), lda,
                                 x+(Nm SHIFT), 1, one, y, 1);
/*
 *    If we are copying Y, we have formed A*x into y, so scale it by the
 *    original alpha, by using axpby: Y = beta0*Y + alpha0*y
 */
      if (COPYY)
         Mjoin(PATL,axpby)(imb, alpha0, y, 1, beta0, Y, incY);
      else
         y += imb SHIFT;
      A += imb SHIFT;
      Y += (imb*incY)SHIFT;
      m -= imb;
   }
   while(m);
   if (vp)
      free(vp);
}
Esempio n. 7
0
void ATL_gemv
   (ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda,
    const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY)
/*
 * Y = alpha*conj(A)*X + beta*Y
 * For Conjugate transpose, first form x = conj(X), y = A^T * conj(X),
 * then use axpbyConj to add this to original Y in the operation
 * Y = beta*Y + alpha*conj(y) = beta*Y + alpha*(A^H * X), which is
 * Y = beta*Y + alpha * A^H * X.
 */
{
   ATL_mvkern_t mvtk, mvtk_b1, mvtk_b0;
   void *vp=NULL;
   TYPE *x = (TYPE*)X, *y = (TYPE*)Y;
   size_t t1, t2;
   ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1;
   int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, COPYX, COPYY, APPLYALPHAX;
   int minM, minN;
   TYPE one[2] = {ATL_rone, ATL_rzero};
   TYPE Zero[2] = {ATL_rzero, ATL_rzero};
   TYPE *beta = (TYPE*) beta0;
   const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha0[1] == ATL_rzero);

   if (M < 1 || N < 1)          /* F77 BLAS doesn't scale in either case */
      return;
   if (SCALAR_IS_ZERO(alpha0))   /* No contrib from alpha*A*x */
   {
      if (!SCALAR_IS_ONE(beta0))
      {
         if (SCALAR_IS_ZERO(beta0))
            Mjoin(PATL,zero)(N, Y, incY);
         else
            Mjoin(PATL,scal)(N, beta, Y, incY);
      }
      return;
   }
/*
 * ATLAS's MVT kernels loop over M in inner loop, which is bad news if M is
 * very small.  Call code that requires no copy of X & Y for these degenerate
 * cases
 */
   if (M < 16)
   {
      Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX,
                          beta0, Y, incY);
      return;
   }
/*
 * Get mvtk kernel pointer along with any usage guidelines, and use the
 * optimized CacheElts to compute the correct blocking factor
 */
   mvtk_b1 = ATL_GetMVTKern(M, N, A, lda, &mvtk_b0, &mu, &nu,
                            &minM, &minN, &alignX, &ALIGNX2A, &alignY,
                            &ForceNU, &CacheElts);
/*
 * Set up to handle case where kernel requres N to be a multiple if NU
 */
   if (ForceNU)
   {
      Nm = (N/nu)*nu;
      nr = N - Nm;
   }
   else
   {
      Nm = N;
      nr = 0;
   }
/*
 * For very small N, we can't afford the data copy, so call special case code
 */
   if (N < 4 || Nm < 1)
   {
      Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX,
                          beta0, Y, incY);
      return;
   }
   if (CacheElts)
   {
      mb = (CacheElts - 2*nu) / (2*(nu+1));
      mb = (mb > mu) ? (mb/mu)*mu : M;
      mb = (mb > M) ? M : mb;
   }
   else
      mb = M;
   vp = malloc(ATL_MulBySize(mb+N) + 2*ATL_Cachelen);
/*
 * If we cannot allocate enough space to copy the vectors, give up and
 * call the simple loop-based implementation
 */
   if (!vp)
   {
      Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX,
                          beta0, Y, incY);
      return;
   }
   y = ATL_AlignPtr(vp);
   x = y + (N SHIFT);
   x = (ALIGNX2A) ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x);
   beta = Zero;
/*
 * In this step, we form y = A^T * conj(X)
 */
   mvtk = mvtk_b0;
   m = M;
   do
   {
      imb = Mmin(mb, m);
      Mjoin(PATL,copyConj)(imb, X, incX, x, 1);  /* x = conj(X) */
/*
 *    Call optimized kernel (can be restricted or general)
 */
      if (imb >= minM)
         mvtk(imb, Nm, A, lda, x, y);
      else
         Mjoin(PATL,mvtk_Mlt16)(imb, Nm, one, A, lda, x, 1, beta, y, 1);
/*
 *    Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy
 */
      if (nr)
         Mjoin(PATL,mvtk_smallN)(imb, nr, one, A+((size_t)lda)*(Nm SHIFT), lda,
                                 x, 1, beta, y+(Nm SHIFT), 1);
      beta = one;
      mvtk = mvtk_b1;
      A += imb SHIFT;
      X += (imb*incX)SHIFT;
      m -= imb;
      imb = Mmin(m,mb);
   }
   while(m);

/*
 * Given y = A^T * conj(X) from above, now do:
 *    Y = beta*Y + alpha*conj(y) = beta*Y + alpha*(A^H * x), which is
 *    Y = beta*Y + alpha * A^H * x.
 */
   Mjoin(PATL,axpbyConj)(N, alpha0, y, 1, beta0, Y, incY);
   free(vp);
}