Example #1
0
static int ATL_trmvLT
(
   const enum ATLAS_DIAG  Diag,
   const int nb,
   ATL_CINT N,
   const TYPE *A,
   ATL_CINT lda,
   TYPE *X,
   ATL_CINT incX
)
/*
 * RETURNS: 0 if TRMV was performed, non-zero if nothing done
 */
{
   static void (*trmvK)(ATL_CINT, const TYPE*, ATL_CINT, const TYPE*, TYPE*);
   void (*gemv)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT,
                const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT);
   void *vp;
   TYPE *x, *y;
   const size_t opsize = (N*N+N+N)*sizeof(TYPE)SHIFT;
   size_t t0;
   #ifdef TCPLX
      size_t N2=N+N, lda2 = lda+lda;
      TYPE one[2] = {ATL_rone, ATL_rzero};
   #else
      #define N2 N
      #define lda2 lda
      #define one ATL_rone
   #endif
   const size_t incA = ((size_t)lda+1)*(nb SHIFT);
   ATL_CINT Nnb = ((N-1)/nb)*nb, Nr = N-Nnb;
   ATL_INT j;

   if (N < nb+nb)
      return(1);
   if (opsize > MY_CE)
      gemv = Mjoin(PATL,gemvT);
   else
      gemv = (opsize <= ATL_MulBySize(ATL_L1elts)) ? Mjoin(PATL,gemvT_L1) :
             Mjoin(PATL,gemvT_L2);
   trmvK = (Diag == AtlasNonUnit) ? ATL_trmvLTNk : ATL_trmvLTUk;
/*
 * If X is aligned to Cachelen wt inc=1, use it as y
 */
   t0 = (size_t) X;
   if (incX == 1 && (ATL_MulByCachelen(ATL_DivByCachelen(t0)) == t0))
   {
      ATL_INT i;
      vp = malloc(ATL_Cachelen+ATL_MulBySize(N));
      if (!vp)
         return(2);
      x = ATL_AlignPtr(vp);
      y = X;
      for (i=0; i < N2; i++)
      {
         x[i] = X[i];
         X[i] = ATL_rzero;
      }

   }
   else  /* allocate both X and Y */
   {
      vp = malloc((ATL_Cachelen+ATL_MulBySize(N))<<1);
      if (!vp)
         return(3);
      x = ATL_AlignPtr(vp);
      y = x + N2;
      y = ATL_AlignPtr(y);
      Mjoin(PATL,copy)(N, X, incX, x, 1);
      Mjoin(PATL,zero)(N, y, 1);
   }
   for (j=0; j < Nnb; j += nb, A += incA)
   {
      #ifdef TCPLX
         const register size_t j2=j+j, nb2=nb+nb;
      #else
         #define j2 j
         #define nb2 nb
      #endif
      trmvK(nb, A, lda, x+j2, y+j2);
      gemv(N-j-nb, nb, one, A+nb2, lda, x+j2+nb2, 1, one, y+j2, 1);
      #ifndef TCPLX
         #undef j2
         #undef nb2
      #endif
   }
   #ifdef TCPLX
      j += j;
   #endif
   trmvK(Nr, A, lda, x+j, y+j);
   if (y != X)
      Mjoin(PATL,copy)(N, y, 1, X, incX);
   free(vp);
   return(0);
}
Example #2
0
void ATL_her(const enum ATLAS_UPLO Uplo, ATL_CINT N, const TYPE alpha,
               const TYPE *X, ATL_CINT incX, TYPE *A, ATL_CINT lda)
{
   const TYPE calpha[2] = {alpha, ATL_rzero};
   void *vp=NULL;
   TYPE *x, *xt;
   ATL_r1kern_t gerk;
   ATL_INT CacheElts;
   const int ALP1 = (alpha == ATL_rone);
   int COPYX, COPYXt;
   int mu, nu, minM, minN, alignX, alignXt, FNU, ALIGNX2A;
   if (N < 1 || (alpha == ATL_rzero))
      return;
/*
 * For very small problems, avoid overhead of func calls & data copy
 */
   if (N < 50)
   {
      Mjoin(PATL,refher)(Uplo, N, alpha, X, incX, A, lda);
      return;
   }
/*
 * Determine the GER kernel to use, and its parameters
 */
   gerk = ATL_GetR1Kern(N-ATL_s1L_NU, ATL_s1L_NU, A, lda, &mu, &nu,
                        &minM, &minN, &alignX, &ALIGNX2A, &alignXt,
                        &FNU, &CacheElts);
/*
 * Determine if we need to copy the vectors
 */
   COPYX = (incX != 1);
   if (!COPYX)  /* may still need to copy due to alignment issues */
   {
/*
 *    ATL_Cachelen is the highest alignment that can be requested, so
 *    make X's % with Cachelen match that of A if you want A & X to have
 *    the same alignment
 */
      if (ALIGNX2A)
      {
         size_t t1 = (size_t) A, t2 = (size_t) X;
         COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
                 (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
      }
      else if (alignX)
      {
         size_t t1 = (size_t) X;
         COPYX = ((t1/alignX)*alignX != t1);
      }
   }
   vp = malloc((ATL_Cachelen+ATL_MulBySize(N))*(1+COPYX));
   if (!vp)
   {
      Mjoin(PATL,refher)(Uplo, N, alpha, X, incX, A, lda);
      return;
   }
   xt = ATL_AlignPtr(vp);
   if (COPYX)
   {
      x = xt + N+N;
      x = ALIGNX2A ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x);
      Mjoin(PATL,copy)(N, X, incX, x, 1);
   }
   else
      x = (TYPE*) X;
   if (ALP1)
      Mjoin(PATL,copyConj)(N, X, incX, xt, 1);
   else
      Mjoin(PATL,moveConj)(N, calpha, X, incX, xt, 1);
   if (Uplo == AtlasUpper)
      Mjoin(PATL,her_kU)(gerk, N, alpha, x, xt, A, lda);
   else
      Mjoin(PATL,her_kL)(gerk, N, alpha, x, xt, A, lda);
   if (vp)
     free(vp);
}
Example #3
0
void ATL_gemv
   (ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda,
    const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY)
/*
 * y = alpha*A*x + beta*y, A is MxN, len(X) = N, len(Y) = M
 */
{
   ATL_mvkern_t mvnk, mvnk_b1, mvnk_b0;
   void *vp=NULL;
   TYPE *x = (TYPE*)X, *y = (TYPE*)Y, *p;
   size_t t1, t2;
   ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1;
   int mu, nu, alignX, alignY, ALIGNY2A, ForceNU, COPYX, COPYY, APPLYALPHAX;
   int minM, minN, DOTBASED;
   #ifdef TREAL
      #define one ATL_rone
      #define Zero ATL_rzero
      TYPE alpha = alpha0, beta = beta0;
      const int ALPHA_IS_ONE = (alpha0 == ATL_rone);
   #else
      TYPE one[2] = {ATL_rone, ATL_rzero}, *alpha=(TYPE*)alpha0;
      TYPE Zero[2] = {ATL_rzero, ATL_rzero};
      TYPE *beta = (TYPE*) beta0;
      const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha[1] == ATL_rzero);
   #endif

   if (M < 1 || N < 1)          /* F77BLAS doesn't scale in either case */
      return;
   if (SCALAR_IS_ZERO(alpha))   /* No contrib from alpha*A*x */
   {
      if (!SCALAR_IS_ONE(beta))
      {
         if (SCALAR_IS_ZERO(beta))
            Mjoin(PATL,zero)(M, Y, incY);
         else
            Mjoin(PATL,scal)(M, beta, Y, incY);
      }
      return;
   }
/*
 * ATLAS's mvn kernels loop over M in inner loop, which is bad news if M is
 * very small.  Call code that requires no copy of X & Y for these degenerate
 * cases
 */
   if (M < 16)
   {
      Mjoin(PATL,mvnk_Mlt16)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY);
      return;
   }
/*
 * Get mvnk kernel pointer along with any usage guidelines, and use the
 * optimized CacheElts to compute the correct blocking factor
 * For no transpose, X alignment args really apply to Y, and vice versa.
 */
   mvnk_b1 = ATL_GetMVNKern(M, N, A, lda, &mvnk_b0, &DOTBASED, &mu, &nu,
                            &minM, &minN, &alignY, &ALIGNY2A, &alignX,
                            &ForceNU, &CacheElts);
/*
 * Set up to handle case where kernel requires N to be a multiple if NU
 */
   if (ForceNU)
   {
      Nm = (N/nu)*nu;
      nr = N - Nm;
   }
   else
   {
      Nm = N;
      nr = 0;
   }
/*
 * For very small N, we can't afford the data copy, so call special case code
 */
   if (N < 4 || Nm < 1)
   {
      Mjoin(PATL,mvnk_smallN)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY);
      return;
   }
   if (CacheElts)
   {
      mb = (CacheElts - 2*nu) / (2*(nu+1));
      mb = (mb > mu) ? (mb/mu)*mu : M;
      mb = (mb > M) ? M : mb;
   }
   else
      mb = M;
/*
 *****************************************************************************
 Figure out whether vecs need be copied, and which one will be scaled by alpha
 *****************************************************************************
 */
   COPYX = (incX != 1);
   if (!COPYX && alignX)
   {
      t1 = (size_t) X;
      COPYX = ((t1/alignX)*alignX != t1);
   }
   COPYY = (incY != 1);
   if (!COPYY)  /* may still need to copy due to alignment issues */
   {
/*
 *    ATL_Cachelen is the highest alignment that can be requested, so
 *    make X's % with Cachelen match that of A if you want A & X to have
 *    the same alignment
 */
      if (ALIGNY2A)
      {
         t1 = (size_t) A;
         t2 = (size_t) Y;
         COPYY = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
                 (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
      }
      else if (alignY)
      {
         t1 = (size_t) Y;
         COPYY = ((t1/alignY)*alignY != t1);
      }
   }
   if (COPYX != COPYY)         /* if only one of them is already being copied */
      APPLYALPHAX = COPYX;     /* apply alpha to that one */
   else if (!COPYY && !COPYX)  /* nobody currently being copied means */
   {                           /* we'll need to force a copy to apply alpha */
      APPLYALPHAX = (M < N);   /* apply alpha to vector requiring least */
      if (!ALPHA_IS_ONE)       /* workspace if alpha != 1.0 */
      {
         COPYX = APPLYALPHAX;
         COPYY = !APPLYALPHAX;
      }
   }
   else                        /* if both are being copied anyway */
      APPLYALPHAX = 0;         /* apply alpha during update of Y */

   if (COPYX | COPYY)         /* if I need to copy either vector */
   {                          /* allocate & align them */
      vp = malloc(ATL_MulBySize(COPYY*mb+COPYX*N) + 2*ATL_Cachelen);
/*
 *    If we cannot allocate enough space to copy the vectors, give up and
 *    call the simple loop-based implementation
 */
      if (!vp)
      {
         Mjoin(PATL,mvnk_smallN)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY);
         return;
      }
      if (COPYX)
      {
         x = ATL_AlignPtr(vp);
         if (APPLYALPHAX && !ALPHA_IS_ONE)
            Mjoin(PATL,cpsc)(N, alpha, X, incX, x, 1);
         else
            Mjoin(PATL,copy)(N, X, incX, x, 1);
         if (COPYY)
            y = x + (N SHIFT);
      }
      else /* if (COPYY)  known true by surrounding if */
         y = vp;
      if (COPYY)
      {
         y = (ALIGNY2A) ? ATL_Align2Ptr(y, A) : ATL_AlignPtr(y);
         beta = Zero;
         alpha = one;
      }
   }
/*
 * Apply beta to Y if we aren't copying Y
 */
   if (!COPYY && !SCALAR_IS_ONE(beta0))
   {
      if (SCALAR_IS_ZERO(beta0))
         beta = Zero;
      else
      {
         Mjoin(PATL,scal)(M, beta0, Y, incY);
         beta = one;
      }
   }
   mvnk = (COPYY || SCALAR_IS_ZERO(beta)) ? mvnk_b0 : mvnk_b1;
   m = M;
   do
   {
      imb = Mmin(mb, m);
/*
 *    Call optimized kernel (can be restricted or general)
 */
      if (imb >= minM)
         mvnk(imb, Nm, A, lda, x, y);
      else
         Mjoin(PATL,mvnk_Mlt16)(imb, Nm, one, A, lda, x, 1, beta, y, 1);
/*
 *    Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy
 */
      if (nr)
         Mjoin(PATL,mvnk_smallN)(imb, nr, one, A+((size_t)lda)*(Nm SHIFT), lda,
                                 x+(Nm SHIFT), 1, one, y, 1);
/*
 *    If we are copying Y, we have formed A*x into y, so scale it by the
 *    original alpha, by using axpby: Y = beta0*Y + alpha0*y
 */
      if (COPYY)
         Mjoin(PATL,axpby)(imb, alpha0, y, 1, beta0, Y, incY);
      else
         y += imb SHIFT;
      A += imb SHIFT;
      Y += (imb*incY)SHIFT;
      m -= imb;
   }
   while(m);
   if (vp)
      free(vp);
}
Example #4
0
void ATL_ger
   (const int M, const int N, const SCALAR alpha0, const TYPE *X,
    const int incX, const TYPE *Y, const int incY, TYPE *A, const int lda)
{
#ifdef ATL_CHOICE
   const size_t opsize = (M*N+M+N)*sizeof(TYPE)SHIFT;

   if (opsize <= ATL_MulBySize(ATL_L1elts))
      Mjoin(MY_GER,_L1)(M, N, alpha0, X, incX, Y, incY, A, lda);
   else if (opsize <= MY_CE)
      Mjoin(MY_GER,_L2)(M, N, alpha0, X, incX, Y, incY, A, lda);
   else
      Mjoin(MY_GER,_OOC)(M, N, alpha0, X, incX, Y, incY, A, lda);
#else
   void (*getX)(const int N, const SCALAR alpha, const TYPE *X,
                const int incX, TYPE *Y, const int incY);
   ATL_r1kern_t gerk;
   void *vp=NULL;
   TYPE *x = (TYPE*)X, *y = (TYPE*)Y;
   size_t t1, t2;
   ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1;
   int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, COPYX, COPYY, APPLYALPHAX;
   int minM, minN;
   #ifdef TREAL
      #define one ATL_rone
      TYPE alpha = alpha0;
      const int ALPHA_IS_ONE = (alpha0 == ATL_rone);
   #else
      TYPE one[2] = {ATL_rone, ATL_rzero}, *alpha=(TYPE*)alpha0;
      const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha[1] == ATL_rzero);
   #endif

   if (M < 1 || N < 1 || SCALAR_IS_ZERO(alpha))
      return;
/*
 * Get gerk kernel pointer along with any usage guidelines, and use the
 * optimized CacheElts to compute the correct blocking factor
 */
   gerk = ATL_GetR1Kern(M, N, A, lda, &mu, &nu, &minM, &minN, &alignX,
                         &ALIGNX2A, &alignY, &ForceNU, &CacheElts);
   if (CacheElts)
   {
      mb = (CacheElts - 2*nu) / (2*(nu+1));
      mb = (mb > mu) ? (mb/mu)*mu : M;
      mb = (mb > M) ? M : mb;
   }
   else
      mb = M;
/*
 * Set up to handle case where kernel requres N to be a multiple if NU
 */
   if (ForceNU)
   {
      Nm = (N/nu)*nu;
      nr = N - Nm;
   }
   else
   {
      Nm = N;
      nr = 0;
   }
/*
 * For very small N, we can't afford the data copy, so call AXPY-based routine
 */
   if (N < 4 || Nm < 1)
   {
      MY_GERK_AXPY(M, N, alpha0, X, incX, Y, incY, A, lda);
      return;
   }
/*
 * ATLAS's GER kernels loop over M in inner loop, which is bad news if M is
 * very small.  Call code that requires no copy of A & B for these degenerate
 * cases
 */
   if (M < 16 || M < minM)
   {
      MY_GERK_MLT16(M, N, alpha0, X, incX, Y, incY, A, lda);
      return;
   }
/*
 *****************************************************************************
 Figure out whether vecs need be copied, and which one will be scaled by alpha
 *****************************************************************************
 */
   #ifdef Conj_
      COPYY = 1;
   #else
      COPYY = (incY != 1);
      if (!COPYY && alignY)
      {
         t1 = (size_t) Y;
         COPYY = ((t1/alignY)*alignY != t1);
      }
   #endif
   COPYX = (incX != 1);
   if (!COPYX)  /* may still need to copy due to alignment issues */
   {
/*
 *    ATL_Cachelen is the highest alignment that can be requested, so
 *    make X's % with Cachelen match that of A if you want A & X to have
 *    the same alignment
 */
      if (ALIGNX2A)
      {
         t1 = (size_t) A;
         t2 = (size_t) X;
         COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
                 (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
      }
      else if (alignX)
      {
         t1 = (size_t) X;
         COPYX = ((t1/alignX)*alignX != t1);
      }
   }
   if (COPYX != COPYY)         /* if only one of them is already being copied */
      APPLYALPHAX = COPYX;     /* apply alpha to that one */
   else if (!COPYY && !COPYX)  /* nobody currently being copied means */
   {                           /* we'll need to force a copy to apply alpha */
      APPLYALPHAX = (M < N);   /* apply alpha to shorter vector */
      if (!ALPHA_IS_ONE)       /* force copy if alpha != 1.0 */
      {
         COPYX = APPLYALPHAX;
         COPYY = !APPLYALPHAX;
      }
   }
   else                        /* if both are being copied anyway */
      APPLYALPHAX = (M < N);   /* apply alpha to shorter vector */

   if (COPYX | COPYY)         /* if I need to copy either vector */
   {                          /* allocate & align them */
      vp = malloc(ATL_MulBySize(COPYX*mb+COPYY*N) + 2*ATL_Cachelen);
/*
 *    If we cannot allocate enough space to copy the vectors, give up and
 *    call the simple loop-based implementation
 */
      if (!vp)
      {
         MY_GERK_AXPY(M, N, alpha0, X, incX, Y, incY, A, lda);
         return;
      }
      if (COPYY)
      {
         y = ATL_AlignPtr(vp);
         x = y + (N SHIFT);
         x = (ALIGNX2A) ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x);
         if (!APPLYALPHAX && !ALPHA_IS_ONE)  /* need to apply alpha to Y */
         {
            #ifdef Conj_
               Mjoin(PATL,moveConj)(N, alpha, Y, incY, y, 1);
            #else
               Mjoin(PATL,cpsc)(N, alpha, Y, incY, y, 1);
            #endif
            alpha = one;
         }
         else  /* do not apply alpha */
         #ifdef Conj_
            Mjoin(PATL,copyConj)(N, Y, incY, y, 1);
         #else
            Mjoin(PATL,copy)(N, Y, incY, y, 1);
         #endif
      }
      else if (ALIGNX2A)
         x = ATL_Align2Ptr(vp, A);
      else
         x = ATL_AlignPtr(vp);
   }
   getX = (COPYX) ? Mjoin(PATL,cpsc) : NULL;
   m = M;
   do
   {
      imb = Mmin(mb, m);
      if (getX)    /* copy X if necessary */
         getX(imb, alpha, X, incX, x, 1);
      else
         x = (TYPE*) X;
/*
 *    Call optimized kernel (can be restricted or general)
 */
      if (imb > minM)
         gerk(imb, Nm, x, y, A, lda);
      else
         Mjoin(PATL,gerk_Mlt16)(imb, Nm, one, x, 1, y, 1, A, lda);
/*
 *    Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy
 */
      if (nr)
         Mjoin(PATL,gerk_axpy)(imb, nr, one, x, 1, y+(Nm SHIFT), 1,
                               A+((size_t)lda)*(Nm SHIFT), lda);
      A += imb SHIFT;
      X += (imb*incX)SHIFT;
      m -= imb;
      imb = Mmin(m,mb);
   }
   while(m);
   if (vp)
      free(vp);
#endif
}
Example #5
0
void Mjoin(PATL,hemv)
(
   const enum ATLAS_UPLO Uplo,
   const int             N,
   const SCALAR          alpha,
   const TYPE            *A,
   const int             lda,
   const TYPE            *X,
   const int             incX,
   const SCALAR          beta,
   TYPE                  *Y,
   const int             incY
)
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, hemv ) performs the matrix-vector operation
 *
 *    y := alpha * A * x + beta * y,
 *
 * where alpha and beta are scalars, x and y are n-element vectors and A
 * is an n by n Hermitian matrix.
 *
 * This is a blocked version of the algorithm.  For a more detailed des-
 * cription of  the arguments of this function, see the reference imple-
 * mentation in the ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
{
   const int BETA0 = (*beta == ATL_rzero && beta[1] == ATL_rzero);
   const int BETA1 = (*beta == ATL_rone && beta[1] == ATL_rzero);
   const int ALPHA1 = (*alpha == ATL_rone && alpha[1] == ATL_rzero);
   const int ALPHA0 = (*alpha == ATL_rzero && alpha[1] == ATL_rzero);
   if (N <= 0 || (ALPHA0 && BETA1))
      return;
   if (ALPHA0)
   {
      if (BETA0)
         Mjoin(PATL,zero)(N, Y, incY);
      else
         Mjoin(PATL,scal)(N, beta, Y, incY);
      return;
   }
#ifdef USE_GEMV_BASED
   if (N >= 240)
   {
      void *vp=NULL;
      TYPE *x=(TYPE*)X, *y=Y, *xh, *yh;
      const size_t tX = (size_t)X, tY = (size_t)Y, N2 = N+N;
      const int COPYY = !(incY == 1 &&
                          (ATL_MulByCachelen(ATL_DivByCachelen(tY)) == tY));
      const int COPYX = !(incX == 1 && (COPYY || ALPHA1) &&
                          (ATL_MulByCachelen(ATL_DivByCachelen(tX)) == tX));
      const TYPE one[2] = {ATL_rone, ATL_rzero};
      const TYPE *calp=one, *cbet=one;
      TYPE *tp;
      tp = vp = malloc((COPYX+COPYY+2)*(ATL_Cachelen+ATL_MulBySize(N)));
      if (!vp)
      {
         Mjoin(PATL,refhemv)(Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
         return;
      }
      yh = ATL_AlignPtr(tp);
      Mjoin(PATL,zero)(N, yh, 1);
      tp = yh + N2;
      xh = ATL_AlignPtr(tp);
      tp = xh + N2;
      if (COPYX)
      {
         x = ATL_AlignPtr(tp);
         if (COPYY || ALPHA1)
         {
            register ATL_INT i;
            register const size_t incX2 = incX+incX;
            const TYPE *xx=X;
            for (i=0; i < N2; i += 2, xx += incX2)
            {
               xh[i] = x[i] = *xx;
               xh[i+1] = -(x[i+1] = xx[1]);
            }
         }
         else if (alpha[1] == ATL_rzero)
         {
            register ATL_INT i;
            register const size_t incX2 = incX+incX;
            register const TYPE ra=(*alpha), ia=alpha[1];
            const TYPE *xx=X;
            for (i=0; i < N2; i += 2, xx += incX2)
            {
               register TYPE rx = *xx, ix = xx[1];
               x[i] = rx*ra;
               x[i+1] = ix*ra;
               xh[i] = rx;
               xh[i+1] = -ix;
            }
         }
         else
         {
            register ATL_INT i;
            register const size_t incX2 = incX+incX;
            register const TYPE ra=(*alpha), ia=alpha[1];
            const TYPE *xx=X;
            for (i=0; i < N2; i += 2, xx += incX2)
            {
               register TYPE rx = *xx, ix = xx[1];
               x[i] = rx*ra - ix*ia;
               x[i+1] = rx*ia + ix*ra;
               xh[i] = rx;
               xh[i+1] = -ix;
            }
         }
         tp = x + N2;
      }
      else
         Mjoin(PATL,copyConj)(N, X, incX, xh, 1);
      if (COPYY)
      {
         calp = alpha;
         cbet = beta;
         y = ATL_AlignPtr(tp);
         Mjoin(PATL,zero)(N, y, 1);
      }
      else if (BETA0)
         Mjoin(PATL,zero)(N, y, 1);
      else if (!BETA1)
         Mjoin(PATL,scal)(N, beta, y, 1);
      if (Uplo == AtlasLower)
         ATL_symvL(Mjoin(PATL,refhemv), 120, N, A, lda, x, y, xh, yh);
      else
         ATL_symvU(Mjoin(PATL,refhemv), 120, N, A, lda, x, y, xh, yh);
      if (COPYY)
      {
         Mjoin(PATL,axpbyConj)(N, alpha, yh, 1, calp, y, 1);
         Mjoin(PATL,axpby)(N, one, y, 1, cbet, Y, incY);
      }
      else
         Mjoin(PATL,axpyConj)(N, alpha, yh, 1, Y, incY);
      free(vp);
      return;
   }
#endif
   Mjoin(PATL,refhemv)(Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
}