static int ATL_trmvLT ( const enum ATLAS_DIAG Diag, const int nb, ATL_CINT N, const TYPE *A, ATL_CINT lda, TYPE *X, ATL_CINT incX ) /* * RETURNS: 0 if TRMV was performed, non-zero if nothing done */ { static void (*trmvK)(ATL_CINT, const TYPE*, ATL_CINT, const TYPE*, TYPE*); void (*gemv)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT, const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT); void *vp; TYPE *x, *y; const size_t opsize = (N*N+N+N)*sizeof(TYPE)SHIFT; size_t t0; #ifdef TCPLX size_t N2=N+N, lda2 = lda+lda; TYPE one[2] = {ATL_rone, ATL_rzero}; #else #define N2 N #define lda2 lda #define one ATL_rone #endif const size_t incA = ((size_t)lda+1)*(nb SHIFT); ATL_CINT Nnb = ((N-1)/nb)*nb, Nr = N-Nnb; ATL_INT j; if (N < nb+nb) return(1); if (opsize > MY_CE) gemv = Mjoin(PATL,gemvT); else gemv = (opsize <= ATL_MulBySize(ATL_L1elts)) ? Mjoin(PATL,gemvT_L1) : Mjoin(PATL,gemvT_L2); trmvK = (Diag == AtlasNonUnit) ? ATL_trmvLTNk : ATL_trmvLTUk; /* * If X is aligned to Cachelen wt inc=1, use it as y */ t0 = (size_t) X; if (incX == 1 && (ATL_MulByCachelen(ATL_DivByCachelen(t0)) == t0)) { ATL_INT i; vp = malloc(ATL_Cachelen+ATL_MulBySize(N)); if (!vp) return(2); x = ATL_AlignPtr(vp); y = X; for (i=0; i < N2; i++) { x[i] = X[i]; X[i] = ATL_rzero; } } else /* allocate both X and Y */ { vp = malloc((ATL_Cachelen+ATL_MulBySize(N))<<1); if (!vp) return(3); x = ATL_AlignPtr(vp); y = x + N2; y = ATL_AlignPtr(y); Mjoin(PATL,copy)(N, X, incX, x, 1); Mjoin(PATL,zero)(N, y, 1); } for (j=0; j < Nnb; j += nb, A += incA) { #ifdef TCPLX const register size_t j2=j+j, nb2=nb+nb; #else #define j2 j #define nb2 nb #endif trmvK(nb, A, lda, x+j2, y+j2); gemv(N-j-nb, nb, one, A+nb2, lda, x+j2+nb2, 1, one, y+j2, 1); #ifndef TCPLX #undef j2 #undef nb2 #endif } #ifdef TCPLX j += j; #endif trmvK(Nr, A, lda, x+j, y+j); if (y != X) Mjoin(PATL,copy)(N, y, 1, X, incX); free(vp); return(0); }
void ATL_her(const enum ATLAS_UPLO Uplo, ATL_CINT N, const TYPE alpha, const TYPE *X, ATL_CINT incX, TYPE *A, ATL_CINT lda) { const TYPE calpha[2] = {alpha, ATL_rzero}; void *vp=NULL; TYPE *x, *xt; ATL_r1kern_t gerk; ATL_INT CacheElts; const int ALP1 = (alpha == ATL_rone); int COPYX, COPYXt; int mu, nu, minM, minN, alignX, alignXt, FNU, ALIGNX2A; if (N < 1 || (alpha == ATL_rzero)) return; /* * For very small problems, avoid overhead of func calls & data copy */ if (N < 50) { Mjoin(PATL,refher)(Uplo, N, alpha, X, incX, A, lda); return; } /* * Determine the GER kernel to use, and its parameters */ gerk = ATL_GetR1Kern(N-ATL_s1L_NU, ATL_s1L_NU, A, lda, &mu, &nu, &minM, &minN, &alignX, &ALIGNX2A, &alignXt, &FNU, &CacheElts); /* * Determine if we need to copy the vectors */ COPYX = (incX != 1); if (!COPYX) /* may still need to copy due to alignment issues */ { /* * ATL_Cachelen is the highest alignment that can be requested, so * make X's % with Cachelen match that of A if you want A & X to have * the same alignment */ if (ALIGNX2A) { size_t t1 = (size_t) A, t2 = (size_t) X; COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) != (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2))); } else if (alignX) { size_t t1 = (size_t) X; COPYX = ((t1/alignX)*alignX != t1); } } vp = malloc((ATL_Cachelen+ATL_MulBySize(N))*(1+COPYX)); if (!vp) { Mjoin(PATL,refher)(Uplo, N, alpha, X, incX, A, lda); return; } xt = ATL_AlignPtr(vp); if (COPYX) { x = xt + N+N; x = ALIGNX2A ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x); Mjoin(PATL,copy)(N, X, incX, x, 1); } else x = (TYPE*) X; if (ALP1) Mjoin(PATL,copyConj)(N, X, incX, xt, 1); else Mjoin(PATL,moveConj)(N, calpha, X, incX, xt, 1); if (Uplo == AtlasUpper) Mjoin(PATL,her_kU)(gerk, N, alpha, x, xt, A, lda); else Mjoin(PATL,her_kL)(gerk, N, alpha, x, xt, A, lda); if (vp) free(vp); }
void ATL_gemv (ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY) /* * y = alpha*A*x + beta*y, A is MxN, len(X) = N, len(Y) = M */ { ATL_mvkern_t mvnk, mvnk_b1, mvnk_b0; void *vp=NULL; TYPE *x = (TYPE*)X, *y = (TYPE*)Y, *p; size_t t1, t2; ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; int mu, nu, alignX, alignY, ALIGNY2A, ForceNU, COPYX, COPYY, APPLYALPHAX; int minM, minN, DOTBASED; #ifdef TREAL #define one ATL_rone #define Zero ATL_rzero TYPE alpha = alpha0, beta = beta0; const int ALPHA_IS_ONE = (alpha0 == ATL_rone); #else TYPE one[2] = {ATL_rone, ATL_rzero}, *alpha=(TYPE*)alpha0; TYPE Zero[2] = {ATL_rzero, ATL_rzero}; TYPE *beta = (TYPE*) beta0; const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha[1] == ATL_rzero); #endif if (M < 1 || N < 1) /* F77BLAS doesn't scale in either case */ return; if (SCALAR_IS_ZERO(alpha)) /* No contrib from alpha*A*x */ { if (!SCALAR_IS_ONE(beta)) { if (SCALAR_IS_ZERO(beta)) Mjoin(PATL,zero)(M, Y, incY); else Mjoin(PATL,scal)(M, beta, Y, incY); } return; } /* * ATLAS's mvn kernels loop over M in inner loop, which is bad news if M is * very small. Call code that requires no copy of X & Y for these degenerate * cases */ if (M < 16) { Mjoin(PATL,mvnk_Mlt16)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY); return; } /* * Get mvnk kernel pointer along with any usage guidelines, and use the * optimized CacheElts to compute the correct blocking factor * For no transpose, X alignment args really apply to Y, and vice versa. */ mvnk_b1 = ATL_GetMVNKern(M, N, A, lda, &mvnk_b0, &DOTBASED, &mu, &nu, &minM, &minN, &alignY, &ALIGNY2A, &alignX, &ForceNU, &CacheElts); /* * Set up to handle case where kernel requires N to be a multiple if NU */ if (ForceNU) { Nm = (N/nu)*nu; nr = N - Nm; } else { Nm = N; nr = 0; } /* * For very small N, we can't afford the data copy, so call special case code */ if (N < 4 || Nm < 1) { Mjoin(PATL,mvnk_smallN)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY); return; } if (CacheElts) { mb = (CacheElts - 2*nu) / (2*(nu+1)); mb = (mb > mu) ? (mb/mu)*mu : M; mb = (mb > M) ? M : mb; } else mb = M; /* ***************************************************************************** Figure out whether vecs need be copied, and which one will be scaled by alpha ***************************************************************************** */ COPYX = (incX != 1); if (!COPYX && alignX) { t1 = (size_t) X; COPYX = ((t1/alignX)*alignX != t1); } COPYY = (incY != 1); if (!COPYY) /* may still need to copy due to alignment issues */ { /* * ATL_Cachelen is the highest alignment that can be requested, so * make X's % with Cachelen match that of A if you want A & X to have * the same alignment */ if (ALIGNY2A) { t1 = (size_t) A; t2 = (size_t) Y; COPYY = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) != (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2))); } else if (alignY) { t1 = (size_t) Y; COPYY = ((t1/alignY)*alignY != t1); } } if (COPYX != COPYY) /* if only one of them is already being copied */ APPLYALPHAX = COPYX; /* apply alpha to that one */ else if (!COPYY && !COPYX) /* nobody currently being copied means */ { /* we'll need to force a copy to apply alpha */ APPLYALPHAX = (M < N); /* apply alpha to vector requiring least */ if (!ALPHA_IS_ONE) /* workspace if alpha != 1.0 */ { COPYX = APPLYALPHAX; COPYY = !APPLYALPHAX; } } else /* if both are being copied anyway */ APPLYALPHAX = 0; /* apply alpha during update of Y */ if (COPYX | COPYY) /* if I need to copy either vector */ { /* allocate & align them */ vp = malloc(ATL_MulBySize(COPYY*mb+COPYX*N) + 2*ATL_Cachelen); /* * If we cannot allocate enough space to copy the vectors, give up and * call the simple loop-based implementation */ if (!vp) { Mjoin(PATL,mvnk_smallN)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY); return; } if (COPYX) { x = ATL_AlignPtr(vp); if (APPLYALPHAX && !ALPHA_IS_ONE) Mjoin(PATL,cpsc)(N, alpha, X, incX, x, 1); else Mjoin(PATL,copy)(N, X, incX, x, 1); if (COPYY) y = x + (N SHIFT); } else /* if (COPYY) known true by surrounding if */ y = vp; if (COPYY) { y = (ALIGNY2A) ? ATL_Align2Ptr(y, A) : ATL_AlignPtr(y); beta = Zero; alpha = one; } } /* * Apply beta to Y if we aren't copying Y */ if (!COPYY && !SCALAR_IS_ONE(beta0)) { if (SCALAR_IS_ZERO(beta0)) beta = Zero; else { Mjoin(PATL,scal)(M, beta0, Y, incY); beta = one; } } mvnk = (COPYY || SCALAR_IS_ZERO(beta)) ? mvnk_b0 : mvnk_b1; m = M; do { imb = Mmin(mb, m); /* * Call optimized kernel (can be restricted or general) */ if (imb >= minM) mvnk(imb, Nm, A, lda, x, y); else Mjoin(PATL,mvnk_Mlt16)(imb, Nm, one, A, lda, x, 1, beta, y, 1); /* * Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy */ if (nr) Mjoin(PATL,mvnk_smallN)(imb, nr, one, A+((size_t)lda)*(Nm SHIFT), lda, x+(Nm SHIFT), 1, one, y, 1); /* * If we are copying Y, we have formed A*x into y, so scale it by the * original alpha, by using axpby: Y = beta0*Y + alpha0*y */ if (COPYY) Mjoin(PATL,axpby)(imb, alpha0, y, 1, beta0, Y, incY); else y += imb SHIFT; A += imb SHIFT; Y += (imb*incY)SHIFT; m -= imb; } while(m); if (vp) free(vp); }
void ATL_ger (const int M, const int N, const SCALAR alpha0, const TYPE *X, const int incX, const TYPE *Y, const int incY, TYPE *A, const int lda) { #ifdef ATL_CHOICE const size_t opsize = (M*N+M+N)*sizeof(TYPE)SHIFT; if (opsize <= ATL_MulBySize(ATL_L1elts)) Mjoin(MY_GER,_L1)(M, N, alpha0, X, incX, Y, incY, A, lda); else if (opsize <= MY_CE) Mjoin(MY_GER,_L2)(M, N, alpha0, X, incX, Y, incY, A, lda); else Mjoin(MY_GER,_OOC)(M, N, alpha0, X, incX, Y, incY, A, lda); #else void (*getX)(const int N, const SCALAR alpha, const TYPE *X, const int incX, TYPE *Y, const int incY); ATL_r1kern_t gerk; void *vp=NULL; TYPE *x = (TYPE*)X, *y = (TYPE*)Y; size_t t1, t2; ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, COPYX, COPYY, APPLYALPHAX; int minM, minN; #ifdef TREAL #define one ATL_rone TYPE alpha = alpha0; const int ALPHA_IS_ONE = (alpha0 == ATL_rone); #else TYPE one[2] = {ATL_rone, ATL_rzero}, *alpha=(TYPE*)alpha0; const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha[1] == ATL_rzero); #endif if (M < 1 || N < 1 || SCALAR_IS_ZERO(alpha)) return; /* * Get gerk kernel pointer along with any usage guidelines, and use the * optimized CacheElts to compute the correct blocking factor */ gerk = ATL_GetR1Kern(M, N, A, lda, &mu, &nu, &minM, &minN, &alignX, &ALIGNX2A, &alignY, &ForceNU, &CacheElts); if (CacheElts) { mb = (CacheElts - 2*nu) / (2*(nu+1)); mb = (mb > mu) ? (mb/mu)*mu : M; mb = (mb > M) ? M : mb; } else mb = M; /* * Set up to handle case where kernel requres N to be a multiple if NU */ if (ForceNU) { Nm = (N/nu)*nu; nr = N - Nm; } else { Nm = N; nr = 0; } /* * For very small N, we can't afford the data copy, so call AXPY-based routine */ if (N < 4 || Nm < 1) { MY_GERK_AXPY(M, N, alpha0, X, incX, Y, incY, A, lda); return; } /* * ATLAS's GER kernels loop over M in inner loop, which is bad news if M is * very small. Call code that requires no copy of A & B for these degenerate * cases */ if (M < 16 || M < minM) { MY_GERK_MLT16(M, N, alpha0, X, incX, Y, incY, A, lda); return; } /* ***************************************************************************** Figure out whether vecs need be copied, and which one will be scaled by alpha ***************************************************************************** */ #ifdef Conj_ COPYY = 1; #else COPYY = (incY != 1); if (!COPYY && alignY) { t1 = (size_t) Y; COPYY = ((t1/alignY)*alignY != t1); } #endif COPYX = (incX != 1); if (!COPYX) /* may still need to copy due to alignment issues */ { /* * ATL_Cachelen is the highest alignment that can be requested, so * make X's % with Cachelen match that of A if you want A & X to have * the same alignment */ if (ALIGNX2A) { t1 = (size_t) A; t2 = (size_t) X; COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) != (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2))); } else if (alignX) { t1 = (size_t) X; COPYX = ((t1/alignX)*alignX != t1); } } if (COPYX != COPYY) /* if only one of them is already being copied */ APPLYALPHAX = COPYX; /* apply alpha to that one */ else if (!COPYY && !COPYX) /* nobody currently being copied means */ { /* we'll need to force a copy to apply alpha */ APPLYALPHAX = (M < N); /* apply alpha to shorter vector */ if (!ALPHA_IS_ONE) /* force copy if alpha != 1.0 */ { COPYX = APPLYALPHAX; COPYY = !APPLYALPHAX; } } else /* if both are being copied anyway */ APPLYALPHAX = (M < N); /* apply alpha to shorter vector */ if (COPYX | COPYY) /* if I need to copy either vector */ { /* allocate & align them */ vp = malloc(ATL_MulBySize(COPYX*mb+COPYY*N) + 2*ATL_Cachelen); /* * If we cannot allocate enough space to copy the vectors, give up and * call the simple loop-based implementation */ if (!vp) { MY_GERK_AXPY(M, N, alpha0, X, incX, Y, incY, A, lda); return; } if (COPYY) { y = ATL_AlignPtr(vp); x = y + (N SHIFT); x = (ALIGNX2A) ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x); if (!APPLYALPHAX && !ALPHA_IS_ONE) /* need to apply alpha to Y */ { #ifdef Conj_ Mjoin(PATL,moveConj)(N, alpha, Y, incY, y, 1); #else Mjoin(PATL,cpsc)(N, alpha, Y, incY, y, 1); #endif alpha = one; } else /* do not apply alpha */ #ifdef Conj_ Mjoin(PATL,copyConj)(N, Y, incY, y, 1); #else Mjoin(PATL,copy)(N, Y, incY, y, 1); #endif } else if (ALIGNX2A) x = ATL_Align2Ptr(vp, A); else x = ATL_AlignPtr(vp); } getX = (COPYX) ? Mjoin(PATL,cpsc) : NULL; m = M; do { imb = Mmin(mb, m); if (getX) /* copy X if necessary */ getX(imb, alpha, X, incX, x, 1); else x = (TYPE*) X; /* * Call optimized kernel (can be restricted or general) */ if (imb > minM) gerk(imb, Nm, x, y, A, lda); else Mjoin(PATL,gerk_Mlt16)(imb, Nm, one, x, 1, y, 1, A, lda); /* * Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy */ if (nr) Mjoin(PATL,gerk_axpy)(imb, nr, one, x, 1, y+(Nm SHIFT), 1, A+((size_t)lda)*(Nm SHIFT), lda); A += imb SHIFT; X += (imb*incX)SHIFT; m -= imb; imb = Mmin(m,mb); } while(m); if (vp) free(vp); #endif }
void Mjoin(PATL,hemv) ( const enum ATLAS_UPLO Uplo, const int N, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *X, const int incX, const SCALAR beta, TYPE *Y, const int incY ) /* * Purpose * ======= * * Mjoin( PATL, hemv ) performs the matrix-vector operation * * y := alpha * A * x + beta * y, * * where alpha and beta are scalars, x and y are n-element vectors and A * is an n by n Hermitian matrix. * * This is a blocked version of the algorithm. For a more detailed des- * cription of the arguments of this function, see the reference imple- * mentation in the ATLAS/src/blas/reference directory. * * --------------------------------------------------------------------- */ { const int BETA0 = (*beta == ATL_rzero && beta[1] == ATL_rzero); const int BETA1 = (*beta == ATL_rone && beta[1] == ATL_rzero); const int ALPHA1 = (*alpha == ATL_rone && alpha[1] == ATL_rzero); const int ALPHA0 = (*alpha == ATL_rzero && alpha[1] == ATL_rzero); if (N <= 0 || (ALPHA0 && BETA1)) return; if (ALPHA0) { if (BETA0) Mjoin(PATL,zero)(N, Y, incY); else Mjoin(PATL,scal)(N, beta, Y, incY); return; } #ifdef USE_GEMV_BASED if (N >= 240) { void *vp=NULL; TYPE *x=(TYPE*)X, *y=Y, *xh, *yh; const size_t tX = (size_t)X, tY = (size_t)Y, N2 = N+N; const int COPYY = !(incY == 1 && (ATL_MulByCachelen(ATL_DivByCachelen(tY)) == tY)); const int COPYX = !(incX == 1 && (COPYY || ALPHA1) && (ATL_MulByCachelen(ATL_DivByCachelen(tX)) == tX)); const TYPE one[2] = {ATL_rone, ATL_rzero}; const TYPE *calp=one, *cbet=one; TYPE *tp; tp = vp = malloc((COPYX+COPYY+2)*(ATL_Cachelen+ATL_MulBySize(N))); if (!vp) { Mjoin(PATL,refhemv)(Uplo, N, alpha, A, lda, X, incX, beta, Y, incY); return; } yh = ATL_AlignPtr(tp); Mjoin(PATL,zero)(N, yh, 1); tp = yh + N2; xh = ATL_AlignPtr(tp); tp = xh + N2; if (COPYX) { x = ATL_AlignPtr(tp); if (COPYY || ALPHA1) { register ATL_INT i; register const size_t incX2 = incX+incX; const TYPE *xx=X; for (i=0; i < N2; i += 2, xx += incX2) { xh[i] = x[i] = *xx; xh[i+1] = -(x[i+1] = xx[1]); } } else if (alpha[1] == ATL_rzero) { register ATL_INT i; register const size_t incX2 = incX+incX; register const TYPE ra=(*alpha), ia=alpha[1]; const TYPE *xx=X; for (i=0; i < N2; i += 2, xx += incX2) { register TYPE rx = *xx, ix = xx[1]; x[i] = rx*ra; x[i+1] = ix*ra; xh[i] = rx; xh[i+1] = -ix; } } else { register ATL_INT i; register const size_t incX2 = incX+incX; register const TYPE ra=(*alpha), ia=alpha[1]; const TYPE *xx=X; for (i=0; i < N2; i += 2, xx += incX2) { register TYPE rx = *xx, ix = xx[1]; x[i] = rx*ra - ix*ia; x[i+1] = rx*ia + ix*ra; xh[i] = rx; xh[i+1] = -ix; } } tp = x + N2; } else Mjoin(PATL,copyConj)(N, X, incX, xh, 1); if (COPYY) { calp = alpha; cbet = beta; y = ATL_AlignPtr(tp); Mjoin(PATL,zero)(N, y, 1); } else if (BETA0) Mjoin(PATL,zero)(N, y, 1); else if (!BETA1) Mjoin(PATL,scal)(N, beta, y, 1); if (Uplo == AtlasLower) ATL_symvL(Mjoin(PATL,refhemv), 120, N, A, lda, x, y, xh, yh); else ATL_symvU(Mjoin(PATL,refhemv), 120, N, A, lda, x, y, xh, yh); if (COPYY) { Mjoin(PATL,axpbyConj)(N, alpha, yh, 1, calp, y, 1); Mjoin(PATL,axpby)(N, one, y, 1, cbet, Y, incY); } else Mjoin(PATL,axpyConj)(N, alpha, yh, 1, Y, incY); free(vp); return; } #endif Mjoin(PATL,refhemv)(Uplo, N, alpha, A, lda, X, incX, beta, Y, incY); }