static void RsymvL(const int N, const TYPE *A, const int lda, const TYPE *X, const SCALAR beta, TYPE *Y) { int mb, nb, jb, jbs, j, m; const TYPE *x0=X, *A0=A; const SCALAR alpha=beta; TYPE *y0=Y; ATL_GetPartSYMV(A, lda, &mb, &nb); for (j=0; j < N; j += nb) { jb = N-j; jb = Mmin(jb,nb); refsymvL(jb, alpha, A, lda, X, 1, beta, Y, 1); m = N-j-jb; if (m) { jbs = jb SHIFT; X += jbs; Y += jbs; gemvT(jb, m, alpha, A+jbs, lda, X, 1, beta, y0, 1); gemvN(m, jb, alpha, A+jbs, lda, x0, 1, beta, Y, 1); A += jbs*(lda+1); x0 = X; y0 = Y; } } }
void Mjoin(PATL,symv) (const enum ATLAS_UPLO Uplo, const int N, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *X, const int incX, const SCALAR beta, TYPE *Y, const int incY) { int mb, nb, jb, mb1, incA1, incA, incXY, incXY1, n, j; const int lda2=(lda SHIFT); const TYPE *x0=X, *x1, *A0=A, *A1; TYPE *y1, *y0=Y; assert(incX==1 && incY==1 && Uplo == AtlasLower); #ifdef TREAL assert(alpha == ATL_rone && beta == ATL_rone); #else assert(*alpha == ATL_rone && *beta == ATL_rone); assert(alpha[1] == ATL_rzero && beta[1] == ATL_rzero); #endif ATL_GetPartSYMV(A, lda, &mb, &nb); mb1 = N - ( (N-1) / mb ) * mb; incA1 = nb * lda2; incXY1 = (nb SHIFT); incA = incXY = mb SHIFT; n = (N-mb)SHIFT; A += n; X += n; Y += n; for (n=N-mb; n > 0; n -= mb, A -= incA, X -= incXY, Y -= incXY) { RsymvL(mb, A+n*lda2, lda, X, beta, Y); for (j=0, A1=A, x1=x0, y1=y0; j < n; j += nb, A1 += incA1, x1 += incXY1, y1 += incXY1) { jb = n - j; jb = Mmin(jb, nb); gemvT(jb, mb, alpha, A1, lda, X, 1, beta, y1, 1); gemvN(mb, jb, alpha, A1, lda, x1, 1, beta, Y, 1); } } RsymvL(mb1, A0, lda, x0, beta, y0); }
static void ATL_symvL ( ATL_symvK_t symvK, const int NB, ATL_CINT N, const TYPE *A, ATL_CINT lda, const TYPE *x, TYPE *y, const TYPE *xt, TYPE *yt ) { const TYPE one[2] = {ATL_rone, ATL_rzero}; ATL_INT Mmb2; ATL_INT Mmb, mr, MB, j; const size_t incA = (NB SHIFT)*lda; const size_t opsize = ((size_t)(N+8)*(N+4))*(sizeof(TYPE)>>1)SHIFT; void (*gemvT)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT, const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT); void (*gemvN)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT, const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT); if (opsize > MY_CE) { gemvT = Mjoin(PATL,gemvT); gemvN = Mjoin(PATL,gemvN_L2); } else if (opsize <= ATL_MulBySize(ATL_L1elts)) { gemvT = Mjoin(PATL,gemvT_L1); gemvN = Mjoin(PATL,gemvN_L1); } else { gemvT = Mjoin(PATL,gemvT_L2); gemvN = Mjoin(PATL,gemvN_L2); } /* * Choose MB such that A is retained in L2 cache for second GEMV call * If partial block is tiny, absorbe it into last block since cache is not * precise anyway. */ MB = ATL_DivBySize(MY_CE) / NB; MB = (MB > N || MB < 240) ? N : MB; Mmb2 = Mmb+Mmb; for (j=0; j < N; j += NB, A += incA) { const register size_t j2 = j+j; register int i, nb=N-j; nb = (nb >= NB) ? NB : nb; symvK(AtlasLower, nb, one, A+j2, lda, x+j2, 1, one, y+j2, 1); for (i=j+nb; i < N; i += MB) { const register size_t i2 = i+i; register int mb = N-i; mb = (mb >= MB) ? MB : mb; gemvT(mb, nb, one, A+i2, lda, xt+i2, 1, one, yt+j2, 1); gemvN(mb, nb, one, A+i2, lda, x+j2, 1, one, y+i2, 1); } } }