void Mjoin(PATL,DOMVNWORK_cols)(ATL_LAUNCHSTRUCT_t *lp, void *vp) { ATL_thread_t *tp = vp; ATL_TGEMV_t *pd = lp->opstruct; ATL_CINT N = pd->N, lda = pd->lda; const int P = tp->P; ATL_CINT nr = pd->nr; const int vrank = (!nr || (pd->flg & 1)) ? tp->rank : (P + tp->rank+nr-1)%P; ATL_INT n = pd->n; TYPE *y = pd->Y + (tp->rank)*((pd->M SHIFT)+ATL_Cachelen/sizeof(TYPE)); const TYPE *a = pd->A + (lda SHIFT)*vrank; #ifdef TCPLX TYPE one[2] = {ATL_rone, ATL_rzero}, zero[2] = {ATL_rzero, ATL_rzero}; const enum ATLAS_TRANS TA = (pd->flg & 4) ? AtlasConj : AtlasNoTrans; #else const enum ATLAS_TRANS TA = AtlasNoTrans; #define one ATL_rone #define zero ATL_rzero #endif y = ATL_Align2Ptr(y, a); if (vrank < nr) n++; Mjoin(PATL,gemv)(TA, pd->M, n, one, a, lda*P, pd->X+vrank*((pd->incX)SHIFT), P*pd->incX, zero, y, 1); }
void Mjoin(PATL,CombineMVN) ( void *vp, /* void ptr to ATL_GEMV_t struct given to threads */ const int myrank, /* my entry in MMNODE_t array */ const int hisrank /* array entry to be combined into mine */ ) { ATL_TGEMV_t *pd = vp; ATL_CINT M = pd->M; ATL_INT i; const int P = pd->P; const int vrank = (!pd->nr || (pd->flg & 1)) ? myrank : (P + myrank+pd->nr-1)%P; const int hvrank = (!pd->nr || (pd->flg & 1)) ? hisrank : (P + hisrank+pd->nr-1)%P; const TYPE *a = pd->A + (pd->lda SHIFT)*vrank; const TYPE *ha = pd->A + (pd->lda SHIFT)*hvrank; TYPE *y = pd->Y + (myrank)*((M SHIFT)+ATL_Cachelen/sizeof(TYPE)); TYPE *hy = pd->Y + (hisrank)*((M SHIFT)+ATL_Cachelen/sizeof(TYPE)); #ifdef TCPLX ATL_CINT M2 = M+M; const TYPE one[2] = {ATL_rone, ATL_rzero}; #else #define one ATL_rone #endif y = ATL_Align2Ptr(y, a); hy = ATL_Align2Ptr(hy, ha); #ifdef TCPLX for (i=0; i < M2; i++) #else for (i=0; i < M; i++) #endif y[i] += hy[i]; }
void Mjoin(PATL,tgemv) (const enum ATLAS_TRANS TA, ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX, const SCALAR beta, TYPE *Y, ATL_CINT incY) { static size_t ALb=0, ALe=0; size_t at = (size_t) A; ATL_INT n, P, ldaP; ATL_TGEMV_t pd; /* * quick return if possible. */ if (M < 1 || N < 1) return; if (SCALAR_IS_ZERO(alpha)) /* No contrib from alpha*A*x */ { ATL_CINT NY = (TA == AtlasTrans || TA == AtlasConjTrans) ? N : M; if (!SCALAR_IS_ONE(beta)) { if (SCALAR_IS_ZERO(beta)) Mjoin(PATL,zero)(NY, Y, incY); else Mjoin(PATL,scal)(NY, beta, Y, incY); } return; } pd.flg = (at >= ALb && at <= ALe) ? 1 : 0; ALb = (size_t)A; ALe = (size_t)(A+(M SHIFT)); #ifdef TREAL pd.flg |= (TA == AtlasTrans || TA == AtlasConjTrans) ? 2 : 0; #else if (TA != AtlasNoTrans) { if (TA == AtlasConj) pd.flg |= 4; else if (TA == AtlasTrans) pd.flg |= 2; else /* if (TA == AtlasConjTrans) */ pd.flg |= (2|4); } #endif P = ATL_DivBySize(CacheEdge); P = ((size_t)M*N+P-1) / P; /* add more procs only when cache is full */ P = (P&1 && P > 1)?P+1 : P; /* don't use odd P; it hurts alignment */ P = Mmin(ATL_NTHREADS, P); if (TA == AtlasNoTrans || TA == AtlasConj) P=1; //fprintf(stderr, "P=%d, TA=%d, M=%d, N=%d\n", P, (TA==AtlasTrans), M, N); /* * Make sure we don't overflow 32-bit integer lda */ ldaP = P * lda; while ((size_t)ldaP != ((size_t)lda)*P) { P--; ldaP = P * lda; } if (P > 1) { pd.M = M; pd.N = N; pd.incX = incX; pd.incY = incY; pd.lda = lda; pd.alpha = alpha; pd.beta = beta; pd.X = X; pd.Y = Y; pd.A = A; pd.P = P; n = N / P; pd.n = n; pd.nr = N - n*P; if (pd.flg & 2) /* Transpose case */ { ATL_goparallel(P, Mjoin(PATL,DOMVTWORK_cols), &pd, NULL); return; } /* * For gemvN, everyone needs a private M-length y. Don't do this unless * we are sure the combine cost is likely dominated by the parallelism */ else if (n > Mmax(P,8)) { int vrank; const TYPE *a; TYPE *y, *y0; #ifdef TCPLX TYPE one[2] = {ATL_rone, ATL_rzero}; TYPE zero[2] = {ATL_rzero, ATL_rzero}; #endif y0 = y = malloc(P*(ATL_Cachelen+ATL_MulBySize(M))); ATL_assert(y); pd.Y = y; pd.incY = 1; #ifdef TREAL pd.alpha = ATL_rone; pd.beta = ATL_rzero; #else pd.alpha = one; pd.beta = zero; #endif ATL_goparallel(P, Mjoin(PATL,DOMVNWORK_cols), &pd, Mjoin(PATL,CombineMVN)); /* * goparallel reduces all node's Ys to node 0's. Extract his from the * work array, and combine it with input array, applying both alpha * and beta in the process */ vrank = (!pd.nr || (pd.flg & 1)) ? 0 : pd.nr-1; a = A + (lda SHIFT)*vrank; y = ATL_Align2Ptr(y, a); Mjoin(PATL,axpby)(M, alpha, y, 1, beta, Y, incY); free(y0); return; } } /* * If we haven't parallelized this thing, just do it serial */ Mjoin(PATL,gemv)(TA, M, N, alpha, A, lda, X, incX, beta, Y, incY); }
void ATL_her(const enum ATLAS_UPLO Uplo, ATL_CINT N, const TYPE alpha, const TYPE *X, ATL_CINT incX, TYPE *A, ATL_CINT lda) { const TYPE calpha[2] = {alpha, ATL_rzero}; void *vp=NULL; TYPE *x, *xt; ATL_r1kern_t gerk; ATL_INT CacheElts; const int ALP1 = (alpha == ATL_rone); int COPYX, COPYXt; int mu, nu, minM, minN, alignX, alignXt, FNU, ALIGNX2A; if (N < 1 || (alpha == ATL_rzero)) return; /* * For very small problems, avoid overhead of func calls & data copy */ if (N < 50) { Mjoin(PATL,refher)(Uplo, N, alpha, X, incX, A, lda); return; } /* * Determine the GER kernel to use, and its parameters */ gerk = ATL_GetR1Kern(N-ATL_s1L_NU, ATL_s1L_NU, A, lda, &mu, &nu, &minM, &minN, &alignX, &ALIGNX2A, &alignXt, &FNU, &CacheElts); /* * Determine if we need to copy the vectors */ COPYX = (incX != 1); if (!COPYX) /* may still need to copy due to alignment issues */ { /* * ATL_Cachelen is the highest alignment that can be requested, so * make X's % with Cachelen match that of A if you want A & X to have * the same alignment */ if (ALIGNX2A) { size_t t1 = (size_t) A, t2 = (size_t) X; COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) != (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2))); } else if (alignX) { size_t t1 = (size_t) X; COPYX = ((t1/alignX)*alignX != t1); } } vp = malloc((ATL_Cachelen+ATL_MulBySize(N))*(1+COPYX)); if (!vp) { Mjoin(PATL,refher)(Uplo, N, alpha, X, incX, A, lda); return; } xt = ATL_AlignPtr(vp); if (COPYX) { x = xt + N+N; x = ALIGNX2A ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x); Mjoin(PATL,copy)(N, X, incX, x, 1); } else x = (TYPE*) X; if (ALP1) Mjoin(PATL,copyConj)(N, X, incX, xt, 1); else Mjoin(PATL,moveConj)(N, calpha, X, incX, xt, 1); if (Uplo == AtlasUpper) Mjoin(PATL,her_kU)(gerk, N, alpha, x, xt, A, lda); else Mjoin(PATL,her_kL)(gerk, N, alpha, x, xt, A, lda); if (vp) free(vp); }
void ATL_ger (const int M, const int N, const SCALAR alpha0, const TYPE *X, const int incX, const TYPE *Y, const int incY, TYPE *A, const int lda) { #ifdef ATL_CHOICE const size_t opsize = (M*N+M+N)*sizeof(TYPE)SHIFT; if (opsize <= ATL_MulBySize(ATL_L1elts)) Mjoin(MY_GER,_L1)(M, N, alpha0, X, incX, Y, incY, A, lda); else if (opsize <= MY_CE) Mjoin(MY_GER,_L2)(M, N, alpha0, X, incX, Y, incY, A, lda); else Mjoin(MY_GER,_OOC)(M, N, alpha0, X, incX, Y, incY, A, lda); #else void (*getX)(const int N, const SCALAR alpha, const TYPE *X, const int incX, TYPE *Y, const int incY); ATL_r1kern_t gerk; void *vp=NULL; TYPE *x = (TYPE*)X, *y = (TYPE*)Y; size_t t1, t2; ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, COPYX, COPYY, APPLYALPHAX; int minM, minN; #ifdef TREAL #define one ATL_rone TYPE alpha = alpha0; const int ALPHA_IS_ONE = (alpha0 == ATL_rone); #else TYPE one[2] = {ATL_rone, ATL_rzero}, *alpha=(TYPE*)alpha0; const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha[1] == ATL_rzero); #endif if (M < 1 || N < 1 || SCALAR_IS_ZERO(alpha)) return; /* * Get gerk kernel pointer along with any usage guidelines, and use the * optimized CacheElts to compute the correct blocking factor */ gerk = ATL_GetR1Kern(M, N, A, lda, &mu, &nu, &minM, &minN, &alignX, &ALIGNX2A, &alignY, &ForceNU, &CacheElts); if (CacheElts) { mb = (CacheElts - 2*nu) / (2*(nu+1)); mb = (mb > mu) ? (mb/mu)*mu : M; mb = (mb > M) ? M : mb; } else mb = M; /* * Set up to handle case where kernel requres N to be a multiple if NU */ if (ForceNU) { Nm = (N/nu)*nu; nr = N - Nm; } else { Nm = N; nr = 0; } /* * For very small N, we can't afford the data copy, so call AXPY-based routine */ if (N < 4 || Nm < 1) { MY_GERK_AXPY(M, N, alpha0, X, incX, Y, incY, A, lda); return; } /* * ATLAS's GER kernels loop over M in inner loop, which is bad news if M is * very small. Call code that requires no copy of A & B for these degenerate * cases */ if (M < 16 || M < minM) { MY_GERK_MLT16(M, N, alpha0, X, incX, Y, incY, A, lda); return; } /* ***************************************************************************** Figure out whether vecs need be copied, and which one will be scaled by alpha ***************************************************************************** */ #ifdef Conj_ COPYY = 1; #else COPYY = (incY != 1); if (!COPYY && alignY) { t1 = (size_t) Y; COPYY = ((t1/alignY)*alignY != t1); } #endif COPYX = (incX != 1); if (!COPYX) /* may still need to copy due to alignment issues */ { /* * ATL_Cachelen is the highest alignment that can be requested, so * make X's % with Cachelen match that of A if you want A & X to have * the same alignment */ if (ALIGNX2A) { t1 = (size_t) A; t2 = (size_t) X; COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) != (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2))); } else if (alignX) { t1 = (size_t) X; COPYX = ((t1/alignX)*alignX != t1); } } if (COPYX != COPYY) /* if only one of them is already being copied */ APPLYALPHAX = COPYX; /* apply alpha to that one */ else if (!COPYY && !COPYX) /* nobody currently being copied means */ { /* we'll need to force a copy to apply alpha */ APPLYALPHAX = (M < N); /* apply alpha to shorter vector */ if (!ALPHA_IS_ONE) /* force copy if alpha != 1.0 */ { COPYX = APPLYALPHAX; COPYY = !APPLYALPHAX; } } else /* if both are being copied anyway */ APPLYALPHAX = (M < N); /* apply alpha to shorter vector */ if (COPYX | COPYY) /* if I need to copy either vector */ { /* allocate & align them */ vp = malloc(ATL_MulBySize(COPYX*mb+COPYY*N) + 2*ATL_Cachelen); /* * If we cannot allocate enough space to copy the vectors, give up and * call the simple loop-based implementation */ if (!vp) { MY_GERK_AXPY(M, N, alpha0, X, incX, Y, incY, A, lda); return; } if (COPYY) { y = ATL_AlignPtr(vp); x = y + (N SHIFT); x = (ALIGNX2A) ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x); if (!APPLYALPHAX && !ALPHA_IS_ONE) /* need to apply alpha to Y */ { #ifdef Conj_ Mjoin(PATL,moveConj)(N, alpha, Y, incY, y, 1); #else Mjoin(PATL,cpsc)(N, alpha, Y, incY, y, 1); #endif alpha = one; } else /* do not apply alpha */ #ifdef Conj_ Mjoin(PATL,copyConj)(N, Y, incY, y, 1); #else Mjoin(PATL,copy)(N, Y, incY, y, 1); #endif } else if (ALIGNX2A) x = ATL_Align2Ptr(vp, A); else x = ATL_AlignPtr(vp); } getX = (COPYX) ? Mjoin(PATL,cpsc) : NULL; m = M; do { imb = Mmin(mb, m); if (getX) /* copy X if necessary */ getX(imb, alpha, X, incX, x, 1); else x = (TYPE*) X; /* * Call optimized kernel (can be restricted or general) */ if (imb > minM) gerk(imb, Nm, x, y, A, lda); else Mjoin(PATL,gerk_Mlt16)(imb, Nm, one, x, 1, y, 1, A, lda); /* * Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy */ if (nr) Mjoin(PATL,gerk_axpy)(imb, nr, one, x, 1, y+(Nm SHIFT), 1, A+((size_t)lda)*(Nm SHIFT), lda); A += imb SHIFT; X += (imb*incX)SHIFT; m -= imb; imb = Mmin(m,mb); } while(m); if (vp) free(vp); #endif }
void ATL_gemv (ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY) /* * y = alpha*A*x + beta*y, A is MxN, len(X) = N, len(Y) = M */ { ATL_mvkern_t mvnk, mvnk_b1, mvnk_b0; void *vp=NULL; TYPE *x = (TYPE*)X, *y = (TYPE*)Y, *p; size_t t1, t2; ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; int mu, nu, alignX, alignY, ALIGNY2A, ForceNU, COPYX, COPYY, APPLYALPHAX; int minM, minN, DOTBASED; #ifdef TREAL #define one ATL_rone #define Zero ATL_rzero TYPE alpha = alpha0, beta = beta0; const int ALPHA_IS_ONE = (alpha0 == ATL_rone); #else TYPE one[2] = {ATL_rone, ATL_rzero}, *alpha=(TYPE*)alpha0; TYPE Zero[2] = {ATL_rzero, ATL_rzero}; TYPE *beta = (TYPE*) beta0; const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha[1] == ATL_rzero); #endif if (M < 1 || N < 1) /* F77BLAS doesn't scale in either case */ return; if (SCALAR_IS_ZERO(alpha)) /* No contrib from alpha*A*x */ { if (!SCALAR_IS_ONE(beta)) { if (SCALAR_IS_ZERO(beta)) Mjoin(PATL,zero)(M, Y, incY); else Mjoin(PATL,scal)(M, beta, Y, incY); } return; } /* * ATLAS's mvn kernels loop over M in inner loop, which is bad news if M is * very small. Call code that requires no copy of X & Y for these degenerate * cases */ if (M < 16) { Mjoin(PATL,mvnk_Mlt16)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY); return; } /* * Get mvnk kernel pointer along with any usage guidelines, and use the * optimized CacheElts to compute the correct blocking factor * For no transpose, X alignment args really apply to Y, and vice versa. */ mvnk_b1 = ATL_GetMVNKern(M, N, A, lda, &mvnk_b0, &DOTBASED, &mu, &nu, &minM, &minN, &alignY, &ALIGNY2A, &alignX, &ForceNU, &CacheElts); /* * Set up to handle case where kernel requires N to be a multiple if NU */ if (ForceNU) { Nm = (N/nu)*nu; nr = N - Nm; } else { Nm = N; nr = 0; } /* * For very small N, we can't afford the data copy, so call special case code */ if (N < 4 || Nm < 1) { Mjoin(PATL,mvnk_smallN)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY); return; } if (CacheElts) { mb = (CacheElts - 2*nu) / (2*(nu+1)); mb = (mb > mu) ? (mb/mu)*mu : M; mb = (mb > M) ? M : mb; } else mb = M; /* ***************************************************************************** Figure out whether vecs need be copied, and which one will be scaled by alpha ***************************************************************************** */ COPYX = (incX != 1); if (!COPYX && alignX) { t1 = (size_t) X; COPYX = ((t1/alignX)*alignX != t1); } COPYY = (incY != 1); if (!COPYY) /* may still need to copy due to alignment issues */ { /* * ATL_Cachelen is the highest alignment that can be requested, so * make X's % with Cachelen match that of A if you want A & X to have * the same alignment */ if (ALIGNY2A) { t1 = (size_t) A; t2 = (size_t) Y; COPYY = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) != (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2))); } else if (alignY) { t1 = (size_t) Y; COPYY = ((t1/alignY)*alignY != t1); } } if (COPYX != COPYY) /* if only one of them is already being copied */ APPLYALPHAX = COPYX; /* apply alpha to that one */ else if (!COPYY && !COPYX) /* nobody currently being copied means */ { /* we'll need to force a copy to apply alpha */ APPLYALPHAX = (M < N); /* apply alpha to vector requiring least */ if (!ALPHA_IS_ONE) /* workspace if alpha != 1.0 */ { COPYX = APPLYALPHAX; COPYY = !APPLYALPHAX; } } else /* if both are being copied anyway */ APPLYALPHAX = 0; /* apply alpha during update of Y */ if (COPYX | COPYY) /* if I need to copy either vector */ { /* allocate & align them */ vp = malloc(ATL_MulBySize(COPYY*mb+COPYX*N) + 2*ATL_Cachelen); /* * If we cannot allocate enough space to copy the vectors, give up and * call the simple loop-based implementation */ if (!vp) { Mjoin(PATL,mvnk_smallN)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY); return; } if (COPYX) { x = ATL_AlignPtr(vp); if (APPLYALPHAX && !ALPHA_IS_ONE) Mjoin(PATL,cpsc)(N, alpha, X, incX, x, 1); else Mjoin(PATL,copy)(N, X, incX, x, 1); if (COPYY) y = x + (N SHIFT); } else /* if (COPYY) known true by surrounding if */ y = vp; if (COPYY) { y = (ALIGNY2A) ? ATL_Align2Ptr(y, A) : ATL_AlignPtr(y); beta = Zero; alpha = one; } } /* * Apply beta to Y if we aren't copying Y */ if (!COPYY && !SCALAR_IS_ONE(beta0)) { if (SCALAR_IS_ZERO(beta0)) beta = Zero; else { Mjoin(PATL,scal)(M, beta0, Y, incY); beta = one; } } mvnk = (COPYY || SCALAR_IS_ZERO(beta)) ? mvnk_b0 : mvnk_b1; m = M; do { imb = Mmin(mb, m); /* * Call optimized kernel (can be restricted or general) */ if (imb >= minM) mvnk(imb, Nm, A, lda, x, y); else Mjoin(PATL,mvnk_Mlt16)(imb, Nm, one, A, lda, x, 1, beta, y, 1); /* * Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy */ if (nr) Mjoin(PATL,mvnk_smallN)(imb, nr, one, A+((size_t)lda)*(Nm SHIFT), lda, x+(Nm SHIFT), 1, one, y, 1); /* * If we are copying Y, we have formed A*x into y, so scale it by the * original alpha, by using axpby: Y = beta0*Y + alpha0*y */ if (COPYY) Mjoin(PATL,axpby)(imb, alpha0, y, 1, beta0, Y, incY); else y += imb SHIFT; A += imb SHIFT; Y += (imb*incY)SHIFT; m -= imb; } while(m); if (vp) free(vp); }
void ATL_gemv (ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY) /* * Y = alpha*conj(A)*X + beta*Y * For Conjugate transpose, first form x = conj(X), y = A^T * conj(X), * then use axpbyConj to add this to original Y in the operation * Y = beta*Y + alpha*conj(y) = beta*Y + alpha*(A^H * X), which is * Y = beta*Y + alpha * A^H * X. */ { ATL_mvkern_t mvtk, mvtk_b1, mvtk_b0; void *vp=NULL; TYPE *x = (TYPE*)X, *y = (TYPE*)Y; size_t t1, t2; ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, COPYX, COPYY, APPLYALPHAX; int minM, minN; TYPE one[2] = {ATL_rone, ATL_rzero}; TYPE Zero[2] = {ATL_rzero, ATL_rzero}; TYPE *beta = (TYPE*) beta0; const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha0[1] == ATL_rzero); if (M < 1 || N < 1) /* F77 BLAS doesn't scale in either case */ return; if (SCALAR_IS_ZERO(alpha0)) /* No contrib from alpha*A*x */ { if (!SCALAR_IS_ONE(beta0)) { if (SCALAR_IS_ZERO(beta0)) Mjoin(PATL,zero)(N, Y, incY); else Mjoin(PATL,scal)(N, beta, Y, incY); } return; } /* * ATLAS's MVT kernels loop over M in inner loop, which is bad news if M is * very small. Call code that requires no copy of X & Y for these degenerate * cases */ if (M < 16) { Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX, beta0, Y, incY); return; } /* * Get mvtk kernel pointer along with any usage guidelines, and use the * optimized CacheElts to compute the correct blocking factor */ mvtk_b1 = ATL_GetMVTKern(M, N, A, lda, &mvtk_b0, &mu, &nu, &minM, &minN, &alignX, &ALIGNX2A, &alignY, &ForceNU, &CacheElts); /* * Set up to handle case where kernel requres N to be a multiple if NU */ if (ForceNU) { Nm = (N/nu)*nu; nr = N - Nm; } else { Nm = N; nr = 0; } /* * For very small N, we can't afford the data copy, so call special case code */ if (N < 4 || Nm < 1) { Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX, beta0, Y, incY); return; } if (CacheElts) { mb = (CacheElts - 2*nu) / (2*(nu+1)); mb = (mb > mu) ? (mb/mu)*mu : M; mb = (mb > M) ? M : mb; } else mb = M; vp = malloc(ATL_MulBySize(mb+N) + 2*ATL_Cachelen); /* * If we cannot allocate enough space to copy the vectors, give up and * call the simple loop-based implementation */ if (!vp) { Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX, beta0, Y, incY); return; } y = ATL_AlignPtr(vp); x = y + (N SHIFT); x = (ALIGNX2A) ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x); beta = Zero; /* * In this step, we form y = A^T * conj(X) */ mvtk = mvtk_b0; m = M; do { imb = Mmin(mb, m); Mjoin(PATL,copyConj)(imb, X, incX, x, 1); /* x = conj(X) */ /* * Call optimized kernel (can be restricted or general) */ if (imb >= minM) mvtk(imb, Nm, A, lda, x, y); else Mjoin(PATL,mvtk_Mlt16)(imb, Nm, one, A, lda, x, 1, beta, y, 1); /* * Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy */ if (nr) Mjoin(PATL,mvtk_smallN)(imb, nr, one, A+((size_t)lda)*(Nm SHIFT), lda, x, 1, beta, y+(Nm SHIFT), 1); beta = one; mvtk = mvtk_b1; A += imb SHIFT; X += (imb*incX)SHIFT; m -= imb; imb = Mmin(m,mb); } while(m); /* * Given y = A^T * conj(X) from above, now do: * Y = beta*Y + alpha*conj(y) = beta*Y + alpha*(A^H * x), which is * Y = beta*Y + alpha * A^H * x. */ Mjoin(PATL,axpbyConj)(N, alpha0, y, 1, beta0, Y, incY); free(vp); }