void col2blk(const int M, const int N, const TYPE *A, const int lda, TYPE *V, const SCALAR alpha) { const int nMb = ATL_DivByNB(M), ib = M - ATL_MulByNB(nMb); const int incA = (lda - M)<<1, incv = ATL_MulByNB(N); const int incV = (incv<<1) - NB; int i, ii, j; TYPE *rp = V+ATL_MulByNB(N), *ip = V, *prp, *pip; #ifdef ALPHAXI0 #ifdef Conj_ const register TYPE ralpha = *alpha, calpha = -ralpha; #else const register TYPE ralpha = *alpha; #endif #elif defined(ALPHAX) const register TYPE ralpha = *alpha, ialpha = alpha[1]; register TYPE ra, ia; #endif pip = V + (M-ib)*(N<<1); prp = pip + ib*N; for (j=N; j; j--, V += NB, A += incA) { ip = V; rp = V + incv; for (ii=nMb; ii; ii--, rp += incV, ip += incV) { for (i=NB; i; i--, A += 2, rp++, ip++) scalcp(A, rp, ip); } for (i=ib; i; i--, A += 2, prp++, pip++) scalcp(A, prp, pip); } }
static void row2blkT_NB(const int M, const int N, const TYPE *A, const int lda, TYPE *vr, TYPE *vi, const SCALAR alpha) { const int incA = lda<<2, incv = 2 - NBNB; const TYPE *pA0 = A, *pA1 = A + (lda<<1); int i, j; #ifdef ALPHAXI0 #ifdef Conj_ const register TYPE ralpha = *alpha, calpha = -ralpha; #else const register TYPE ralpha = *alpha; #endif #elif defined(ALPHAX) const register TYPE ralpha = *alpha, ialpha = alpha[1]; register TYPE ra, ia; #endif #if ((NB/2)*2 != NB) /* ATLAS should ensure NB divisible by 2 */ ATL_assert((NB/2)*2 == NB); #endif for (j=(NB>>1); j; j --, pA0 += incA, pA1 += incA, vr += incv, vi += incv) { for (i=0; i != NB2; i += 2, vr += NB, vi += NB) { scalcp(pA0+i, vr, vi); scalcp(pA1+i, vr+1, vi+1); } } }
void Mjoin(prow2blkT,_blk)(const int blk, const int M, const int N, const SCALAR alpha, const TYPE *A, int lda, const int ldainc, TYPE *V) /* * Given a packed Upper matrix A, copies & transposes M rows starting at A into * block-major row panel * ldainc = 0 : General rectangular * ldainc = 1 : Upper * ldainc = -1 : Lower */ { const int kb = Mmin(blk,N); const int ncb = N / kb, nr = N - ncb*kb; const int incV = kb*M - kb; const int VN = kb*M, vn = nr*M; int jb, i, j; TYPE *v; #ifdef ALPHAXI0 #ifdef Conj_ const register TYPE ralpha = *alpha, calpha = -ralpha; #else const register TYPE ralpha = *alpha; #endif #elif defined(ALPHAX) register const TYPE ralpha=(*alpha), ialpha = alpha[1]; register TYPE ra, ia; #endif if (ldainc == -1) lda--; lda -= M; lda += lda; for (jb=ncb; jb; jb--) { for (j=kb; j; j--) { v = V++; for (i=0; i != M; i++, v += kb, A += 2) scalcp(A, v+VN, v); A += lda; lda += ldainc; } V += incV; } for (j=nr; j; j--) { v = V++; for (i=0; i != M; i++, v += nr, A += 2) scalcp(A, v+vn, v); A += lda; lda += ldainc; } }
void Mjoin(pcol2blk,_blk)(const int blk, const int M, const int N, const SCALAR alpha, const TYPE *A, int lda, const int ldainc, TYPE *V) /* * Given a packed matrix A, copies N columns starting at A into * block-major column panel * ldainc = 0 : General * ldainc = 1 : Upper * ldainc = -1 : Lower * NOTE: specialize to alpha cases after it works! */ { const int kb = Mmin(M,blk); const int nrb = M / kb, mr = M - nrb*kb; const int nv = kb*N, nvv = mr*N; const int NN = nv+nv - kb; const int ldainc2 = ldainc+ldainc, M2 = M+M; int i, ib, j, J; TYPE *v = V + nrb*(NN+kb); #ifdef ALPHAXI0 #ifdef Conj_ const register TYPE ralpha = *alpha, calpha = -ralpha; #else const register TYPE ralpha = *alpha; #endif #elif defined(ALPHAX) const register TYPE ralpha=(*alpha), ialpha = alpha[1]; register TYPE ra, ia; #endif if (ldainc == -1) lda--; lda += lda; ATL_assert(N <= blk); for (j=0; j != N; j++) { for (ib=nrb; ib; ib--) { for (i=0; i < kb; i++, A += 2, V++) scalcp(A, V+nv, V); V += NN; } if (mr) { for (i=0; i < mr; i++, A += 2, v++) scalcp(A, v+nvv, v); } V += kb - nrb*(NN+kb); A += lda - M2; lda += ldainc2; } }
void prow2blk_KB(const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda, const int ldainc, TYPE *V) /* * This routine used by full copy to copy one mbxnb block of a matrix A to * block-major nbxmb storage (A is transposed during the copy) */ { TYPE *v; const int mn = mb * nb, ldainc2 = ldainc+ldainc; int i, j; #ifdef ALPHAXI0 #ifdef Conj_ const register TYPE ralpha = *alpha, calpha = -ralpha; #else const register TYPE ralpha = *alpha; #endif #elif defined(ALPHAX) register const TYPE ralpha=(*alpha), ialpha = alpha[1]; register TYPE ra, ia; #endif if (ldainc == -1) lda--; lda -= mb; lda += lda; for (j=nb; j; j--) { v = V++; for (i=0; i != mb; i++, v += nb, A += 2) scalcp(A, v+mn, v); A += lda; lda += ldainc2; } }
static void row2blkT_KB(const int M, const int N, const TYPE *A, const int lda, TYPE *vr, TYPE *vi, const SCALAR alpha) { const int M2 = M<<1, lda2 = lda<<1, incv = 1 - M*N; int i, j; #ifdef ALPHAXI0 #ifdef Conj_ const register TYPE ralpha = *alpha, calpha = -ralpha; #else const register TYPE ralpha = *alpha; #endif #elif defined(ALPHAX) const register TYPE ralpha = *alpha, ialpha = alpha[1]; register TYPE ra, ia; #endif for (j=N; j; j--, A += lda2, vr += incv, vi += incv) { for (i=0; i != M2; i += 2, vr += N, vi += N) scalcp(A+i, vr, vi); } }