int cblas_errprn(int ierr, int info, char *form, ...) { va_list argptr; va_start(argptr, form); #ifdef GCCWIN vprintf(form, argptr); #else vfprintf(stderr, form, argptr); #endif va_end(argptr); return(Mmin(ierr,info)); }
static TYPE *ATL_LmulUC(const int M, const int N, const TYPE *LU, const int ldl) { const int lda = ldl SHIFT, MN = Mmin(M,N); int i, j, m; TYPE *C, *c; #ifdef TREAL const TYPE ONE=ATL_rone; #else const TYPE ONE[2] = {ATL_rone, ATL_rzero}; #endif C = c = malloc(M*ATL_MulBySize(N)); ATL_assert(c); if (M >= N) { for (j=0; j < MN; j++) { m = j SHIFT; for (i=0; i < m; i++) c[i] = ATL_rzero; #ifdef TCPLX c[i++] = ATL_rone; c[i++] = ATL_rzero; #else c[i++] = ATL_rone; #endif for (m=M SHIFT; i < m; i++) c[i] = LU[i]; c += m; LU += lda; } LU -= MN * lda; for (m=M SHIFT; j < N; j++, c += m) Mjoin(PATL,zero)(M, c, 1); cblas_trmm(CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, M, N, ONE, LU, ldl, C, M); } else /* M < N */ { for (j=0; j < M; j++) { m = (j+1) SHIFT; for (i=0; i < m; i++) c[i] = LU[i]; for (m=M SHIFT; i < m; i++) c[i] = ATL_rzero; c += m; LU += lda; } Mjoin(PATL,gecopy)(M, N-M, LU, ldl, c, M); LU -= M * lda; cblas_trmm(CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, M, N, ONE, LU, ldl, C, M); } return(C); }
void Mjoin(prow2blkT,_blk)(const int blk, const int M, const int N, const SCALAR alpha, const TYPE *A, int lda, const int ldainc, TYPE *V) /* * Given a packed Upper matrix A, copies & transposes M rows starting at A into * block-major row panel * ldainc = 0 : General rectangular * ldainc = 1 : Upper * ldainc = -1 : Lower */ { const int kb = Mmin(blk,N); const int ncb = N / kb, nr = N - ncb*kb; const int incV = kb*M - kb; const int VN = kb*M, vn = nr*M; int jb, i, j; TYPE *v; #ifdef ALPHAXI0 #ifdef Conj_ const register TYPE ralpha = *alpha, calpha = -ralpha; #else const register TYPE ralpha = *alpha; #endif #elif defined(ALPHAX) register const TYPE ralpha=(*alpha), ialpha = alpha[1]; register TYPE ra, ia; #endif if (ldainc == -1) lda--; lda -= M; lda += lda; for (jb=ncb; jb; jb--) { for (j=kb; j; j--) { v = V++; for (i=0; i != M; i++, v += kb, A += 2) scalcp(A, v+VN, v); A += lda; lda += ldainc; } V += incV; } for (j=nr; j; j--) { v = V++; for (i=0; i != M; i++, v += nr, A += 2) scalcp(A, v+vn, v); A += lda; lda += ldainc; } }
void Mjoin(pcol2blk,_blk)(const int blk, const int M, const int N, const SCALAR alpha, const TYPE *A, int lda, const int ldainc, TYPE *V) /* * Given a packed matrix A, copies N columns starting at A into * block-major column panel * ldainc = 0 : General * ldainc = 1 : Upper * ldainc = -1 : Lower * NOTE: specialize to alpha cases after it works! */ { const int kb = Mmin(M,blk); const int nrb = M / kb, mr = M - nrb*kb; const int nv = kb*N, nvv = mr*N; const int NN = nv+nv - kb; const int ldainc2 = ldainc+ldainc, M2 = M+M; int i, ib, j, J; TYPE *v = V + nrb*(NN+kb); #ifdef ALPHAXI0 #ifdef Conj_ const register TYPE ralpha = *alpha, calpha = -ralpha; #else const register TYPE ralpha = *alpha; #endif #elif defined(ALPHAX) const register TYPE ralpha=(*alpha), ialpha = alpha[1]; register TYPE ra, ia; #endif if (ldainc == -1) lda--; lda += lda; ATL_assert(N <= blk); for (j=0; j != N; j++) { for (ib=nrb; ib; ib--) { for (i=0; i < kb; i++, A += 2, V++) scalcp(A, V+nv, V); V += NN; } if (mr) { for (i=0; i < mr; i++, A += 2, v++) scalcp(A, v+nvv, v); } V += kb - nrb*(NN+kb); A += lda - M2; lda += ldainc2; } }
int solve(int S[500][500], int R, int C){ int sol[R][C]; sol[R-1][C-1] = 0; for(int i=R-2; i>=0; i--) sol[i][C-1] = sol[i+1][C-1]-S[i][C-1]; for(int j=C-2; j>=0; j--) sol[R-1][j] = sol[R-1][j+1]-S[R-1][j]; for(int i=R-2; i>=0; i--){ for(int j=C-2; j>=0; j--){ sol[i][j] = Mmin(sol[i+1][j],sol[i][j+1]) - S[i][j]; if(sol[i][j]<1) sol[i][j] = 0; } } return sol[0][0]+1; }
void Mjoin(PATL,prow2blkTF)(const int M, const int N, const SCALAR alpha, const TYPE *A, int lda, const int ldainc, TYPE *V) { const int mb = Mmin(NB,M), nMb = ATL_DivByNB(M); const int m = ATL_MulByNB(nMb), n = ATL_MulByNB(ATL_DivByNB(N)); const int nr = N - n, mr = M - m; const int incVm = ATL_MulByNB(N), incVV = ATL_MulByNB(mr); int i, j, ib, jb; const enum PACK_UPLO UA = (ldainc == 1) ? PackUpper : ( (ldainc == -1) ? PackLower : PackGen ); TYPE *v, *vv = V+nMb*incVm; void (*row2blk)(const int M, const int N, const TYPE alpha, const TYPE *A, int lda, const int ldainc, TYPE *V); if (ldainc) { if (alpha == ATL_rone) row2blk = ATL_prow2blk_KB_a1; else row2blk = ATL_prow2blk_KB_aX; for (j=0; j < n; j += NB) { for (v=V, i=0; i < m; i += NB, v += incVm) row2blk(NB, NB, alpha, A+MindexP(UA,i,j,lda), Mpld(UA,j,lda), ldainc, v); if (mr) { row2blk(mr, NB, alpha, A+MindexP(UA,m,j,lda), Mpld(UA,j,lda), ldainc, vv); vv += incVV; } V += NBNB; } if (nr) { for (v=V, i=0; i < m; i += NB, v += incVm) row2blk(NB, nr, alpha, A+MindexP(UA,i,n,lda), Mpld(UA,n,lda), ldainc, v); if (mr) row2blk(mr, nr, alpha, A+MindexP(UA,m,n,lda), Mpld(UA,n,lda), ldainc, vv); } } else if (SCALAR_IS_ONE(alpha)) Mjoin(PATL,row2blkT2_a1)(M, N, A, lda, V, alpha); else Mjoin(PATL,row2blkT2_aX)(M, N, A, lda, V, alpha); }
int ATL_getrfR(const int M, const int N, TYPE *A, const int lda, int *ipiv) /* * Row-major factorization of form * A = L * U * P * where P is a column-permutation matrix, L is lower triangular (lower * trapazoidal if M > N), and U is upper triangular with unit diagonals (upper * trapazoidal if M < N). This is the recursive Level 3 BLAS version. */ { const int MN = Mmin(M, N); int Nup, Ndown, i, ierr=0; #ifdef TCPLX const TYPE one[2] = {ATL_rone, ATL_rzero}; const TYPE none[2] = {ATL_rnone, ATL_rzero}; TYPE inv[2], tmp[2]; #else #define one ATL_rone #define none ATL_rnone TYPE tmp; #endif TYPE *Ar, *Ac, *An; if (MN > 1) { Nup = MN >> 1; #ifdef NB if (Nup > NB) Nup = ATL_MulByNB(ATL_DivByNB(Nup)); #endif Ndown = M - Nup; i = ATL_getrfR(Nup, N, A, lda, ipiv); if (i) if (!ierr) ierr = i; Ar = A + (Nup * lda SHIFT); Ac = A + (Nup SHIFT); An = Ar + (Nup SHIFT); ATL_laswp(Ndown, Ar, lda, 0, Nup, ipiv, 1); /* apply pivots */ cblas_trsm(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasUnit, Ndown, Nup, one, A, lda, Ar, lda); cblas_gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, Ndown, N-Nup, Nup, none, Ar, lda, Ac, lda, one, An, lda); i = ATL_getrfR(Ndown, N-Nup, An, lda, ipiv+Nup); if (i) if (!ierr) ierr = Nup + i; for (i=Nup; i != MN; i++) ipiv[i] += Nup; ATL_laswp(Nup, A, lda, Nup, MN, ipiv, 1); /* apply pivots */ }
void Mjoin(Mjoin(PATL,t),MY_GER) (ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *X, ATL_CINT incX, const TYPE *Y, ATL_CINT incY, TYPE *A, ATL_CINT lda) { ATL_INT mb, nb, mu, nu, nblks, nrblks, ncblks, ldaP; ATL_TGER_t pd; int P; static TYPE *A0=NULL, *A0e=NULL; if (M < 1 || N < 1 || SCALAR_IS_ZERO(alpha)) /* quick return if no-op */ return; pd.M = M; pd.N = N; pd.incX = incX; pd.incY = incY; pd.lda = lda; pd.alpha = alpha; pd.X = X; pd.Y = Y; pd.A = A; pd.flg = (A0 == A || A0e == A+(M SHIFT)) ? 1 : 2; A0 = A; A0e = A+(M SHIFT); P = ATL_DivBySize(CacheEdge); P = ((size_t)M*N+P-1) / P; /* add more procs only when cache is full */ P = (P&1 && P > 1)?P+1 : P; /* don't use odd P, since it hurts alignment */ // printf("TGER, P=%d\n", P); P = Mmin(ATL_NTHREADS, P); /* * Make sure we don't overflow 32-bit integer lda */ ldaP = P * lda; while ((size_t)ldaP != ((size_t)lda)*P) { P--; ldaP = P * lda; } if (P > 1) ATL_goparallel(P, MY_DOWORK_cols, &pd, NULL); else MY_GER1(M, N, alpha, X, incX, Y, incY, A, lda); }
static TYPE *ATL_LmulUR(const int M, const int N, const TYPE *LU, const int ldl) { const int lda = ldl SHIFT, ldc = N SHIFT, MN = Mmin(M,N); int i, j, m; TYPE *C, *c; #ifdef TREAL const TYPE ONE=ATL_rone; #else const TYPE ONE[2] = {ATL_rone, ATL_rzero}; #endif C = c = malloc(M*ATL_MulBySize(N)); ATL_assert(c); if (M >= N) { for (i=0; i != N; i++, LU += lda, C += ldc) { Mjoin(PATL,copy)(i+1, LU, 1, C, 1); Mjoin(PATL,zero)(N-i-1, C+((i+1)SHIFT), 1); } for(; i != M; i++, LU += lda, C += ldc) Mjoin(PATL,copy)(N, LU, 1, C, 1); LU -= lda * M; C -= ldc * M; cblas_trmm(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasUnit, M, N, ONE, LU, ldl, C, N); } else /* N > M */ { for (i=0; i != M; i++, C += ldc, LU += lda) { Mjoin(PATL,zero)(i, C, 1); C[i SHIFT] = ATL_rone; #ifdef TCPLX C[(i SHIFT)+1] = ATL_rzero; #endif Mjoin(PATL,copy)(N-i-1, LU+((i+1)SHIFT), 1, C+((i+1)SHIFT), 1); } LU -= lda * M; C -= ldc * M; cblas_trmm(CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, M, N, ONE, LU, ldl, C, N); } return(C); }
void Mjoin(Mjoin(PATL,pcol2blk),NM) (const int M, const int N, const TYPE alpha, const TYPE *A, int lda, const int ldainc, TYPE *V) /* * Given a packed matrix A, copies N columns starting at A into * block-major column panel * ldainc = 0 : General * ldainc = 1 : Upper * ldainc = -1 : Lower * NOTE: specialize to alpha cases after it works! */ { const int kb = Mmin(M,NB); const int nrb = M / kb, mr = M - nrb*kb; int i, ib, j, J; const int NN = N*kb; TYPE *v = V + nrb*NN; if (ldainc) { if (ldainc == -1) lda--; ATL_assert(N <= NB); for (j=0; j != N; j++) { for (ib=nrb; ib; ib--) { for (i=0; i < kb; i++) V[i] = ATL_MulByALPHA(A[i]); V += NN; A += kb; } if (mr) { for (i=0; i < mr; i++) v[i] = ATL_MulByALPHA(A[i]); v += mr; } V += kb - nrb*NN; A += lda - nrb*kb; lda += ldainc; } } else Mjoin(Mjoin(PATL,col2blk),NM)(M, N, A, lda, V, alpha); }
void Mjoin(Mjoin(PATL,prow2blkT),NM) (const int M, const int N, const TYPE alpha, const TYPE *A, int lda, const int ldainc, TYPE *V) /* * Given a packed Upper matrix A, copies & transposes M rows starting at A into * block-major row panel * ldainc = 0 : General rectangular * ldainc = 1 : Upper * ldainc = -1 : Lower * NOTE: specialize to alpha cases after it works! */ { const int kb = Mmin(NB,N); const int ncb = N / kb, nr = N - ncb*kb; const int incV = kb*M - kb; int jb, i, j; TYPE *v; if (ldainc) { if (ldainc == -1) lda--; for (jb=ncb; jb; jb--) { for (j=kb; j; j--) { v = V++; for (i=0; i != M; i++, v += kb) *v = ATL_MulByALPHA(A[i]); A += lda; lda += ldainc; } V += incV; } for (j=nr; j; j--) { v = V++; for (i=0; i != M; i++, v += nr) *v = ATL_MulByALPHA(A[i]); A += lda; lda += ldainc; } } else Mjoin(Mjoin(PATL,row2blkT),NM)(N, M, A, lda, V, alpha); }
void Mjoin(PATL,geset) (ATL_CINT M, ATL_CINT N, const SCALAR alpha, const SCALAR beta, TYPE *A, ATL_CINT lda) /* * Sets main diagonal to beta, rest of matrix to alpha */ { ATL_INT j; ATL_CINT MN = Mmin(M,N); #ifdef TCPLX ATL_CINT lda2 = lda+lda; #else #define lda2 lda #endif #ifdef TCPLX if (*alpha == *beta && alpha[1] == beta[1]) #else if (alpha == beta) #endif { for (j=0; j < N; j++, A += lda2) Mjoin(PATL,set)(M, alpha, A, 1); return; } for (j=0; j < MN; j++, A += lda2) { if (j) Mjoin(PATL,set)(j, alpha, A, 1); #ifdef TCPLX A[j+j] = *beta; A[j+j+1] = beta[1]; #else A[j] = beta; #endif if (M-j-1) Mjoin(PATL,set)(M-j-1, alpha, A+((j+1) SHIFT), 1); } for (; j < N; j++, A += lda2) Mjoin(PATL,set)(M, alpha, A, 1); }
void GoToTown(int *nreps, int flsizeKB, int mflopF, int ldagap, int rout, int *Ns, int *Ms, int *UPLOs, int *SDs) { FILE *fpout=stdout; double time, mflop, mfB; int *nbs, *flgs, *ms, *ns; int itst=0, lda, n, m, u, s, b, r, M, kk, nb0, nrep; fprintf(fpout, "*** TUNING FOR %10s ***\n", Bitmap2Char(FSRout, (rout<<8)+UPLOs[1]+SDs[1])); fprintf(fpout, "*********************************\n"); fprintf(fpout, "TST REP UP SD M N LDA TIME MFLOP\n"); fprintf(fpout, "====== === == == ====== ====== ====== ============= =============\n"); for (n=1; n <= Ns[0]; n++) { for (m=1; m <= Ms[0]; m++) { M = (Ms[m]) ? Ms[m]:Ns[n]; for (u=1; u <= UPLOs[0]; u++) { for (s=1; s <= SDs[0]; s++) { lda = ldagap + M; nrep = GetMyReps(Mmin(M,Ns[n]), nreps); for (r=1; r <= nrep; r++) { time = GetTime(rout, mflopF, lda, M, Ns[n], CAN_NB, UPLOs[u], SDs[s], flsizeKB); mflop = Time2Flops(rout, UPLOs[u], M, Ns[n], time); fprintf(fpout, "%6d %4d %c %c %7d %7d %7d %13e %14.2f\n", itst++, r, Uplo2Char(rout, UPLOs[u]+SDs[s]), Side2Char(rout, SDs[s]+UPLOs[u]), M, Ns[n], lda, time, mflop); } /* end of reps loop */ } /* end of Side loop */ } /* end of Uplo loop */ } /* end of M loop */ } /* end of N loop */ }
void F77WRAP_GETRF(const F77_INTEGER *M, const F77_INTEGER *N, TYPE *A, const F77_INTEGER *lda, F77_INTEGER *ipiv0, F77_INTEGER *info) { const int MN = Mmin(*M,*N); int i; #ifdef ATL_FunkyInts int *ipiv; ipiv = malloc(MN*sizeof(int)); ATL_assert(ipiv); #else #define ipiv ipiv0 #endif *info = ATL_getrf(AtlasColMajor, *M, *N, A, *lda, ipiv); #ifdef ATL_FunkyInts for (i=0; i != MN; i++) ipiv0[i] = ipiv[i] + 1; free(ipiv); #else for (i=0; i != MN; i++) ipiv[i]++; #endif }
void Mjoin(pcol2blkF,_blk) (const int blk, const int M, const int N, const SCALAR alpha, const TYPE *A, int lda, const int ldainc, TYPE *V) /* * Copies entire MxN matrix to block major format */ { int j, jb; const int incV = blk*(M+M); const enum PACK_UPLO UA = (ldainc == 1) ? PackUpper : ( (lda == -1) ? PackLower : PackGen ); void (*col2blk)(const int blk, const int M, const int N, const SCALAR alpha, const TYPE *A, int lda, const int ldainc, TYPE *V); #ifdef Conj_ if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) col2blk = Mjoin(Mjoin(PATL,pcol2blkConj_a1),_blk); else col2blk = Mjoin(Mjoin(PATL,pcol2blkConj_aXi0),_blk); } else col2blk = Mjoin(Mjoin(PATL,pcol2blkConj_aX),_blk); #else if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) col2blk = Mjoin(Mjoin(PATL,pcol2blk_a1),_blk); else col2blk = Mjoin(Mjoin(PATL,pcol2blk_aXi0),_blk); } else col2blk = Mjoin(Mjoin(PATL,pcol2blk_aX),_blk); #endif for (j=0; j < N; j += blk) { jb = N-j; jb = Mmin(jb, blk); col2blk(blk, M, jb, alpha, A+MindexP(UA,0,j,lda), Mpld(UA,j,lda), ldainc, V); V += incV; } }
int ATL_gerqr(ATL_CINT M, ATL_CINT N, TYPE *A, ATL_CINT LDA, TYPE *TAU, TYPE *ws_RQ2, TYPE *ws_T, ATL_CINT LDT, TYPE *WORKM, const int buildT) { int top, bottom, buildT_temp; int topMN; int I, INFO, IINFO, lbuilt, rbuilt, method; int LDA2 = LDA SHIFT; /* for complex LDA *2 */ int LDT2 = LDT SHIFT; /* for complex LDT *2 */ ATL_CINT minMN = Mmin(M, N); #ifdef TCPLX TYPE ONE[2] = {ATL_rone, ATL_rzero}; #else #define ONE ATL_rone #endif if (M < 1 || N < 1) return(0); /* Nothing to do. */ METHOD(method, N, M, LDA); /* Find the method. */ #if !defined(ATL_USEPTHREADS) if (method == 2 || method == 3) method=1; /* Don't PCA if no affinity. */ #endif switch(method) /* Based on method; */ { case 0: /* RECURSION. */ /* * Choose a smart recursive column partitioning based on M: */ if (minMN >= NB+NB) /* big prob, put remainder on right */ { topMN = ATL_MulByNB(ATL_DivByNB(minMN>>1)); bottom = minMN - topMN; top = M - bottom; } else /* small prob, keep M mult of MU (MU more critical than NU) */ {
void Mjoin(PATL,symv) (const enum ATLAS_UPLO Uplo, const int N, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *X, const int incX, const SCALAR beta, TYPE *Y, const int incY) { int mb, nb, jb, mb1, incA1, incA, incXY, incXY1, n, j; const int lda2=(lda SHIFT); const TYPE *x0=X, *x1, *A0=A, *A1; TYPE *y1, *y0=Y; assert(incX==1 && incY==1 && Uplo == AtlasLower); #ifdef TREAL assert(alpha == ATL_rone && beta == ATL_rone); #else assert(*alpha == ATL_rone && *beta == ATL_rone); assert(alpha[1] == ATL_rzero && beta[1] == ATL_rzero); #endif ATL_GetPartSYMV(A, lda, &mb, &nb); mb1 = N - ( (N-1) / mb ) * mb; incA1 = nb * lda2; incXY1 = (nb SHIFT); incA = incXY = mb SHIFT; n = (N-mb)SHIFT; A += n; X += n; Y += n; for (n=N-mb; n > 0; n -= mb, A -= incA, X -= incXY, Y -= incXY) { RsymvL(mb, A+n*lda2, lda, X, beta, Y); for (j=0, A1=A, x1=x0, y1=y0; j < n; j += nb, A1 += incA1, x1 += incXY1, y1 += incXY1) { jb = n - j; jb = Mmin(jb, nb); gemvT(jb, mb, alpha, A1, lda, X, 1, beta, y1, 1); gemvN(mb, jb, alpha, A1, lda, x1, 1, beta, Y, 1); } } RsymvL(mb1, A0, lda, x0, beta, y0); }
int ATL_ormqr (const enum CBLAS_SIDE SIDE, const enum CBLAS_TRANSPOSE TRANS, ATL_CINT M, ATL_CINT N, ATL_CINT K, TYPE *A, ATL_CINT lda, const TYPE *TAU, TYPE *C, ATL_CINT ldc, TYPE *WORK, ATL_CINT LWORK) /* * This is the C translation of the standard LAPACK Fortran routine: * SUBROUTINE ATL_ormqr( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, * WORK, LWORK, INFO ) * * ATL_ormqr.c : * int ATL_ormqr(const enum CBLAS_SIDE SIDE SIDE, * const enum CBLAS_TRANSPOSE TRANS, ATL_CINT M, ATL_CINT N, * ATL_CINT K, TYPE * A, ATL_CINT lda,TYPE * TAU, TYPE * C, ATL_CINT ldc, * TYPE * WORK, ATL_CINT LWORK) * * NOTE : ATL_ormqr.c will get compiled to four precisions * single precision real, double precision real * single precision complex, double precision complex * * * * Purpose * ======= * * ATL_ormqr overwrites the general real M-by-N matrix C with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * C C * Q * TRANS = 'T': Q**T * C C * Q**T * * where Q is, * a real orthogonal matrix defined as the product of k * elementary reflectors * * Q = H(1) H(2) . . . H(k) * * OR * a complex unitary matrix defined as a product of k * elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by ATLL_geqrf.c. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Arguments * ========= * * SIDE (input) CHARACTER*1 * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (input) CHARACTER*1 * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (input) INTEGER * The number of rows of the matrix C. M >= 0. * * N (input) INTEGER * The number of columns of the matrix C. N >= 0. * * K (input) INTEGER * The number of elementary reflectors whose product defines * the matrix Q. * If SIDE = 'L', M >= K >= 0; * if SIDE = 'R', N >= K >= 0. * * A (input) array, dimension (LDA,K) * The i-th column must contain the vector which defines the * elementary reflector H(i), for i = 1,2,...,k, as returned by * DGEQRF in the first k columns of its array argument A. * A is modified by the routine but restored on exit. * * lda (input) INTEGER * The leading dimension of the array A. * If SIDE = 'L', LDA >= max(1,M); * if SIDE = 'R', LDA >= max(1,N). * * TAU (input) array, dimension (K) * TAU(i) must contain the scalar factor of the elementary * reflector H(i), as returned by ATL_geqrf.c. * * C (input/output) array, dimension (LDC,N) * On entry, the M-by-N matrix C. * On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q. * * ldc (input) INTEGER * The leading dimension of the array C. LDC >= max(1,M). * * WORK (workspace/output) array, dimension (MAX(1,LWORK)) * On exit, if INFO = 0, WORK(1) returns the optimal LWORK. * * LWORK (input) INTEGER * The dimension of the array WORK. * If SIDE = 'L', LWORK >= max(1,N); * if SIDE = 'R', LWORK >= max(1,M). * For optimum performance LWORK >= N*NB if SIDE = 'L', and * LWORK >= M*NB if SIDE = 'R', where NB is the optimal * blocksize. * * If LWORK = -1, then a workspace query is assumed; the routine * only calculates the optimal size of the WORK array, returns * this value as the first entry of the WORK array, and no error * message related to LWORK is issued by XERBLA. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value */ { ATL_CINT minMN = Mmin(M, N), maxMN = Mmax(M, N); ATL_INT n, nb, j, ib, mi, ni, ic, jc ; TYPE *ws_QR2, *ws_T, *ws_larfb; /* Workspace for QR2,T, larfb */ void *vp=NULL; nb = clapack_ilaenv(LAIS_OPT_NB, LAormqr, MYOPT+LARight+LAUpper, M, N, K,-1); /* * If it is a workspace query, return the size of work required. * wrksz = wrksz of ATL_larfb + ATL_larft + ATL_geqr2 */ if (LWORK < 0) { if(SIDE == CblasLeft) { *WORK = ( N*nb + nb*nb + maxMN ) ; } else { *WORK = ( M*nb + nb*nb + maxMN ) ; } return(0); } else if (M < 1 || N < 1) /* quick return if no work to do */ return(0); /* * If the user gives us too little space, see if we can allocate it ourselves */ else { if(SIDE == CblasLeft) { if (LWORK < (N*nb + nb*nb + maxMN)) { vp = malloc(ATL_MulBySize(N*nb + nb*nb + maxMN) + ATL_Cachelen); if (!vp) return(-7); WORK = ATL_AlignPtr(vp); } } else { if (LWORK < (M*nb + nb*nb + maxMN)) { vp = malloc(ATL_MulBySize(M*nb + nb*nb + maxMN) + ATL_Cachelen); if (!vp) return(-7); WORK = ATL_AlignPtr(vp); } } /* if CblasRight */ } /* * Assign workspace areas for ATL_larft, ATL_geqr2, ATL_larfb */ ws_T = WORK; /* T at begining of work */ ws_QR2 = WORK +(nb SHIFT)*nb; /* After T Work space */ ws_larfb = ws_QR2 + (maxMN SHIFT); /* After workspace for T and QR2 */ if (SIDE == CblasLeft) { if ( TRANS == CblasNoTrans ) { j = (K/nb)*nb; if (j == K) { j=K -nb; } for (j; j >= 0; j = j - nb) { ib = nb; if ((j+nb) > K) { ib = K - j; } /* * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) */ ATL_larft(LAForward, LAColumnStore, M-j, ib, A+(j SHIFT)*(lda+1), lda, TAU+(j SHIFT), ws_T, ib); /* * H or H' is applied to C(i:m,1:n) */ ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore, (M-j), N, ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib, C+(j SHIFT), ldc, ws_larfb, N); } /* for */ } /* CblasNoTrans */ else { for (j = 0 ; j < K; j = j + nb) { ib = Mmin(K-j, nb); /* * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) */ ATL_larft(LAForward, LAColumnStore, M-j, ib, A+(j SHIFT)*(lda+1), lda, TAU+(j SHIFT), ws_T, ib); /* * H or H' is applied to C(i:m,1:n) */ ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore, (M-j), N, ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib, C+(j SHIFT), ldc, ws_larfb, N); } /* for */ } /* CblasNoTran */ } /* cblasLeft */ else { if ( TRANS == CblasNoTrans ) { for (j = 0 ; j < K; j = j + nb) { ib = Mmin(K-j, nb); /* * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) */ ATL_larft(LAForward, LAColumnStore, N-j, ib, A+(j SHIFT)*(lda+1), lda, TAU+(j SHIFT), ws_T, ib); /* * H or H' is applied to C(1:m,i:n) */ ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore, M, N-j, ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib, C+((j SHIFT)*ldc), ldc, ws_larfb, M); } /* for */ } else { j = (K/nb)*nb; if (j == K) { j=K -nb; } for (j; j >= 0; j = j - nb) { ib = nb; if ((j+nb) > K) { ib = K - j; } /* * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) */ ATL_larft(LAForward, LAColumnStore, N-j, ib, A+(j SHIFT)*(lda+1), lda, TAU+(j SHIFT), ws_T, ib); /* * H or H' is applied to C(1:m,i:n) */ ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore, M, N-j , ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib, C+((j SHIFT)*ldc) , ldc, ws_larfb, M); } /* for */ } /* Cblas Tran on Right */ } if (vp) free(vp); return(0); } /* END ATL_ormqr */
void Mjoin( PATL, tbsvUC ) ( const enum ATLAS_DIAG DIAG, const int N, /* N > 0 assumed */ const int K, const TYPE * A, const int LDA, TYPE * X ) { /* * Purpose * ======= * * Mjoin( PATL, tbsvUC ) solves the following triangular system of equations * * conjg( A ) * x = b, * * where b and x are n-element vectors and A is an n by n unit or non- * unit, upper triangular band matrix. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * This is a blocked version of the algorithm. For a more detailed des- * cription of the arguments of this function, see the reference imple- * mentation in the ATLAS/src/blas/reference directory. * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ void (*tbsv0)( const int, const int, const TYPE *, const int, TYPE * ); #ifdef TREAL #define lda2 LDA #define one ATL_rone #define none ATL_rnone #else TYPE none[2] = { ATL_rnone, ATL_rzero }, one [2] = { ATL_rone, ATL_rzero }; const int lda2 = ( LDA SHIFT ); #endif #ifdef ATL_AXPYMV int ia, ian, j, kl, ku, m1, ma, mb, mb1, n, na, nb; #else int ia, ian, kl, ku, ma, mb, na, nb, nb1; #endif /* .. * .. Executable Statements .. * */ ATL_GetPartMVN( A, LDA, &mb, &nb ); if( DIAG == AtlasNonUnit ) tbsv0 = Mjoin( PATL, tbsvUCN ); else tbsv0 = Mjoin( PATL, tbsvUCU ); #ifdef ATL_AXPYMV mb1 = N - ( m1 = ( ( N - 1 ) / mb ) * mb ); tbsv0( mb1, K, A+m1*lda2, LDA, X+(m1 SHIFT) ); for( n = mb1, j = m1 - mb; n < N; n += mb, j -= mb ) { ian = j + mb; ia = mb - K; ia = j + Mmax( ia, 0 ); ma = ian - ia; na = N - ian; na = Mmin( na, K ); kl = ma - 1; kl = Mmax( kl, 0 ); ku = K - 1 - kl; ku = Mmax( ku, 0 ); Mjoin( PATL, gbmv )( AtlasConj, ma, na, kl, ku, none, A+ian*lda2, LDA, X+(ian SHIFT), 1, one, X+(ia SHIFT), 1 ); tbsv0( mb, K, A+j*lda2, LDA, X+(j SHIFT) ); } #else nb1 = N - ( ( N - 1 ) / nb ) * nb; for( ian = N - nb; ian > 0; ian -= nb ) { ia = ian - K; ia = Mmax( ia, 0 ); ma = ian - ia; na = Mmin( nb, K ); kl = ma - 1; kl = Mmax( kl, 0 ); ku = K - 1 - kl; ku = Mmax( ku, 0 ); tbsv0( nb, K, A+ian*lda2, LDA, X+(ian SHIFT) ); Mjoin( PATL, gbmv )( AtlasConj, ma, na, kl, ku, none, A+ian*lda2, LDA, X+(ian SHIFT), 1, one, X+(ia SHIFT), 1 ); } tbsv0( nb1, K, A, LDA, X ); #endif /* * End of Mjoin( PATL, tbsvUC ) */ }
void Mjoin(PATL,ttrsm)(const enum ATLAS_SIDE side, const enum ATLAS_UPLO uplo, const enum ATLAS_TRANS TA, const enum ATLAS_DIAG diag, ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *A, ATL_CINT lda, TYPE *B, ATL_CINT ldb) { ATL_TTRSM_t trsms[ATL_NTHREADS]; TYPE *b; ATL_INT n, nblks, minblks; double tblks; int nr, p, i, j, extrablks; static int nb=0; if (M < 1 || N < 1) return; if (SCALAR_IS_ZERO(alpha)) { Mjoin(PATL,gezero)(M, N, B, ldb); return; } #if defined(ATL_ARCH_XeonPHI) && defined(TREAL) { int Mjoin(PATL,ttrsm_amm) (const enum ATLAS_SIDE side, const enum ATLAS_UPLO uplo, const enum ATLAS_TRANS TA, const enum ATLAS_DIAG diag, ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *A, ATL_CINT lda, TYPE *B, ATL_CINT ldb); if (!Mjoin(PATL,ttrsm_amm)(side, uplo, TA, diag, M, N, alpha, A, lda, B, ldb)) return; } #endif /* * Distribute RHS over the processors */ if (!nb) nb = Mjoin(PATL,GetNB)(); if (side == AtlasLeft) { nblks = N/nb; nr = N - nblks*nb; tblks = ((double)(M*N)) / ( (double)nb * nb ); p = (tblks+ATL_TTRSM_XOVER-1)/ATL_TTRSM_XOVER; p = Mmin(p, ATL_NTHREADS); p = p ? p : 1; b = B; minblks = nblks / p; extrablks = nblks - minblks*p; for (i=0; i < p; i++) { if (i < extrablks) n = (minblks+1)*nb; else if (i == extrablks) n = minblks*nb + nr; else n = minblks*nb; trsms[i].A = A; trsms[i].M = M; trsms[i].N = n; trsms[i].lda = lda; trsms[i].ldb = ldb; trsms[i].B = b; trsms[i].alpha = SADD alpha; trsms[i].side = side; trsms[i].uplo = uplo; trsms[i].TA = TA; trsms[i].diag = diag; n *= (ldb << Mjoin(PATL,shift)); b = MindxT(b, n); } } else /* Side == AtlasRight */ { nblks = M/nb; nr = M - nblks*nb; tblks = (N/nb)*nblks; p = (tblks+ATL_TTRSM_XOVER-1)/ATL_TTRSM_XOVER; p = Mmin(p, ATL_NTHREADS); p = p ? p : 1; b = B; minblks = nblks / p; extrablks = nblks - minblks*p; for (i=0; i < p; i++) { if (i < extrablks) n = (minblks+1)*nb; else if (i == extrablks) n = minblks*nb + nr; else n = minblks*nb; trsms[i].A = A; trsms[i].M = n; trsms[i].N = N; trsms[i].lda = lda; trsms[i].ldb = ldb; trsms[i].B = b; trsms[i].alpha = SADD alpha; trsms[i].side = side; trsms[i].uplo = uplo; trsms[i].TA = TA; trsms[i].diag = diag; n <<= Mjoin(PATL,shift); b = MindxT(b, n); } } if (p < 2) { Mjoin(PATL,trsm)(side, uplo, TA, diag, M, N, alpha, A, lda, B, ldb); return; } for (; i < ATL_NTHREADS; i++) /* flag rest of struct as uninitialized */ trsms[i].B = NULL; ATL_goparallel(p, Mjoin(PATL,DoWorkTRSM), trsms, NULL); }
int HPCC_Stream(HPCC_Params *params, int doIO, double *copyGBs, double *scaleGBs, double *addGBs, double *triadGBs, int *failure) { int quantum; int BytesPerWord; register int j, k; double scalar, t, times[4][NTIMES]; FILE *outFile; double GiBs = 1073741824.0, curGBs; if (doIO) { // outFile = fopen( params->outFname, "w+" ); outFile = stdout; if (! outFile) { outFile = stderr; fprintf( outFile, "Cannot open output file.\n" ); return 1; } } // VectorSize = HPCC_LocalVectorSize( params, 3, sizeof(double), 0 ); /* Need 3 vectors */ // HARDCODED VectorSize // params->StreamVectorSize = VectorSize; a = HPCC_XMALLOC( double, VectorSize ); b = HPCC_XMALLOC( double, VectorSize ); c = HPCC_XMALLOC( double, VectorSize ); if (!a || !b || !c) { if (c) HPCC_free(c); if (b) HPCC_free(b); if (a) HPCC_free(a); if (doIO) { fprintf( outFile, "Failed to allocate memory (%lu).\n", VectorSize ); fflush( outFile ); fclose( outFile ); } return 1; } /* --- SETUP --- determine precision and check timing --- */ if (doIO) { fprintf (outFile, "Generated on %s\n", params->nowASCII); fprintf( outFile, HLINE); BytesPerWord = sizeof(double); fprintf( outFile, "This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); fprintf( outFile, HLINE); fprintf( outFile, "Array size = %lu, Offset = %d\n" , VectorSize, OFFSET); fprintf( outFile, "Total memory required = %.4f GiB.\n", (3.0 * BytesPerWord) * ( (double) VectorSize / GiBs)); fprintf( outFile, "Each test is run %d times, but only\n", NTIMES); fprintf( outFile, "the *best* time for each is used.\n"); fflush ( outFile); } #ifdef _OPENMP if (doIO) fprintf( outFile, HLINE); #pragma omp parallel private(k) { #pragma omp single nowait { k = omp_get_num_threads(); if (doIO) fprintf( outFile, "Number of Threads requested = %i\n",k); params->StreamThreads = k; } } #endif /* Get initial value for system clock. */ #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } if (doIO) fprintf( outFile, HLINE); if ( (quantum = checktick()) >= 1) { if (doIO) fprintf( outFile, "Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); } else { if (doIO) fprintf( outFile, "Your clock granularity appears to be " "less than one microsecond.\n"); } t = mysecond(); #ifdef _OPENMP #pragma omp parallel for #endif for (j = 0; j < VectorSize; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); if (doIO) { fprintf( outFile, "Each test below will take on the order" " of %d microseconds.\n", (int) t ); fprintf( outFile, " (= %d clock ticks)\n", (int) (t/quantum) ); fprintf( outFile, "Increase the size of the arrays if this shows that\n"); fprintf( outFile, "you are not getting at least 20 clock ticks per test.\n"); fprintf( outFile, HLINE); fprintf( outFile, "WARNING -- The above is only a rough guideline.\n"); fprintf( outFile, "For best results, please be sure you know the\n"); fprintf( outFile, "precision of your system timer.\n"); fprintf( outFile, HLINE); } /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #ifdef TUNED tuned_STREAM_Copy(); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #ifdef TUNED tuned_STREAM_Add(); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = Mmin(mintime[j], times[j][k]); maxtime[j] = Mmax(maxtime[j], times[j][k]); } } if (doIO) fprintf( outFile, "Function Rate (GB/s) Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] /= (double)(NTIMES - 1); /* note -- skip first iteration */ /* make sure no division by zero */ curGBs = (mintime[j] > 0.0 ? 1.0 / mintime[j] : -1.0); curGBs *= 1e-9 * bytes[j] * VectorSize; if (doIO) fprintf( outFile, "%s%11.4f %11.4f %11.4f %11.4f\n", label[j], curGBs, avgtime[j], mintime[j], maxtime[j]); switch (j) { case 0: *copyGBs = curGBs; break; case 1: *scaleGBs = curGBs; break; case 2: *addGBs = curGBs; break; case 3: *triadGBs = curGBs; break; } } if (doIO) fprintf( outFile, HLINE); /* --- Check Results --- */ checkSTREAMresults( outFile, doIO, failure ); if (doIO) fprintf( outFile, HLINE); HPCC_free(c); HPCC_free(b); HPCC_free(a); if (doIO) { fflush( outFile ); fclose( outFile ); } return 0; }
void Mjoin(PATL,tgemv) (const enum ATLAS_TRANS TA, ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX, const SCALAR beta, TYPE *Y, ATL_CINT incY) { static size_t ALb=0, ALe=0; size_t at = (size_t) A; ATL_INT n, P, ldaP; ATL_TGEMV_t pd; /* * quick return if possible. */ if (M < 1 || N < 1) return; if (SCALAR_IS_ZERO(alpha)) /* No contrib from alpha*A*x */ { ATL_CINT NY = (TA == AtlasTrans || TA == AtlasConjTrans) ? N : M; if (!SCALAR_IS_ONE(beta)) { if (SCALAR_IS_ZERO(beta)) Mjoin(PATL,zero)(NY, Y, incY); else Mjoin(PATL,scal)(NY, beta, Y, incY); } return; } pd.flg = (at >= ALb && at <= ALe) ? 1 : 0; ALb = (size_t)A; ALe = (size_t)(A+(M SHIFT)); #ifdef TREAL pd.flg |= (TA == AtlasTrans || TA == AtlasConjTrans) ? 2 : 0; #else if (TA != AtlasNoTrans) { if (TA == AtlasConj) pd.flg |= 4; else if (TA == AtlasTrans) pd.flg |= 2; else /* if (TA == AtlasConjTrans) */ pd.flg |= (2|4); } #endif P = ATL_DivBySize(CacheEdge); P = ((size_t)M*N+P-1) / P; /* add more procs only when cache is full */ P = (P&1 && P > 1)?P+1 : P; /* don't use odd P; it hurts alignment */ P = Mmin(ATL_NTHREADS, P); if (TA == AtlasNoTrans || TA == AtlasConj) P=1; //fprintf(stderr, "P=%d, TA=%d, M=%d, N=%d\n", P, (TA==AtlasTrans), M, N); /* * Make sure we don't overflow 32-bit integer lda */ ldaP = P * lda; while ((size_t)ldaP != ((size_t)lda)*P) { P--; ldaP = P * lda; } if (P > 1) { pd.M = M; pd.N = N; pd.incX = incX; pd.incY = incY; pd.lda = lda; pd.alpha = alpha; pd.beta = beta; pd.X = X; pd.Y = Y; pd.A = A; pd.P = P; n = N / P; pd.n = n; pd.nr = N - n*P; if (pd.flg & 2) /* Transpose case */ { ATL_goparallel(P, Mjoin(PATL,DOMVTWORK_cols), &pd, NULL); return; } /* * For gemvN, everyone needs a private M-length y. Don't do this unless * we are sure the combine cost is likely dominated by the parallelism */ else if (n > Mmax(P,8)) { int vrank; const TYPE *a; TYPE *y, *y0; #ifdef TCPLX TYPE one[2] = {ATL_rone, ATL_rzero}; TYPE zero[2] = {ATL_rzero, ATL_rzero}; #endif y0 = y = malloc(P*(ATL_Cachelen+ATL_MulBySize(M))); ATL_assert(y); pd.Y = y; pd.incY = 1; #ifdef TREAL pd.alpha = ATL_rone; pd.beta = ATL_rzero; #else pd.alpha = one; pd.beta = zero; #endif ATL_goparallel(P, Mjoin(PATL,DOMVNWORK_cols), &pd, Mjoin(PATL,CombineMVN)); /* * goparallel reduces all node's Ys to node 0's. Extract his from the * work array, and combine it with input array, applying both alpha * and beta in the process */ vrank = (!pd.nr || (pd.flg & 1)) ? 0 : pd.nr-1; a = A + (lda SHIFT)*vrank; y = ATL_Align2Ptr(y, a); Mjoin(PATL,axpby)(M, alpha, y, 1, beta, Y, incY); free(y0); return; } } /* * If we haven't parallelized this thing, just do it serial */ Mjoin(PATL,gemv)(TA, M, N, alpha, A, lda, X, incX, beta, Y, incY); }
static int dtr2mx_(double *a, int *lda, double *beta, double *t, int *ldt, int *nrow, int *ncol, int * mb, int *nb, int *ilt, int *jlt) { /* System generated locals */ long a_dim1, a_offset, t_dim1, t_offset; int i__1, i__2, i__3, i__4; /* Local variables */ static int k, ia, ja, jj, ki, kj, it, jt, mr, irm, jrm; /* -- PUMMA Package routine (version 2.1) -- */ /* Jaeyoung Choi, Oak Ridge National Laboratory. */ /* Jack Dongarra, Univ. of Tennessee, Oak Ridge National Laboratory. */ /* David Walker, Oak Ridge National Laboratory. */ /* October 31, 1994. */ /* Purpose */ /* T <== A' + beta*T (assume beta = 0.0, or 1.0) */ /* T is a scattered 2-D array from a scattered 2-D array A */ /* T = A' */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; t_dim1 = *ldt; t_offset = 1 + t_dim1; t -= t_offset; /* Function Body */ ia = 0; jt = 0; if (*beta == 0.) { i__1 = *nrow - 2; for (ki = 0; ki <= i__1; ++ki) { ja = 0; it = 0; i__2 = *ncol - 2; for (kj = 0; kj <= i__2; ++kj) { i__3 = *nb; for (jj = 1; jj <= i__3; ++jj) { i__4 = *mb; for (k = 1; k <= i__4; ++k) { t[it + jj + (jt + k) * t_dim1] = a[ia + k + (ja + jj) * a_dim1]; /* L10: */ } } ja += commtrb_1.jaz; it += commtrb_1.itz; /* L20: */ } jrm = *jlt - ja; if (jrm > 0) { i__2 = Mmin(*nb,jrm); for (jj = 1; jj <= i__2; ++jj) { i__4 = *mb; for (k = 1; k <= i__4; ++k) { t[it + jj + (jt + k) * t_dim1] = a[ia + k + (ja + jj) * a_dim1]; /* L30: */ } } } ia += commtrb_1.iaz; jt += commtrb_1.jtz; /* L40: */ } irm = *ilt - ia; if (irm > 0) { ja = 0; it = 0; mr = Mmin(irm,*mb); i__1 = *ncol - 2; for (kj = 0; kj <= i__1; ++kj) { i__4 = *nb; for (jj = 1; jj <= i__4; ++jj) { i__2 = mr; for (k = 1; k <= i__2; ++k) { t[it + jj + (jt + k) * t_dim1] = a[ia + k + (ja + jj) * a_dim1]; /* L50: */ } } ja += commtrb_1.jaz; it += commtrb_1.itz; /* L60: */ } jrm = *jlt - ja; if (jrm > 0) { i__1 = Mmin(*nb,jrm); for (jj = 1; jj <= i__1; ++jj) { i__2 = mr; for (k = 1; k <= i__2; ++k) { t[it + jj + (jt + k) * t_dim1] = a[ia + k + (ja + jj) * a_dim1]; /* L70: */ } } } } } else { /* T = A' + T */ i__2 = *nrow - 2; for (ki = 0; ki <= i__2; ++ki) { ja = 0; it = 0; i__1 = *ncol - 2; for (kj = 0; kj <= i__1; ++kj) { i__4 = *nb; for (jj = 1; jj <= i__4; ++jj) { i__3 = *mb; for (k = 1; k <= i__3; ++k) { t[it + jj + (jt + k) * t_dim1] += a[ia + k + (ja + jj) * a_dim1]; /* L80: */ } } ja += commtrb_1.jaz; it += commtrb_1.itz; /* L90: */ } jrm = *jlt - ja; if (jrm > 0) { i__1 = Mmin(*nb,jrm); for (jj = 1; jj <= i__1; ++jj) { i__3 = *mb; for (k = 1; k <= i__3; ++k) { t[it + jj + (jt + k) * t_dim1] += a[ia + k + (ja + jj) * a_dim1]; /* L100: */ } } } ia += commtrb_1.iaz; jt += commtrb_1.jtz; /* L110: */ } irm = *ilt - ia; if (irm > 0) { ja = 0; it = 0; mr = Mmin(irm,*mb); i__2 = *ncol - 2; for (kj = 0; kj <= i__2; ++kj) { i__3 = *nb; for (jj = 1; jj <= i__3; ++jj) { i__1 = mr; for (k = 1; k <= i__1; ++k) { t[it + jj + (jt + k) * t_dim1] += a[ia + k + (ja + jj) * a_dim1]; /* L120: */ } } ja += commtrb_1.jaz; it += commtrb_1.itz; /* L130: */ } jrm = *jlt - ja; if (jrm > 0) { i__2 = Mmin(*nb,jrm); for (jj = 1; jj <= i__2; ++jj) { i__1 = mr; for (k = 1; k <= i__1; ++k) { t[it + jj + (jt + k) * t_dim1] += a[ia + k + (ja + jj) * a_dim1]; /* L140: */ } } } } } return 0; } /* dtr2mx_ */
int Mjoin(PATL,gels) (const enum ATLAS_TRANS TA, ATL_CINT M, ATL_CINT N, ATL_CINT NRHS, TYPE *A, ATL_CINT lda, TYPE *B, ATL_CINT ldb, TYPE *work, ATL_CINT lwork) /* * GELS solves overdetermined or underdetermined linear systems * involving an M-by-N matrix A, or its conjugate-transpose, using a QR * or LQ factorization of A. It is assumed that A has full rank. * * This is a straight translation from LAPACK 3.2.1; the only performance * improvements come from using ATLAS's improved QR (and slighly ORMQR) * implementations. * * The following options are provided: * * 1. If TRANS = 'N' and m >= n: find the least squares solution of * an overdetermined system, i.e., solve the least squares problem * minimize || B - A*X ||. * * 2. If TRANS = 'N' and m < n: find the minimum norm solution of * an underdetermined system A * X = B. * * 3. If TRANS = 'C/T' and m >= n: find the minimum norm solution of * an undetermined system A**H * X = B. * * 4. If TRANS = 'C/T' and m < n: find the least squares solution of * an overdetermined system, i.e., solve the least squares problem * minimize || B - A**H * X ||. * * Several right hand side vectors b and solution vectors x can be * handled in a single call; they are stored as the contiguously in the * M-by-NRHS right hand side matrix B and the N-by-NRHS solution * matrix X. * * TRANS (input) CHARACTER*1 * = 'N': the linear system involves A; * = 'C': the linear system involves A**H (complex only). * = 'T': the linear system involves A**T (real only). * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of * columns of the matrices B and X. NRHS >= 0. * * A (input/output) COMPLEX*16 array, dimension (LDA,N) * On entry, the M-by-N matrix A. * if M >= N, A is overwritten by details of its QR * factorization as returned by GEQRF; * if M < N, A is overwritten by details of its LQ * factorization as returned by GELQF. * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * B (input/output) COMPLEX*16 array, dimension (LDB,NRHS) * On entry, the matrix B of right hand side vectors, stored * columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS * if TRANS = 'C/T'. * On exit, if INFO = 0, B is overwritten by the solution * vectors, stored columnwise: * if TRANS = 'N' and m >= n, rows 1 to n of B contain the least * squares solution vectors; the residual sum of squares for the * solution in each column is given by the sum of squares of the * modulus of elements N+1 to M in that column; * if TRANS = 'N' and m < n, rows 1 to N of B contain the * minimum norm solution vectors; * if TRANS = 'C' and m >= n, rows 1 to M of B contain the * minimum norm solution vectors; * if TRANS = 'C' and m < n, rows 1 to M of B contain the * least squares solution vectors; the residual sum of squares * for the solution in each column is given by the sum of * squares of the modulus of elements M+1 to N in that column. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= MAX(1,M,N). * * RETURNS: * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, the i-th diagonal element of the * triangular factor of A is zero, so that A does not have * full rank; the least squares solution could not be * computed. */ { TYPE *TAU; #ifdef TCPLX const enum ATLAS_TRANS RTRAN = (TA == AtlasNoTrans) ? AtlasConjTrans : AtlasNoTrans; const TYPE one[3] = {ATL_rone, ATL_rzero, ATL_rzero}, *zero=one+1; TYPE wsq[4]; #else const enum ATLAS_TRANS RTRAN = (TA == AtlasNoTrans) ? AtlasTrans : AtlasNoTrans; #define one ATL_rone #define zero ATL_rzero TYPE wsq[2]; #endif TYPE *free0=NULL; TYPE anrm, bnrm; ATL_INT scalN, wlen; ATL_CINT MN = Mmin(M,N); int iascal=0, ibscal=0, ierr; /* * Quick return for degenerate cases */ if (!NRHS) return(0); else if (!M || !N) { Mjoin(PATL,geset)(Mmax(M,N), NRHS, zero, zero, B, ldb); return(0); } /* * If no workspace given, routines will simply allocate their own, we need TAU */ if (lwork == 0 || lwork < -1) { free0 = TAU = malloc(MN*ATL_sizeof); ATL_assert(TAU); work = NULL; wlen = 0; } /* * If the user is providing workspace, or doing a workspace query, we must * compute the required workspace */ else { wlen = MN; /* space needed for TAU */ if (M >= N) { ATL_assert(!Mjoin(PATL,geqrf)(M, N, A, lda, NULL, wsq, -1)); ATL_assert(!Mjoin(PATL,ormqr)(AtlasLeft, RTRAN, M, NRHS, N, A, lda, NULL, B, ldb, wsq+(1 SHIFT), -1)); } else { ATL_assert(!Mjoin(PATL,gelqf)(M, N, A, lda, NULL, wsq, -1)); ATL_assert(!Mjoin(PATL,ormqr)(AtlasLeft, RTRAN, N, NRHS, M, A, lda, NULL, B, ldb, wsq+(1 SHIFT), -1)); } if (wsq[1 SHIFT] > wsq[0]) wsq[0] = wsq[1 SHIFT]; wlen += wsq[0]; /* * If this was a workspace query, return optimal workspace in *work */ if (lwork == -1) { *work = wlen; return(0); } /* * Otherwise, take action if user's workspace is inadequate */ if (lwork < wlen) { if (lwork >= wlen-MN) /* users space is work, we alloc TAU */ { free0 = TAU = malloc(MN*ATL_sizeof); wlen -= MN; ATL_assert(TAU); work = work; } else if (lwork < MN) /* can't even use workspace for TAU */ { free0 = TAU = malloc(MN*ATL_sizeof); ATL_assert(TAU); work = NULL; wlen = 0; } else /* user's workspace becomes TAU; let worker routs alloc work */ { TAU = work; work = NULL; wlen = 0; } } else /* user provided adequate workspace for everything */ { wlen = lwork - MN; TAU = work; work += MN SHIFT; } } // TPSD is (TA != AtlasNoTrans) /* * =============================================================== * Scale if max elt in A is outside safe range, return if nrm is 0 * =============================================================== */ anrm = Mjoin(PATL,gemaxnrm)(M, N, A, lda); /* * If it is below it, scale matrix norm up to smallest safe number */ if (anrm > ATL_rzero && anrm < ATL_labadUNDERTHRESH) { Mjoin(PATL,lascl)(LAMATG, 0, 0, anrm, ATL_labadUNDERTHRESH, M, N, A, lda); iascal = 1; } /* * If matrix norm huge, scale it down by largest safe number */ else if (anrm > ATL_labadOVERTHRESH) { Mjoin(PATL,lascl)(LAMATG, 0, 0, anrm, ATL_labadOVERTHRESH, M, N, A, lda); iascal = 2; } /* * If norm is 0, entire matrix is 0, return zero solution */ else if (anrm == ATL_rzero) { Mjoin(PATL,geset)(Mmax(M,N), NRHS, zero, zero, B, ldb); if (free0) free(free0); return(0); } /* * =============================================================== * Scale if max elt in B is outside safe range, return if nrm is 0 * =============================================================== */ scalN = (TA != AtlasNoTrans) ? N : M; bnrm = Mjoin(PATL,gemaxnrm)(scalN, NRHS, B, ldb); /* * If it is below it, scale matrix norm up to smallest safe number */ if (bnrm > ATL_rzero && bnrm < ATL_labadUNDERTHRESH) { Mjoin(PATL,lascl)(LAMATG, 0, 0, bnrm, ATL_labadUNDERTHRESH, scalN, NRHS, B, ldb); ibscal = 1; } /* * If matrix norm huge, scale it down by largest safe number */ else if (bnrm > ATL_labadOVERTHRESH) { Mjoin(PATL,lascl)(LAMATG, 0, 0, bnrm, ATL_labadOVERTHRESH, scalN, NRHS, B, ldb); ibscal = 2; } if (M >= N) /* overdetermined system */ { /* * Compute QR factorization of A */ ATL_assert(!Mjoin(PATL,geqrf)(M, N, A, lda, TAU, work, wlen)); /* * Least-squares problem min || A * X - B || */ if (TA == AtlasNoTrans) { ATL_assert(!Mjoin(PATL,ormqr)(AtlasLeft, RTRAN, M, NRHS, N, A, lda, TAU, B, ldb, work, wlen)); ierr = Mjoin(PATL,trtrs)(AtlasUpper, AtlasNoTrans, AtlasNonUnit, N, NRHS, A, lda, B, ldb); if (ierr) { if (free0) free(free0); return(ierr); } scalN = N; } /* * Overdetermined system of equations A' * X = B */ else /* transposed case */ { ierr = Mjoin(PATL,trtrs)(AtlasUpper, TA, AtlasNonUnit, N, NRHS, A, lda, B, ldb); if (ierr) { if (free0) free(free0); return(ierr); } Mjoin(PATL,gezero)(M-N, NRHS, B+(N SHIFT), ldb); ATL_assert(!Mjoin(PATL,ormqr)(AtlasLeft, AtlasNoTrans, M, NRHS, N, A, lda, TAU, B, ldb, work, wlen)); scalN = M; } } /* * Compute LQ factorization of A */ else /* M < N */ { ATL_assert(!Mjoin(PATL,gelqf)(M, N, A, lda, TAU, work, wlen)); /* * Underdetermined system of equations A * X = B */ if (TA == AtlasNoTrans) { /* * B(1:M,1:NRHS) = inv(L) * B(1:M,1:NRHS) */ ierr = Mjoin(PATL,trtrs)(AtlasLower, AtlasNoTrans, AtlasNonUnit, M, NRHS, A, lda, B, ldb); if (ierr) { if (free0) free(free0); return(ierr); } Mjoin(PATL,gezero)(N-M, NRHS, B+(M SHIFT), ldb); ATL_assert(!Mjoin(PATL,ormlq)(AtlasLeft, RTRAN, N, NRHS, M, A, lda, TAU, B, ldb, work, wlen)); scalN = N; } /* * Overdetermined system min || A' * X - B || */ else { ATL_assert(!Mjoin(PATL,ormlq)(AtlasLeft, AtlasNoTrans, N, NRHS, M, A, lda, TAU, B, ldb, work, wlen)); /* * B(1:M,1:NRHS) := inv(L') * B(1:M,1:NRHS) */ ierr = Mjoin(PATL,trtrs)(AtlasLower, mytrans, AtlasNonUnit, M, NRHS, A, lda, B, ldb); if (ierr) { if (free0) free(free0); return(ierr); } scalN = M; } } /* * Undo scaling */ if (iascal == 1) Mjoin(PATL,lascl)(LAMATG, 0, 0, anrm, ATL_labadUNDERTHRESH, scalN, NRHS, B, ldb); else if (iascal == 2) Mjoin(PATL,lascl)(LAMATG, 0, 0, anrm, ATL_labadOVERTHRESH, scalN, NRHS, B, ldb); if (ibscal == 1) Mjoin(PATL,lascl)(LAMATG, 0, 0, ATL_labadUNDERTHRESH, bnrm, scalN, NRHS, B, ldb); else if (ibscal == 2) Mjoin(PATL,lascl)(LAMATG, 0, 0, ATL_labadOVERTHRESH, bnrm, scalN, NRHS, B, ldb); if (free0) free(free0); return(0); }
static int dmv2mx_(double *t, int *ldt, double *beta, double *a, int *lda, int *nrow, int *ncol, int *mb, int *nb, int *ilt, int *jlt) { /* System generated locals */ long t_dim1, t_offset, a_dim1, a_offset; int i__1, i__2, i__3, i__4; /* Local variables */ static int k, ia, ja, jj, ki, kj, it, jt, mr, irm, jrm; /* -- PUMMA Package routine (version 2.1) -- */ /* Jaeyoung Choi, Oak Ridge National Laboratory. */ /* Jack Dongarra, Univ. of Tennessee, Oak Ridge National Laboratory. */ /* David Walker, Oak Ridge National Laboratory. */ /* October 31, 1994. */ /* Purpose */ /* A <== T + beta*A (assume beta = 0.0, or 1.0) */ /* A is a scattered 2-D array from a condensed 2-D buffer T */ /* Parameter adjustments */ t_dim1 = *ldt; t_offset = 1 + t_dim1; t -= t_offset; a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; /* Function Body */ it = 0; ia = 0; /* A <== T */ if (*beta == 0.) { /* If NPROW = 1, use DCOPY */ if (*nrow == 1) { jt = 0; ja = 0; i__1 = *ncol - 2; for (kj = 0; kj <= i__1; ++kj) { i__2 = *nb; for (jj = 1; jj <= i__2; ++jj) { i__3 = Mmin(*mb,*ilt); HPL_dcopy(i__3, &t[(jt + jj) * t_dim1 + 1], 1, &a[(ja + jj) * a_dim1 + 1], 1); /* L10: */ } jt += *nb; ja += commtrb_1.jtz; /* L20: */ } jrm = *jlt - ja; if (jrm > 0) { i__1 = Mmin(*nb,jrm); for (jj = 1; jj <= i__1; ++jj) { i__2 = Mmin(*mb,*ilt); HPL_dcopy(i__2, &t[(jt + jj) * t_dim1 + 1], 1, &a[(ja + jj) * a_dim1 + 1], 1); /* L30: */ } } } else { i__1 = *nrow - 2; for (ki = 0; ki <= i__1; ++ki) { jt = 0; ja = 0; i__2 = *ncol - 2; for (kj = 0; kj <= i__2; ++kj) { i__3 = *nb; for (jj = 1; jj <= i__3; ++jj) { i__4 = *mb; for (k = 1; k <= i__4; ++k) { a[ia + k + (ja + jj) * a_dim1] = t[it + k + (jt + jj) * t_dim1]; /* L40: */ } } jt += *nb; ja += commtrb_1.jtz; /* L50: */ } jrm = *jlt - ja; if (jrm > 0) { i__2 = Mmin(*nb,jrm); for (jj = 1; jj <= i__2; ++jj) { i__4 = *mb; for (k = 1; k <= i__4; ++k) { a[ia + k + (ja + jj) * a_dim1] = t[it + k + (jt + jj) * t_dim1]; /* L60: */ } } } it += *mb; ia += commtrb_1.itz; /* L70: */ } irm = *ilt - ia; if (irm > 0) { jt = 0; ja = 0; mr = Mmin(*mb,irm); i__1 = *ncol - 2; for (kj = 0; kj <= i__1; ++kj) { i__4 = *nb; for (jj = 1; jj <= i__4; ++jj) { i__2 = mr; for (k = 1; k <= i__2; ++k) { a[ia + k + (ja + jj) * a_dim1] = t[it + k + (jt + jj) * t_dim1]; /* L80: */ } } jt += *nb; ja += commtrb_1.jtz; /* L90: */ } jrm = *jlt - ja; if (jrm > 0) { i__1 = Mmin(*nb,jrm); for (jj = 1; jj <= i__1; ++jj) { i__2 = mr; for (k = 1; k <= i__2; ++k) { a[ia + k + (ja + jj) * a_dim1] = t[it + k + (jt + jj) * t_dim1]; /* L100: */ } } } } } /* A <== T + A */ } else { /* If NPROW = 1, use DAXPY */ if (*nrow == 1) { jt = 0; ja = 0; i__2 = *ncol - 2; for (kj = 0; kj <= i__2; ++kj) { i__1 = *nb; for (jj = 1; jj <= i__1; ++jj) { i__4 = Mmin(*mb,*ilt); HPL_daxpy(i__4, 1.0, &t[(jt + jj) * t_dim1 + 1], 1, &a[(ja + jj) * a_dim1 + 1], 1); /* L110: */ } jt += *nb; ja += commtrb_1.jtz; /* L120: */ } jrm = *jlt - ja; if (jrm > 0) { i__2 = Mmin(*nb,jrm); for (jj = 1; jj <= i__2; ++jj) { i__1 = Mmin(*mb,*ilt); HPL_daxpy(i__1, 1.0, &t[(jt + jj) * t_dim1 + 1], 1, & a[(ja + jj) * a_dim1 + 1], 1); /* L130: */ } } } else { i__2 = *nrow - 2; for (ki = 0; ki <= i__2; ++ki) { jt = 0; ja = 0; i__1 = *ncol - 2; for (kj = 0; kj <= i__1; ++kj) { i__4 = *nb; for (jj = 1; jj <= i__4; ++jj) { i__3 = *mb; for (k = 1; k <= i__3; ++k) { a[ia + k + (ja + jj) * a_dim1] += t[it + k + (jt + jj) * t_dim1]; /* L140: */ } } jt += *nb; ja += commtrb_1.jtz; /* L150: */ } jrm = *jlt - ja; if (jrm > 0) { i__1 = Mmin(*nb,jrm); for (jj = 1; jj <= i__1; ++jj) { i__3 = *mb; for (k = 1; k <= i__3; ++k) { a[ia + k + (ja + jj) * a_dim1] += t[it + k + (jt + jj) * t_dim1]; /* L160: */ } } } it += *mb; ia += commtrb_1.itz; /* L170: */ } irm = *ilt - ia; if (irm > 0) { jt = 0; ja = 0; mr = Mmin(*mb,irm); i__2 = *ncol - 2; for (kj = 0; kj <= i__2; ++kj) { i__3 = *nb; for (jj = 1; jj <= i__3; ++jj) { i__1 = mr; for (k = 1; k <= i__1; ++k) { a[ia + k + (ja + jj) * a_dim1] += t[it + k + (jt + jj) * t_dim1]; /* L180: */ } } jt += *nb; ja += commtrb_1.jtz; /* L190: */ } jrm = *jlt - ja; if (jrm > 0) { i__2 = Mmin(*nb,jrm); for (jj = 1; jj <= i__2; ++jj) { i__1 = mr; for (k = 1; k <= i__1; ++k) { a[ia + k + (ja + jj) * a_dim1] += t[it + k + (jt + jj) * t_dim1]; /* L200: */ } } } } } } return 0; } /* dmv2mx_ */
PT_TREE_T Mjoin( PATL, pthescal_nt ) ( const unsigned int THREADS, pthread_attr_t * ATTR, const enum ATLAS_UPLO UPLO, const int M, const int N, const void * ALPHA, void * A, const int LDA ) { /* * Purpose * ======= * * Mjoin( PATL, pthescal_nt ) scales a trapezoidal Hermitian m-by-n matrix * A by the real scalar alpha. The imaginary parts of the diagonal ele- * ments of A need not be set on input, they are assumed to be zero, and * on exit they are set to zero. * * This is a multi-threaded version of the algorithm. * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ PT_TREE_T root = NULL; PT_MISC_TYPE_T type; double tblks, tmnblks; unsigned int nthreads; int mn, nb, nbm1; /* .. * .. Executable Statements .. * */ /* * Make sure we don't thread this for the time being */ if( THREADS >= 1 ) { Mjoin( PATL, hescal )( UPLO, M, N, ((TYPE *)(ALPHA))[0], (TYPE *)(A), LDA ); return( root ); } nbm1 = ( nb = Mjoin( PATL, GetNB )() ) - 1; mn = Mmin( M, N ); tmnblks = (double)( (mn+nbm1) / nb ); tblks = tmnblks * tmnblks; if( UPLO == AtlasLower ) { tblks += (double)( (N+nbm1) / nb ) * (double)( (M-mn+nbm1) / nb ); } else { tblks += (double)( (M+nbm1) / nb ) * (double)( (N-mn+nbm1) / nb ); } if( ( tblks <= (double)(ATL_XOVER_MI_DEFAULT) ) || ( THREADS <= 1 ) ) { Mjoin( PATL, hescal )( UPLO, M, N, ((TYPE *)(ALPHA))[0], (TYPE *)(A), LDA ); return( root ); } if( tblks >= (double)(THREADS) ) { nthreads = THREADS; } else { nthreads = (unsigned int)floor( tblks + 0.5 ); } type.size = sizeof( TYPE[2] ); type.fun = Mjoin( PATL, pthescal0 ); if( UPLO == AtlasLower ) { root = ATL_Stzscal( &type, 0, nthreads, ATTR, nb, AtlasLower, M - mn, 0, mn, ALPHA, A, LDA ); } else { root = ATL_Stzscal( &type, 0, nthreads, ATTR, nb, AtlasUpper, 0, N - mn, mn, ALPHA, A, LDA ); } ATL_thread_tree( root, ATTR ); return( root ); /* * End of Mjoin( PATL, pthescal_nt ) */ }
void Mjoin( PATL, hpmv ) ( const enum ATLAS_UPLO UPLO, const int N, const SCALAR ALPHA, const TYPE * A, const TYPE * X, const int INCX, const SCALAR BETA, TYPE * Y, const int INCY ) { /* * Purpose * ======= * * Mjoin( PATL, hpmv ) performs the matrix-vector operation * * y := alpha * A * x + beta * y, * * where alpha and beta are scalars, x and y are n-element vectors and A * is an n by n Hermitian matrix, supplied in packed form. * * This is a blocked version of the algorithm. For a more detailed des- * cription of the arguments of this function, see the reference imple- * mentation in the ATLAS/src/blas/reference directory. * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ void (*gpmv0)( const int, const int, const SCALAR, const TYPE *, const int, const TYPE *, const int, const SCALAR, TYPE *, const int ); void (*gpmv1)( const int, const int, const SCALAR, const TYPE *, const int, const TYPE *, const int, const SCALAR, TYPE *, const int ); void (*gpmvN)( const int, const int, const SCALAR, const TYPE *, const int, const TYPE *, const int, const SCALAR, TYPE *, const int ); #ifdef TREAL TYPE alphaY, beta0; #define one ATL_rone #define zero ATL_rzero #else const TYPE * alphaY, * beta0; const TYPE one [2] = { ATL_rone, ATL_rzero }, zero[2] = { ATL_rzero, ATL_rzero }; #endif void * vx = NULL, * vy = NULL; TYPE * A0, * A1, * x, * x0, * x1, * y, * y00, * y0, * y1; int incXY, incXY1, j, jb, lda, lda0, lda1, mb, mb1, n, nb; /* .. * .. Executable Statements .. * */ if( N == 0 ) return; if( SCALAR_IS_ZERO( ALPHA ) ) { if( !( SCALAR_IS_ONE( BETA ) ) ) Mjoin( PATL, scal )( N, BETA, Y, INCY ); return; } if( ( INCX != 1 ) || ( ( INCY == 1 ) && !( SCALAR_IS_ONE( ALPHA ) ) ) ) { vx = (void *)malloc( ATL_Cachelen + ATL_MulBySize( N ) ); ATL_assert( vx ); x = ATL_AlignPtr( vx ); Mjoin( PATL, cpsc )( N, ALPHA, X, INCX, x, 1 ); alphaY = one; } else { x = (TYPE *)(X); alphaY = ALPHA; } if( ( INCY != 1 ) || !( SCALAR_IS_ONE( alphaY ) ) ) { vy = malloc( ATL_Cachelen + ATL_MulBySize( N ) ); ATL_assert( vy ); y00 = y = ATL_AlignPtr( vy ); beta0 = zero; } else { y00 = y = (TYPE *)(Y); beta0 = BETA; } ATL_GetPartSPMV( A, N, &mb, &nb ); mb1 = N - ( ( N - 1 ) / mb ) * mb; incXY1 = (nb SHIFT); if( UPLO == AtlasUpper ) { if( SCALAR_IS_ZERO( beta0 ) ) gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_b0_y1 ); else if( SCALAR_IS_ONE ( beta0 ) ) gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 ); else gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_bX_y1 ); gpmv1 = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 ); gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b1_y1 ); lda = 1; lda0 = lda; A0 = (TYPE *)(A); MUpnext( mb, A0, lda0 ); incXY = (mb SHIFT); x0 = x + incXY; y0 = y + incXY; for( n = N - mb; n > 0; n -= mb, x0 += incXY, x += incXY, y0 += incXY, y += incXY ) { Mjoin( PATL, hpmvU )( mb, A, lda, x, beta0, y ); for( j = 0, lda1 = lda0, A1 = A0 - (mb SHIFT), x1 = x0, y1 = y0; j < n; j += nb, x1 += incXY1, y1 += incXY1 ) { jb = n - j; jb = Mmin( jb, nb ); gpmv0( jb, mb, one, A1, lda1, x, 1, beta0, y1, 1 ); gpmvN( mb, jb, one, A1, lda1, x1, 1, one, y, 1 ); MUpnext( jb, A1, lda1 ); A1 -= (jb SHIFT); } beta0 = one; gpmv0 = gpmv1; lda = lda0; A = A0; MUpnext( mb, A0, lda0 ); } Mjoin( PATL, hpmvU )( mb1, A, lda, x, beta0, y ); } else { if( SCALAR_IS_ZERO( beta0 ) ) gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_b0_y1 ); else if( SCALAR_IS_ONE ( beta0 ) ) gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_b1_y1 ); else gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_bX_y1 ); gpmv1 = Mjoin( PATL, gpmvLC_a1_x1_b1_y1 ); gpmvN = Mjoin( PATL, gpmvLN_a1_x1_b1_y1 ); lda = N; lda0 = lda; A0 = (TYPE *)(A); MLpnext( N, A, lda ); incXY = (mb SHIFT); x0 = x; y0 = y; for( n = N - mb, x += ((N-mb) SHIFT), y += ((N-mb) SHIFT); n > 0; n -= mb, x -= incXY, y -= incXY ) { MLpprev( mb, A, lda ); Mjoin( PATL, hpmvL )( mb, A, lda, x, beta0, y ); for( j = 0, lda1 = lda0, A1 = A0 + (n SHIFT), x1 = x0, y1 = y0; j < n; j += nb, x1 += incXY1, y1 += incXY1 ) { jb = n - j; jb = Mmin( jb, nb ); gpmv0( jb, mb, one, A1, lda1, x, 1, beta0, y1, 1 ); gpmvN( mb, jb, one, A1, lda1, x1, 1, one, y, 1 ); MLpnext( jb, A1, lda1 ); A1 -= (jb SHIFT); } beta0 = one; gpmv0 = gpmv1; } Mjoin( PATL, hpmvL )( mb1, A0, lda0, x0, beta0, y0 ); } if( vx ) free( vx ); if( vy ) { Mjoin( PATL, axpby )( N, alphaY, y00, 1, BETA, Y, INCY ); free( vy ); } /* * End of Mjoin( PATL, hpmv ) */ }
static int ATL_trmvUT ( const enum ATLAS_DIAG Diag, const int nb, ATL_CINT N, const TYPE *A, ATL_CINT lda, TYPE *X, ATL_CINT incX ) /* * RETURNS: 0 if TRMV was performed, non-zero if nothing done */ { static void (*trmvK)(ATL_CINT, const TYPE*, ATL_CINT, const TYPE*, TYPE*); void (*gemv)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT, const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT); void *vp; TYPE *x, *y; const size_t opsize = (N*N+N+N)*sizeof(TYPE)SHIFT; size_t t0; #ifdef TCPLX size_t N2=N+N, lda2 = lda+lda; TYPE one[2] = {ATL_rone, ATL_rzero}; #else #define N2 N #define lda2 lda #define one ATL_rone #endif const size_t incA = ((size_t)lda2)*nb; ATL_INT j; if (N < nb+nb) return(1); if (opsize > MY_CE) gemv = Mjoin(PATL,gemvT); else gemv = (opsize <= ATL_MulBySize(ATL_L1elts)) ? Mjoin(PATL,gemvT_L1) : Mjoin(PATL,gemvT_L2); trmvK = (Diag == AtlasNonUnit) ? ATL_trmvUTNk : ATL_trmvUTUk; /* * If X is aligned to Cachelen wt inc=1, use it as y */ t0 = (size_t) X; if (incX == 1 && (ATL_MulByCachelen(ATL_DivByCachelen(t0)) == t0)) { ATL_INT i; vp = malloc(ATL_Cachelen+ATL_MulBySize(N)); if (!vp) return(2); x = ATL_AlignPtr(vp); y = X; for (i=0; i < N2; i++) { x[i] = X[i]; X[i] = ATL_rzero; } } else /* allocate both X and Y */ { vp = malloc((ATL_Cachelen+ATL_MulBySize(N))<<1); if (!vp) return(3); x = ATL_AlignPtr(vp); y = x + N2; y = ATL_AlignPtr(y); Mjoin(PATL,copy)(N, X, incX, x, 1); Mjoin(PATL,zero)(N, y, 1); } trmvK(nb, A, lda, x, y); A += incA; for (j=nb; j < N; j += nb, A += incA) { int kb = N-j; #ifdef TCPLX const register size_t j2 = j + j; #else #define j2 j #endif kb = Mmin(nb, kb); gemv(j, kb, one, A, lda, x, 1, one, y+j2, 1); trmvK(kb, A+j2, lda, x+j2, y+j2); #ifndef TCPLX #undef j2 #endif } if (y != X) Mjoin(PATL,copy)(N, y, 1, X, incX); free(vp); return(0); }
main(int nargs, char *args[]) /* * tst <tst> <# TA> <TA's> <# TB's> <TB's> <M0> <MN> <incM> <N0> <NN> <incN> * <K0> <KN> <incK> <# alphas> <alphas> <# betas> <betas> * */ { int M0, MN, incM, N0, NN, incN, K0, KN, incK, lda, ldb, ldc, MFLOP; int i, k, m, n, im, in, ik, ita, itb, ia, ib, nTA, nTB, nalph, nbeta; int itst=0, ipass=0, TEST, LDA_IS_M, MSAME=0, KSAME=0; int ndiag, nuplo, nside; TYPE *alph, *beta, *A, *B, *C, *D=NULL; #ifdef TREAL TYPE bet1 = 1.0, alp1 = -1.0; #else TYPE bet1[2] = {1.0, 0.0}, alp1[2] = {-1.0, 0.0}; #endif char TA, TB; enum ATLAS_SIDE *Side; enum ATLAS_UPLO *Uplo; enum ATLAS_TRANS *TransA, *TransB, TAc, TBc; enum ATLAS_DIAG *Diag; int CACHESIZE; GetFlags(nargs, args, &TEST, &nside, &Side, &nuplo, &Uplo, &nTA, &TransA, &nTB, &TransB, &ndiag, &Diag, &M0, &MN, &incM, &N0, &NN, &incN, &K0, &KN, &incK, &nalph, &alph, &nbeta, &beta, &LDA_IS_M, &MFLOP,&CACHESIZE); if (M0 == -1) { MSAME = 1; M0 = MN = incM = NN; } if (K0 == -1) { KSAME = 1; K0 = KN = incK = NN; } if (!MFLOP) { A = malloc(MN*KN*ATL_sizeof); B = malloc(NN*KN*ATL_sizeof); C = malloc(MN*NN*ATL_sizeof); if (TEST) D = malloc(MN*NN*ATL_sizeof); else D = NULL; if (!A || !B || !C || (TEST && !D)) { fprintf(stderr, "Not enough memory to run tests!!\n"); exit(-1); } } /* * Page the code in from disk, so first timing doesn't blow */ if (MFLOP) { mmcase0(10, 1, 'n', 'n', 100, 100, 100, alp1, 100, 100, bet1, 100); mmcase0(10, 1, 'n', 't', 100, 100, 100, alp1, 100, 100, bet1, 100); mmcase0(10, 1, 't', 'n', 100, 100, 100, alp1, 100, 100, bet1, 100); mmcase0(10, 1, 't', 't', 100, 100, 100, alp1, 100, 100, bet1, 100); } else { m = Mmin(100, MN); k = Mmin(100, KN); n = Mmin(100, NN); matgen(m, k, A, m, m*k); matgen(k, n, B, k, n*k); matgen(m, n, C, m, m*n); TA = TB = 'N'; TAc = TBc = AtlasNoTrans; trusted_gemm(TAc, TBc, m, n, k, alp1, A, m, B, k, bet1, C, m); test_gemm(TAc, TBc, m, n, k, alp1, A, m, B, k, bet1, C, m); } #ifdef TREAL printf("\nTEST TA TB M N K alpha beta Time Mflop SpUp PASS\n"); printf("==== == == === === === ===== ===== ====== ===== ==== ====\n\n"); #else printf("\nTEST TA TB M N K alpha beta Time Mflop SpUp PASS\n"); printf("==== == == === === === ===== ===== ===== ===== ====== ===== ==== ====\n\n"); #endif for (im=M0; im <= MN; im += incM) { for (n=N0; n <= NN; n += incN) { if (MSAME) m = n; else m = im; for (ik=K0; ik <= KN; ik += incK) { if (KSAME) k = n; else k = ik; for (ita=0; ita != nTA; ita++) { if (TransA[ita] == AtlasNoTrans) TA = 'N'; else if (TransA[ita] == AtlasTrans) TA = 'T'; else if (TransA[ita] == AtlasConjTrans) TA = 'C'; for (itb=0; itb != nTB; itb++) { if (TransB[itb] == AtlasNoTrans) TB = 'N'; else if (TransB[itb] == AtlasTrans) TB = 'T'; else if (TransB[itb] == AtlasConjTrans) TB = 'C'; for (ia=0; ia != nalph; ia++) { for (ib=0; ib != nbeta; ib++) { itst++; if (LDA_IS_M) { if (TA == 'n' || TA == 'N') lda = m; else lda = k; if (TB == 'n' || TB == 'N') ldb = k; else ldb = n; ldc = m; } else { if (TA == 'n' || TA == 'N') lda = MN; else lda = KN; if (TB == 'n' || TB == 'N') ldb = KN; else ldb = NN; ldc = MN; } if (MFLOP) { ipass++; #ifdef TREAL mmcase0(MFLOP, CACHESIZE, TA, TB, m, n, k, alph[ia], lda, ldb, beta[ib], ldc); #else mmcase0(MFLOP, CACHESIZE, TA, TB, m, n, k, alph+(ia SHIFT), lda, ldb, beta+(ib SHIFT), ldc); #endif } else { #ifdef TREAL ipass += mmcase(TEST, CACHESIZE, TA, TB, m, n, k, alph[ia], A, lda, B, ldb, beta[ib], C, ldc, D,ldc); #else ipass += mmcase(TEST, CACHESIZE, TA, TB, m, n, k, alph+(ia SHIFT), A, lda, B, ldb, beta+(ib SHIFT), C, ldc, D,ldc); #endif } } } } } } } } if (TEST && !MFLOP) printf("\nNTEST=%d, NUMBER PASSED=%d, NUMBER FAILURES=%d\n", itst, ipass, itst-ipass); else printf("\nDone with %d timing runs\n",itst); free(Side); free(Uplo); free(TransA); free(TransB); free(Diag); free(alph); free(beta); if (!MFLOP) { free(A); free(B); free(C); if (D) free(D); } exit(0); }
int RunCase(int CacheSize, TYPE thresh, int MFLOP, enum ATLAS_ORDER Order, int M, int N, int lda) { char *cord = (Order == AtlasColMajor ? "Col" : "Row"); const double maxMN = Mmax(M,N), minMN = Mmin(M,N); unsigned long nreps=0; int npiv=(-1), *ipiv; const int incA = (Order == AtlasColMajor ? N*lda : M*lda); double mflops, mflop, resid, tim=(-1.0), t0; TYPE *A, *a; int i; #ifdef TREAL mflops = maxMN * minMN * minMN - ((minMN*minMN*minMN) / 3.0) - (minMN*minMN) / 2.0; #else mflops = (maxMN * minMN * minMN - ((minMN*minMN*minMN) / 3.0) + (maxMN*minMN) / 2.0)*4.0 - 3.0 * minMN*minMN; #endif mflops /= 1000000.0; if (thresh > ATL_rzero) { if (Order == AtlasColMajor) resid = lutestC(CacheSize, M, N, lda, &npiv, &tim); else resid = lutestR(CacheSize, M, N, lda, &npiv, &tim); } else resid = -1.0; if (MFLOP > mflops || thresh <= ATL_rzero) /* need to time repetitively */ { nreps = (mflops*1000000); nreps = (MFLOP*1000000 + nreps-1) / nreps; if (nreps < 1) nreps = 1; i = ATL_DivBySize(2*CacheSize) ATL_PTCACHEMUL; i = (i + M*N) / (M*N); if (i < nreps) i = nreps; /* don't reuse mem or no pivoting */ a = A = malloc(i * ATL_MulBySize(incA)); if (A != NULL) { ipiv = malloc(Mmin(M,N)*sizeof(int)); /* what the hell - reuse ipiv */ if (ipiv) { Mjoin(PATL,gegen)(i*incA, 1, A, i*incA, incA+M+3012); t0 = time00(); for (i=nreps; i; i--, a += incA) test_getrf(Order, M, N, a, lda, ipiv); tim = time00() - t0; tim /= nreps; if (npiv == 0) npiv = findnpvt(Mmin(M,N), ipiv); free(ipiv); } else fprintf(stderr, " WARNING: not enough mem to run timings!\n"); free(A); } else fprintf(stderr, " WARNING: not enough mem to run timings!\n"); } if (tim > 0.0) mflop = mflops / tim; else mflop = 0.0; fprintf(stdout, "%5d %3s %6d %6d %6d %6d %9.3f %9.3f %9.3e\n", nreps, cord, M, N, lda, npiv, tim, mflop, resid); return(resid <= thresh); }