int ATL_potrfRL(const int N, TYPE *A, const int lda) { TYPE *An, *Ar; int Nleft, Nright, ierr; static const TYPE ONE[2] = {ATL_rone, ATL_rzero}; const int lda2=lda+lda; if (N > 1) { Nleft = N >> 1; #ifdef NB if (Nleft > NB<<1) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft)); #endif Nright = N - Nleft; ierr = ATL_potrfRL(Nleft, A, lda); if (!ierr) { Ar = A + Nleft * lda2; An = Ar + Nleft+Nleft; cblas_trsm(CblasRowMajor, CblasRight, CblasLower, CblasConjTrans, CblasNonUnit, Nright, Nleft, ONE, A, lda, Ar, lda); cblas_herk(CblasRowMajor, CblasLower, CblasNoTrans, Nright, Nleft, ATL_rnone, Ar, lda, ATL_rone, An, lda); ierr = ATL_potrfRL(Nright, An, lda); if (ierr) return(ierr+Nleft); } else return(ierr); }
int ATL_getrfC(const int M, const int N, TYPE *A, const int lda, int *ipiv) /* * Column-major factorization of form * A = P * L * U * where P is a row-permutation matrix, L is lower triangular with unit diagonal * elements (lower trapazoidal if M > N), and U is upper triangular (upper * trapazoidal if M < N). This is the recursive Level 3 BLAS version. */ { const int MN = Mmin(M, N); int Nleft, Nright, k, i, ierr=0; #ifdef TCPLX const TYPE one[2] = {ATL_rone, ATL_rzero}; const TYPE none[2] = {ATL_rnone, ATL_rzero}; TYPE inv[2], tmp[2]; #else #define one ATL_rone #define none ATL_rnone TYPE tmp; #endif TYPE *Ac, *An; if (((size_t)M)*N <= ATL_L1elts) return(Mjoin(PATL,getf2)(M, N, A, lda, ipiv)); #if defined(ATL_USEPTHREADS) && defined(ATL_USEPCA) if (N <= (NB<<2) && N >= 16 && M-N >= ATL_PCAMin && ((size_t)ATL_MulBySize(M)*N) <= CacheEdge*ATL_NTHREADS) { if (N >= 16) ierr = Mjoin(PATL,tgetf2)(M, N, A, lda, ipiv); else ierr = Mjoin(PATL,tgetf2_nocp)(M, N, A, lda, ipiv); return(ierr); } #endif if (MN > ATL_luMmin) { Nleft = MN >> 1; #ifdef NB if (Nleft > NB) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft)); #endif Nright = N - Nleft; i = ATL_getrfC(M, Nleft, A, lda, ipiv); /* factor left to L & U */ if (i) if (!ierr) ierr = i; /* * Update trailing submatrix */ Ac = A + (Nleft * lda SHIFT); An = Ac + (Nleft SHIFT); ATL_laswp(Nright, Ac, lda, 0, Nleft, ipiv, 1); cblas_trsm(CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, Nleft, Nright, one, A, lda, Ac, lda); cblas_gemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M-Nleft, Nright, Nleft, none, A+(Nleft SHIFT), lda, Ac, lda, one, An, lda); i = ATL_getrfC(M-Nleft, Nright, An, lda, ipiv+Nleft); if (i) if (!ierr) ierr = i + Nleft; for (i=Nleft; i != MN; i++) ipiv[i] += Nleft; ATL_laswp(Nleft, A, lda, Nleft, MN, ipiv, 1); }
int ATL_trtriRL(const enum ATLAS_DIAG Diag, const int N, TYPE *A, const int lda) { int ierr = 0; TYPE *Age, *Atr; TYPE tmp; int Nleft, Nright; #ifdef TREAL #define one ATL_rone #define mone -ATL_rone #define none ATL_rnone #else static const TYPE one[2] = {ATL_rone, ATL_rzero}; static const TYPE mone[2] = {-ATL_rone, ATL_rzero}; static const TYPE none[2] = {ATL_rnone, ATL_rzero}; #endif #ifdef TREAL if (N > REAL_RECURSE_LIMIT) #else if (N > 1) #endif { Nleft = N >> 1; #ifdef NB if (Nleft > NB) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft)); #endif Nright = N - Nleft; Age = A + ((Nleft*lda) SHIFT); Atr = A + (Nleft * (lda+1) SHIFT); cblas_trsm(AtlasRowMajor, AtlasRight, AtlasLower, AtlasNoTrans, Diag, Nright, Nleft, one, A, lda, Age, lda); cblas_trsm(AtlasRowMajor, AtlasLeft, AtlasLower, AtlasNoTrans, Diag, Nright, Nleft, mone, Atr, lda, Age, lda); ierr = ATL_trtriRL(Diag, Nleft, A, lda); if (ierr!=0) return(ierr); ierr = ATL_trtriRL(Diag, Nright, Atr, lda); if (ierr!=0) return(ierr+Nleft); }
int ATL_getrfR(const int M, const int N, TYPE *A, const int lda, int *ipiv) /* * Row-major factorization of form * A = L * U * P * where P is a column-permutation matrix, L is lower triangular (lower * trapazoidal if M > N), and U is upper triangular with unit diagonals (upper * trapazoidal if M < N). This is the recursive Level 3 BLAS version. */ { const int MN = Mmin(M, N); int Nup, Ndown, i, ierr=0; #ifdef TCPLX const TYPE one[2] = {ATL_rone, ATL_rzero}; const TYPE none[2] = {ATL_rnone, ATL_rzero}; TYPE inv[2], tmp[2]; #else #define one ATL_rone #define none ATL_rnone TYPE tmp; #endif TYPE *Ar, *Ac, *An; if (MN > 1) { Nup = MN >> 1; #ifdef NB if (Nup > NB) Nup = ATL_MulByNB(ATL_DivByNB(Nup)); #endif Ndown = M - Nup; i = ATL_getrfR(Nup, N, A, lda, ipiv); if (i) if (!ierr) ierr = i; Ar = A + (Nup * lda SHIFT); Ac = A + (Nup SHIFT); An = Ar + (Nup SHIFT); ATL_laswp(Ndown, Ar, lda, 0, Nup, ipiv, 1); /* apply pivots */ cblas_trsm(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasUnit, Ndown, Nup, one, A, lda, Ar, lda); cblas_gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, Ndown, N-Nup, Nup, none, Ar, lda, Ac, lda, one, An, lda); i = ATL_getrfR(Ndown, N-Nup, An, lda, ipiv+Nup); if (i) if (!ierr) ierr = Nup + i; for (i=Nup; i != MN; i++) ipiv[i] += Nup; ATL_laswp(Nup, A, lda, Nup, MN, ipiv, 1); /* apply pivots */ }
int Mjoin(PATL,trtrs) (const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TA, const enum ATLAS_DIAG Diag, ATL_CINT N, ATL_CINT NRHS, const TYPE *A, ATL_CINT lda, TYPE *B, ATL_CINT ldb) /* * Checks for singularity, and then solves system: * A * X = B or A^T * X = B * where A is a triangular matrix (as indicated by Uplo). * RETURNS : * 0 : successful exit * <0 : argument #(-return) had illegal value (start counting from 1) * >0 : diag elt # (1st elt 1) was zero, so A is singular */ { #ifdef TCPLX TYPE one[2] = {ATL_rone, ATL_rzero}; ATL_CINT N2=N+N; #else #define one ATL_rone #endif ATL_CINT ldap1 = (lda+1)SHIFT; ATL_INT i; /* * Zero on diagonal means singular triangular matrix */ if (Diag != AtlasUnit) { #ifdef TCPLX for (i=0; i < N2; i += 2, A += ldap1) if (SCALAR_IS_ZERO(A)) return((i>>1)+1); #else for (i=0; i < N; i++, A += ldap1) if (*A == ATL_rzero) return(i+1); #endif A -= ldap1*N; } cblas_trsm(CblasColMajor, AtlasLeft, Uplo, TA, Diag, N, NRHS, one, A, lda, B, ldb); return(0); }
int ATL_potrfL(const int N, TYPE *A, const int lda) { TYPE *An, *Ar; const size_t lda2=(lda SHIFT); int Nleft, Nright, ierr; #ifdef TREAL #define lda2 lda #define ONE ATL_rone #else static const TYPE ONE[2] = {ATL_rone, ATL_rzero}; #endif #ifdef TREAL if (N > 4) #else if (N > 1) #endif { Nleft = N >> 1; #ifdef NB if (Nleft > NB<<1) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft)); #endif Nright = N - Nleft; ierr = ATL_potrfL(Nleft, A, lda); if (!ierr) { Ar = A + (Nleft SHIFT); An = Ar + lda2 * Nleft; cblas_trsm(CblasColMajor, CblasRight, CblasLower, llt_trans, CblasNonUnit, Nright, Nleft, ONE, A, lda, Ar, lda); llt_syrk(CblasColMajor, CblasLower, CblasNoTrans, Nright, Nleft, ATL_rnone, Ar, lda, ATL_rone, An, lda); ierr = ATL_potrfL(Nright, An, lda); if (ierr) return(ierr+Nleft); } else return(ierr); }
void ATL_getrs(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans, const int N, const int NRHS, const TYPE *A, const int lda, const int *ipiv, TYPE *B, const int ldb) /* * OK, this pivoting crap is tricky. The trick is, when we pivot columns * of the matrix, this effects X but not B, and when we pivot rows, this * effects B, but not X. So, must never attempt to apply a Pr * (row permutation matrix) to X or a Pc to B. */ { enum CBLAS_DIAG Lunit, Uunit; #ifdef TREAL #define one ATL_rone #else const TYPE one[2] = {ATL_rone, ATL_rzero}; #endif if (!N || !NRHS) return; if (Order == CblasColMajor) { /* * A*X = B. Since we have pivoted A by Pr (PA=LU), we pivot B by Pr, * **and this does not effect X at all**, so we solve * X = inv(U)*inv(L)*(Pr * B) */ if (Trans == CblasNoTrans) { ATL_laswp(NRHS, B, ldb, 0, N, ipiv, 1); cblas_trsm(Order, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, N, NRHS, one, A, lda, B, ldb); cblas_trsm(Order, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, N, NRHS, one, A, lda, B, ldb); } /* * trans(L*U = PA) ==> U' L' = A' P, so P is Pc, and does not effect B, * U' L' Pc X = B ==> Pc X = inv(L') * inv(U') * B, but we want * X, not Pc X, so we apply inv(Pc) after doing these steps. */ else { cblas_trsm(Order, CblasLeft, CblasUpper, Trans, CblasNonUnit, N, NRHS, one, A, lda, B, ldb); cblas_trsm(Order, CblasLeft, CblasLower, Trans, CblasUnit, N, NRHS, one, A, lda, B, ldb); ATL_laswp(NRHS, B, ldb, 0, N, ipiv, -1); } } /* * For row-major arrays, we actually have X^T and B^T, so must tranpose * both sides of equation, so what we are solving is: X' * A' = B' */ else { /* * A = LU*inv(Pc), X' * (LU*inv(Pc))' = B' ==> X' * inv(Pc) * U' * L' = B' * X' * inv(Pc) = U' * L' * B', so apply inv(Pc) after solves. */ if (Trans == CblasNoTrans) { cblas_trsm(Order, CblasRight, CblasLower, CblasTrans, CblasNonUnit, NRHS, N, one, A, lda, B, ldb); cblas_trsm(Order, CblasRight, CblasUpper, CblasTrans, CblasUnit, NRHS, N, one, A, lda, B, ldb); ATL_laswp(NRHS, B, ldb, 0, N, ipiv, -1); } /* * A' = (LU*inv(Pc))', but Pc is on rows of non-trans matrix, so: * X' * (inv(Pr)*L*U) = B' * X' = (Pr * B') * inv(U) * inv(L) * NOTE: this case is untested */ else { ATL_laswp(NRHS, B, ldb, 0, N, ipiv, 1); cblas_trsm(Order, CblasRight, CblasUpper, CblasNoTrans, CblasUnit, NRHS, N, one, A, lda, B, ldb); cblas_trsm(Order, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, NRHS, N, one, A, lda, B, ldb); } } }
void ATL_potrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, const int NRHS, const TYPE *A, const int lda, TYPE *B, const int ldb) { #ifdef TCPLX int j; const int ldb2 = ldb+ldb; const TYPE one[2] = {ATL_rone, ATL_rzero}; #else #define one ATL_rone #endif if (!N || !NRHS) return; if (Order == CblasColMajor) { /* * Solve X = inv(U) * inv(U') * B */ if (Uplo == AtlasUpper) { cblas_trsm(Order, CblasLeft, CblasUpper, MyTrans, CblasNonUnit, N, NRHS, one, A, lda, B, ldb); cblas_trsm(Order, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, N, NRHS, one, A, lda, B, ldb); } /* * Solve X = inv(L') * inv(L) * B */ else { cblas_trsm(Order, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, N, NRHS, one, A, lda, B, ldb); cblas_trsm(Order, CblasLeft, CblasLower, MyTrans, CblasNonUnit, N, NRHS, one, A, lda, B, ldb); } } /* * For row-major, remember we have x' and b', so we must transpose usual * equations */ else { #ifdef TCPLX for (j=0; j < NRHS; j++) Mjoin(PATLU,scal)(N, -1.0, B+j*ldb2+1, 2); #endif /* * solve x^T = b^T * inv(U) * inv(U^T) * conj( x^H = b^H * inv(U) * inv(U^H) ) (complex) */ if (Uplo == CblasUpper) { cblas_trsm(Order, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, NRHS, N, one, A, lda, B, ldb); cblas_trsm(Order, CblasRight, CblasUpper, MyTrans, CblasNonUnit, NRHS, N, one, A, lda, B, ldb); } /* * solve x^T = b^T * inv(L^T) * inv(L) * conj( x^H = b^H * inv(L^H) * inv(L) ) (complex) */ else { cblas_trsm(Order, CblasRight, CblasLower, MyTrans, CblasNonUnit, NRHS, N, one, A, lda, B, ldb); cblas_trsm(Order, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, NRHS, N, one, A, lda, B, ldb); } #ifdef TCPLX for (j=0; j < NRHS; j++) Mjoin(PATLU,scal)(N, -1.0, B+j*ldb2+1, 2); #endif } }