int ATL_getrfC(const int M, const int N, TYPE *A, const int lda, int *ipiv) /* * Column-major factorization of form * A = P * L * U * where P is a row-permutation matrix, L is lower triangular with unit diagonal * elements (lower trapazoidal if M > N), and U is upper triangular (upper * trapazoidal if M < N). This is the recursive Level 3 BLAS version. */ { const int MN = Mmin(M, N); int Nleft, Nright, k, i, ierr=0; #ifdef TCPLX const TYPE one[2] = {ATL_rone, ATL_rzero}; const TYPE none[2] = {ATL_rnone, ATL_rzero}; TYPE inv[2], tmp[2]; #else #define one ATL_rone #define none ATL_rnone TYPE tmp; #endif TYPE *Ac, *An; if (((size_t)M)*N <= ATL_L1elts) return(Mjoin(PATL,getf2)(M, N, A, lda, ipiv)); #if defined(ATL_USEPTHREADS) && defined(ATL_USEPCA) if (N <= (NB<<2) && N >= 16 && M-N >= ATL_PCAMin && ((size_t)ATL_MulBySize(M)*N) <= CacheEdge*ATL_NTHREADS) { if (N >= 16) ierr = Mjoin(PATL,tgetf2)(M, N, A, lda, ipiv); else ierr = Mjoin(PATL,tgetf2_nocp)(M, N, A, lda, ipiv); return(ierr); } #endif if (MN > ATL_luMmin) { Nleft = MN >> 1; #ifdef NB if (Nleft > NB) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft)); #endif Nright = N - Nleft; i = ATL_getrfC(M, Nleft, A, lda, ipiv); /* factor left to L & U */ if (i) if (!ierr) ierr = i; /* * Update trailing submatrix */ Ac = A + (Nleft * lda SHIFT); An = Ac + (Nleft SHIFT); ATL_laswp(Nright, Ac, lda, 0, Nleft, ipiv, 1); cblas_trsm(CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, Nleft, Nright, one, A, lda, Ac, lda); cblas_gemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M-Nleft, Nright, Nleft, none, A+(Nleft SHIFT), lda, Ac, lda, one, An, lda); i = ATL_getrfC(M-Nleft, Nright, An, lda, ipiv+Nleft); if (i) if (!ierr) ierr = i + Nleft; for (i=Nleft; i != MN; i++) ipiv[i] += Nleft; ATL_laswp(Nleft, A, lda, Nleft, MN, ipiv, 1); }
int ATL_getrfR(const int M, const int N, TYPE *A, const int lda, int *ipiv) /* * Row-major factorization of form * A = L * U * P * where P is a column-permutation matrix, L is lower triangular (lower * trapazoidal if M > N), and U is upper triangular with unit diagonals (upper * trapazoidal if M < N). This is the recursive Level 3 BLAS version. */ { const int MN = Mmin(M, N); int Nup, Ndown, i, ierr=0; #ifdef TCPLX const TYPE one[2] = {ATL_rone, ATL_rzero}; const TYPE none[2] = {ATL_rnone, ATL_rzero}; TYPE inv[2], tmp[2]; #else #define one ATL_rone #define none ATL_rnone TYPE tmp; #endif TYPE *Ar, *Ac, *An; if (MN > 1) { Nup = MN >> 1; #ifdef NB if (Nup > NB) Nup = ATL_MulByNB(ATL_DivByNB(Nup)); #endif Ndown = M - Nup; i = ATL_getrfR(Nup, N, A, lda, ipiv); if (i) if (!ierr) ierr = i; Ar = A + (Nup * lda SHIFT); Ac = A + (Nup SHIFT); An = Ar + (Nup SHIFT); ATL_laswp(Ndown, Ar, lda, 0, Nup, ipiv, 1); /* apply pivots */ cblas_trsm(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasUnit, Ndown, Nup, one, A, lda, Ar, lda); cblas_gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, Ndown, N-Nup, Nup, none, Ar, lda, Ac, lda, one, An, lda); i = ATL_getrfR(Ndown, N-Nup, An, lda, ipiv+Nup); if (i) if (!ierr) ierr = Nup + i; for (i=Nup; i != MN; i++) ipiv[i] += Nup; ATL_laswp(Nup, A, lda, Nup, MN, ipiv, 1); /* apply pivots */ }
static TYPE lutestR(int CacheSize, int M, int N, int lda, int *npiv, double *tim) { TYPE *A, *LmU; int *ipiv; const int MN = Mmin(M,N); int i; double t0, t1; TYPE normA, eps, resid; eps = Mjoin(PATL,epsilon)(); A = malloc(ATL_MulBySize(lda)*M); if (A == NULL) return(-1); ipiv = malloc( MN * sizeof(int) ); if (ipiv == NULL) { free(A); return(-1); } t0 = ATL_flushcache(CacheSize); Mjoin(PATL,gegen)(N, M, A, lda, M*N+lda); #ifdef DEBUG Mjoin(PATL,geprint)("A0", N, M, A, lda); #endif normA = Mjoin(PATL,genrm1)(N, M, A, lda); /* actually infnrm, but OK */ t0 = ATL_flushcache(-1); t0 = time00(); test_getrf(CblasRowMajor, M, N, A, lda, ipiv); t1 = time00() - t0; *tim = t1; t0 = ATL_flushcache(0); #ifdef DEBUG Mjoin(PATL,geprint)("LU", N, M, A, lda); #endif LmU = ATL_LmulUR(M, N, A, lda); /* LmU contains L * U */ #ifdef DEBUG Mjoin(PATL,geprint)("L*U", N, M, LmU, N); #endif Mjoin(PATL,gegen)(N, M, A, lda, M*N+lda); /* regenerate A, overwriting LU */ ATL_laswp(M, A, lda, 0, MN, ipiv, 1); /* apply swaps to A */ resid = Mjoin(PATL,gediffnrm1)(N, M, A, lda, LmU, N); resid /= (normA * eps * Mmin(M,N)); *npiv = findnpvt(MN, ipiv); free(LmU); free(A); free(ipiv); return(resid); }
void ATL_getrs(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans, const int N, const int NRHS, const TYPE *A, const int lda, const int *ipiv, TYPE *B, const int ldb) /* * OK, this pivoting crap is tricky. The trick is, when we pivot columns * of the matrix, this effects X but not B, and when we pivot rows, this * effects B, but not X. So, must never attempt to apply a Pr * (row permutation matrix) to X or a Pc to B. */ { enum CBLAS_DIAG Lunit, Uunit; #ifdef TREAL #define one ATL_rone #else const TYPE one[2] = {ATL_rone, ATL_rzero}; #endif if (!N || !NRHS) return; if (Order == CblasColMajor) { /* * A*X = B. Since we have pivoted A by Pr (PA=LU), we pivot B by Pr, * **and this does not effect X at all**, so we solve * X = inv(U)*inv(L)*(Pr * B) */ if (Trans == CblasNoTrans) { ATL_laswp(NRHS, B, ldb, 0, N, ipiv, 1); cblas_trsm(Order, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, N, NRHS, one, A, lda, B, ldb); cblas_trsm(Order, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, N, NRHS, one, A, lda, B, ldb); } /* * trans(L*U = PA) ==> U' L' = A' P, so P is Pc, and does not effect B, * U' L' Pc X = B ==> Pc X = inv(L') * inv(U') * B, but we want * X, not Pc X, so we apply inv(Pc) after doing these steps. */ else { cblas_trsm(Order, CblasLeft, CblasUpper, Trans, CblasNonUnit, N, NRHS, one, A, lda, B, ldb); cblas_trsm(Order, CblasLeft, CblasLower, Trans, CblasUnit, N, NRHS, one, A, lda, B, ldb); ATL_laswp(NRHS, B, ldb, 0, N, ipiv, -1); } } /* * For row-major arrays, we actually have X^T and B^T, so must tranpose * both sides of equation, so what we are solving is: X' * A' = B' */ else { /* * A = LU*inv(Pc), X' * (LU*inv(Pc))' = B' ==> X' * inv(Pc) * U' * L' = B' * X' * inv(Pc) = U' * L' * B', so apply inv(Pc) after solves. */ if (Trans == CblasNoTrans) { cblas_trsm(Order, CblasRight, CblasLower, CblasTrans, CblasNonUnit, NRHS, N, one, A, lda, B, ldb); cblas_trsm(Order, CblasRight, CblasUpper, CblasTrans, CblasUnit, NRHS, N, one, A, lda, B, ldb); ATL_laswp(NRHS, B, ldb, 0, N, ipiv, -1); } /* * A' = (LU*inv(Pc))', but Pc is on rows of non-trans matrix, so: * X' * (inv(Pr)*L*U) = B' * X' = (Pr * B') * inv(U) * inv(L) * NOTE: this case is untested */ else { ATL_laswp(NRHS, B, ldb, 0, N, ipiv, 1); cblas_trsm(Order, CblasRight, CblasUpper, CblasNoTrans, CblasUnit, NRHS, N, one, A, lda, B, ldb); cblas_trsm(Order, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, NRHS, N, one, A, lda, B, ldb); } } }