static TYPE geresid(enum CBLAS_ORDER Order, int N, TYPE *A, int lda, TYPE *AI, int ldi) /* * returns ||A - AI|| / (N * eps * ||A|| * ||AI||); * for row-major, we are not using 1-norm, since we are adding rows instead * of cols, but it should be an equally good norm, so don't worry about it. */ { TYPE numer, denom, eps; const int ldcp1 = (N+1)SHIFT; TYPE *C; int i; #ifdef TREAL TYPE one = ATL_rone, zero = ATL_rzero; #else TYPE one[2] = {ATL_rone, ATL_rzero}, zero[2] = {ATL_rzero, ATL_rzero}; #endif eps = Mjoin(PATL,epsilon)(); C = malloc(N*ATL_MulBySize(N)); ATL_assert(C); cblas_gemm(Order, CblasNoTrans, CblasNoTrans, N, N, N, one, A, lda, AI, ldi, zero, C, N); /* C now has A*inv(A) */ for (i=0; i != N; i++) C[i*ldcp1] -= ATL_rone; /* C now has A*inv(A)-I */ numer = Mjoin(PATL,genrm1)(N, N, C, N); denom = Mjoin(PATL,genrm1)(N, N, A, lda) * Mjoin(PATL,genrm1)(N, N, AI, ldi) * N * eps; free(C); return(numer/denom); }
int ATL_getriC(const int N, TYPE *A, const int lda, const int *ipiv, TYPE *wrk, const int lwrk) { const int lda2 = lda SHIFT; int J, jb, nb, nright, iret; TYPE *A0 = A; #ifdef TREAL const TYPE one=ATL_rone, none=ATL_rnone; #else const TYPE one[2]={ATL_rone,ATL_rzero}, none[2]={ATL_rnone, ATL_rzero}; #endif iret = ATL_trtri(CblasColMajor, CblasUpper, CblasNonUnit, N, A, lda); if (!iret && N > 1) { /* * Find largest NB we can use with our provided workspace */ jb = lwrk / N; if (jb >= NB) nb = ATL_MulByNB(ATL_DivByNB(jb)); else if (jb >= ATL_mmNU) nb = (jb/ATL_mmNU)*ATL_mmNU; else nb = jb; if (!nb) return(-6); /* need at least 1 col of workspace */ /* * Only first iteration will have partial block, unroll it */ jb = N - (N/nb)*nb; if (!jb) jb = nb; J = N - jb; A += lda2*J; trcpzeroL(jb, jb, A+(J SHIFT), lda, wrk, jb); cblas_trsm(CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasUnit, N, jb, one, wrk, jb, A, lda); if (J) { do { J -= nb; A -= nb*lda2; nright = N-J; trcpzeroL(nright, nb, A+(J SHIFT), lda, wrk, nright); cblas_gemm(CblasColMajor, CblasNoTrans, CblasNoTrans, N, nb, nright-nb, none, A+nb*lda2, lda, wrk+(nb SHIFT), nright, one, A, lda); cblas_trsm(CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasUnit, N, nb, one, wrk, nright, A, lda); } while(J); } /* * Apply column interchanges */ for (J=N-2; J >= 0; J--) { jb = ipiv[J]; if (jb != J) cblas_swap(N, A+J*lda2, 1, A+jb*lda2, 1); } } return(iret); }
int ATL_getrfC(const int M, const int N, TYPE *A, const int lda, int *ipiv) /* * Column-major factorization of form * A = P * L * U * where P is a row-permutation matrix, L is lower triangular with unit diagonal * elements (lower trapazoidal if M > N), and U is upper triangular (upper * trapazoidal if M < N). This is the recursive Level 3 BLAS version. */ { const int MN = Mmin(M, N); int Nleft, Nright, k, i, ierr=0; #ifdef TCPLX const TYPE one[2] = {ATL_rone, ATL_rzero}; const TYPE none[2] = {ATL_rnone, ATL_rzero}; TYPE inv[2], tmp[2]; #else #define one ATL_rone #define none ATL_rnone TYPE tmp; #endif TYPE *Ac, *An; if (((size_t)M)*N <= ATL_L1elts) return(Mjoin(PATL,getf2)(M, N, A, lda, ipiv)); #if defined(ATL_USEPTHREADS) && defined(ATL_USEPCA) if (N <= (NB<<2) && N >= 16 && M-N >= ATL_PCAMin && ((size_t)ATL_MulBySize(M)*N) <= CacheEdge*ATL_NTHREADS) { if (N >= 16) ierr = Mjoin(PATL,tgetf2)(M, N, A, lda, ipiv); else ierr = Mjoin(PATL,tgetf2_nocp)(M, N, A, lda, ipiv); return(ierr); } #endif if (MN > ATL_luMmin) { Nleft = MN >> 1; #ifdef NB if (Nleft > NB) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft)); #endif Nright = N - Nleft; i = ATL_getrfC(M, Nleft, A, lda, ipiv); /* factor left to L & U */ if (i) if (!ierr) ierr = i; /* * Update trailing submatrix */ Ac = A + (Nleft * lda SHIFT); An = Ac + (Nleft SHIFT); ATL_laswp(Nright, Ac, lda, 0, Nleft, ipiv, 1); cblas_trsm(CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, Nleft, Nright, one, A, lda, Ac, lda); cblas_gemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M-Nleft, Nright, Nleft, none, A+(Nleft SHIFT), lda, Ac, lda, one, An, lda); i = ATL_getrfC(M-Nleft, Nright, An, lda, ipiv+Nleft); if (i) if (!ierr) ierr = i + Nleft; for (i=Nleft; i != MN; i++) ipiv[i] += Nleft; ATL_laswp(Nleft, A, lda, Nleft, MN, ipiv, 1); }
int ATL_getrfR(const int M, const int N, TYPE *A, const int lda, int *ipiv) /* * Row-major factorization of form * A = L * U * P * where P is a column-permutation matrix, L is lower triangular (lower * trapazoidal if M > N), and U is upper triangular with unit diagonals (upper * trapazoidal if M < N). This is the recursive Level 3 BLAS version. */ { const int MN = Mmin(M, N); int Nup, Ndown, i, ierr=0; #ifdef TCPLX const TYPE one[2] = {ATL_rone, ATL_rzero}; const TYPE none[2] = {ATL_rnone, ATL_rzero}; TYPE inv[2], tmp[2]; #else #define one ATL_rone #define none ATL_rnone TYPE tmp; #endif TYPE *Ar, *Ac, *An; if (MN > 1) { Nup = MN >> 1; #ifdef NB if (Nup > NB) Nup = ATL_MulByNB(ATL_DivByNB(Nup)); #endif Ndown = M - Nup; i = ATL_getrfR(Nup, N, A, lda, ipiv); if (i) if (!ierr) ierr = i; Ar = A + (Nup * lda SHIFT); Ac = A + (Nup SHIFT); An = Ar + (Nup SHIFT); ATL_laswp(Ndown, Ar, lda, 0, Nup, ipiv, 1); /* apply pivots */ cblas_trsm(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasUnit, Ndown, Nup, one, A, lda, Ar, lda); cblas_gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, Ndown, N-Nup, Nup, none, Ar, lda, Ac, lda, one, An, lda); i = ATL_getrfR(Ndown, N-Nup, An, lda, ipiv+Nup); if (i) if (!ierr) ierr = Nup + i; for (i=Nup; i != MN; i++) ipiv[i] += Nup; ATL_laswp(Nup, A, lda, Nup, MN, ipiv, 1); /* apply pivots */ }