int CORE_zgetrf_incpiv(int M, int N, int IB, PLASMA_Complex64_t *A, int LDA, int *IPIV, int *INFO) { int i, j, k, sb; int iinfo; /* Check input arguments */ *INFO = 0; if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(5, "Illegal value of LDA"); return -5; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; k = min(M, N); for(i =0 ; i < k; i += IB) { sb = min(IB, k-i); /* * Factor diagonal and subdiagonal blocks and test for exact singularity. */ iinfo = LAPACKE_zgetf2_work(LAPACK_COL_MAJOR, M-i, sb, &A[LDA*i+i], LDA, &IPIV[i]); /* * Adjust INFO and the pivot indices. */ if((*INFO == 0) && (iinfo > 0)) *INFO = iinfo + i; if (i+sb < N) { CORE_zgessm( M-i, N-(i+sb), sb, sb, &IPIV[i], &A[LDA*i+i], LDA, &A[LDA*(i+sb)+i], LDA); } for(j = i; j < i+sb; j++) { IPIV[j] = i + IPIV[j]; } } return PLASMA_SUCCESS; }
/***************************************************************************//** * Parallel tile LU factorization - static scheduling **/ void plasma_pzgetrf_incpiv(plasma_context_t *plasma) { PLASMA_desc A; PLASMA_desc L; int *IPIV; PLASMA_sequence *sequence; PLASMA_request *request; int k, m, n; int next_k; int next_m; int next_n; int ldak, ldam; int info; int tempkn, tempkm, tempmm, tempnn; int ib = PLASMA_IB; PLASMA_Complex64_t *work; plasma_unpack_args_5(A, L, IPIV, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; work = (PLASMA_Complex64_t*)plasma_private_alloc(plasma, ib*L.nb, L.dtyp); ss_init(A.mt, A.nt, -1); k = 0; n = PLASMA_RANK; while (n >= A.nt) { k++; n = n-A.nt+k; } m = k; while (k < min(A.mt, A.nt) && n < A.nt && !ss_aborted()) { next_n = n; next_m = m; next_k = k; next_m++; if (next_m == A.mt) { next_n += PLASMA_SIZE; while (next_n >= A.nt && next_k < min(A.mt, A.nt)) { next_k++; next_n = next_n-A.nt+next_k; } next_m = next_k; } tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldak = BLKLDD(A, k); ldam = BLKLDD(A, m); if (n == k) { if (m == k) { ss_cond_wait(k, k, k-1); CORE_zgetrf_incpiv( tempkm, tempkn, ib, A(k, k), ldak, IPIV(k, k), &info); if (info != 0 && m == A.mt-1) { plasma_request_fail(sequence, request, info + A.nb*k); ss_abort(); } ss_cond_set(k, k, k); } else { ss_cond_wait(m, k, k-1); CORE_ztstrf( tempmm, tempkn, ib, A.nb, A(k, k), ldak, A(m, k), ldam, L(m, k), L.mb, IPIV(m, k), work, L.nb, &info); if (info != 0 && m == A.mt-1) { plasma_request_fail(sequence, request, info + A.nb*k); ss_abort(); } ss_cond_set(m, k, k); } } else { if (m == k) { ss_cond_wait(k, k, k); ss_cond_wait(k, n, k-1); CORE_zgessm( tempkm, tempnn, tempkm, ib, IPIV(k, k), A(k, k), ldak, A(k, n), ldak); } else { ss_cond_wait(m, k, k); ss_cond_wait(m, n, k-1); CORE_zssssm( A.nb, tempnn, tempmm, tempnn, A.nb, ib, A(k, n), ldak, A(m, n), ldam, L(m, k), L.mb, A(m, k), ldam, IPIV(m, k)); ss_cond_set(m, n, k); } } n = next_n; m = next_m; k = next_k; } plasma_private_free(plasma, work); ss_finalize(); }
/***************************************************************************//** * Parallel forward substitution for tile LU - static scheduling **/ void plasma_pztrsmpl(plasma_context_t *plasma) { PLASMA_desc A; PLASMA_desc B; PLASMA_desc L; int *IPIV; PLASMA_sequence *sequence; PLASMA_request *request; int k, m, n; int next_k; int next_m; int next_n; int ldak, ldbk, ldam, ldbm; int tempkm, tempnn, tempkmin, tempmm, tempkn; int ib; plasma_unpack_args_6(A, B, L, IPIV, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; ss_init(B.mt, B.nt, -1); ib = PLASMA_IB; k = 0; n = PLASMA_RANK; while (n >= B.nt) { k++; n = n-B.nt; } m = k; while (k < min(A.mt, A.nt) && n < B.nt) { next_n = n; next_m = m; next_k = k; next_m++; if (next_m == A.mt) { next_n += PLASMA_SIZE; while (next_n >= B.nt && next_k < min(A.mt, A.nt)) { next_k++; next_n = next_n-B.nt; } next_m = next_k; } tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; tempkmin = k == min(A.mt, A.nt)-1 ? min(A.m, A.n)-k*A.mb : A.mb; tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); if (m == k) { ss_cond_wait(k, n, k-1); CORE_zgessm( tempkm, tempnn, tempkmin, ib, IPIV(k, k), A(k, k), ldak, B(k, n), ldbk); ss_cond_set(k, n, k); } else { ss_cond_wait(m, n, k-1); CORE_zssssm( A.nb, tempnn, tempmm, tempnn, tempkn, ib, B(k, n), ldbk, B(m, n), ldbm, L(m, k), L.mb, A(m, k), ldam, IPIV(m, k)); ss_cond_set(m, n, k); } n = next_n; m = next_m; k = next_k; } ss_finalize(); }