/***************************************************************************//** * Parallel tile QR factorization - dynamic scheduling **/ void plasma_pdgeqrf_quark(PLASMA_desc A, PLASMA_desc T, int ib) { int k, m, n; int ldak, ldam; int tempkm, tempkn, tempnn, tempmm; for (k = 0; k < min(A.mt, A.nt); k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldak = BLKLDD(A, k); double *dA = A(k, k); double *dT = T(k, k); #if defined(USE_OMPEXT) omp_set_task_priority(1); #endif #pragma omp task depend(inout: dA[0:T.nb*T.nb]) depend(out:dT[0:ib*T.nb]) { double tau[T.nb]; double work[ib * T.nb]; CORE_dgeqrt(tempkm, tempkn, ib, dA, ldak, dT, T.mb, &tau[0], &work[0]); } for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; double *dA = A(k, k); double *dT = T(k, k); double *dC = A(k, n); #pragma omp task depend(in: dA[0:T.nb*T.nb], dT[0:ib*T.nb]) depend(inout:dC[0:T.nb*T.nb]) { double work[T.nb * ib]; CORE_dormqr(PlasmaLeft, PlasmaTrans, tempkm, tempnn, tempkm, ib, dA, ldak, dT, T.mb, dC, ldak, &work[0], T.nb); } } for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); double *dA = A(k, k); double *dB = A(m, k); double *dT = T(m, k); #pragma omp task depend(inout:dA[0:T.nb*T.nb], dB[0:T.nb*T.nb]) depend(out:dT[0:ib*T.nb]) { double tau[T.nb]; double work[ib * T.nb]; CORE_dtsqrt(tempmm, tempkn, ib, dA, ldak, dB, ldam, dT, T.mb, &tau[0], &work[0]); } for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; double *dA = A(k, n); double *dB = A(m, n); double *dV = A(m, k); double *dT = T(m, k); #pragma omp task depend(inout:dA[0:T.nb*T.nb], dB[0:T.nb*T.nb]) depend(in:dV[0:T.nb*T.nb], dT[0:ib*T.nb]) { double work[ib * T.nb]; CORE_dtsmqr(PlasmaLeft, PlasmaTrans, A.mb, tempnn, tempmm, tempnn, A.nb, ib, dA, ldak, dB, ldam, dV, ldam, dT, T.mb, &work[0], ib); } } } } }
/***************************************************************************//** * Parallel tile QR factorization - static scheduling **/ void plasma_pdgeqrf(plasma_context_t *plasma) { PLASMA_desc A; PLASMA_desc T; PLASMA_sequence *sequence; PLASMA_request *request; int k, m, n; int next_k; int next_m; int next_n; int ldak, ldam; int tempkm, tempkn, tempnn, tempmm; int ib = PLASMA_IB; double *work, *tau; plasma_unpack_args_4(A, T, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; work = (double*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp); tau = (double*)plasma_private_alloc(plasma, A.nb, A.dtyp); ss_init(A.mt, A.nt, -1); k = 0; n = PLASMA_RANK; while (n >= A.nt) { k++; n = n-A.nt+k; } m = k; while (k < min(A.mt, A.nt) && n < A.nt) { next_n = n; next_m = m; next_k = k; next_m++; if (next_m == A.mt) { next_n += PLASMA_SIZE; while (next_n >= A.nt && next_k < min(A.mt, A.nt)) { next_k++; next_n = next_n-A.nt+next_k; } next_m = next_k; } tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldak = BLKLDD(A, k); ldam = BLKLDD(A, m); if (n == k) { if (m == k) { ss_cond_wait(k, k, k-1); CORE_dgeqrt( tempkm, tempkn, ib, A(k, k), ldak, T(k, k), T.mb, tau, work); ss_cond_set(k, k, k); } else { ss_cond_wait(m, k, k-1); CORE_dtsqrt( tempmm, tempkn, ib, A(k, k), ldak, A(m, k), ldam, T(m, k), T.mb, tau, work); ss_cond_set(m, k, k); } } else { if (m == k) { ss_cond_wait(k, k, k); ss_cond_wait(k, n, k-1); CORE_dormqr( PlasmaLeft, PlasmaTrans, tempkm, tempnn, tempkm, ib, A(k, k), ldak, T(k, k), T.mb, A(k, n), ldak, work, T.nb); } else { ss_cond_wait(m, k, k); ss_cond_wait(m, n, k-1); CORE_dtsmqr( PlasmaLeft, PlasmaTrans, A.nb, tempnn, tempmm, tempnn, A.nb, ib, A(k, n), ldak, A(m, n), ldam, A(m, k), ldam, T(m, k), T.mb, work, ib); ss_cond_set(m, n, k); } } n = next_n; m = next_m; k = next_k; } plasma_private_free(plasma, work); plasma_private_free(plasma, tau); ss_finalize(); }