/***************************************************************************//** * Parallel tile QR factorization - static scheduling **/ void plasma_pdgeqrf(plasma_context_t *plasma) { PLASMA_desc A; PLASMA_desc T; PLASMA_sequence *sequence; PLASMA_request *request; int k, m, n; int next_k; int next_m; int next_n; int ldak, ldam; int tempkm, tempkn, tempnn, tempmm; int ib = PLASMA_IB; double *work, *tau; plasma_unpack_args_4(A, T, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; work = (double*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp); tau = (double*)plasma_private_alloc(plasma, A.nb, A.dtyp); ss_init(A.mt, A.nt, -1); k = 0; n = PLASMA_RANK; while (n >= A.nt) { k++; n = n-A.nt+k; } m = k; while (k < min(A.mt, A.nt) && n < A.nt) { next_n = n; next_m = m; next_k = k; next_m++; if (next_m == A.mt) { next_n += PLASMA_SIZE; while (next_n >= A.nt && next_k < min(A.mt, A.nt)) { next_k++; next_n = next_n-A.nt+next_k; } next_m = next_k; } tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldak = BLKLDD(A, k); ldam = BLKLDD(A, m); if (n == k) { if (m == k) { ss_cond_wait(k, k, k-1); CORE_dgeqrt( tempkm, tempkn, ib, A(k, k), ldak, T(k, k), T.mb, tau, work); ss_cond_set(k, k, k); } else { ss_cond_wait(m, k, k-1); CORE_dtsqrt( tempmm, tempkn, ib, A(k, k), ldak, A(m, k), ldam, T(m, k), T.mb, tau, work); ss_cond_set(m, k, k); } } else { if (m == k) { ss_cond_wait(k, k, k); ss_cond_wait(k, n, k-1); CORE_dormqr( PlasmaLeft, PlasmaTrans, tempkm, tempnn, tempkm, ib, A(k, k), ldak, T(k, k), T.mb, A(k, n), ldak, work, T.nb); } else { ss_cond_wait(m, k, k); ss_cond_wait(m, n, k-1); CORE_dtsmqr( PlasmaLeft, PlasmaTrans, A.nb, tempnn, tempmm, tempnn, A.nb, ib, A(k, n), ldak, A(m, n), ldam, A(m, k), ldam, T(m, k), T.mb, work, ib); ss_cond_set(m, n, k); } } n = next_n; m = next_m; k = next_k; } plasma_private_free(plasma, work); plasma_private_free(plasma, tau); ss_finalize(); }
/***************************************************************************//** * Parallel tile QR factorization - dynamic scheduling **/ void plasma_pdgeqrf_quark(PLASMA_desc A, PLASMA_desc T, int ib) { int k, m, n; int ldak, ldam; int tempkm, tempkn, tempnn, tempmm; for (k = 0; k < min(A.mt, A.nt); k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldak = BLKLDD(A, k); double *dA = A(k, k); double *dT = T(k, k); #if defined(USE_OMPEXT) omp_set_task_priority(1); #endif #pragma omp task depend(inout: dA[0:T.nb*T.nb]) depend(out:dT[0:ib*T.nb]) { double tau[T.nb]; double work[ib * T.nb]; CORE_dgeqrt(tempkm, tempkn, ib, dA, ldak, dT, T.mb, &tau[0], &work[0]); } for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; double *dA = A(k, k); double *dT = T(k, k); double *dC = A(k, n); #pragma omp task depend(in: dA[0:T.nb*T.nb], dT[0:ib*T.nb]) depend(inout:dC[0:T.nb*T.nb]) { double work[T.nb * ib]; CORE_dormqr(PlasmaLeft, PlasmaTrans, tempkm, tempnn, tempkm, ib, dA, ldak, dT, T.mb, dC, ldak, &work[0], T.nb); } } for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); double *dA = A(k, k); double *dB = A(m, k); double *dT = T(m, k); #pragma omp task depend(inout:dA[0:T.nb*T.nb], dB[0:T.nb*T.nb]) depend(out:dT[0:ib*T.nb]) { double tau[T.nb]; double work[ib * T.nb]; CORE_dtsqrt(tempmm, tempkn, ib, dA, ldak, dB, ldam, dT, T.mb, &tau[0], &work[0]); } for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; double *dA = A(k, n); double *dB = A(m, n); double *dV = A(m, k); double *dT = T(m, k); #pragma omp task depend(inout:dA[0:T.nb*T.nb], dB[0:T.nb*T.nb]) depend(in:dV[0:T.nb*T.nb], dT[0:ib*T.nb]) { double work[ib * T.nb]; CORE_dtsmqr(PlasmaLeft, PlasmaTrans, A.mb, tempnn, tempmm, tempnn, A.nb, ib, dA, ldak, dB, ldam, dV, ldam, dT, T.mb, &work[0], ib); } } } } }
/***************************************************************************//** * Parallel application of Q using tile V - QR factorization - dynamic scheduling **/ void plasma_pdormqr_quark(PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int ib) { int k, m, n; int ldak, ldbk, ldam, ldan, ldbm; int tempkm, tempnn, tempkmin, tempmm, tempkn; int minMT, minM; if (A.m > A.n) { minM = A.n; minMT = A.nt; } else { minM = A.m; minMT = A.mt; } double *work = (double *)alloca(sizeof(double) * T.nb * ib); /* * PlasmaLeft / PlasmaTrans */ if (side == PlasmaLeft ) { if (trans == PlasmaTrans) { for (k = 0; k < minMT; k++) { tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; double *dA = A(k, k); double *dT = T(k, k); double *dB = B(k, n); { CORE_dormqr(side, trans, tempkm, tempnn, tempkmin, ib, dA, ldak, dT, T.mb, dB, ldbk, work, T.nb); } } for (m = k+1; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; double *dA = B(k, n); double *dB = B(m, n); double *dV = A(m, k); double *dT = T(m, k); { CORE_dtsmqr(side, trans, B.mb, tempnn, tempmm, tempnn, tempkmin, ib, dA, ldbk, dB, ldbm, dV, ldam, dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb); } } } } } /* * PlasmaLeft / PlasmaNoTrans */ else { for (k = minMT-1; k >= 0; k--) { tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); for (m = B.mt-1; m > k; m--) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; double *dA = B(k, n); double *dB = B(m, n); double *dV = A(m, k); double *dT = T(m, k); { CORE_dtsmqr(side, trans, B.mb, tempnn, tempmm, tempnn, tempkmin, ib, dA, ldbk, dB, ldbm, dV, ldam, dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb); } } } for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; double *dA = A(k, k); double *dT = T(k, k); double *dB = B(k, n); { CORE_dormqr(side, trans, tempkm, tempnn, tempkmin, ib, dA, ldak, dT, T.mb, dB, ldbk, work, T.nb); } } } } } /* * PlasmaRight / PlasmaTrans */ else { if (trans == PlasmaTrans) { for (k = minMT-1; k >= 0; k--) { tempkn = k == B.nt-1 ? B.n-k*B.nb : B.nb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); for (n = B.nt-1; n > k; n--) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; ldan = BLKLDD(A, n); for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); double *dA = B(m, k); double *dB = B(m, n); double *dV = A(n, k); double *dT = T(n, k); { CORE_dtsmqr(side, trans, tempmm, B.nb, tempmm, tempnn, tempkmin, ib, dA, ldbm, dB, ldbm, dV, ldan, dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb); } } } for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); double *dA = A(k, k); double *dT = T(k, k); double *dB = B(m, k); { CORE_dormqr(side, trans, tempmm, tempkn, tempkmin, ib, dA, ldak, dT, T.mb, dB, ldbm, work, T.nb); } } } } /* * PlasmaRight / PlasmaNoTrans */ else { for (k = 0; k < minMT; k++) { tempkn = k == B.nt-1 ? B.n-k*B.nb : B.nb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); double *dA = A(k, k); double *dT = T(k, k); double *dB = B(m, k); { CORE_dormqr(side, trans, tempmm, tempkn, tempkmin, ib, dA, ldak, dT, T.mb, dB, ldbm, work, T.nb); } } for (n = k+1; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; ldan = BLKLDD(A, n); for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); double *dA = B(m, k); double *dB = B(m, n); double *dV = A(n, k); double *dT = T(n, k); { CORE_dtsmqr(side, trans, tempmm, B.nb, tempmm, tempnn, tempkmin, ib, dA, ldbm, dB, ldbm, dV, ldan, dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb); } } } } } } }