/***************************************************************************//** * Parallel construction of Q using tile V (application to identity) - dynamic scheduling **/ void plasma_pzunglq_quark(PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldak, ldqm; int tempnn, tempmm, tempkmin, tempkn; int tempAkm, tempAkn; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; for (k = min(A.mt, A.nt)-1; k >= 0; k--) { tempAkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempAkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; tempkmin = min( tempAkn, tempAkm ); tempkn = k == Q.nt-1 ? Q.n-k*Q.nb : Q.nb; ldak = BLKLDD(A, k); for (n = Q.nt-1; n > k; n--) { tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb; for (m = 0; m < Q.mt; m++) { tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb; ldqm = BLKLDD(Q, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, Q.nb, tempmm, tempnn, tempAkm, ib, T.nb, Q(m, k), ldqm, Q(m, n), ldqm, A(k, n), ldak, T(k, n), T.mb); } } for (m = 0; m < Q.mt; m++) { tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb; ldqm = BLKLDD(Q, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, tempkn, tempkmin, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, Q(m, k), ldqm); } } }
/***************************************************************************//** * Parallel tile BAND Tridiagonal Reduction - dynamic scheduler **/ void plasma_pzherbt_quark(PLASMA_enum uplo, PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n, i, j; int ldak, ldam, ldan, ldaj, ldai; int tempkn, tempmm, tempnn, tempjj; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; if (uplo == PlasmaLower) { for (k = 0; k < A.nt-1; k++){ tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb; ldak = BLKLDD(A, k+1); QUARK_CORE_zgeqrt( plasma->quark, &task_flags, tempkn, A.nb, ib, T.nb, A(k+1, k), ldak, T(k+1, k), T.mb); /* LEFT and RIGHT on the symmetric diagonal block */ QUARK_CORE_zherfb( plasma->quark, &task_flags, PlasmaLower, tempkn, tempkn, ib, T.nb, A(k+1, k), ldak, T(k+1, k), T.mb, A(k+1, k+1), ldak); /* RIGHT on the remaining tiles until the bottom */ for (m = k+2; m < A.mt ; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_zunmqr( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, A.nb, tempkn, ib, T.nb, A(k+1, k), ldak, T(k+1, k), T.mb, A(m , k+1), ldam); } for (m = k+2; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ztsqrt( plasma->quark, &task_flags, tempmm, A.nb, ib, T.nb, A(k+1, k), ldak, A(m , k), ldam, T(m , k), T.mb); /* LEFT */ for (i = k+2; i < m; i++) { ldai = BLKLDD(A, i); QUARK_CORE_ztsmqr_hetra1( plasma->quark, &task_flags, PlasmaLeft, PlasmaConjTrans, A.mb, A.nb, tempmm, A.nb, A.nb, ib, T.nb, A(i, k+1), ldai, A(m, i), ldam, A(m, k), ldam, T(m, k), T.mb); } /* RIGHT */ for (j = m+1; j < A.mt ; j++) { tempjj = j == A.mt-1 ? A.m-j*A.mb : A.mb; ldaj = BLKLDD(A, j); QUARK_CORE_ztsmqr( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempjj, A.nb, tempjj, tempmm, A.nb, ib, T.nb, A(j, k+1), ldaj, A(j, m), ldaj, A(m, k), ldam, T(m, k), T.mb); } /* LEFT->RIGHT */ QUARK_CORE_ztsmqr_corner( plasma->quark, &task_flags, A.nb, A.nb, tempmm, A.nb, tempmm, tempmm, A.nb, ib, T.nb, A(k+1, k+1), ldak, A(m , k+1), ldam, A(m , m), ldam, A(m , k), ldam, T(m , k), T.mb); } } } else { for (k = 0; k < A.nt-1; k++){ tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb; ldak = BLKLDD(A, k+1); QUARK_CORE_zgelqt( plasma->quark, &task_flags, A.nb, tempkn, ib, T.nb, A(k, k+1), A.nb, T(k, k+1), T.mb); /* RIGHT and LEFT on the symmetric diagonal block */ QUARK_CORE_zherfb( plasma->quark, &task_flags, PlasmaUpper, tempkn, tempkn, ib, T.nb, A(k, k+1), A.nb, T(k, k+1), T.mb, A(k+1, k+1), ldak); /* LEFT on the remaining tiles until the left side */ for (n = k+2; n < A.nt ; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_zunmlq( plasma->quark, &task_flags, PlasmaLeft, PlasmaNoTrans, A.nb, tempnn, tempkn, ib, T.nb, A(k, k+1), A.nb, T(k, k+1), T.mb, A(k+1, n), ldak); } for (n = k+2; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldan = BLKLDD(A, n); QUARK_CORE_ztslqt( plasma->quark, &task_flags, A.nb, tempnn, ib, T.nb, A(k, k+1), A.nb, A(k, n), A.nb, T(k, n), T.mb); /* RIGHT */ for (i = k+2; i < n; i++) { ldai = BLKLDD(A, i); QUARK_CORE_ztsmlq_hetra1( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, A.mb, A.nb, A.nb, tempnn, A.nb, ib, T.nb, A(k+1, i), ldak, A(i, n), ldai, A(k, n), A.nb, T(k, n), T.mb); } /* LEFT */ for (j = n+1; j < A.nt ; j++) { tempjj = j == A.nt-1 ? A.n-j*A.nb : A.nb; ldaj = BLKLDD(A, j); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, PlasmaLeft, PlasmaNoTrans, A.nb, tempjj, tempnn, tempjj, A.nb, ib, T.nb, A(k+1, j), ldak, A(n, j), ldan, A(k, n), A.nb, T(k, n), T.mb); } /* RIGHT->LEFT */ QUARK_CORE_ztsmlq_corner( plasma->quark, &task_flags, A.nb, A.nb, A.nb, tempnn, tempnn, tempnn, A.nb, ib, T.nb, A(k+1, k+1), ldak, A(k+1, n), ldak, A(n , n), ldan, A(k , n), A.nb, T(k , n), T.mb); } } } }
/***************************************************************************//** * Parallel application of Q using tile V - LQ factorization (reduction * Householder) - dynamic scheduling **/ void plasma_pzunmlqrh_quark(PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int K, N, RD, lastRD; int ldaN, ldak; int ldbN, ldbm, ldbNRD; int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; K = min(A.mt, A.nt); if (side == PlasmaLeft ) { if (trans == PlasmaNoTrans) { /* * PlasmaLeft / PlasmaNoTrans */ for (k = 0; k < K; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); for (N = k; N < A.nt; N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); ldaN = BLKLDD(A, N); ldbN = BLKLDD(B, N); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempNn, tempnn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, B(N, n), ldbN); } for (m = N+1; m < min(N+BS, A.nt); m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, B.nb, tempnn, tempmm, tempnn, tempkm, ib, T.nb, B(N, n), ldbN, B(m, n), ldbm, A(k, m), ldak, T(k, m), T.mb); } } } for (RD = BS; RD < A.nt-k; RD *= 2) { for (N = k; N+RD < A.nt; N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; ldbN = BLKLDD(B, N ); ldbNRD = BLKLDD(B, N+RD); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zttmlq( plasma->quark, &task_flags, side, trans, B.mb, tempnn, tempNRDn, tempnn, tempkm, ib, T.nb, B (N, n), ldbN, B (N+RD, n), ldbNRD, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } } } else { /* * PlasmaLeft / PlasmaConjTrans */ for (k = K-1; k >= 0; k--) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); lastRD = 0; for (RD = BS; RD < A.nt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { for (N = k; N+RD < A.nt; N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; ldbN = BLKLDD(B, N ); ldbNRD = BLKLDD(B, N+RD); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zttmlq( plasma->quark, &task_flags, side, trans, B.nb, tempnn, tempNRDn, tempnn, tempkm, ib, T.nb, B (N, n), ldbN, B (N+RD, n), ldbNRD, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } for (N = k; N < A.nt; N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); ldaN = BLKLDD(A, N); ldbN = BLKLDD(B, N); for (m = min(N+BS, A.nt)-1; m > N; m--) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, B.mb, tempnn, tempmm, tempnn, tempkm, ib, T.nb, B(N, n), ldbN, B(m, n), ldbm, A(k, m), ldak, T(k, m), T.mb); } } for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempNn, tempnn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, B(N, n), ldbN); } } } } } else { if (trans == PlasmaNoTrans) { /* * PlasmaRight / PlasmaNoTrans */ for (k = K-1; k >= 0; k--) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); lastRD = 0; for (RD = BS; RD < A.nt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { for (N = k; N+RD < A.nt; N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; for (m = 0; m < B.mt; m++) { ldbm = BLKLDD(B, m); tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; QUARK_CORE_zttmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempNRDn, tempkm, ib, T.nb, B (m, N ), ldbm, B (m, N+RD), ldbm, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } for (N = k; N < A.nt; N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); for (n = min(N+BS, A.nt)-1; n > N; n--) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempnn, tempkm, ib, T.nb, B(m, N), ldbm, B(m, n), ldbm, A(k, n), ldak, T(k, n), T.mb); } } for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempmm, tempNn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, B(m, N), ldbm); } } } } else { /* * PlasmaRight / PlasmaConjTrans */ for (k = 0; k < K; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); for (N = k; N < A.nt; N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); ldaN = BLKLDD(A, N); for (m = 0; m < B.mt; m++) { ldbm = BLKLDD(B, m); tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempmm, tempNn, tempkmin, ib, T.nb, A(k, N), ldaN, T(k, N), T.mb, B(m, N), ldbm); } for (n = N+1; n < min(N+BS, A.nt); n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, tempmm, tempNn, tempmm, tempnn, tempkm, ib, T.nb, B(m, N), ldbm, B(m, n), ldbm, A(k, n), ldak, T(k, n), T.mb); } } } for (RD = BS; RD < A.nt-k; RD *= 2) { for (N = k; N+RD < A.nt; N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_zttmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempNRDn, tempkm, ib, T.nb, B (m, N ), ldbm, B (m, N+RD), ldbm, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } } } } }
/***************************************************************************//** * Parallel application of Q using tile V - LQ factorization - dynamic scheduling **/ void plasma_pzunmlq_quark(PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldak, ldan, ldbk, ldbm; int tempmm, tempnn, tempkn, tempkm, tempkmin; int ib, minMT, minM; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; if (A.m > A.n) { minM = A.n; minMT = A.nt; } else { minM = A.m; minMT = A.mt; } if (side == PlasmaLeft ) { if (trans == PlasmaNoTrans) { /* * PlasmaLeft / PlasmaNoTrans */ for (k = 0; k < minMT; k++) { tempkm = k == B.mt -1 ? B.m -k*B.mb : B.mb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempkm, tempnn, tempkmin, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, B(k, n), ldbk); } for (m = k+1; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, B.mb, tempnn, tempmm, tempnn, tempkmin, ib, T.nb, B(k, n), ldbk, B(m, n), ldbm, A(k, m), ldak, T(k, m), T.mb); } } } } else { /* * PlasmaLeft / PlasmaConjTrans */ for (k = minMT-1; k >= 0; k--) { tempkm = k == B.mt -1 ? B.m -k*B.mb : B.mb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); for (m = B.mt-1; m > k; m--) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, B.mb, tempnn, tempmm, tempnn, tempkmin, ib, T.nb, B(k, n), ldbk, B(m, n), ldbm, A(k, m), ldak, T(k, m), T.mb); } } for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempkm, tempnn, tempkmin, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, B(k, n), ldbk); } } } } else { if (trans == PlasmaNoTrans) { /* * PlasmaRight / PlasmaNoTrans */ for (k = minMT-1; k >= 0; k--) { tempkn = k == B.nt -1 ? B.n -k*B.nb : B.nb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); for (n = B.nt-1; n > k; n--) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; ldan = BLKLDD(A, n); for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempnn, tempkmin, ib, T.nb, B(m, k), ldbm, B(m, n), ldbm, A(k, n), ldak, T(k, n), T.mb); } } for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempmm, tempkn, tempkmin, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, B(m, k), ldbm); } } } else { /* * PlasmaRight / PlasmaConjTrans */ for (k = 0; k < minMT; k++) { tempkn = k == B.nt -1 ? B.n -k*B.nb : B.nb; tempkmin = k == minMT-1 ? minM-k*A.mb : A.mb; ldak = BLKLDD(A, k); for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempmm, tempkn, tempkmin, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, B(m, k), ldbm); } for (n = k+1; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempnn, tempkmin, ib, T.nb, B(m, k), ldbm, B(m, n), ldbm, A(k, n), ldak, T(k, n), T.mb); } } } } } }
/***************************************************************************//** * Parallel construction of Q using tile V (application to identity; * reduction Householder) - dynamic scheduling **/ void plasma_pzunglqrh_quark(PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int K, N, RD, lastRD; int ldak; int ldqm; int tempkm, tempNn, tempnn, tempmm, tempNRDn, tempkmin; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; K = min(A.mt, A.nt); for (k = K-1; k >= 0; k--) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); lastRD = 0; for (RD = BS; RD < A.nt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { for (N = k; N+RD < A.nt-1; /* No reduction with rightmost single-column subdomain */ N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; for (m = 0; m < Q.mt; m++) { tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb; ldqm = BLKLDD(Q, m ); QUARK_CORE_zttmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, Q.nb, tempmm, tempNRDn, tempkm, ib, T.nb, Q (m, N ), ldqm, Q (m, N+RD), ldqm, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } for (N = k; N < A.nt-1 || N == k; /* No rightmost single-column */ N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm, tempNn); for (n = N+BS-1 == A.nt-2 ? A.nt-1 : min(N+BS-1, A.nt-1); /* Suck in rightmost single-column domain */ n >= N+1; n--) { tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb; for (m = 0; m < Q.mt; m++) { tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb; ldqm = BLKLDD(Q, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, Q.nb, tempmm, tempnn, tempkm, ib, T.nb, Q(m, N), ldqm, Q(m, n), ldqm, A(k, n), ldak, T(k, n), T.mb); } } for (m = 0; m < Q.mt; m++) { tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb; ldqm = BLKLDD(Q, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, tempNn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, Q(m, N), ldqm); } } } }
/***************************************************************************//** * Parallel tile LQ factorization (reduction Householder) - dynamic scheduling **/ void plasma_pzgelqfrh_quark(PLASMA_desc A, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int K, N, RD; int ldak, ldam; int tempkm, tempNn, tempmm, tempnn, tempNRDn; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; K = min(A.mt, A.nt); for (k = 0; k < K; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); // for (N = k; N < A.nt; N += BS) { for (N = k; N < A.nt-1 || N == k; // No rightmost single-column subdomain N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; QUARK_CORE_zgelqt( plasma->quark, &task_flags, tempkm, tempNn, ib, T.nb, A(k, N), ldak, T(k, N), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, tempNn, tempNn, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, A(m, N), ldam); } // for (n = N+1; n < N+BS && n < A.nt; n++) { for (n = N+1; (n < N+BS && n < A.nt) || n == A.nt-1; // Suck in rightmost single-column domain n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ztslqt( plasma->quark, &task_flags, tempkm, tempnn, ib, T.nb, A(k, N), ldak, A(k, n), ldak, T(k, n), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, A.nb, tempmm, tempnn, A.mb, ib, T.nb, A(m, N), ldam, A(m, n), ldam, A(k, n), ldak, T(k, n), T.mb); } } } for (RD = BS; RD < A.nt-k; RD *= 2) { // for (N = k; N+RD < A.nt; N += 2*RD) { for (N = k; N+RD < A.nt-1; // No reduction with rightmost single-column subdomain N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; QUARK_CORE_zttlqt( plasma->quark, &task_flags, tempkm, tempNRDn, ib, T.nb, A (k, N ), ldak, A (k, N+RD), ldak, T2(k, N+RD), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_zttmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, A.nb, tempmm, tempNRDn, A.mb, ib, T.nb, A (m, N ), ldam, A (m, N+RD), ldam, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } } }
/***************************************************************************//** * Parallel tile LQ factorization - dynamic scheduling **/ void plasma_pzgelqf_quark(PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldak, ldam; int tempkm, tempkn, tempmm, tempnn; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; for (k = 0; k < min(A.mt, A.nt); k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldak = BLKLDD(A, k); QUARK_CORE_zgelqt( plasma->quark, &task_flags, tempkm, tempkn, ib, T.nb, A(k, k), ldak, T(k, k), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, tempkn, tempkn, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, A(m, k), ldam); } for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ztslqt( plasma->quark, &task_flags, tempkm, tempnn, ib, T.nb, A(k, k), ldak, A(k, n), ldak, T(k, n), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, A.nb, tempmm, tempnn, A.mb, ib, T.nb, A(m, k), ldam, A(m, n), ldam, A(k, n), ldak, T(k, n), T.mb); } } } }