/***************************************************************************//** * Parallel construction of Q using tile V (application to identity; * reduction Householder) - dynamic scheduling **/ void plasma_pzunglqrh_quark(PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int K, N, RD, lastRD; int ldak; int ldqm; int tempkm, tempNn, tempnn, tempmm, tempNRDn, tempkmin; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; K = min(A.mt, A.nt); for (k = K-1; k >= 0; k--) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); lastRD = 0; for (RD = BS; RD < A.nt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { for (N = k; N+RD < A.nt-1; /* No reduction with rightmost single-column subdomain */ N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; for (m = 0; m < Q.mt; m++) { tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb; ldqm = BLKLDD(Q, m ); QUARK_CORE_zttmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, Q.nb, tempmm, tempNRDn, tempkm, ib, T.nb, Q (m, N ), ldqm, Q (m, N+RD), ldqm, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } for (N = k; N < A.nt-1 || N == k; /* No rightmost single-column */ N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm, tempNn); for (n = N+BS-1 == A.nt-2 ? A.nt-1 : min(N+BS-1, A.nt-1); /* Suck in rightmost single-column domain */ n >= N+1; n--) { tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb; for (m = 0; m < Q.mt; m++) { tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb; ldqm = BLKLDD(Q, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, Q.nb, tempmm, tempnn, tempkm, ib, T.nb, Q(m, N), ldqm, Q(m, n), ldqm, A(k, n), ldak, T(k, n), T.mb); } } for (m = 0; m < Q.mt; m++) { tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb; ldqm = BLKLDD(Q, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, tempNn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, Q(m, N), ldqm); } } } }
/***************************************************************************//** * Parallel application of Q using tile V - LQ factorization (reduction * Householder) - dynamic scheduling **/ void plasma_pzunmlqrh_quark(PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int K, N, RD, lastRD; int ldaN, ldak; int ldbN, ldbm, ldbNRD; int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; K = min(A.mt, A.nt); if (side == PlasmaLeft ) { if (trans == PlasmaNoTrans) { /* * PlasmaLeft / PlasmaNoTrans */ for (k = 0; k < K; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); for (N = k; N < A.nt; N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); ldaN = BLKLDD(A, N); ldbN = BLKLDD(B, N); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempNn, tempnn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, B(N, n), ldbN); } for (m = N+1; m < min(N+BS, A.nt); m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, B.nb, tempnn, tempmm, tempnn, tempkm, ib, T.nb, B(N, n), ldbN, B(m, n), ldbm, A(k, m), ldak, T(k, m), T.mb); } } } for (RD = BS; RD < A.nt-k; RD *= 2) { for (N = k; N+RD < A.nt; N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; ldbN = BLKLDD(B, N ); ldbNRD = BLKLDD(B, N+RD); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zttmlq( plasma->quark, &task_flags, side, trans, B.mb, tempnn, tempNRDn, tempnn, tempkm, ib, T.nb, B (N, n), ldbN, B (N+RD, n), ldbNRD, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } } } else { /* * PlasmaLeft / PlasmaConjTrans */ for (k = K-1; k >= 0; k--) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); lastRD = 0; for (RD = BS; RD < A.nt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { for (N = k; N+RD < A.nt; N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; ldbN = BLKLDD(B, N ); ldbNRD = BLKLDD(B, N+RD); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zttmlq( plasma->quark, &task_flags, side, trans, B.nb, tempnn, tempNRDn, tempnn, tempkm, ib, T.nb, B (N, n), ldbN, B (N+RD, n), ldbNRD, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } for (N = k; N < A.nt; N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); ldaN = BLKLDD(A, N); ldbN = BLKLDD(B, N); for (m = min(N+BS, A.nt)-1; m > N; m--) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, B.mb, tempnn, tempmm, tempnn, tempkm, ib, T.nb, B(N, n), ldbN, B(m, n), ldbm, A(k, m), ldak, T(k, m), T.mb); } } for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempNn, tempnn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, B(N, n), ldbN); } } } } } else { if (trans == PlasmaNoTrans) { /* * PlasmaRight / PlasmaNoTrans */ for (k = K-1; k >= 0; k--) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); lastRD = 0; for (RD = BS; RD < A.nt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { for (N = k; N+RD < A.nt; N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; for (m = 0; m < B.mt; m++) { ldbm = BLKLDD(B, m); tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; QUARK_CORE_zttmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempNRDn, tempkm, ib, T.nb, B (m, N ), ldbm, B (m, N+RD), ldbm, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } for (N = k; N < A.nt; N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); for (n = min(N+BS, A.nt)-1; n > N; n--) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempnn, tempkm, ib, T.nb, B(m, N), ldbm, B(m, n), ldbm, A(k, n), ldak, T(k, n), T.mb); } } for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempmm, tempNn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, B(m, N), ldbm); } } } } else { /* * PlasmaRight / PlasmaConjTrans */ for (k = 0; k < K; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); for (N = k; N < A.nt; N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); ldaN = BLKLDD(A, N); for (m = 0; m < B.mt; m++) { ldbm = BLKLDD(B, m); tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempmm, tempNn, tempkmin, ib, T.nb, A(k, N), ldaN, T(k, N), T.mb, B(m, N), ldbm); } for (n = N+1; n < min(N+BS, A.nt); n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, tempmm, tempNn, tempmm, tempnn, tempkm, ib, T.nb, B(m, N), ldbm, B(m, n), ldbm, A(k, n), ldak, T(k, n), T.mb); } } } for (RD = BS; RD < A.nt-k; RD *= 2) { for (N = k; N+RD < A.nt; N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_zttmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempNRDn, tempkm, ib, T.nb, B (m, N ), ldbm, B (m, N+RD), ldbm, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } } } } }
/***************************************************************************//** * Parallel tile LQ factorization (reduction Householder) - dynamic scheduling **/ void plasma_pzgelqfrh_quark(PLASMA_desc A, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int K, N, RD; int ldak, ldam; int tempkm, tempNn, tempmm, tempnn, tempNRDn; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; K = min(A.mt, A.nt); for (k = 0; k < K; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); // for (N = k; N < A.nt; N += BS) { for (N = k; N < A.nt-1 || N == k; // No rightmost single-column subdomain N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; QUARK_CORE_zgelqt( plasma->quark, &task_flags, tempkm, tempNn, ib, T.nb, A(k, N), ldak, T(k, N), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, tempNn, tempNn, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, A(m, N), ldam); } // for (n = N+1; n < N+BS && n < A.nt; n++) { for (n = N+1; (n < N+BS && n < A.nt) || n == A.nt-1; // Suck in rightmost single-column domain n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ztslqt( plasma->quark, &task_flags, tempkm, tempnn, ib, T.nb, A(k, N), ldak, A(k, n), ldak, T(k, n), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, A.nb, tempmm, tempnn, A.mb, ib, T.nb, A(m, N), ldam, A(m, n), ldam, A(k, n), ldak, T(k, n), T.mb); } } } for (RD = BS; RD < A.nt-k; RD *= 2) { // for (N = k; N+RD < A.nt; N += 2*RD) { for (N = k; N+RD < A.nt-1; // No reduction with rightmost single-column subdomain N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; QUARK_CORE_zttlqt( plasma->quark, &task_flags, tempkm, tempNRDn, ib, T.nb, A (k, N ), ldak, A (k, N+RD), ldak, T2(k, N+RD), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_zttmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, A.nb, tempmm, tempNRDn, A.mb, ib, T.nb, A (m, N ), ldam, A (m, N+RD), ldam, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } } }