/***************************************************************************//** * Parallel tile matrix-matrix multiplication - dynamic scheduling **/ void plasma_pzgemm_quark(PLASMA_enum transA, PLASMA_enum transB, PLASMA_Complex64_t alpha, PLASMA_desc A, PLASMA_desc B, PLASMA_Complex64_t beta, PLASMA_desc C, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int m, n, k; int ldam, ldak, ldbn, ldbk, ldcm; int tempmm, tempnn, tempkn, tempkm; PLASMA_Complex64_t zbeta; PLASMA_Complex64_t zone = (PLASMA_Complex64_t)1.0; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); for (m = 0; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; ldcm = BLKLDD(C, m); for (n = 0; n < C.nt; n++) { tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb; /* * A: PlasmaNoTrans / B: PlasmaNoTrans */ if (transA == PlasmaNoTrans) { ldam = BLKLDD(A, m); if (transB == PlasmaNoTrans) { for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; QUARK_CORE_zgemm( plasma->quark, &task_flags, transA, transB, tempmm, tempnn, tempkn, A.mb, alpha, A(m, k), ldam, /* lda * Z */ B(k, n), ldbk, /* ldb * Y */ zbeta, C(m, n), ldcm); /* ldc * Y */ } } /* * A: PlasmaNoTrans / B: Plasma[Conj]Trans */ else { ldbn = BLKLDD(B, n); for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; zbeta = k == 0 ? beta : zone; QUARK_CORE_zgemm( plasma->quark, &task_flags, transA, transB, tempmm, tempnn, tempkn, A.mb, alpha, A(m, k), ldam, /* lda * Z */ B(n, k), ldbn, /* ldb * Z */ zbeta, C(m, n), ldcm); /* ldc * Y */ } } } /* * A: Plasma[Conj]Trans / B: PlasmaNoTrans */ else { if (transB == PlasmaNoTrans) { for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; QUARK_CORE_zgemm( plasma->quark, &task_flags, transA, transB, tempmm, tempnn, tempkm, A.mb, alpha, A(k, m), ldak, /* lda * X */ B(k, n), ldbk, /* ldb * Y */ zbeta, C(m, n), ldcm); /* ldc * Y */ } } /* * A: Plasma[Conj]Trans / B: Plasma[Conj]Trans */ else { ldbn = BLKLDD(B, n); for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; QUARK_CORE_zgemm( plasma->quark, &task_flags, transA, transB, tempmm, tempnn, tempkm, A.mb, alpha, A(k, m), ldak, /* lda * X */ B(n, k), ldbn, /* ldb * Z */ zbeta, C(m, n), ldcm); /* ldc * Y */ } } } } } }
/***************************************************************************//** * Parallel tile triangular matrix inverse - dynamic scheduling **/ void plasma_pztrtri_quark(PLASMA_enum uplo, PLASMA_enum diag, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldam, ldan; int tempkn, tempmm, tempnn; PLASMA_Complex64_t zone = (PLASMA_Complex64_t) 1.0; PLASMA_Complex64_t mzone = (PLASMA_Complex64_t)-1.0; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); /* * PlasmaLower */ if (uplo == PlasmaLower) { for (n = 0; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldan = BLKLDD(A, n); for (m = n+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaRight, uplo, PlasmaNoTrans, diag, tempmm, tempnn, A.mb, mzone, A(n, n), ldan, A(m, n), ldam); } for (m = n+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); for (k = 0; k < n; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; QUARK_CORE_zgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempmm, tempkn, tempnn, A.mb, zone, A(m, n), ldam, A(n, k), ldan, zone, A(m, k), ldam); } } for (m = 0; m < n; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaLeft, uplo, PlasmaNoTrans, diag, tempnn, tempmm, A.mb, zone, A(n, n), ldan, A(n, m), ldan); } QUARK_CORE_ztrtri( plasma->quark, &task_flags, uplo, diag, tempnn, A.mb, A(n, n), ldan, sequence, request, A.nb*n); } } /* * PlasmaUpper */ else { for (m = 0; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); for (n = m+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaLeft, uplo, PlasmaNoTrans, diag, tempmm, tempnn, A.mb, mzone, A(m, m), ldam, A(m, n), ldam); } for (n = 0; n < m; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldan = BLKLDD(A, n); for (k = m+1; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; QUARK_CORE_zgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempnn, tempkn, tempmm, A.mb, zone, A(n, m), ldan, A(m, k), ldam, zone, A(n, k), ldan); } QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaRight, uplo, PlasmaNoTrans, diag, tempnn, tempmm, A.mb, zone, A(m, m), ldam, A(n, m), ldan); } QUARK_CORE_ztrtri( plasma->quark, &task_flags, uplo, diag, tempmm, A.mb, A(m, m), ldam, sequence, request, A.mb*m); } } }
/***************************************************************************//** * Parallel tile Hermitian rank-k update - dynamic scheduling **/ void plasma_pzherk_quark(PLASMA_enum uplo, PLASMA_enum trans, double alpha, PLASMA_desc A, double beta, PLASMA_desc C, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int m, n, k; int ldak, ldam, ldan, ldcm, ldcn; int tempnn, tempmm, tempkn, tempkm; PLASMA_Complex64_t zone = (PLASMA_Complex64_t)1.0; PLASMA_Complex64_t zalpha = (PLASMA_Complex64_t)alpha; PLASMA_Complex64_t zbeta; double dbeta; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); for (n = 0; n < C.nt; n++) { tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb; ldan = BLKLDD(A, n); ldcn = BLKLDD(C, n); /* * PlasmaNoTrans */ if (trans == PlasmaNoTrans) { for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; dbeta = k == 0 ? beta : 1.0; QUARK_CORE_zherk( plasma->quark, &task_flags, uplo, trans, tempnn, tempkn, A.mb, alpha, A(n, k), ldan, /* ldan * K */ dbeta, C(n, n), ldcn); /* ldc * N */ } /* * PlasmaNoTrans / PlasmaLower */ if (uplo == PlasmaLower) { for (m = n+1; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; ldam = BLKLDD(A, m); ldcm = BLKLDD(C, m); for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; zbeta = k == 0 ? (PLASMA_Complex64_t)beta : zone; QUARK_CORE_zgemm( plasma->quark, &task_flags, trans, PlasmaConjTrans, tempmm, tempnn, tempkn, A.mb, zalpha, A(m, k), ldam, /* ldam * K */ A(n, k), ldan, /* ldan * K */ zbeta, C(m, n), ldcm); /* ldc * N */ } } } /* * PlasmaNoTrans / PlasmaUpper */ else { for (m = n+1; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; ldam = BLKLDD(A, m); for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; zbeta = k == 0 ? (PLASMA_Complex64_t)beta : zone; QUARK_CORE_zgemm( plasma->quark, &task_flags, trans, PlasmaConjTrans, tempnn, tempmm, tempkn, A.mb, zalpha, A(n, k), ldan, /* ldan * K */ A(m, k), ldam, /* ldam * M */ zbeta, C(n, m), ldcn); /* ldc * M */ } } } } /* * Plasma[Conj]Trans */ else { for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); dbeta = k == 0 ? beta : 1.0; QUARK_CORE_zherk( plasma->quark, &task_flags, uplo, trans, tempnn, tempkm, A.mb, alpha, A(k, n), ldak, /* lda * N */ dbeta, C(n, n), ldcn); /* ldc * N */ } /* * Plasma[Conj]Trans / PlasmaLower */ if (uplo == PlasmaLower) { for (m = n+1; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; ldcm = BLKLDD(C, m); for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); zbeta = k == 0 ? (PLASMA_Complex64_t)beta : zone; QUARK_CORE_zgemm( plasma->quark, &task_flags, trans, PlasmaNoTrans, tempmm, tempnn, tempkm, A.mb, zalpha, A(k, m), ldak, /* lda * M */ A(k, n), ldak, /* lda * N */ zbeta, C(m, n), ldcm); /* ldc * N */ } } } /* * Plasma[Conj]Trans / PlasmaUpper */ else { for (m = n+1; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); zbeta = k == 0 ? (PLASMA_Complex64_t)beta : zone; QUARK_CORE_zgemm( plasma->quark, &task_flags, trans, PlasmaNoTrans, tempnn, tempmm, tempkm, A.mb, zalpha, A(k, n), ldak, /* lda * K */ A(k, m), ldak, /* lda * M */ zbeta, C(n, m), ldcn); /* ldc * M */ } } } } } }
/***************************************************************************//** * Parallel tile LU factorization with no pivoting - dynamic scheduling **/ void plasma_pzgetrf_nopiv_quark(PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n, ib; int ldak, ldam; int tempkm, tempkn, tempmm, tempnn; PLASMA_Complex64_t zone = (PLASMA_Complex64_t) 1.0; PLASMA_Complex64_t mzone = (PLASMA_Complex64_t)-1.0; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; for (k = 0; k < min(A.mt, A.nt); k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldak = BLKLDD(A, k); QUARK_CORE_zgetrf_nopiv( plasma->quark, &task_flags, tempkm, tempkn, ib, A.mb, A(k, k), ldak, sequence, request, A.mb*k); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaRight, PlasmaUpper, PlasmaNoTrans, PlasmaNonUnit, tempmm, tempkn, A.mb, zone, A(k, k), ldak, A(m, k), ldam); } for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaLeft, PlasmaLower, PlasmaNoTrans, PlasmaUnit, tempkm, tempnn, A.mb, zone, A(k, k), ldak, A(k, n), ldak); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_zgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempmm, tempnn, A.mb, A.mb, mzone, A(m, k), ldam, A(k, n), ldak, zone, A(m, n), ldam); } } } }