/***************************************************************************//** * Parallel tile LQ factorization - static scheduling **/ void plasma_pcgelqf(plasma_context_t *plasma) { PLASMA_desc A; PLASMA_desc T; PLASMA_sequence *sequence; PLASMA_request *request; int k, m, n; int next_k; int next_m; int next_n; int ldak, ldam; int tempkm, tempkn, tempmm, tempnn; int ib = PLASMA_IB; PLASMA_Complex32_t *work, *tau; plasma_unpack_args_4(A, T, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; work = (PLASMA_Complex32_t*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp); tau = (PLASMA_Complex32_t*)plasma_private_alloc(plasma, A.nb, A.dtyp); ss_init(A.mt, A.nt, -1); k = 0; m = PLASMA_RANK; while (m >= A.mt) { k++; m = m-A.mt+k; } n = k; while (k < min(A.mt, A.nt) && m < A.mt) { next_m = m; next_n = n; next_k = k; next_n++; if (next_n == A.nt) { next_m += PLASMA_SIZE; while (next_m >= A.mt && next_k < min(A.nt, A.mt)) { next_k++; next_m = next_m-A.mt+next_k; } next_n = next_k; } tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldak = BLKLDD(A, k); ldam = BLKLDD(A, m); if (m == k) { if (n == k) { ss_cond_wait(k, k, k-1); CORE_cgelqt( tempkm, tempkn, ib, A(k, k), ldak, T(k, k), T.mb, tau, work); ss_cond_set(k, k, k); } else { ss_cond_wait(k, n, k-1); CORE_ctslqt( tempkm, tempnn, ib, A(k, k), ldak, A(k, n), ldak, T(k, n), T.mb, tau, work); ss_cond_set(k, n, k); } } else { if (n == k) { ss_cond_wait(k, k, k); ss_cond_wait(m, k, k-1); CORE_cunmlq( PlasmaRight, PlasmaConjTrans, tempmm, tempkn, tempkn, ib, A(k, k), ldak, T(k, k), T.mb, A(m, k), ldam, work, T.nb); } else { ss_cond_wait(k, n, k); ss_cond_wait(m, n, k-1); CORE_ctsmlq( PlasmaRight, PlasmaConjTrans, tempmm, A.nb, tempmm, tempnn, A.nb, ib, A(m, k), ldam, A(m, n), ldam, A(k, n), ldak, T(k, n), T.mb, work, T.nb); ss_cond_set(m, n, k); } } m = next_m; n = next_n; k = next_k; } plasma_private_free(plasma, work); plasma_private_free(plasma, tau); ss_finalize(); }
/***************************************************************************//** * Parallel tile LU factorization - dynamic scheduling **/ void plasma_psgetrf_incpiv_quark(PLASMA_desc A, PLASMA_desc L, int *IPIV, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldak, ldam; int tempkm, tempkn, tempmm, tempnn; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; for (k = 0; k < min(A.mt, A.nt); k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldak = BLKLDD(A, k); QUARK_CORE_sgetrf_incpiv( plasma->quark, &task_flags, tempkm, tempkn, ib, L.nb, A(k, k), ldak, IPIV(k, k), sequence, request, k == A.mt-1, A.nb*k); for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_sgessm( plasma->quark, &task_flags, tempkm, tempnn, tempkm, ib, L.nb, IPIV(k, k), A(k, k), ldak, A(k, n), ldak); } for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ststrf( plasma->quark, &task_flags, tempmm, tempkn, ib, L.nb, A(k, k), ldak, A(m, k), ldam, L(m, k), L.mb, IPIV(m, k), sequence, request, m == A.mt-1, A.nb*k); for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_sssssm( plasma->quark, &task_flags, A.nb, tempnn, tempmm, tempnn, A.nb, ib, L.nb, A(k, n), ldak, A(m, n), ldam, L(m, k), L.mb, A(m, k), ldam, IPIV(m, k)); } } } }
/***************************************************************************//** * Parallel tile LQ factorization (reduction Householder) - static / sequential **/ void plasma_pzgelqfrh(plasma_context_t *plasma) { PLASMA_desc A; PLASMA_desc T; int BS; PLASMA_sequence *sequence; PLASMA_request *request; int k, m, n; int K, N, RD; int ldak, ldam; int tempkm, tempNn, tempmm, tempnn, tempNRDn; int ib; if (PLASMA_RANK != 0) return; plasma_unpack_args_5(A, T, BS, sequence, request); ib = PLASMA_IB; PLASMA_Complex64_t *work, *tau; work = (PLASMA_Complex64_t*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp); tau = (PLASMA_Complex64_t*)plasma_private_alloc(plasma, A.nb, A.dtyp); if (sequence->status != PLASMA_SUCCESS) return; K = min(A.mt, A.nt); for (k = 0; k < K; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); for (N = k; N < A.nt-1 || N == k; // No rightmost single-column subdomain N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; CORE_zgelqt( tempkm, tempNn, ib, A(k, N), ldak, T(k, N), T.mb, tau, work); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); CORE_zunmlq( PlasmaRight, PlasmaConjTrans, tempmm, tempNn, tempNn, ib, A(k, N), ldak, T(k, N), T.mb, A(m, N), ldam, work , A.nb); } for (n = N+1; (n < N+BS && n < A.nt) || n == A.nt-1; // Suck in rightmost single-column domain n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; CORE_ztslqt( tempkm, tempnn, ib, A(k, N), ldak, A(k, n), ldak, T(k, n), T.mb, tau, work); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); CORE_ztsmlq( PlasmaRight, PlasmaConjTrans, tempmm, A.mb, tempmm, tempnn, A.mb, ib, A(m, N), ldam, A(m, n), ldam, A(k, n), ldak, T(k, n), T.mb, work , A.nb); } } } for (RD = BS; RD < A.nt-k; RD *= 2) { for (N = k; N+RD < A.nt-1; // No reduction with rightmost single-column subdomain N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; CORE_zttlqt( tempkm, tempNRDn, ib, A (k, N ), ldak, A (k, N+RD), ldak, T2(k, N+RD), T.mb, tau, work); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); CORE_zttmlq( PlasmaRight, PlasmaConjTrans, tempmm, A.nb, tempmm, tempNRDn, tempkm, ib, A (m, N ), ldam, A (m, N+RD), ldam, A (k, N+RD), ldak, T2(k, N+RD), T.mb, work , A.nb); } } } } }
/***************************************************************************//** * Parallel construction of Q using tile V (application to identity; * reduction Householder) - dynamic scheduling **/ void plasma_pzunglqrh_quark(PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int K, N, RD, lastRD; int ldak; int ldqm; int tempkm, tempNn, tempnn, tempmm, tempNRDn, tempkmin; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; K = min(A.mt, A.nt); for (k = K-1; k >= 0; k--) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); lastRD = 0; for (RD = BS; RD < A.nt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { for (N = k; N+RD < A.nt-1; /* No reduction with rightmost single-column subdomain */ N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; for (m = 0; m < Q.mt; m++) { tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb; ldqm = BLKLDD(Q, m ); QUARK_CORE_zttmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, Q.nb, tempmm, tempNRDn, tempkm, ib, T.nb, Q (m, N ), ldqm, Q (m, N+RD), ldqm, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } for (N = k; N < A.nt-1 || N == k; /* No rightmost single-column */ N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm, tempNn); for (n = N+BS-1 == A.nt-2 ? A.nt-1 : min(N+BS-1, A.nt-1); /* Suck in rightmost single-column domain */ n >= N+1; n--) { tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb; for (m = 0; m < Q.mt; m++) { tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb; ldqm = BLKLDD(Q, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, Q.nb, tempmm, tempnn, tempkm, ib, T.nb, Q(m, N), ldqm, Q(m, n), ldqm, A(k, n), ldak, T(k, n), T.mb); } } for (m = 0; m < Q.mt; m++) { tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb; ldqm = BLKLDD(Q, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, tempNn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, Q(m, N), ldqm); } } } }
/***************************************************************************//** * Parallel tile QR factorization - dynamic scheduling **/ void plasma_pdgeqrf_quark(PLASMA_desc A, PLASMA_desc T, int ib) { int k, m, n; int ldak, ldam; int tempkm, tempkn, tempnn, tempmm; for (k = 0; k < min(A.mt, A.nt); k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldak = BLKLDD(A, k); double *dA = A(k, k); double *dT = T(k, k); #if defined(USE_OMPEXT) omp_set_task_priority(1); #endif #pragma omp task depend(inout: dA[0:T.nb*T.nb]) depend(out:dT[0:ib*T.nb]) { double tau[T.nb]; double work[ib * T.nb]; CORE_dgeqrt(tempkm, tempkn, ib, dA, ldak, dT, T.mb, &tau[0], &work[0]); } for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; double *dA = A(k, k); double *dT = T(k, k); double *dC = A(k, n); #pragma omp task depend(in: dA[0:T.nb*T.nb], dT[0:ib*T.nb]) depend(inout:dC[0:T.nb*T.nb]) { double work[T.nb * ib]; CORE_dormqr(PlasmaLeft, PlasmaTrans, tempkm, tempnn, tempkm, ib, dA, ldak, dT, T.mb, dC, ldak, &work[0], T.nb); } } for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); double *dA = A(k, k); double *dB = A(m, k); double *dT = T(m, k); #pragma omp task depend(inout:dA[0:T.nb*T.nb], dB[0:T.nb*T.nb]) depend(out:dT[0:ib*T.nb]) { double tau[T.nb]; double work[ib * T.nb]; CORE_dtsqrt(tempmm, tempkn, ib, dA, ldak, dB, ldam, dT, T.mb, &tau[0], &work[0]); } for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; double *dA = A(k, n); double *dB = A(m, n); double *dV = A(m, k); double *dT = T(m, k); #pragma omp task depend(inout:dA[0:T.nb*T.nb], dB[0:T.nb*T.nb]) depend(in:dV[0:T.nb*T.nb], dT[0:ib*T.nb]) { double work[ib * T.nb]; CORE_dtsmqr(PlasmaLeft, PlasmaTrans, A.mb, tempnn, tempmm, tempnn, A.nb, ib, dA, ldak, dB, ldam, dV, ldam, dT, T.mb, &work[0], ib); } } } } }
/***************************************************************************//** * Parallel application of Q using tile V - QR factorization - static scheduling **/ void plasma_pcunmqr(plasma_context_t *plasma) { PLASMA_enum side; PLASMA_enum trans; PLASMA_desc A; PLASMA_desc B; PLASMA_desc T; PLASMA_sequence *sequence; PLASMA_request *request; int k, m, n; int next_k; int next_m; int next_n; int ldak, ldbk, ldam, ldbm; int tempkm, tempnn, tempkmin, tempmm; int minMT, minM; int ib = PLASMA_IB; PLASMA_Complex32_t *work; plasma_unpack_args_7(side, trans, A, B, T, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; if (side != PlasmaLeft) { plasma_request_fail(sequence, request, PLASMA_ERR_NOT_SUPPORTED); return; } if (trans != PlasmaConjTrans) { plasma_request_fail(sequence, request, PLASMA_ERR_NOT_SUPPORTED); return; } work = (PLASMA_Complex32_t*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp); ss_init(B.mt, B.nt, -1); if (A.m > A.n) { minM = A.n; minMT = A.nt; } else { minM = A.m; minMT = A.mt; } k = 0; n = PLASMA_RANK; while (n >= B.nt) { k++; n = n-B.nt; } m = k; while (k < minMT && n < B.nt) { next_n = n; next_m = m; next_k = k; next_m++; if (next_m == A.mt) { next_n += PLASMA_SIZE; while (next_n >= B.nt && next_k < minMT) { next_k++; next_n = next_n-B.nt; } next_m = next_k; } tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb; tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); if (m == k) { ss_cond_wait(k, n, k-1); CORE_cunmqr( side, trans, tempkm, tempnn, tempkmin, ib, A(k, k), ldak, T(k, k), T.mb, B(k, n), ldbk, work, T.nb); ss_cond_set(k, n, k); } else { ss_cond_wait(m, n, k-1); CORE_ctsmqr( side, trans, A.mb, tempnn, tempmm, tempnn, tempkmin, ib, B(k, n), ldbk, B(m, n), ldbm, A(m, k), ldam, T(m, k), T.mb, work, ib); ss_cond_set(m, n, k); } n = next_n; m = next_m; k = next_k; } plasma_private_free(plasma, work); ss_finalize(); }
/***************************************************************************//** * Parallel tile LU factorization - dynamic scheduling - Right looking **/ void plasma_pdgetrf_rectil_quark(PLASMA_desc A, int *IPIV) { int k, m, n; int tempk, tempm, tempkm, tempkn, tempmm, tempnn; int ldak, ldam; double zone = (double)1.0; double mzone = (double)-1.0; void * fakedep; /* How many threads per panel? Probably needs to be adjusted during factorization. */ CORE_dgetrf_rectil_init(); for (k = 0; k < min(A.mt, A.nt); k++) { tempk = k * A.mb; tempm = A.m - tempk; tempkm = k == A.mt-1 ? tempm : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldak = BLKLDD(A, k); double *dA = A(k, k); int *dB = IPIV(k); PLASMA_desc pDesc = plasma_desc_submatrix(A, tempk, k*A.nb, tempm, tempkn); hclib_pragma_marker("omp", "task depend(inout:dA[0:A.mb*A.nb]) depend(out:dB[0:pDesc.n])", "pragma59_omp_task"); { int info[3]; info[1] = 0; info[2] = 1; CORE_dgetrf_rectil( pDesc, dB, info ); } /* * Update the trailing submatrix */ fakedep = (void *)(intptr_t)(k+1); for (n = k+1; n < A.nt; n++) { /* * Apply row interchange after the panel (work on the panel) */ tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; PLASMA_desc descA = plasma_desc_submatrix(A, tempk, n*A.nb, tempm, tempnn); double *dA = A(k, n); double *dB = A(k, k); int *dipiv = IPIV(k); hclib_pragma_marker("omp", "task depend(inout:dA[0:1]) depend(in:dB[0:ldak], dipiv[0:tempkm])", "pragma82_omp_task"); CORE_dswptr_ontile(descA, 1, tempkm, dipiv, 1, dB, ldak); m = k+1; if ( m < A.mt ) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); double *dA = A(m , k); double *dB = A(k , n); double *dC = A(m , n); hclib_pragma_marker("omp", "task depend(in:dA[0:A.mb*A.mb], dB[0:A.mb*A.mb]) depend(inout:dC[0:A.mb*A.mb])", "pragma93_omp_task"); cblas_dgemm(CblasColMajor, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_TRANSPOSE)PlasmaNoTrans, tempmm, tempnn, A.nb, mzone, dA, ldam, dB, ldak, zone, dC, ldam); for (m = k+2; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); double *dA = A(m , k); double *dB = A(k , n); double *dC = A(m , n); double *fake1 = A(k+1, n); double *fake2 = (double *)fakedep; hclib_pragma_marker("omp", "task depend(in:dA[0:A.mb*A.mb], dB[0:A.mb*A.mb], fake2[0:1]) depend(inout:dC[0:A.mb*A.mb], fake1[0:A.mb*A.nb])", "pragma110_omp_task"); cblas_dgemm(CblasColMajor, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_TRANSPOSE)PlasmaNoTrans, tempmm, tempnn, A.nb, mzone, dA, ldam, dB, ldak, zone, dC, ldam); } } } } for (k = 0; k < min(A.mt, A.nt); k++) { int mintmp; tempk = k * A.mb; tempm = A.m - tempk; tempkm = k == A.mt-1 ? tempm : A.mb; tempkn = k == A.nt-1 ? A.n - k * A.nb : A.nb; mintmp = min(tempkm, tempkn); ldak = BLKLDD(A, k); /* * Apply row interchange behind the panel (work on the panel) */ fakedep = (void*)(intptr_t)k; for (n = 0; n < k; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; double *Aij = A(k, n); double *prevSwap = A(k-1, n); int *dipiv = IPIV(k); PLASMA_desc descA = plasma_desc_submatrix(A, tempk, n*A.nb, tempm, tempnn); hclib_pragma_marker("omp", "task depend(inout:Aij[0:1],fakedep) depend(in:dipiv[0:mintmp], prevSwap[0:A.lm*A.nb])", "pragma142_omp_task"); CORE_dlaswp_ontile(descA, 1, mintmp, dipiv, 1); } } }
/***************************************************************************//** * Parallel tile Hermitian rank-k update - dynamic scheduling **/ void plasma_pcsyr2k_quark(PLASMA_enum uplo, PLASMA_enum trans, PLASMA_Complex32_t alpha, PLASMA_desc A, PLASMA_desc B, PLASMA_Complex32_t beta, PLASMA_desc C, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int m, n, k; int ldak, ldam, ldan, ldcm, ldcn; int ldbk, ldbm, ldbn; int tempnn, tempmm, tempkn, tempkm; PLASMA_Complex32_t zone = (PLASMA_Complex32_t)1.0; PLASMA_Complex32_t zbeta; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); for (n = 0; n < C.nt; n++) { tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb; ldan = BLKLDD(A, n); ldbn = BLKLDD(B, n); ldcn = BLKLDD(C, n); /* * PlasmaNoTrans */ if (trans == PlasmaNoTrans) { for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; zbeta = k == 0 ? beta : zone; QUARK_CORE_csyr2k( plasma->quark, &task_flags, uplo, trans, tempnn, tempkn, A.mb, alpha, A(n, k), ldan, /* ldan * K */ B(n, k), ldbn, zbeta, C(n, n), ldcn); /* ldc * N */ } /* * PlasmaNoTrans / PlasmaLower */ if (uplo == PlasmaLower) { for (m = n+1; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); ldcm = BLKLDD(C, m); for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; zbeta = k == 0 ? beta : zone; QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaTrans, tempmm, tempnn, tempkn, A.mb, alpha, A(m, k), ldam, /* ldam * K */ B(n, k), ldbn, /* ldan * K */ zbeta, C(m, n), ldcm); /* ldc * N */ QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaTrans, tempmm, tempnn, tempkn, A.mb, alpha, B(m, k), ldbm, /* ldam * K */ A(n, k), ldan, /* ldan * K */ zone, C(m, n), ldcm); /* ldc * N */ } } } /* * PlasmaNoTrans / PlasmaUpper */ else { for (m = n+1; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; zbeta = k == 0 ? beta : zone; QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaTrans, tempnn, tempmm, tempkn, A.mb, alpha, A(n, k), ldan, /* ldan * K */ B(m, k), ldbm, /* ldam * M */ zbeta, C(n, m), ldcn); /* ldc * M */ QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaTrans, tempnn, tempmm, tempkn, A.mb, alpha, B(n, k), ldan, /* ldan * K */ A(m, k), ldam, /* ldam * M */ zone, C(n, m), ldcn); /* ldc * M */ } } } } /* * Plasma[Conj]Trans */ else { for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; QUARK_CORE_csyr2k( plasma->quark, &task_flags, uplo, trans, tempnn, tempkm, A.mb, alpha, A(k, n), ldak, /* lda * N */ B(k, n), ldbk, zbeta, C(n, n), ldcn); /* ldc * N */ } /* * Plasma[Conj]Trans / PlasmaLower */ if (uplo == PlasmaLower) { for (m = n+1; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; ldcm = BLKLDD(C, m); for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaNoTrans, tempmm, tempnn, tempkm, A.mb, alpha, A(k, m), ldak, /* lda * M */ B(k, n), ldbk, /* lda * N */ zbeta, C(m, n), ldcm); /* ldc * N */ QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaNoTrans, tempmm, tempnn, tempkm, A.mb, alpha, B(k, m), ldbk, /* lda * M */ A(k, n), ldak, /* lda * N */ zone, C(m, n), ldcm); /* ldc * N */ } } } /* * Plasma[Conj]Trans / PlasmaUpper */ else { for (m = n+1; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaNoTrans, tempnn, tempmm, tempkm, A.mb, alpha, A(k, n), ldak, /* lda * K */ B(k, m), ldbk, /* lda * M */ zbeta, C(n, m), ldcn); /* ldc * M */ QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaNoTrans, tempnn, tempmm, tempkm, A.mb, alpha, B(k, n), ldbk, /* lda * K */ A(k, m), ldak, /* lda * M */ zone, C(n, m), ldcn); /* ldc * M */ } } } } } }
/***************************************************************************//** * Parallel tile BAND Tridiagonal Reduction - dynamic scheduler **/ void plasma_pssyrbt_quark(PLASMA_enum uplo, PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n, i, j; int ldak, ldam, ldan, ldaj, ldai; int tempkn, tempmm, tempnn, tempjj; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; if (uplo == PlasmaLower) { for (k = 0; k < A.nt-1; k++){ tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb; ldak = BLKLDD(A, k+1); QUARK_CORE_sgeqrt( plasma->quark, &task_flags, tempkn, A.nb, ib, T.nb, A(k+1, k), ldak, T(k+1, k), T.mb); /* LEFT and RIGHT on the symmetric diagonal block */ QUARK_CORE_ssyrfb( plasma->quark, &task_flags, PlasmaLower, tempkn, tempkn, ib, T.nb, A(k+1, k), ldak, T(k+1, k), T.mb, A(k+1, k+1), ldak); /* RIGHT on the remaining tiles until the bottom */ for (m = k+2; m < A.mt ; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_sormqr( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, A.nb, tempkn, ib, T.nb, A(k+1, k), ldak, T(k+1, k), T.mb, A(m , k+1), ldam); } for (m = k+2; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_stsqrt( plasma->quark, &task_flags, tempmm, A.nb, ib, T.nb, A(k+1, k), ldak, A(m , k), ldam, T(m , k), T.mb); /* LEFT */ for (i = k+2; i < m; i++) { ldai = BLKLDD(A, i); QUARK_CORE_stsmqr_sytra1( plasma->quark, &task_flags, PlasmaLeft, PlasmaTrans, A.mb, A.nb, tempmm, A.nb, A.nb, ib, T.nb, A(i, k+1), ldai, A(m, i), ldam, A(m, k), ldam, T(m, k), T.mb); } /* RIGHT */ for (j = m+1; j < A.mt ; j++) { tempjj = j == A.mt-1 ? A.m-j*A.mb : A.mb; ldaj = BLKLDD(A, j); QUARK_CORE_stsmqr( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempjj, A.nb, tempjj, tempmm, A.nb, ib, T.nb, A(j, k+1), ldaj, A(j, m), ldaj, A(m, k), ldam, T(m, k), T.mb); } /* LEFT->RIGHT */ QUARK_CORE_stsmqr_corner( plasma->quark, &task_flags, A.nb, A.nb, tempmm, A.nb, tempmm, tempmm, A.nb, ib, T.nb, A(k+1, k+1), ldak, A(m , k+1), ldam, A(m , m), ldam, A(m , k), ldam, T(m , k), T.mb); } } } else { for (k = 0; k < A.nt-1; k++){ tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb; ldak = BLKLDD(A, k+1); QUARK_CORE_sgelqt( plasma->quark, &task_flags, A.nb, tempkn, ib, T.nb, A(k, k+1), A.nb, T(k, k+1), T.mb); /* RIGHT and LEFT on the symmetric diagonal block */ QUARK_CORE_ssyrfb( plasma->quark, &task_flags, PlasmaUpper, tempkn, tempkn, ib, T.nb, A(k, k+1), A.nb, T(k, k+1), T.mb, A(k+1, k+1), ldak); /* LEFT on the remaining tiles until the left side */ for (n = k+2; n < A.nt ; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_sormlq( plasma->quark, &task_flags, PlasmaLeft, PlasmaNoTrans, A.nb, tempnn, tempkn, ib, T.nb, A(k, k+1), A.nb, T(k, k+1), T.mb, A(k+1, n), ldak); } for (n = k+2; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldan = BLKLDD(A, n); QUARK_CORE_stslqt( plasma->quark, &task_flags, A.nb, tempnn, ib, T.nb, A(k, k+1), A.nb, A(k, n), A.nb, T(k, n), T.mb); /* RIGHT */ for (i = k+2; i < n; i++) { ldai = BLKLDD(A, i); QUARK_CORE_stsmlq_sytra1( plasma->quark, &task_flags, PlasmaRight, PlasmaTrans, A.mb, A.nb, A.nb, tempnn, A.nb, ib, T.nb, A(k+1, i), ldak, A(i, n), ldai, A(k, n), A.nb, T(k, n), T.mb); } /* LEFT */ for (j = n+1; j < A.nt ; j++) { tempjj = j == A.nt-1 ? A.n-j*A.nb : A.nb; ldaj = BLKLDD(A, j); QUARK_CORE_stsmlq( plasma->quark, &task_flags, PlasmaLeft, PlasmaNoTrans, A.nb, tempjj, tempnn, tempjj, A.nb, ib, T.nb, A(k+1, j), ldak, A(n, j), ldan, A(k, n), A.nb, T(k, n), T.mb); } /* RIGHT->LEFT */ QUARK_CORE_stsmlq_corner( plasma->quark, &task_flags, A.nb, A.nb, A.nb, tempnn, tempnn, tempnn, A.nb, ib, T.nb, A(k+1, k+1), ldak, A(k+1, n), ldak, A(n , n), ldan, A(k , n), A.nb, T(k , n), T.mb); } } } }
/***************************************************************************//** * Parallel tile triangular matrix inverse - dynamic scheduling **/ void plasma_pztrtri_quark(PLASMA_enum uplo, PLASMA_enum diag, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldam, ldan; int tempkn, tempmm, tempnn; PLASMA_Complex64_t zone = (PLASMA_Complex64_t) 1.0; PLASMA_Complex64_t mzone = (PLASMA_Complex64_t)-1.0; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); /* * PlasmaLower */ if (uplo == PlasmaLower) { for (n = 0; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldan = BLKLDD(A, n); for (m = n+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaRight, uplo, PlasmaNoTrans, diag, tempmm, tempnn, A.mb, mzone, A(n, n), ldan, A(m, n), ldam); } for (m = n+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); for (k = 0; k < n; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; QUARK_CORE_zgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempmm, tempkn, tempnn, A.mb, zone, A(m, n), ldam, A(n, k), ldan, zone, A(m, k), ldam); } } for (m = 0; m < n; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaLeft, uplo, PlasmaNoTrans, diag, tempnn, tempmm, A.mb, zone, A(n, n), ldan, A(n, m), ldan); } QUARK_CORE_ztrtri( plasma->quark, &task_flags, uplo, diag, tempnn, A.mb, A(n, n), ldan, sequence, request, A.nb*n); } } /* * PlasmaUpper */ else { for (m = 0; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); for (n = m+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaLeft, uplo, PlasmaNoTrans, diag, tempmm, tempnn, A.mb, mzone, A(m, m), ldam, A(m, n), ldam); } for (n = 0; n < m; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldan = BLKLDD(A, n); for (k = m+1; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; QUARK_CORE_zgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempnn, tempkn, tempmm, A.mb, zone, A(n, m), ldan, A(m, k), ldam, zone, A(n, k), ldan); } QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaRight, uplo, PlasmaNoTrans, diag, tempnn, tempmm, A.mb, zone, A(m, m), ldam, A(n, m), ldan); } QUARK_CORE_ztrtri( plasma->quark, &task_flags, uplo, diag, tempmm, A.mb, A(m, m), ldam, sequence, request, A.mb*m); } } }
/***************************************************************************//** * Parallel tile symmetric matrix-matrix multiplication - dynamic scheduling **/ void plasma_pdsymm_quark(PLASMA_enum side, PLASMA_enum uplo, double alpha, PLASMA_desc A, PLASMA_desc B, double beta, PLASMA_desc C, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int lda, ldak, ldb, ldc; int tempmm, tempnn, tempkn, tempkm; double zbeta; double zone = (double)1.0; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); for (m = 0; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; ldc = BLKLDD(C, m); for (n = 0; n < C.nt; n++) { tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb; /* * PlasmaLeft / PlasmaLower */ if (side == PlasmaLeft) { lda = BLKLDD(A, m); if (uplo == PlasmaLower) { for (k = 0; k < C.mt; k++) { tempkm = k == C.mt-1 ? C.m-k*C.mb : C.mb; ldak = BLKLDD(A, k); ldb = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; if (k < m) { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempmm, tempnn, tempkm, A.mb, alpha, A(m, k), lda, /* lda * K */ B(k, n), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { if (k == m) { QUARK_CORE_dsymm( plasma->quark, &task_flags, side, uplo, tempmm, tempnn, A.mb, alpha, A(k, k), ldak, /* ldak * X */ B(k, n), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaTrans, PlasmaNoTrans, tempmm, tempnn, tempkm, A.mb, alpha, A(k, m), ldak, /* ldak * X */ B(k, n), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } } } } /* * PlasmaLeft / PlasmaUpper */ else { for (k = 0; k < C.mt; k++) { tempkm = k == C.mt-1 ? C.m-k*C.mb : C.mb; ldak = BLKLDD(A, k); ldb = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; if (k < m) { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaTrans, PlasmaNoTrans, tempmm, tempnn, tempkm, A.mb, alpha, A(k, m), ldak, /* ldak * X */ B(k, n), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { if (k == m) { QUARK_CORE_dsymm( plasma->quark, &task_flags, side, uplo, tempmm, tempnn, A.mb, alpha, A(k, k), ldak, /* ldak * K */ B(k, n), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempmm, tempnn, tempkm, A.mb, alpha, A(m, k), lda, /* lda * K */ B(k, n), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } } } } } /* * PlasmaRight / PlasmaLower */ else { lda = BLKLDD(A, n); ldb = BLKLDD(B, m); if (uplo == PlasmaLower) { for (k = 0; k < C.nt; k++) { tempkn = k == C.nt-1 ? C.n-k*C.nb : C.nb; ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; if (k < n) { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaTrans, tempmm, tempnn, tempkn, A.mb, alpha, B(m, k), ldb, /* ldb * K */ A(n, k), lda, /* lda * K */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { if (k == n) { QUARK_CORE_dsymm( plasma->quark, &task_flags, side, uplo, tempmm, tempnn, A.mb, alpha, A(k, k), ldak, /* ldak * Y */ B(m, k), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempmm, tempnn, tempkn, A.mb, alpha, B(m, k), ldb, /* ldb * K */ A(k, n), ldak, /* ldak * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } } } } /* * PlasmaRight / PlasmaUpper */ else { for (k = 0; k < C.nt; k++) { tempkn = k == C.nt-1 ? C.n-k*C.nb : C.nb; ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; if (k < n) { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempmm, tempnn, tempkn, A.mb, alpha, B(m, k), ldb, /* ldb * K */ A(k, n), ldak, /* ldak * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { if (k == n) { QUARK_CORE_dsymm( plasma->quark, &task_flags, side, uplo, tempmm, tempnn, A.mb, alpha, A(k, k), ldak, /* ldak * Y */ B(m, k), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaTrans, tempmm, tempnn, tempkn, A.mb, alpha, B(m, k), ldb, /* ldb * K */ A(n, k), lda, /* lda * K */ zbeta, C(m, n), ldc); /* ldc * Y */ } } } } } } } }
/***************************************************************************//** * Parallel tile symmetric matrix-matrix multiplication - static scheduling **/ void plasma_pdsymm(plasma_context_t *plasma) { PLASMA_enum side; PLASMA_enum uplo; double alpha; PLASMA_desc A; PLASMA_desc B; double beta; PLASMA_desc C; PLASMA_sequence *sequence; PLASMA_request *request; int k, m, n; int next_m; int next_n; int lda, ldak, ldb, ldc; int tempmm, tempnn, tempkn, tempkm; double zbeta; double zone = (double)1.0; plasma_unpack_args_9(side, uplo, alpha, A, B, beta, C, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; n = 0; m = PLASMA_RANK; while (m >= C.mt && n < C.nt) { n++; m = m-C.mt; } while (n < C.nt) { next_m = m; next_n = n; next_m += PLASMA_SIZE; while (next_m >= C.mt && next_n < C.nt) { next_n++; next_m = next_m - C.mt; } tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb; ldc = BLKLDD(C, m); /* * PlasmaLeft / PlasmaLower */ if (side == PlasmaLeft) { lda = BLKLDD(A, m); if (uplo == PlasmaLower) { for (k = 0; k < C.mt; k++) { tempkm = k == C.mt-1 ? C.m-k*C.mb : C.mb; ldak = BLKLDD(A, k); ldb = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; if (k < m) { CORE_dgemm( PlasmaNoTrans, PlasmaNoTrans, tempmm, tempnn, tempkm, alpha, A(m, k), lda, B(k, n), ldb, zbeta, C(m, n), ldc); } else { if (k == m) { CORE_dsymm( side, uplo, tempmm, tempnn, alpha, A(k, k), ldak, B(k, n), ldb, zbeta, C(m, n), ldc); } else { CORE_dgemm( PlasmaTrans, PlasmaNoTrans, tempmm, tempnn, tempkm, alpha, A(k, m), ldak, B(k, n), ldb, zbeta, C(m, n), ldc); } } } } /* * PlasmaLeft / PlasmaUpper */ else { for (k = 0; k < C.mt; k++) { tempkm = k == C.mt-1 ? C.m-k*C.mb : C.mb; ldak = BLKLDD(A, k); ldb = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; if (k < m) { CORE_dgemm( PlasmaTrans, PlasmaNoTrans, tempmm, tempnn, tempkm, alpha, A(k, m), ldak, B(k, n), ldb, zbeta, C(m, n), ldc); } else { if (k == m) { CORE_dsymm( side, uplo, tempmm, tempnn, alpha, A(k, k), ldak, B(k, n), ldb, zbeta, C(m, n), ldc); } else { CORE_dgemm( PlasmaNoTrans, PlasmaNoTrans, tempmm, tempnn, tempkm, alpha, A(m, k), lda, B(k, n), ldb, zbeta, C(m, n), ldc); } } } } } /* * PlasmaRight / PlasmaLower */ else { lda = BLKLDD(A, n); ldb = BLKLDD(B, m); if (uplo == PlasmaLower) { for (k = 0; k < C.nt; k++) { tempkn = k == C.nt-1 ? C.n-k*C.nb : C.nb; ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; if (k < n) { CORE_dgemm( PlasmaNoTrans, PlasmaTrans, tempmm, tempnn, tempkn, alpha, B(m, k), ldb, A(n, k), lda, zbeta, C(m, n), ldc); } else { if (n == k) { CORE_dsymm( side, uplo, tempmm, tempnn, alpha, A(k, k), ldak, B(m, k), ldb, zbeta, C(m, n), ldc); } else { CORE_dgemm( PlasmaNoTrans, PlasmaNoTrans, tempmm, tempnn, tempkn, alpha, B(m, k), ldb, A(k, n), ldak, zbeta, C(m, n), ldc); } } } } /* * PlasmaRight / PlasmaUpper */ else { for (k = 0; k < C.nt; k++) { tempkn = k == C.nt-1 ? C.n-k*C.nb : C.nb; ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; if (k < n) { CORE_dgemm( PlasmaNoTrans, PlasmaNoTrans, tempmm, tempnn, tempkn, alpha, B(m, k), ldb, A(k, n), ldak, zbeta, C(m, n), ldc); } else { if (n == k) { CORE_dsymm( side, uplo, tempmm, tempnn, alpha, A(k, k), ldak, B(m, k), ldb, zbeta, C(m, n), ldc); } else { CORE_dgemm( PlasmaNoTrans, PlasmaTrans, tempmm, tempnn, tempkn, alpha, B(m, k), ldb, A(n, k), lda, zbeta, C(m, n), ldc); } } } } } m = next_m; n = next_n; } }
/***************************************************************************//** * Parallel tile LU factorization - static scheduling **/ void plasma_psgetrf_incpiv(plasma_context_t *plasma) { PLASMA_desc A; PLASMA_desc L; int *IPIV; PLASMA_sequence *sequence; PLASMA_request *request; int k, m, n; int next_k; int next_m; int next_n; int ldak, ldam; int info; int tempkn, tempkm, tempmm, tempnn; int ib = PLASMA_IB; float *work; plasma_unpack_args_5(A, L, IPIV, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; work = (float*)plasma_private_alloc(plasma, ib*L.nb, L.dtyp); ss_init(A.mt, A.nt, -1); k = 0; n = PLASMA_RANK; while (n >= A.nt) { k++; n = n-A.nt+k; } m = k; while (k < min(A.mt, A.nt) && n < A.nt && !ss_aborted()) { next_n = n; next_m = m; next_k = k; next_m++; if (next_m == A.mt) { next_n += PLASMA_SIZE; while (next_n >= A.nt && next_k < min(A.mt, A.nt)) { next_k++; next_n = next_n-A.nt+next_k; } next_m = next_k; } tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldak = BLKLDD(A, k); ldam = BLKLDD(A, m); if (n == k) { if (m == k) { ss_cond_wait(k, k, k-1); CORE_sgetrf_incpiv( tempkm, tempkn, ib, A(k, k), ldak, IPIV(k, k), &info); if (info != 0 && m == A.mt-1) { plasma_request_fail(sequence, request, info + A.nb*k); ss_abort(); } ss_cond_set(k, k, k); } else { ss_cond_wait(m, k, k-1); CORE_ststrf( tempmm, tempkn, ib, A.nb, A(k, k), ldak, A(m, k), ldam, L(m, k), L.mb, IPIV(m, k), work, L.nb, &info); if (info != 0 && m == A.mt-1) { plasma_request_fail(sequence, request, info + A.nb*k); ss_abort(); } ss_cond_set(m, k, k); } } else { if (m == k) { ss_cond_wait(k, k, k); ss_cond_wait(k, n, k-1); CORE_sgessm( tempkm, tempnn, tempkm, ib, IPIV(k, k), A(k, k), ldak, A(k, n), ldak); } else { ss_cond_wait(m, k, k); ss_cond_wait(m, n, k-1); CORE_sssssm( A.nb, tempnn, tempmm, tempnn, A.nb, ib, A(k, n), ldak, A(m, n), ldam, L(m, k), L.mb, A(m, k), ldam, IPIV(m, k)); ss_cond_set(m, n, k); } } n = next_n; m = next_m; k = next_k; } plasma_private_free(plasma, work); ss_finalize(); }
/***************************************************************************//** * Parallel forward substitution for tile LU - static scheduling **/ void plasma_pztrsmpl(plasma_context_t *plasma) { PLASMA_desc A; PLASMA_desc B; PLASMA_desc L; int *IPIV; PLASMA_sequence *sequence; PLASMA_request *request; int k, m, n; int next_k; int next_m; int next_n; int ldak, ldbk, ldam, ldbm; int tempkm, tempnn, tempkmin, tempmm, tempkn; int ib; plasma_unpack_args_6(A, B, L, IPIV, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; ss_init(B.mt, B.nt, -1); ib = PLASMA_IB; k = 0; n = PLASMA_RANK; while (n >= B.nt) { k++; n = n-B.nt; } m = k; while (k < min(A.mt, A.nt) && n < B.nt) { next_n = n; next_m = m; next_k = k; next_m++; if (next_m == A.mt) { next_n += PLASMA_SIZE; while (next_n >= B.nt && next_k < min(A.mt, A.nt)) { next_k++; next_n = next_n-B.nt; } next_m = next_k; } tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; tempkmin = k == min(A.mt, A.nt)-1 ? min(A.m, A.n)-k*A.mb : A.mb; tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); if (m == k) { ss_cond_wait(k, n, k-1); CORE_zgessm( tempkm, tempnn, tempkmin, ib, IPIV(k, k), A(k, k), ldak, B(k, n), ldbk); ss_cond_set(k, n, k); } else { ss_cond_wait(m, n, k-1); CORE_zssssm( A.nb, tempnn, tempmm, tempnn, tempkn, ib, B(k, n), ldbk, B(m, n), ldbm, L(m, k), L.mb, A(m, k), ldam, IPIV(m, k)); ss_cond_set(m, n, k); } n = next_n; m = next_m; k = next_k; } ss_finalize(); }
/***************************************************************************//** * Parallel application of Q using tile V - QR factorization - dynamic scheduling **/ void plasma_pdormqr_quark(PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int ib) { int k, m, n; int ldak, ldbk, ldam, ldan, ldbm; int tempkm, tempnn, tempkmin, tempmm, tempkn; int minMT, minM; if (A.m > A.n) { minM = A.n; minMT = A.nt; } else { minM = A.m; minMT = A.mt; } double *work = (double *)alloca(sizeof(double) * T.nb * ib); /* * PlasmaLeft / PlasmaTrans */ if (side == PlasmaLeft ) { if (trans == PlasmaTrans) { for (k = 0; k < minMT; k++) { tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; double *dA = A(k, k); double *dT = T(k, k); double *dB = B(k, n); { CORE_dormqr(side, trans, tempkm, tempnn, tempkmin, ib, dA, ldak, dT, T.mb, dB, ldbk, work, T.nb); } } for (m = k+1; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; double *dA = B(k, n); double *dB = B(m, n); double *dV = A(m, k); double *dT = T(m, k); { CORE_dtsmqr(side, trans, B.mb, tempnn, tempmm, tempnn, tempkmin, ib, dA, ldbk, dB, ldbm, dV, ldam, dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb); } } } } } /* * PlasmaLeft / PlasmaNoTrans */ else { for (k = minMT-1; k >= 0; k--) { tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); for (m = B.mt-1; m > k; m--) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; double *dA = B(k, n); double *dB = B(m, n); double *dV = A(m, k); double *dT = T(m, k); { CORE_dtsmqr(side, trans, B.mb, tempnn, tempmm, tempnn, tempkmin, ib, dA, ldbk, dB, ldbm, dV, ldam, dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb); } } } for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; double *dA = A(k, k); double *dT = T(k, k); double *dB = B(k, n); { CORE_dormqr(side, trans, tempkm, tempnn, tempkmin, ib, dA, ldak, dT, T.mb, dB, ldbk, work, T.nb); } } } } } /* * PlasmaRight / PlasmaTrans */ else { if (trans == PlasmaTrans) { for (k = minMT-1; k >= 0; k--) { tempkn = k == B.nt-1 ? B.n-k*B.nb : B.nb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); for (n = B.nt-1; n > k; n--) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; ldan = BLKLDD(A, n); for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); double *dA = B(m, k); double *dB = B(m, n); double *dV = A(n, k); double *dT = T(n, k); { CORE_dtsmqr(side, trans, tempmm, B.nb, tempmm, tempnn, tempkmin, ib, dA, ldbm, dB, ldbm, dV, ldan, dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb); } } } for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); double *dA = A(k, k); double *dT = T(k, k); double *dB = B(m, k); { CORE_dormqr(side, trans, tempmm, tempkn, tempkmin, ib, dA, ldak, dT, T.mb, dB, ldbm, work, T.nb); } } } } /* * PlasmaRight / PlasmaNoTrans */ else { for (k = 0; k < minMT; k++) { tempkn = k == B.nt-1 ? B.n-k*B.nb : B.nb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); double *dA = A(k, k); double *dT = T(k, k); double *dB = B(m, k); { CORE_dormqr(side, trans, tempmm, tempkn, tempkmin, ib, dA, ldak, dT, T.mb, dB, ldbm, work, T.nb); } } for (n = k+1; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; ldan = BLKLDD(A, n); for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); double *dA = B(m, k); double *dB = B(m, n); double *dV = A(n, k); double *dT = T(n, k); { CORE_dtsmqr(side, trans, tempmm, B.nb, tempmm, tempnn, tempkmin, ib, dA, ldbm, dB, ldbm, dV, ldan, dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb); } } } } } } }
/***************************************************************************//** * Parallel tile QR factorization - static scheduling **/ void plasma_pdgeqrf(plasma_context_t *plasma) { PLASMA_desc A; PLASMA_desc T; PLASMA_sequence *sequence; PLASMA_request *request; int k, m, n; int next_k; int next_m; int next_n; int ldak, ldam; int tempkm, tempkn, tempnn, tempmm; int ib = PLASMA_IB; double *work, *tau; plasma_unpack_args_4(A, T, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; work = (double*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp); tau = (double*)plasma_private_alloc(plasma, A.nb, A.dtyp); ss_init(A.mt, A.nt, -1); k = 0; n = PLASMA_RANK; while (n >= A.nt) { k++; n = n-A.nt+k; } m = k; while (k < min(A.mt, A.nt) && n < A.nt) { next_n = n; next_m = m; next_k = k; next_m++; if (next_m == A.mt) { next_n += PLASMA_SIZE; while (next_n >= A.nt && next_k < min(A.mt, A.nt)) { next_k++; next_n = next_n-A.nt+next_k; } next_m = next_k; } tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldak = BLKLDD(A, k); ldam = BLKLDD(A, m); if (n == k) { if (m == k) { ss_cond_wait(k, k, k-1); CORE_dgeqrt( tempkm, tempkn, ib, A(k, k), ldak, T(k, k), T.mb, tau, work); ss_cond_set(k, k, k); } else { ss_cond_wait(m, k, k-1); CORE_dtsqrt( tempmm, tempkn, ib, A(k, k), ldak, A(m, k), ldam, T(m, k), T.mb, tau, work); ss_cond_set(m, k, k); } } else { if (m == k) { ss_cond_wait(k, k, k); ss_cond_wait(k, n, k-1); CORE_dormqr( PlasmaLeft, PlasmaTrans, tempkm, tempnn, tempkm, ib, A(k, k), ldak, T(k, k), T.mb, A(k, n), ldak, work, T.nb); } else { ss_cond_wait(m, k, k); ss_cond_wait(m, n, k-1); CORE_dtsmqr( PlasmaLeft, PlasmaTrans, A.nb, tempnn, tempmm, tempnn, A.nb, ib, A(k, n), ldak, A(m, n), ldam, A(m, k), ldam, T(m, k), T.mb, work, ib); ss_cond_set(m, n, k); } } n = next_n; m = next_m; k = next_k; } plasma_private_free(plasma, work); plasma_private_free(plasma, tau); ss_finalize(); }
/***************************************************************************//** * Parallel tile QR factorization (reduction Householder) - dynamic scheduling **/ void plasma_psgeqrfrh_quark(PLASMA_desc A, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int K, M, RD; int ldaM, ldam, ldaMRD; int tempkn, tempMm, tempnn, tempmm, tempMRDm; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; K = min(A.mt, A.nt); for (k = 0; k < K; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; for (M = k; M < A.mt-1 || M == k; /* No bottom single-row subdomain */ M += BS) { tempMm = M == A.mt-1 ? A.m-M*A.mb : A.mb; ldaM = BLKLDD(A, M); QUARK_CORE_sgeqrt( plasma->quark, &task_flags, tempMm, tempkn, ib, T.nb, A(M, k), ldaM, T(M, k), T.mb); for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_sormqr( plasma->quark, &task_flags, PlasmaLeft, PlasmaTrans, tempMm, tempnn, tempMm, ib, T.nb, A(M, k), ldaM, T(M, k), T.mb, A(M, n), ldaM); } for (m = M+1; (m < M+BS && m < A.mt) || m == A.mt-1; /* Suck in bottom single-row domain */ m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_stsqrt( plasma->quark, &task_flags, tempmm, tempkn, ib, T.nb, A(M, k), ldaM, A(m, k), ldam, T(m, k), T.mb); for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_stsmqr( plasma->quark, &task_flags, PlasmaLeft, PlasmaTrans, A.nb, tempnn, tempmm, tempnn, A.nb, ib, T.nb, A(M, n), ldaM, A(m, n), ldam, A(m, k), ldam, T(m, k), T.mb); } } } for (RD = BS; RD < A.mt-k; RD *= 2) { for (M = k; M+RD < A.mt-1; /* No reduction with bottom single-row subdomain */ M += 2*RD) { tempMRDm = M+RD == A.mt-1 ? A.m-(M+RD)*A.mb : A.mb; ldaM = BLKLDD(A, M ); ldaMRD = BLKLDD(A, M+RD); QUARK_CORE_sttqrt( plasma->quark, &task_flags, tempMRDm, tempkn, ib, T.nb, A (M , k), ldaM, A (M+RD, k), ldaMRD, T2(M+RD, k), T.mb); for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_sttmqr( plasma->quark, &task_flags, PlasmaLeft, PlasmaTrans, A.nb, tempnn, tempMRDm, tempnn, A.nb, ib, T.nb, A (M, n), ldaM, A (M+RD, n), ldaMRD, A (M+RD, k), ldaMRD, T2(M+RD, k), T.mb); } } } } }
/***************************************************************************//** * Parallel tile Hermitian rank-k update - static scheduling **/ void plasma_pcsyr2k(plasma_context_t *plasma) { PLASMA_enum uplo; PLASMA_enum trans; PLASMA_Complex32_t alpha; PLASMA_desc A; PLASMA_desc B; PLASMA_Complex32_t beta; PLASMA_desc C; PLASMA_sequence *sequence; PLASMA_request *request; int m, n, k; int next_m; int next_n; int ldam, ldan, ldak; int ldbm, ldbn, ldbk; int ldcm, ldcn; int tempkn, tempkm, tempmm, tempnn; PLASMA_Complex32_t zone = (PLASMA_Complex32_t)1.0; PLASMA_Complex32_t zbeta; plasma_unpack_args_9(uplo, trans, alpha, A, B, beta, C, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; n = 0; m = PLASMA_RANK; while (m >= C.mt && n < C.nt) { n++; m = m-C.mt+n; } while (n < C.nt) { next_n = n; next_m = m + PLASMA_SIZE; while (next_m >= C.mt && next_n < C.nt) { next_n++; next_m = next_m - C.mt + next_n; } tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb; ldcn = BLKLDD(C, n); ldcm = BLKLDD(C, m); if (m == n) { /* * PlasmaNoTrans */ if (trans == PlasmaNoTrans) { ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; zbeta = k == 0 ? beta : zone; CORE_csyr2k( uplo, trans, tempnn, tempkn, alpha, A(m, k), ldam, B(m, k), ldbm, zbeta, C(m, m), ldcm); } } /* * Plasma[Conj]Trans */ else { for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; CORE_csyr2k( uplo, trans, tempnn, tempkm, alpha, A(k, m), ldak, B(k, m), ldbk, zbeta, C(m, m), ldcm); } } } else { if (trans == PlasmaNoTrans) { ldam = BLKLDD(A, m); ldan = BLKLDD(A, n); ldbm = BLKLDD(B, m); ldbn = BLKLDD(B, n); /* * PlasmaNoTrans / PlasmaLower */ if (uplo == PlasmaLower) { for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; zbeta = k == 0 ? beta : zone; CORE_cgemm( trans, PlasmaTrans, tempmm, tempnn, tempkn, alpha, A(m, k), ldam, B(n, k), ldbn, zbeta, C(m, n), ldcm); CORE_cgemm( trans, PlasmaTrans, tempmm, tempnn, tempkn, alpha, B(m, k), ldbm, A(n, k), ldan, zone, C(m, n), ldcm); } } /* * PlasmaNoTrans / PlasmaUpper */ else { for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; zbeta = k == 0 ? beta : zone; CORE_cgemm( trans, PlasmaTrans, tempnn, tempmm, tempkn, alpha, A(n, k), ldan, B(m, k), ldbm, zbeta, C(n, m), ldcn); CORE_cgemm( trans, PlasmaTrans, tempnn, tempmm, tempkn, alpha, B(n, k), ldbn, A(m, k), ldam, zone, C(n, m), ldcn); } } } else { /* * Plasma[Conj]Trans / PlasmaLower */ if (uplo == PlasmaLower) { for (k = 0; k < A.mt; k++) { ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; zbeta = k == 0 ? beta : zone; CORE_cgemm( trans, PlasmaNoTrans, tempmm, tempnn, tempkm, alpha, A(k, m), ldak, B(k, n), ldbk, zbeta, C(m, n), ldcm); CORE_cgemm( trans, PlasmaNoTrans, tempmm, tempnn, tempkm, alpha, B(k, m), ldbk, A(k, n), ldak, zone, C(m, n), ldcm); } } /* * Plasma[Conj]Trans / PlasmaUpper */ else { for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; CORE_cgemm( trans, PlasmaNoTrans, tempnn, tempmm, tempkm, alpha, A(k, n), ldak, B(k, m), ldbk, zbeta, C(n, m), ldcm); CORE_cgemm( trans, PlasmaNoTrans, tempnn, tempmm, tempkm, alpha, B(k, n), ldbk, A(k, m), ldak, zone, C(n, m), ldcn); } } } } m = next_m; n = next_n; } }
/***************************************************************************//** * Parallel initializztion a 2-D array A to * ALPHA on the offdiagonals. **/ void plasma_pzlaset2_quark(PLASMA_enum uplo, PLASMA_Complex64_t alpha, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int i, j; int ldai, ldaj; int tempim; int tempjm, tempjn; int minmn = min(A.mt, A.nt); plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); if (uplo == PlasmaLower) { for (j = 0; j < minmn; j++){ tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb; tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb; ldaj = BLKLDD(A, j); QUARK_CORE_zlaset2( plasma->quark, &task_flags, PlasmaLower, tempjm, tempjn, alpha, A(j, j), ldaj); for (i = j+1; i < A.mt; i++){ tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb; ldai = BLKLDD(A, i); QUARK_CORE_zlaset2( plasma->quark, &task_flags, PlasmaUpperLower, tempim, tempjn, alpha, A(i, j), ldai); } } } else if (uplo == PlasmaUpper) { for (j = 1; j < A.nt; j++){ tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb; for (i = 0; i < min(j, A.mt); i++){ tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb; ldai = BLKLDD(A, i); QUARK_CORE_zlaset2( plasma->quark, &task_flags, PlasmaUpperLower, tempim, tempjn, alpha, A(i, j), ldai); } } for (j = 0; j < minmn; j++){ tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb; tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb; ldaj = BLKLDD(A, j); QUARK_CORE_zlaset2( plasma->quark, &task_flags, PlasmaUpper, tempjm, tempjn, alpha, A(j, j), ldaj); } } else { for (i = 0; i < A.mt; i++){ tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb; ldai = BLKLDD(A, i); for (j = 0; j < A.nt; j++){ tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb; QUARK_CORE_zlaset2( plasma->quark, &task_flags, PlasmaUpperLower, tempim, tempjn, alpha, A(i, j), ldai); } } } }
/***************************************************************************//** * Parallel application of Q using tile V - QR factorization - dynamic scheduling **/ void plasma_pcunmqr_quark(PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldak, ldbk, ldam, ldan, ldbm; int tempkm, tempnn, tempkmin, tempmm, tempkn; int ib, minMT, minM; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; if (A.m > A.n) { minM = A.n; minMT = A.nt; } else { minM = A.m; minMT = A.mt; } /* * PlasmaLeft / PlasmaConjTrans */ if (side == PlasmaLeft ) { if (trans == PlasmaConjTrans) { for (k = 0; k < minMT; k++) { tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_cunmqr( plasma->quark, &task_flags, side, trans, tempkm, tempnn, tempkmin, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, B(k, n), ldbk); } for (m = k+1; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_ctsmqr( plasma->quark, &task_flags, side, trans, B.mb, tempnn, tempmm, tempnn, tempkmin, ib, T.nb, B(k, n), ldbk, B(m, n), ldbm, A(m, k), ldam, T(m, k), T.mb); } } } } /* * PlasmaLeft / PlasmaNoTrans */ else { for (k = minMT-1; k >= 0; k--) { tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); for (m = B.mt-1; m > k; m--) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_ctsmqr( plasma->quark, &task_flags, side, trans, B.mb, tempnn, tempmm, tempnn, tempkmin, ib, T.nb, B(k, n), ldbk, B(m, n), ldbm, A(m, k), ldam, T(m, k), T.mb); } } for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_cunmqr( plasma->quark, &task_flags, side, trans, tempkm, tempnn, tempkmin, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, B(k, n), ldbk); } } } } /* * PlasmaRight / PlasmaConjTrans */ else { if (trans == PlasmaConjTrans) { for (k = minMT-1; k >= 0; k--) { tempkn = k == B.nt-1 ? B.n-k*B.nb : B.nb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); for (n = B.nt-1; n > k; n--) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; ldan = BLKLDD(A, n); for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_ctsmqr( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempnn, tempkmin, ib, T.nb, B(m, k), ldbm, B(m, n), ldbm, A(n, k), ldan, T(n, k), T.mb); } } for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_cunmqr( plasma->quark, &task_flags, side, trans, tempmm, tempkn, tempkmin, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, B(m, k), ldbm); } } } /* * PlasmaRight / PlasmaNoTrans */ else { for (k = 0; k < minMT; k++) { tempkn = k == B.nt-1 ? B.n-k*B.nb : B.nb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_cunmqr( plasma->quark, &task_flags, side, trans, tempmm, tempkn, tempkmin, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, B(m, k), ldbm); } for (n = k+1; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; ldan = BLKLDD(A, n); for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_ctsmqr( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempnn, tempkmin, ib, T.nb, B(m, k), ldbm, B(m, n), ldbm, A(n, k), ldan, T(n, k), T.mb); } } } } } }
/***************************************************************************//** * Parallel tile Cholesky factorization - dynamic scheduling **/ void plasma_pspotrf_quark(PLASMA_enum uplo, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldak, ldam; int tempkm, tempmm; float zone = (float) 1.0; float mzone = (float)-1.0; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); /* * PlasmaLower */ if (uplo == PlasmaLower) { for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); QUARK_CORE_spotrf( plasma->quark, &task_flags, PlasmaLower, tempkm, A.mb, A(k, k), ldak, sequence, request, A.nb*k); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_strsm( plasma->quark, &task_flags, PlasmaRight, PlasmaLower, PlasmaTrans, PlasmaNonUnit, tempmm, A.mb, A.mb, zone, A(k, k), ldak, A(m, k), ldam); } for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ssyrk( plasma->quark, &task_flags, PlasmaLower, PlasmaNoTrans, tempmm, A.mb, A.mb, -1.0, A(m, k), ldam, 1.0, A(m, m), ldam); for (n = k+1; n < m; n++) { QUARK_CORE_sgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaTrans, tempmm, A.mb, A.mb, A.mb, mzone, A(m, k), ldam, A(n, k), A.mb, zone, A(m, n), ldam); } } } } /* * PlasmaUpper */ else { for (k = 0; k < A.nt; k++) { tempkm = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldak = BLKLDD(A, k); QUARK_CORE_spotrf( plasma->quark, &task_flags, PlasmaUpper, tempkm, A.mb, A(k, k), ldak, sequence, request, A.nb*k); for (m = k+1; m < A.nt; m++) { tempmm = m == A.nt-1 ? A.n-m*A.nb : A.nb; QUARK_CORE_strsm( plasma->quark, &task_flags, PlasmaLeft, PlasmaUpper, PlasmaTrans, PlasmaNonUnit, A.nb, tempmm, A.mb, zone, A(k, k), ldak, A(k, m), ldak); } for (m = k+1; m < A.nt; m++) { tempmm = m == A.nt-1 ? A.n-m*A.nb : A.nb; ldam = BLKLDD(A, m); QUARK_CORE_ssyrk( plasma->quark, &task_flags, PlasmaUpper, PlasmaTrans, tempmm, A.mb, A.mb, -1.0, A(k, m), ldak, 1.0, A(m, m), ldam); for (n = k+1; n < m; n++) { QUARK_CORE_sgemm( plasma->quark, &task_flags, PlasmaTrans, PlasmaNoTrans, A.mb, tempmm, A.mb, A.mb, mzone, A(k, n), ldak, A(k, m), ldak, zone, A(n, m), A.mb); } } } } }
/***************************************************************************//** * Parallel tile LQ factorization (reduction Householder) - dynamic scheduling **/ void plasma_pzgelqfrh_quark(PLASMA_desc A, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int K, N, RD; int ldak, ldam; int tempkm, tempNn, tempmm, tempnn, tempNRDn; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; K = min(A.mt, A.nt); for (k = 0; k < K; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); // for (N = k; N < A.nt; N += BS) { for (N = k; N < A.nt-1 || N == k; // No rightmost single-column subdomain N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; QUARK_CORE_zgelqt( plasma->quark, &task_flags, tempkm, tempNn, ib, T.nb, A(k, N), ldak, T(k, N), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, tempNn, tempNn, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, A(m, N), ldam); } // for (n = N+1; n < N+BS && n < A.nt; n++) { for (n = N+1; (n < N+BS && n < A.nt) || n == A.nt-1; // Suck in rightmost single-column domain n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ztslqt( plasma->quark, &task_flags, tempkm, tempnn, ib, T.nb, A(k, N), ldak, A(k, n), ldak, T(k, n), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, A.nb, tempmm, tempnn, A.mb, ib, T.nb, A(m, N), ldam, A(m, n), ldam, A(k, n), ldak, T(k, n), T.mb); } } } for (RD = BS; RD < A.nt-k; RD *= 2) { // for (N = k; N+RD < A.nt; N += 2*RD) { for (N = k; N+RD < A.nt-1; // No reduction with rightmost single-column subdomain N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; QUARK_CORE_zttlqt( plasma->quark, &task_flags, tempkm, tempNRDn, ib, T.nb, A (k, N ), ldak, A (k, N+RD), ldak, T2(k, N+RD), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_zttmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, A.nb, tempmm, tempNRDn, A.mb, ib, T.nb, A (m, N ), ldam, A (m, N+RD), ldam, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } } }
/***************************************************************************//** * Parallel tile Cholesky factorization - static scheduling **/ void plasma_pspotrf(plasma_context_t *plasma) { PLASMA_enum uplo; PLASMA_desc A; PLASMA_sequence *sequence; PLASMA_request *request; int k, m, n; int next_k; int next_m; int next_n; int ldak, ldam, ldan; int info; int tempkn, tempmn; float zone = (float) 1.0; float mzone = (float)-1.0; plasma_unpack_args_4(uplo, A, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; ss_init(A.nt, A.nt, 0); k = 0; m = PLASMA_RANK; while (m >= A.nt) { k++; m = m-A.nt+k; } n = 0; while (k < A.nt && m < A.nt && !ss_aborted()) { next_n = n; next_m = m; next_k = k; next_n++; if (next_n > next_k) { next_m += PLASMA_SIZE; while (next_m >= A.nt && next_k < A.nt) { next_k++; next_m = next_m-A.nt+next_k; } next_n = 0; } tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; tempmn = m == A.nt-1 ? A.n-m*A.nb : A.nb; ldak = BLKLDD(A, k); ldan = BLKLDD(A, n); ldam = BLKLDD(A, m); if (m == k) { if (n == k) { /* * PlasmaLower */ if (uplo == PlasmaLower) { CORE_spotrf( PlasmaLower, tempkn, A(k, k), ldak, &info); } /* * PlasmaUpper */ else { CORE_spotrf( PlasmaUpper, tempkn, A(k, k), ldak, &info); } if (info != 0) { plasma_request_fail(sequence, request, info + A.nb*k); ss_abort(); } ss_cond_set(k, k, 1); } else { ss_cond_wait(k, n, 1); /* * PlasmaLower */ if (uplo == PlasmaLower) { CORE_ssyrk( PlasmaLower, PlasmaNoTrans, tempkn, A.nb, -1.0, A(k, n), ldak, 1.0, A(k, k), ldak); } /* * PlasmaUpper */ else { CORE_ssyrk( PlasmaUpper, PlasmaTrans, tempkn, A.nb, -1.0, A(n, k), ldan, 1.0, A(k, k), ldak); } } } else { if (n == k) { ss_cond_wait(k, k, 1); /* * PlasmaLower */ if (uplo == PlasmaLower) { CORE_strsm( PlasmaRight, PlasmaLower, PlasmaTrans, PlasmaNonUnit, tempmn, A.nb, zone, A(k, k), ldak, A(m, k), ldam); } /* * PlasmaUpper */ else { CORE_strsm( PlasmaLeft, PlasmaUpper, PlasmaTrans, PlasmaNonUnit, A.nb, tempmn, zone, A(k, k), ldak, A(k, m), ldak); } ss_cond_set(m, k, 1); } else { ss_cond_wait(k, n, 1); ss_cond_wait(m, n, 1); /* * PlasmaLower */ if (uplo == PlasmaLower) { CORE_sgemm( PlasmaNoTrans, PlasmaTrans, tempmn, A.nb, A.nb, mzone, A(m, n), ldam, A(k, n), ldak, zone, A(m, k), ldam); } /* * PlasmaUpper */ else { CORE_sgemm( PlasmaTrans, PlasmaNoTrans, A.nb, tempmn, A.nb, mzone, A(n, k), ldan, A(n, m), ldan, zone, A(k, m), ldak); } } } n = next_n; m = next_m; k = next_k; } ss_finalize(); }
/***************************************************************************//** * Parallel application of Q using tile V - QR factorization (reduction Householder) * - dynamic scheduling **/ void plasma_psormlqrh_quark(PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int K, N, RD, lastRD; int ldaN, ldam, ldak; int ldbN, ldbm, ldbNRD; int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; K = min(A.mt, A.nt); if (side == PlasmaLeft ) { if (trans == PlasmaNoTrans) { /* * PlasmaLeft / PlasmaNoTrans */ for (k = 0; k < K; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); for (N = k; N < A.nt-1 || N == k; /* No rightmost single-column subdomain */ N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); ldaN = BLKLDD(A, N); ldbN = BLKLDD(B, N); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_sormlq( plasma->quark, &task_flags, side, trans, tempNn, tempnn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, B(N, n), ldbN); } for (m = N+1; (m < N+BS && m < A.nt) || m == A.nt-1; /* Suck in rightmost single-column domain */ m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); ldam = BLKLDD(A, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_stsmlq( plasma->quark, &task_flags, side, trans, B.nb, tempnn, tempmm, tempnn, tempkm, ib, T.nb, B(N, n), ldbN, B(m, n), ldbm, A(k, m), ldak, T(k, m), T.mb); } } } for (RD = BS; RD < A.nt-k; RD *= 2) { for (N = k; N+RD < A.nt-1; /* No reduction with rightmost single-column subdomain */ N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; ldbN = BLKLDD(B, N ); ldbNRD = BLKLDD(B, N+RD); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_sttmlq( plasma->quark, &task_flags, side, trans, B.mb, tempnn, tempNRDn, tempnn, tempkm, ib, T.nb, B (N, n), ldbN, B (N+RD, n), ldbNRD, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } } } else { /* * PlasmaLeft / PlasmaTrans */ for (k = K-1; k >= 0; k--) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); lastRD = 0; for (RD = BS; RD < A.nt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { for (N = k; N+RD < A.nt-1; /* No reduction with rightmost single-column subdomain */ N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; ldbN = BLKLDD(B, N ); ldbNRD = BLKLDD(B, N+RD); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_sttmlq( plasma->quark, &task_flags, side, trans, B.nb, tempnn, tempNRDn, tempnn, tempkm, ib, T.nb, B (N, n), ldbN, B (N+RD, n), ldbNRD, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } for (N = k; N < A.nt-1 || N == k; /* No rightmost single-column subdomain */ N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); ldaN = BLKLDD(A, N); ldbN = BLKLDD(B, N); for (m = N+BS-1 == A.nt-2 ? A.nt-1 : min(N+BS-1, A.nt-1); /* Suck in rightmost single-column domain */ m >= N+1; m--) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); ldam = BLKLDD(A, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_stsmlq( plasma->quark, &task_flags, side, trans, B.mb, tempnn, tempmm, tempnn, tempkm, ib, T.nb, B(N, n), ldbN, B(m, n), ldbm, A(k, m), ldak, T(k, m), T.mb); } } for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_sormlq( plasma->quark, &task_flags, side, trans, tempNn, tempnn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, B(N, n), ldbN); } } } } } else { if (trans == PlasmaNoTrans) { /* * PlasmaRight / PlasmaNoTrans */ for (k = K-1; k >= 0; k--) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); lastRD = 0; for (RD = BS; RD < A.nt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { for (N = k; N+RD < A.nt-1; /* No reduction with rightmost single-column subdomain */ N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; for (m = 0; m < B.mt; m++) { ldbm = BLKLDD(B, m); tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; QUARK_CORE_sttmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempNRDn, tempkm, ib, T.nb, B (m, N ), ldbm, B (m, N+RD), ldbm, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } for (N = k; N < A.nt-1 || N == k; /* No rightmost single-column subdomain */ N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); for (n = N+BS-1 == A.nt-2 ? A.nt-1 : min(N+BS-1, A.nt-1); /* Suck in rightmost single-column domain */ n >= N+1; n--) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_stsmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempnn, tempkm, ib, T.nb, B(m, N), ldbm, B(m, n), ldbm, A(k, n), ldak, T(k, n), T.mb); } } for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_sormlq( plasma->quark, &task_flags, side, trans, tempmm, tempNn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, B(m, N), ldbm); } } } } else { /* * PlasmaRight / PlasmaTrans */ for (k = 0; k < K; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); for (N = k; N < A.nt-1 || N == k; /* No rightmost single-column subdomain */ N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); ldaN = BLKLDD(A, N); for (m = 0; m < B.mt; m++) { ldbm = BLKLDD(B, m); tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; QUARK_CORE_sormlq( plasma->quark, &task_flags, side, trans, tempmm, tempNn, tempkmin, ib, T.nb, A(k, N), ldaN, T(k, N), T.mb, B(m, N), ldbm); } for (n = N+1; (n < N+BS && n < A.nt) || n == A.nt-1; /* Suck in rightmost single-column domain */ n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_stsmlq( plasma->quark, &task_flags, side, trans, tempmm, tempNn, tempmm, tempnn, tempkm, ib, T.nb, B(m, N), ldbm, B(m, n), ldbm, A(k, n), ldak, T(k, n), T.mb); } } } for (RD = BS; RD < A.nt-k; RD *= 2) { for (N = k; N+RD < A.nt-1; /* No reduction with rightmost single-column subdomain */ N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_sttmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempNRDn, tempkm, ib, T.nb, B (m, N ), ldbm, B (m, N+RD), ldbm, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } } } } }
/***************************************************************************//** * **/ void plasma_pdlacpy_quark(PLASMA_enum uplo, PLASMA_desc A, PLASMA_desc B, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int X, Y; int m, n; int ldam, ldbm; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); switch (uplo) { /* * PlasmaUpper */ case PlasmaUpper: for (m = 0; m < A.mt; m++) { X = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); if (m < A.nt) { Y = m == A.nt-1 ? A.n-m*A.nb : A.nb; QUARK_CORE_dlacpy( plasma->quark, &task_flags, PlasmaUpper, X, Y, A.mb, A(m, m), ldam, B(m, m), ldbm); } for (n = m+1; n < A.nt; n++) { Y = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_dlacpy( plasma->quark, &task_flags, PlasmaUpperLower, X, Y, A.mb, A(m, n), ldam, B(m, n), ldbm); } } break; /* * PlasmaLower */ case PlasmaLower: for (m = 0; m < A.mt; m++) { X = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); if (m < A.nt) { Y = m == A.nt-1 ? A.n-m*A.nb : A.nb; QUARK_CORE_dlacpy( plasma->quark, &task_flags, PlasmaLower, X, Y, A.mb, A(m, m), ldam, B(m, m), ldbm); } for (n = 0; n < min(m, A.nt); n++) { Y = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_dlacpy( plasma->quark, &task_flags, PlasmaUpperLower, X, Y, A.mb, A(m, n), ldam, B(m, n), ldbm); } } break; /* * PlasmaUpperLower */ case PlasmaUpperLower: default: for (m = 0; m < A.mt; m++) { X = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); for (n = 0; n < A.nt; n++) { Y = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_dlacpy( plasma->quark, &task_flags, PlasmaUpperLower, X, Y, A.mb, A(m, n), ldam, B(m, n), ldbm); } } } }
/***************************************************************************//** * Parallel tile LQ factorization - dynamic scheduling **/ void plasma_pcgelqf_quark(PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldak, ldam; int tempkm, tempkn, tempmm, tempnn; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; for (k = 0; k < min(A.mt, A.nt); k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldak = BLKLDD(A, k); QUARK_CORE_cgelqt( plasma->quark, &task_flags, tempkm, tempkn, ib, T.nb, A(k, k), ldak, T(k, k), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_cunmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, tempkn, tempkn, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, A(m, k), ldam); } for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ctslqt( plasma->quark, &task_flags, tempkm, tempnn, ib, T.nb, A(k, k), ldak, A(k, n), ldak, T(k, n), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ctsmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, A.nb, tempmm, tempnn, A.mb, ib, T.nb, A(m, k), ldam, A(m, n), ldam, A(k, n), ldak, T(k, n), T.mb); } } } }
/***************************************************************************//** * **/ void plasma_pdlacpy(plasma_context_t *plasma) { PLASMA_enum uplo; PLASMA_desc A; PLASMA_desc B; PLASMA_sequence *sequence; PLASMA_request *request; int X, Y; int m, n; int next_m; int next_n; int ldam, ldbm; plasma_unpack_args_5(uplo, A, B, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; switch (uplo) { /* * PlasmaUpper */ case PlasmaUpper: m = 0; n = PLASMA_RANK; while (n >= A.nt) { m++; n = n - A.nt + m; } while (m < A.mt) { next_m = m; next_n = n; next_n += PLASMA_SIZE; while (next_n >= A.nt && next_m < A.mt) { next_m++; next_n = next_n - A.nt + next_m; } X = m == A.mt-1 ? A.m-m*A.mb : A.mb; Y = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); CORE_dlacpy( m == n ? uplo : PlasmaUpperLower, X, Y, A(m, n), ldam, B(m, n), ldbm); n = next_n; m = next_m; } break; /* * PlasmaLower */ case PlasmaLower: n = 0; m = PLASMA_RANK; while (m >= A.mt) { n++; m = m - A.mt + n; } while (n < A.nt) { next_m = m; next_n = n; next_m += PLASMA_SIZE; while (next_m >= A.mt && next_n < A.nt) { next_n++; next_m = next_m - A.mt + next_n; } X = m == A.mt-1 ? A.m-m*A.mb : A.mb; Y = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); CORE_dlacpy( m == n ? uplo : PlasmaUpperLower, X, Y, A(m, n), ldam, B(m, n), ldbm); n = next_n; m = next_m; } break; /* * PlasmaUpperLower */ case PlasmaUpperLower: default: n = 0; m = PLASMA_RANK; while (m >= A.mt) { n++; m = m - A.mt; } while (n < A.nt) { next_m = m; next_n = n; next_m += PLASMA_SIZE; while (next_m >= A.mt && next_n < A.nt) { next_n++; next_m = next_m - A.mt; } X = m == A.mt-1 ? A.m-m*A.mb : A.mb; Y = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); CORE_dlacpy( PlasmaUpperLower, X, Y, A(m, n), ldam, B(m, n), ldbm); n = next_n; m = next_m; } break; } }
/***************************************************************************//** * Parallel UU' or L'L operation - dynamic scheduling **/ void plasma_pclauum_quark(PLASMA_enum uplo, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldam; int tempkm, tempmm, tempnn; PLASMA_Complex32_t zone = (PLASMA_Complex32_t)1.0; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); /* * PlasmaLower */ if (uplo == PlasmaLower) { for (m = 0; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); for(n = 0; n < m; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_cherk( plasma->quark, &task_flags, uplo, PlasmaConjTrans, tempnn, tempmm, A.mb, 1.0, A(m, n), ldam, 1.0, A(n, n), A.mb); for(k = n+1; k < m; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; QUARK_CORE_cgemm( plasma->quark, &task_flags, PlasmaConjTrans, PlasmaNoTrans, tempkm, tempnn, tempmm, A.mb, zone, A(m, k), ldam, A(m, n), ldam, zone, A(k, n), A.mb); } } for (n = 0; n < m; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ctrmm( plasma->quark, &task_flags, PlasmaLeft, uplo, PlasmaConjTrans, PlasmaNonUnit, tempmm, tempnn, A.mb, zone, A(m, m), ldam, A(m, n), ldam); } QUARK_CORE_clauum( plasma->quark, &task_flags, uplo, tempmm, A.mb, A(m, m), ldam); } } /* * PlasmaUpper */ else { for (m = 0; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); for (n = 0; n < m; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_cherk( plasma->quark, &task_flags, uplo, PlasmaNoTrans, tempnn, tempmm, A.mb, 1.0, A(n, m), A.mb, 1.0, A(n, n), A.mb); for (k = n+1; k < m; k++){ tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; QUARK_CORE_cgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaConjTrans, tempnn, tempkm, tempmm, A.mb, zone, A(n, m), A.mb, A(k, m), A.mb, zone, A(n, k), A.mb); } } for (n = 0; n < m; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ctrmm( plasma->quark, &task_flags, PlasmaRight, uplo, PlasmaConjTrans, PlasmaNonUnit, tempnn, tempmm, A.mb, zone, A(m, m), ldam, A(n, m), A.mb); } QUARK_CORE_clauum( plasma->quark, &task_flags, uplo, tempmm, A.mb, A(m, m), ldam); } } }
/***************************************************************************//** * Parallel tile LU factorization with no pivoting - dynamic scheduling **/ void plasma_pzgetrf_nopiv_quark(PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n, ib; int ldak, ldam; int tempkm, tempkn, tempmm, tempnn; PLASMA_Complex64_t zone = (PLASMA_Complex64_t) 1.0; PLASMA_Complex64_t mzone = (PLASMA_Complex64_t)-1.0; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; for (k = 0; k < min(A.mt, A.nt); k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldak = BLKLDD(A, k); QUARK_CORE_zgetrf_nopiv( plasma->quark, &task_flags, tempkm, tempkn, ib, A.mb, A(k, k), ldak, sequence, request, A.mb*k); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaRight, PlasmaUpper, PlasmaNoTrans, PlasmaNonUnit, tempmm, tempkn, A.mb, zone, A(k, k), ldak, A(m, k), ldam); } for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaLeft, PlasmaLower, PlasmaNoTrans, PlasmaUnit, tempkm, tempnn, A.mb, zone, A(k, k), ldak, A(k, n), ldak); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_zgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempmm, tempnn, A.mb, A.mb, mzone, A(m, k), ldam, A(k, n), ldak, zone, A(m, n), ldam); } } } }