/***************************************************************************//** * Parallel tile Cholesky factorization - dynamic scheduling **/ void plasma_pdplrnt_quark( PLASMA_desc A, unsigned long long int seed, PLASMA_sequence *sequence, PLASMA_request *request ) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int m, n; int ldam; int tempmm, tempnn; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); for (m = 0; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); for (n = 0; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_dplrnt( plasma->quark, &task_flags, tempmm, tempnn, A(m, n), ldam, A.m, m*A.mb, n*A.nb, seed ); } } }
void warmup(Quark *q){ int NB = 200; double *H = (double*) malloc(NB*NB*OOC_NTHREADS*sizeof(double)); double *D = (double*) offload_Alloc(NB*NB*OOC_NTHREADS*sizeof(double), 0); { Quark_Task_Flags tflags = Quark_Task_Flags_Initializer; // for(int r = 0; r < OOC_NTHREADS; r++){ for(int r = 0; r < 2; r++){ QUARK_Task_Flag_Set(&tflags, TASK_LOCK_TO_THREAD, r); // QUARK_Task_Flag_Set(&tflags, THREAD_SET_TO_MANUAL_SCHEDULING, (r==0)||(r==1)); QUARK_Insert_Task(q, CORE_H2D, &tflags, sizeof(int), &NB, VALUE, sizeof(int), &NB, VALUE, sizeof(double), H+r*NB*NB, INPUT, sizeof(int), &NB, VALUE, sizeof(double), D+r*NB*NB, OUTPUT, sizeof(int), &NB, VALUE, 0); QUARK_Insert_Task(q, CORE_D2H, &tflags, sizeof(int), &NB, VALUE, sizeof(int), &NB, VALUE, sizeof(double), D+r*NB*NB, INPUT, sizeof(int), &NB, VALUE, sizeof(double), H+r*NB*NB, OUTPUT, sizeof(int), &NB, VALUE, 0); } } QUARK_Barrier(q); offload_Free(D, 0); free(H); }
/***************************************************************************//** * **/ void plasma_pslag2d_quark(PLASMA_desc SA, PLASMA_desc B, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int X, Y; int m, n; int ldam, ldbm; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); for(m = 0; m < SA.mt; m++) { X = m == SA.mt-1 ? SA.m-m*SA.mb : SA.mb; ldam = BLKLDD(SA, m); ldbm = BLKLDD(B, m); for(n = 0; n < SA.nt; n++) { Y = n == SA.nt-1 ? SA.n-n*SA.nb : SA.nb; QUARK_CORE_slag2d( plasma->quark, &task_flags, X, Y, SA.mb, SA(m, n), ldam, B(m, n), ldbm); } } }
/***************************************************************************//** * **/ void plasma_pdaxpy_quark(double alpha, PLASMA_desc A, PLASMA_desc B, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int X, Y; int m, n; int ldam, ldbm; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); for (m = 0; m < A.mt; m++) { X = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); for (n = 0; n < A.nt; n++) { Y = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_daxpy( plasma->quark, &task_flags, X, Y, A.mb, alpha, A(m, n), ldam, B(m, n), ldbm); } } }
/***************************************************************************//** * Parallel construction of Q using tile V (application to identity) - dynamic scheduling **/ void plasma_pdorgqr_quark(PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldak, ldqk, ldam, ldqm; int tempmm, tempnn, tempkmin, tempkm; int tempAkm, tempAkn; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; for (k = min(A.mt, A.nt)-1; k >= 0; k--) { tempAkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempAkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; tempkmin = min( tempAkn, tempAkm ); tempkm = k == Q.mt-1 ? Q.m-k*Q.mb : Q.mb; ldak = BLKLDD(A, k); ldqk = BLKLDD(Q, k); for (m = Q.mt - 1; m > k; m--) { tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb; ldam = BLKLDD(A, m); ldqm = BLKLDD(Q, m); for (n = 0; n < Q.nt; n++) { tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb; QUARK_CORE_dtsmqr( plasma->quark, &task_flags, PlasmaLeft, PlasmaNoTrans, Q.mb, tempnn, tempmm, tempnn, tempAkn, ib, T.nb, Q(k, n), ldqk, Q(m, n), ldqm, A(m, k), ldam, T(m, k), T.mb); } } for (n = 0; n < Q.nt; n++) { tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb; QUARK_CORE_dormqr( plasma->quark, &task_flags, PlasmaLeft, PlasmaNoTrans, tempkm, tempnn, tempkmin, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, Q(k, n), ldqk); } } }
/***************************************************************************//** * Parallel forward substitution for tile LU - dynamic scheduling **/ void plasma_pztrsmpl_quark(PLASMA_desc A, PLASMA_desc B, PLASMA_desc L, int *IPIV, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldak, ldam, ldbk, ldbm; int tempkm, tempnn, tempkmin, tempmm, tempkn; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; for (k = 0; k < min(A.mt, A.nt); k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; tempkmin = k == min(A.mt, A.nt)-1 ? min(A.m, A.n)-k*A.mb : A.mb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zgessm( plasma->quark, &task_flags, tempkm, tempnn, tempkmin, ib, L.nb, IPIV(k, k), A(k, k), ldak, B(k, n), ldbk); } for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zssssm( plasma->quark, &task_flags, A.nb, tempnn, tempmm, tempnn, tempkn, ib, L.nb, B(k, n), ldbk, B(m, n), ldbm, L(m, k), L.mb, A(m, k), ldam, IPIV(m, k)); } } } }
/***************************************************************************//** * Parallel tile row interchanges - dynamic scheduling **/ void plasma_pclaswp_quark(PLASMA_desc B, int *IPIV, int inc, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int m, n; int tempi, tempm, tempmm, tempnn; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); if ( inc > 0 ) { for (m = 0; m < B.mt; m++) { tempi = m * B.mb; tempm = B.m - tempi; tempmm = m == B.mt-1 ? tempm : B.mb; for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n - n * B.nb : B.nb; QUARK_CORE_claswp_ontile( plasma->quark, &task_flags, plasma_desc_submatrix(B, tempi, n*B.nb, tempm, tempnn), B(m, n), 1, tempmm, IPIV(m), inc, B(B.mt-1, n) ); } } } else { for (m = B.mt-1; m > -1; m--) { tempi = m * B.mb; tempm = B.m - tempi; tempmm = m == B.mt-1 ? tempm : B.mb; for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n - n * B.nb : B.nb; QUARK_CORE_claswp_ontile( plasma->quark, &task_flags, plasma_desc_submatrix(B, tempi, n*B.nb, tempm, tempnn), B(m, n), 1, tempmm, IPIV(m), inc, B(0, n) ); } } } }
/***************************************************************************//** * Zeroes a submatrix in tile layout - dynamic scheduling **/ void plasma_pztile_zero_quark(PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request) { PLASMA_Complex64_t *bdl; plasma_context_t *plasma; int X1, Y1; int X2, Y2; int n, m, ldt; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); for (m = 0; m < A.mt; m++) { ldt = BLKLDD(A, m); for (n = 0; n < A.nt; n++) { X1 = n == 0 ? A.j%A.nb : 0; Y1 = m == 0 ? A.i%A.mb : 0; X2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb; Y2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb; bdl = ABDL(m, n); QUARK_Insert_Task(plasma->quark, CORE_ztile_zero_quark, &task_flags, sizeof(int), &X1, VALUE, sizeof(int), &X2, VALUE, sizeof(int), &Y1, VALUE, sizeof(int), &Y2, VALUE, sizeof(PLASMA_Complex64_t)*A.bsiz, bdl, OUTPUT | LOCALITY, sizeof(int), &ldt, VALUE, 0); } } }
/***************************************************************************//** * Conversion from LAPACK F77 matrix layout to tile layout - dynamic scheduling **/ void plasma_pzlapack_to_tile_quark(PLASMA_Complex64_t *Af77, int lda, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request) { PLASMA_Complex64_t *f77; PLASMA_Complex64_t *bdl; plasma_context_t *plasma; int X1, Y1; int X2, Y2; int n, m, ldt; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); for (m = 0; m < A.mt; m++) { ldt = BLKLDD(A, m); for (n = 0; n < A.nt; n++) { X1 = n == 0 ? A.j%A.nb : 0; Y1 = m == 0 ? A.i%A.mb : 0; X2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb; Y2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb; f77 = AF77(m, n); bdl = ABDL(m, n); QUARK_CORE_zlacpy( plasma->quark, &task_flags, PlasmaUpperLower, (Y2-Y1), (X2-X1), A.mb, &(f77[X1*lda+Y1]), lda, &(bdl[X1*lda+Y1]), ldt); } } }
/***************************************************************************//** * Parallel tile QR factorization (reduction Householder) - dynamic scheduling **/ void plasma_psgeqrfrh_quark(PLASMA_desc A, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int K, M, RD; int ldaM, ldam, ldaMRD; int tempkn, tempMm, tempnn, tempmm, tempMRDm; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; K = min(A.mt, A.nt); for (k = 0; k < K; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; for (M = k; M < A.mt-1 || M == k; /* No bottom single-row subdomain */ M += BS) { tempMm = M == A.mt-1 ? A.m-M*A.mb : A.mb; ldaM = BLKLDD(A, M); QUARK_CORE_sgeqrt( plasma->quark, &task_flags, tempMm, tempkn, ib, T.nb, A(M, k), ldaM, T(M, k), T.mb); for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_sormqr( plasma->quark, &task_flags, PlasmaLeft, PlasmaTrans, tempMm, tempnn, tempMm, ib, T.nb, A(M, k), ldaM, T(M, k), T.mb, A(M, n), ldaM); } for (m = M+1; (m < M+BS && m < A.mt) || m == A.mt-1; /* Suck in bottom single-row domain */ m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_stsqrt( plasma->quark, &task_flags, tempmm, tempkn, ib, T.nb, A(M, k), ldaM, A(m, k), ldam, T(m, k), T.mb); for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_stsmqr( plasma->quark, &task_flags, PlasmaLeft, PlasmaTrans, A.nb, tempnn, tempmm, tempnn, A.nb, ib, T.nb, A(M, n), ldaM, A(m, n), ldam, A(m, k), ldam, T(m, k), T.mb); } } } for (RD = BS; RD < A.mt-k; RD *= 2) { for (M = k; M+RD < A.mt-1; /* No reduction with bottom single-row subdomain */ M += 2*RD) { tempMRDm = M+RD == A.mt-1 ? A.m-(M+RD)*A.mb : A.mb; ldaM = BLKLDD(A, M ); ldaMRD = BLKLDD(A, M+RD); QUARK_CORE_sttqrt( plasma->quark, &task_flags, tempMRDm, tempkn, ib, T.nb, A (M , k), ldaM, A (M+RD, k), ldaMRD, T2(M+RD, k), T.mb); for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_sttmqr( plasma->quark, &task_flags, PlasmaLeft, PlasmaTrans, A.nb, tempnn, tempMRDm, tempnn, A.nb, ib, T.nb, A (M, n), ldaM, A (M+RD, n), ldaMRD, A (M+RD, k), ldaMRD, T2(M+RD, k), T.mb); } } } } }
/***************************************************************************//** * Parallel tile triangular matrix inverse - dynamic scheduling **/ void plasma_pztrtri_quark(PLASMA_enum uplo, PLASMA_enum diag, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldam, ldan; int tempkn, tempmm, tempnn; PLASMA_Complex64_t zone = (PLASMA_Complex64_t) 1.0; PLASMA_Complex64_t mzone = (PLASMA_Complex64_t)-1.0; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); /* * PlasmaLower */ if (uplo == PlasmaLower) { for (n = 0; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldan = BLKLDD(A, n); for (m = n+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaRight, uplo, PlasmaNoTrans, diag, tempmm, tempnn, A.mb, mzone, A(n, n), ldan, A(m, n), ldam); } for (m = n+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); for (k = 0; k < n; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; QUARK_CORE_zgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempmm, tempkn, tempnn, A.mb, zone, A(m, n), ldam, A(n, k), ldan, zone, A(m, k), ldam); } } for (m = 0; m < n; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaLeft, uplo, PlasmaNoTrans, diag, tempnn, tempmm, A.mb, zone, A(n, n), ldan, A(n, m), ldan); } QUARK_CORE_ztrtri( plasma->quark, &task_flags, uplo, diag, tempnn, A.mb, A(n, n), ldan, sequence, request, A.nb*n); } } /* * PlasmaUpper */ else { for (m = 0; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); for (n = m+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaLeft, uplo, PlasmaNoTrans, diag, tempmm, tempnn, A.mb, mzone, A(m, m), ldam, A(m, n), ldam); } for (n = 0; n < m; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldan = BLKLDD(A, n); for (k = m+1; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; QUARK_CORE_zgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempnn, tempkn, tempmm, A.mb, zone, A(n, m), ldan, A(m, k), ldam, zone, A(n, k), ldan); } QUARK_CORE_ztrsm( plasma->quark, &task_flags, PlasmaRight, uplo, PlasmaNoTrans, diag, tempnn, tempmm, A.mb, zone, A(m, m), ldam, A(n, m), ldan); } QUARK_CORE_ztrtri( plasma->quark, &task_flags, uplo, diag, tempmm, A.mb, A(m, m), ldam, sequence, request, A.mb*m); } } }
/** **************************************************************************** * * @ingroup InPlaceTransformation * * plasma_pcgetmi2_quark - realises nprob independant transpositions. Each * subproblem is a tile of mb-by-nb elements. * This function use an extra space of PLASMA_SIZE*(mb*nb). This is a * maximum in case of dynamic scheduling. * * ******************************************************************************* * * @param[in] idep * PlasmaIPT_Nodep: No fake dependencies are added. * PlasmaIPT_Panel: A gatherv is added on each panel and panel size is m*nb. * PlasmaIPT_All: A gatherv is added on the whole matrix. * * @param[in] odep * PlasmaIPT_Nodep: No fake dependencies are added. * PlasmaIPT_Panel: A gatherv is added on each panel and panel size is m*nb. * PlasmaIPT_All: A gatherv is added on the whole matrix. * * @param[in] storev * PlasmaColumnWise: Data stored in column major. * PlasmaRowWise: Data stored in row major. * * @param[in] m * Number of row of A if tiles are sorted in column major format, * number of columns otherwise. * * @param[in] n * Number of columns of A if tiles are sorted in column major format, * number of rows otherwise. * * @param[in] mb * Number of rows in each individual subproblem if storev == PlasmaColumnWise, * number of columns otherwise. m%mb must be 0. * * @param[in] nb * Number of columns in each individual subproblem if storev == PlasmaColumnWise, * number of rows otherwise. n%nb must be 0. * * @param[in,out] A * Matrix of size m*n. * * @param[in] sequence * Identifies the sequence of function calls that this call belongs to * (for completion checks and exception handling purposes). * * @param[out] request * Identifies this function call (for exception handling purposes). * ******************************************************************************* * * @sa plasma_pcgetmi2 * ******************************************************************************/ void plasma_pcgetmi2_quark(PLASMA_enum idep, PLASMA_enum odep, PLASMA_enum storev, int m, int n, int mb, int nb, PLASMA_Complex32_t *A, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; PLASMA_Complex32_t *Al, *Ap; int i, j, nprob, mt, nt; int bsiz, psiz, size; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; /* quick return */ if( (mb < 2) || (nb < 2) ) { return ; } QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); bsiz = mb*nb; if ( storev == PlasmaColumnwise ) { psiz = m*nb; mt = ( m / mb ); nt = ( n / nb ); } else { psiz = n*mb; mt = ( n / nb ); nt = ( m / mb ); } size = m*n; switch ( idep ) { /* * Dependencies on each panel as input */ case PlasmaIPT_Panel: switch ( odep ) { case PlasmaIPT_Panel: for (j=0; j<nt; j++) { Ap = A + (psiz*j); for (i=0; i<mt; i++) { Al = Ap + i*bsiz; QUARK_CORE_cgetrip_f1( plasma->quark, &task_flags, mb, nb, Al, bsiz, Ap, psiz, INOUT|GATHERV); } } break; case PlasmaIPT_All: for (j=0; j<nt; j++) { Ap = A + (psiz*j); for (i=0; i<mt; i++) { Al = Ap + i*bsiz; QUARK_CORE_cgetrip_f2(plasma->quark, &task_flags, mb, nb, Al, bsiz, Ap, size, INPUT, A, size, INOUT|GATHERV); } } break; case PlasmaIPT_NoDep: default: for (j=0; j<nt; j++) { Ap = A + (psiz*j); for (i=0; i<mt; i++) { Al = Ap + i*bsiz; QUARK_CORE_cgetrip_f1( plasma->quark, &task_flags, mb, nb, Al, bsiz, Ap, psiz, INPUT); } } } break; /* * Dependency on all the matrix as input */ case PlasmaIPT_All: switch ( odep ) { case PlasmaIPT_Panel: for (j=0; j<nt; j++) { Ap = A + (psiz*j); for (i=0; i<mt; i++) { Al = Ap + i*bsiz; QUARK_CORE_cgetrip_f2( plasma->quark, &task_flags, mb, nb, Al, bsiz, A, size, INPUT, Ap, psiz, INOUT|GATHERV); } } break; case PlasmaIPT_All: nprob = mt*nt; for (i=0; i<nprob; i++) { QUARK_CORE_cgetrip_f1(plasma->quark, &task_flags, mb, nb, &(A[ i*bsiz ]), bsiz, A, size, INOUT|GATHERV); } break; case PlasmaIPT_NoDep: default: nprob = mt*nt; for (i=0; i<nprob; i++) { QUARK_CORE_cgetrip_f1(plasma->quark, &task_flags, mb, nb, &(A[ i*bsiz ]), bsiz, A, size, INPUT); } } break; /* * No Dependencies as input */ case PlasmaIPT_NoDep: default: switch ( odep ) { case PlasmaIPT_Panel: for (j=0; j<nt; j++) { Ap = A + (psiz*j); for (i=0; i<mt; i++) { Al = Ap + i*bsiz; QUARK_CORE_cgetrip_f1( plasma->quark, &task_flags, mb, nb, Al, bsiz, Ap, psiz, INOUT|GATHERV); } } break; case PlasmaIPT_All: nprob = mt*nt; for (i=0; i<nprob; i++) { QUARK_CORE_cgetrip_f1(plasma->quark, &task_flags, mb, nb, &(A[ i*bsiz ]), bsiz, A, size, INOUT|GATHERV); } break; case PlasmaIPT_NoDep: default: nprob = mt*nt; for (i=0; i<nprob; i++) { QUARK_CORE_cgetrip(plasma->quark, &task_flags, mb, nb, &(A[ i*bsiz ]), bsiz); } } } }
/***************************************************************************//** * Parallel application of Q using tile V - LQ factorization (reduction * Householder) - dynamic scheduling **/ void plasma_pzunmlqrh_quark(PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int K, N, RD, lastRD; int ldaN, ldak; int ldbN, ldbm, ldbNRD; int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; K = min(A.mt, A.nt); if (side == PlasmaLeft ) { if (trans == PlasmaNoTrans) { /* * PlasmaLeft / PlasmaNoTrans */ for (k = 0; k < K; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); for (N = k; N < A.nt; N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); ldaN = BLKLDD(A, N); ldbN = BLKLDD(B, N); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempNn, tempnn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, B(N, n), ldbN); } for (m = N+1; m < min(N+BS, A.nt); m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, B.nb, tempnn, tempmm, tempnn, tempkm, ib, T.nb, B(N, n), ldbN, B(m, n), ldbm, A(k, m), ldak, T(k, m), T.mb); } } } for (RD = BS; RD < A.nt-k; RD *= 2) { for (N = k; N+RD < A.nt; N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; ldbN = BLKLDD(B, N ); ldbNRD = BLKLDD(B, N+RD); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zttmlq( plasma->quark, &task_flags, side, trans, B.mb, tempnn, tempNRDn, tempnn, tempkm, ib, T.nb, B (N, n), ldbN, B (N+RD, n), ldbNRD, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } } } else { /* * PlasmaLeft / PlasmaConjTrans */ for (k = K-1; k >= 0; k--) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); lastRD = 0; for (RD = BS; RD < A.nt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { for (N = k; N+RD < A.nt; N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; ldbN = BLKLDD(B, N ); ldbNRD = BLKLDD(B, N+RD); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zttmlq( plasma->quark, &task_flags, side, trans, B.nb, tempnn, tempNRDn, tempnn, tempkm, ib, T.nb, B (N, n), ldbN, B (N+RD, n), ldbNRD, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } for (N = k; N < A.nt; N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); ldaN = BLKLDD(A, N); ldbN = BLKLDD(B, N); for (m = min(N+BS, A.nt)-1; m > N; m--) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, B.mb, tempnn, tempmm, tempnn, tempkm, ib, T.nb, B(N, n), ldbN, B(m, n), ldbm, A(k, m), ldak, T(k, m), T.mb); } } for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempNn, tempnn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, B(N, n), ldbN); } } } } } else { if (trans == PlasmaNoTrans) { /* * PlasmaRight / PlasmaNoTrans */ for (k = K-1; k >= 0; k--) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); lastRD = 0; for (RD = BS; RD < A.nt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { for (N = k; N+RD < A.nt; N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; for (m = 0; m < B.mt; m++) { ldbm = BLKLDD(B, m); tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; QUARK_CORE_zttmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempNRDn, tempkm, ib, T.nb, B (m, N ), ldbm, B (m, N+RD), ldbm, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } for (N = k; N < A.nt; N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); for (n = min(N+BS, A.nt)-1; n > N; n--) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempnn, tempkm, ib, T.nb, B(m, N), ldbm, B(m, n), ldbm, A(k, n), ldak, T(k, n), T.mb); } } for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempmm, tempNn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, B(m, N), ldbm); } } } } else { /* * PlasmaRight / PlasmaConjTrans */ for (k = 0; k < K; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); for (N = k; N < A.nt; N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm,tempNn); ldaN = BLKLDD(A, N); for (m = 0; m < B.mt; m++) { ldbm = BLKLDD(B, m); tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempmm, tempNn, tempkmin, ib, T.nb, A(k, N), ldaN, T(k, N), T.mb, B(m, N), ldbm); } for (n = N+1; n < min(N+BS, A.nt); n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, tempmm, tempNn, tempmm, tempnn, tempkm, ib, T.nb, B(m, N), ldbm, B(m, n), ldbm, A(k, n), ldak, T(k, n), T.mb); } } } for (RD = BS; RD < A.nt-k; RD *= 2) { for (N = k; N+RD < A.nt; N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_zttmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempNRDn, tempkm, ib, T.nb, B (m, N ), ldbm, B (m, N+RD), ldbm, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } } } } }
/***************************************************************************//** * Parallel initializztion a 2-D array A to * ALPHA on the offdiagonals. **/ void plasma_pzlaset2_quark(PLASMA_enum uplo, PLASMA_Complex64_t alpha, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int i, j; int ldai, ldaj; int tempim; int tempjm, tempjn; int minmn = min(A.mt, A.nt); plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); if (uplo == PlasmaLower) { for (j = 0; j < minmn; j++){ tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb; tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb; ldaj = BLKLDD(A, j); QUARK_CORE_zlaset2( plasma->quark, &task_flags, PlasmaLower, tempjm, tempjn, alpha, A(j, j), ldaj); for (i = j+1; i < A.mt; i++){ tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb; ldai = BLKLDD(A, i); QUARK_CORE_zlaset2( plasma->quark, &task_flags, PlasmaUpperLower, tempim, tempjn, alpha, A(i, j), ldai); } } } else if (uplo == PlasmaUpper) { for (j = 1; j < A.nt; j++){ tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb; for (i = 0; i < min(j, A.mt); i++){ tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb; ldai = BLKLDD(A, i); QUARK_CORE_zlaset2( plasma->quark, &task_flags, PlasmaUpperLower, tempim, tempjn, alpha, A(i, j), ldai); } } for (j = 0; j < minmn; j++){ tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb; tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb; ldaj = BLKLDD(A, j); QUARK_CORE_zlaset2( plasma->quark, &task_flags, PlasmaUpper, tempjm, tempjn, alpha, A(j, j), ldaj); } } else { for (i = 0; i < A.mt; i++){ tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb; ldai = BLKLDD(A, i); for (j = 0; j < A.nt; j++){ tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb; QUARK_CORE_zlaset2( plasma->quark, &task_flags, PlasmaUpperLower, tempim, tempjn, alpha, A(i, j), ldai); } } } }
/***************************************************************************//** * Parallel application of Q using tile V - LQ factorization - dynamic scheduling **/ void plasma_pzunmlq_quark(PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldak, ldan, ldbk, ldbm; int tempmm, tempnn, tempkn, tempkm, tempkmin; int ib, minMT, minM; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; if (A.m > A.n) { minM = A.n; minMT = A.nt; } else { minM = A.m; minMT = A.mt; } if (side == PlasmaLeft ) { if (trans == PlasmaNoTrans) { /* * PlasmaLeft / PlasmaNoTrans */ for (k = 0; k < minMT; k++) { tempkm = k == B.mt -1 ? B.m -k*B.mb : B.mb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempkm, tempnn, tempkmin, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, B(k, n), ldbk); } for (m = k+1; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, B.mb, tempnn, tempmm, tempnn, tempkmin, ib, T.nb, B(k, n), ldbk, B(m, n), ldbm, A(k, m), ldak, T(k, m), T.mb); } } } } else { /* * PlasmaLeft / PlasmaConjTrans */ for (k = minMT-1; k >= 0; k--) { tempkm = k == B.mt -1 ? B.m -k*B.mb : B.mb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); for (m = B.mt-1; m > k; m--) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, B.mb, tempnn, tempmm, tempnn, tempkmin, ib, T.nb, B(k, n), ldbk, B(m, n), ldbm, A(k, m), ldak, T(k, m), T.mb); } } for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempkm, tempnn, tempkmin, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, B(k, n), ldbk); } } } } else { if (trans == PlasmaNoTrans) { /* * PlasmaRight / PlasmaNoTrans */ for (k = minMT-1; k >= 0; k--) { tempkn = k == B.nt -1 ? B.n -k*B.nb : B.nb; tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; ldak = BLKLDD(A, k); for (n = B.nt-1; n > k; n--) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; ldan = BLKLDD(A, n); for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempnn, tempkmin, ib, T.nb, B(m, k), ldbm, B(m, n), ldbm, A(k, n), ldak, T(k, n), T.mb); } } for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempmm, tempkn, tempkmin, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, B(m, k), ldbm); } } } else { /* * PlasmaRight / PlasmaConjTrans */ for (k = 0; k < minMT; k++) { tempkn = k == B.nt -1 ? B.n -k*B.nb : B.nb; tempkmin = k == minMT-1 ? minM-k*A.mb : A.mb; ldak = BLKLDD(A, k); for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, side, trans, tempmm, tempkn, tempkmin, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, B(m, k), ldbm); } for (n = k+1; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempnn, tempkmin, ib, T.nb, B(m, k), ldbm, B(m, n), ldbm, A(k, n), ldak, T(k, n), T.mb); } } } } } }
/***************************************************************************//** * Parallel tile BAND Tridiagonal Reduction - dynamic scheduler **/ void plasma_pzherbt_quark(PLASMA_enum uplo, PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n, i, j; int ldak, ldam, ldan, ldaj, ldai; int tempkn, tempmm, tempnn, tempjj; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; if (uplo == PlasmaLower) { for (k = 0; k < A.nt-1; k++){ tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb; ldak = BLKLDD(A, k+1); QUARK_CORE_zgeqrt( plasma->quark, &task_flags, tempkn, A.nb, ib, T.nb, A(k+1, k), ldak, T(k+1, k), T.mb); /* LEFT and RIGHT on the symmetric diagonal block */ QUARK_CORE_zherfb( plasma->quark, &task_flags, PlasmaLower, tempkn, tempkn, ib, T.nb, A(k+1, k), ldak, T(k+1, k), T.mb, A(k+1, k+1), ldak); /* RIGHT on the remaining tiles until the bottom */ for (m = k+2; m < A.mt ; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_zunmqr( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, A.nb, tempkn, ib, T.nb, A(k+1, k), ldak, T(k+1, k), T.mb, A(m , k+1), ldam); } for (m = k+2; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ztsqrt( plasma->quark, &task_flags, tempmm, A.nb, ib, T.nb, A(k+1, k), ldak, A(m , k), ldam, T(m , k), T.mb); /* LEFT */ for (i = k+2; i < m; i++) { ldai = BLKLDD(A, i); QUARK_CORE_ztsmqr_hetra1( plasma->quark, &task_flags, PlasmaLeft, PlasmaConjTrans, A.mb, A.nb, tempmm, A.nb, A.nb, ib, T.nb, A(i, k+1), ldai, A(m, i), ldam, A(m, k), ldam, T(m, k), T.mb); } /* RIGHT */ for (j = m+1; j < A.mt ; j++) { tempjj = j == A.mt-1 ? A.m-j*A.mb : A.mb; ldaj = BLKLDD(A, j); QUARK_CORE_ztsmqr( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempjj, A.nb, tempjj, tempmm, A.nb, ib, T.nb, A(j, k+1), ldaj, A(j, m), ldaj, A(m, k), ldam, T(m, k), T.mb); } /* LEFT->RIGHT */ QUARK_CORE_ztsmqr_corner( plasma->quark, &task_flags, A.nb, A.nb, tempmm, A.nb, tempmm, tempmm, A.nb, ib, T.nb, A(k+1, k+1), ldak, A(m , k+1), ldam, A(m , m), ldam, A(m , k), ldam, T(m , k), T.mb); } } } else { for (k = 0; k < A.nt-1; k++){ tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb; ldak = BLKLDD(A, k+1); QUARK_CORE_zgelqt( plasma->quark, &task_flags, A.nb, tempkn, ib, T.nb, A(k, k+1), A.nb, T(k, k+1), T.mb); /* RIGHT and LEFT on the symmetric diagonal block */ QUARK_CORE_zherfb( plasma->quark, &task_flags, PlasmaUpper, tempkn, tempkn, ib, T.nb, A(k, k+1), A.nb, T(k, k+1), T.mb, A(k+1, k+1), ldak); /* LEFT on the remaining tiles until the left side */ for (n = k+2; n < A.nt ; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_zunmlq( plasma->quark, &task_flags, PlasmaLeft, PlasmaNoTrans, A.nb, tempnn, tempkn, ib, T.nb, A(k, k+1), A.nb, T(k, k+1), T.mb, A(k+1, n), ldak); } for (n = k+2; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldan = BLKLDD(A, n); QUARK_CORE_ztslqt( plasma->quark, &task_flags, A.nb, tempnn, ib, T.nb, A(k, k+1), A.nb, A(k, n), A.nb, T(k, n), T.mb); /* RIGHT */ for (i = k+2; i < n; i++) { ldai = BLKLDD(A, i); QUARK_CORE_ztsmlq_hetra1( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, A.mb, A.nb, A.nb, tempnn, A.nb, ib, T.nb, A(k+1, i), ldak, A(i, n), ldai, A(k, n), A.nb, T(k, n), T.mb); } /* LEFT */ for (j = n+1; j < A.nt ; j++) { tempjj = j == A.nt-1 ? A.n-j*A.nb : A.nb; ldaj = BLKLDD(A, j); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, PlasmaLeft, PlasmaNoTrans, A.nb, tempjj, tempnn, tempjj, A.nb, ib, T.nb, A(k+1, j), ldak, A(n, j), ldan, A(k, n), A.nb, T(k, n), T.mb); } /* RIGHT->LEFT */ QUARK_CORE_ztsmlq_corner( plasma->quark, &task_flags, A.nb, A.nb, A.nb, tempnn, tempnn, tempnn, A.nb, ib, T.nb, A(k+1, k+1), ldak, A(k+1, n), ldak, A(n , n), ldan, A(k , n), A.nb, T(k , n), T.mb); } } } }
/***************************************************************************//** * Parallel tile LQ factorization (reduction Householder) - dynamic scheduling **/ void plasma_pzgelqfrh_quark(PLASMA_desc A, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int K, N, RD; int ldak, ldam; int tempkm, tempNn, tempmm, tempnn, tempNRDn; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; K = min(A.mt, A.nt); for (k = 0; k < K; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); // for (N = k; N < A.nt; N += BS) { for (N = k; N < A.nt-1 || N == k; // No rightmost single-column subdomain N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; QUARK_CORE_zgelqt( plasma->quark, &task_flags, tempkm, tempNn, ib, T.nb, A(k, N), ldak, T(k, N), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, tempNn, tempNn, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, A(m, N), ldam); } // for (n = N+1; n < N+BS && n < A.nt; n++) { for (n = N+1; (n < N+BS && n < A.nt) || n == A.nt-1; // Suck in rightmost single-column domain n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ztslqt( plasma->quark, &task_flags, tempkm, tempnn, ib, T.nb, A(k, N), ldak, A(k, n), ldak, T(k, n), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, A.nb, tempmm, tempnn, A.mb, ib, T.nb, A(m, N), ldam, A(m, n), ldam, A(k, n), ldak, T(k, n), T.mb); } } } for (RD = BS; RD < A.nt-k; RD *= 2) { // for (N = k; N+RD < A.nt; N += 2*RD) { for (N = k; N+RD < A.nt-1; // No reduction with rightmost single-column subdomain N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; QUARK_CORE_zttlqt( plasma->quark, &task_flags, tempkm, tempNRDn, ib, T.nb, A (k, N ), ldak, A (k, N+RD), ldak, T2(k, N+RD), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_zttmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, A.nb, tempmm, tempNRDn, A.mb, ib, T.nb, A (m, N ), ldam, A (m, N+RD), ldam, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } } }
/***************************************************************************//** * Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler **/ void plasma_pzhbrdt_quark(PLASMA_enum uplo, PLASMA_desc A, double *D, double *E, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; #ifdef COMPLEX static PLASMA_Complex64_t zone = (PLASMA_Complex64_t) 1.0; static double dzero = (double) 0.0; PLASMA_Complex64_t ztmp; double absztmp; #endif PLASMA_Complex64_t *C, *S; int blksweep, lcsweep, blkid, lcNB; int N, NB, NT, grsiz, lcgrsiz; int i; size_t eltsize = plasma_element_size(A.dtyp); plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); NT = A.nt; N = A.m; NB = A.mb; /* Quick return */ if (N == 0){ return; } if (NB == 0) { memset(D, 0, N*sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX for (i=0; i<N; i++) D[i] = cabs(*A(i,i)); #else for (i=0; i<N; i++) D[i] = *A(i,i); #endif return; } /* * Barrier is used because the bulge have to wait until * the reduction to band has been finish. * otherwise, I can remove this BARRIER when I integrate * the function dependencies link inside the reduction to * band. Keep in min the case when NB=1, where no bulge-chasing. */ /***************************************************************/ QUARK_Barrier(plasma->quark); tblg = -Wtimming(); /***************************************************************/ /* * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge. * Make diagonal and superdiagonal elements real, storing them in * D and E. if PlasmaLower, first transform lower bidiagonal form * to upper bidiagonal by applying plane rotations/ Householder * from the left, overwriting superdiagonal elements then make * elements real of the resulting upper Bidiagonal. if PlasmaUpper * then make its elements real. For Q, PT: ZSCAL should be done * in case of WANTQ. */ if (NB == 1){ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i<N; i++) { D[i] = creal( *A(i, i) ); /* diag value */ if( i < (N-1)) { /* lower off-diag value */ ztmp = *A((i+1),i); absztmp = cabs(ztmp); *A((i+1),i) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (PLASMA_Complex64_t) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ */ } } } else { /* PlasmaUpper */ for (i=0; i<N; i++) { D[i] = creal( *A(i,i) ); /* diag value*/ if(i<(N-1)) { /* lower off-diag value */ ztmp = *A(i, (i+1)); absztmp = cabs(ztmp); *A(i,(i+1)) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (PLASMA_Complex64_t) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */ } } } /* end PlasmaUpper*/ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif return; } /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */ if( N <= 0 ) /* this will be removed we don t need it. */ { PLASMA_Complex64_t *work, *TTau; int info, ldwork = N*N; work = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, ldwork, PlasmaComplexDouble); TTau = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N, PlasmaComplexDouble); info = LAPACKE_zhetrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N, A(0,0), A.lm, D, E, TTau, work, ldwork); plasma_shared_free(plasma, (void*) work); plasma_shared_free(plasma, (void*) TTau); if( info == 0 ) sequence->status = PLASMA_SUCCESS; else plasma_sequence_flush(plasma->quark, sequence, request, info); return; } /* General case NB > 1 && N > NB */ C = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N, PlasmaComplexDouble); S = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N, PlasmaComplexDouble); /*************************************************************************** * START BULGE CHASING CODE **************************************************************************/ /* * Initialisation of local parameter. those parameter should be * input or tuned parameter. */ grsiz = 1; if( NB > 160 ) { grsiz = 1; } else if( NB > 100 ) { grsiz = 1; /* if( N < 5000 ) grsiz = 1; else grsiz = 2; */ } else { grsiz = 2; } grsiz = max(1, grsiz); /*grsiz=1;*/ /*printf(" Version -dp- N %5d NB %5d lcNB %5d grsiz %5d A.ln %5d A.nb %5d \n",N,NB,lcNB,grsiz,A.ln,A.nb);*/ for (blksweep = 0; blksweep<NT; blksweep++){ lcNB = blksweep == NT-1 ? A.n-blksweep*A.nb : A.nb; /*printf(" Version -dp- N %5d NB %5d lcNB %5d grsiz %5d blksweep%5d NT %5d \n",N,NB,lcNB,grsiz,blksweep,NT);*/ for (lcsweep = 0; lcsweep<lcNB; lcsweep++){ for (blkid = blksweep; blkid<NT; blkid=blkid+grsiz){ lcgrsiz = (blkid+1) < NT ? grsiz : NT-blkid; /*printf(" Version -dp- N %5d NB %5d lcNB %5d grsiz %5d lcgrsiz %5d blkid %5d \n",N,NB,lcNB,grsiz,lcgrsiz,blkid);*/ QUARK_CORE_ztrdalg_v2( plasma->quark, &task_flags, uplo, &A, C, S, lcgrsiz, lcsweep, blkid, blksweep); } } } /* * Barrier used only for now, to be sure that everything * is done before copying the D and E and free workspace. * this will be removed later when D and E are directly filled * during the bulge process. */ QUARK_Barrier(plasma->quark); tblg += Wtimming(); printf(" done with bulge %lf \n\n\n",tblg); plasma_shared_free(plasma, (void*) C); plasma_shared_free(plasma, (void*) S); /* * STORE THE RESULTING diagonal/off-diagonal in D AND E */ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); /* Make diagonal and superdiagonal elements real, * storing them in D and E */ /* In complex case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to E. * When using HouseHolder elimination, * the ZLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i < N-1 ; i++) { D[i] = creal( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if(i<(N-2)) E[i] = creal(*A(i+1, i)); else E[i] = cabs( *A(i+1, i)); } D[i] = creal( *A(i, i) ); } else { /* PlasmaUpper */ for (i=0; i<N-1; i++) { D[i] = creal( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if( i < (N-2) ) E[i] = creal(*A(i, (i+1))); else E[i] = cabs(*A(i, (i+1))); } D[i] = creal( *A(i, i) ); } /* end PlasmaUpper */ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif } /* END FUNCTION */
/***************************************************************************//** * Parallel tile symmetric matrix-matrix multiplication - dynamic scheduling **/ void plasma_pdsymm_quark(PLASMA_enum side, PLASMA_enum uplo, double alpha, PLASMA_desc A, PLASMA_desc B, double beta, PLASMA_desc C, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int lda, ldak, ldb, ldc; int tempmm, tempnn, tempkn, tempkm; double zbeta; double zone = (double)1.0; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); for (m = 0; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; ldc = BLKLDD(C, m); for (n = 0; n < C.nt; n++) { tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb; /* * PlasmaLeft / PlasmaLower */ if (side == PlasmaLeft) { lda = BLKLDD(A, m); if (uplo == PlasmaLower) { for (k = 0; k < C.mt; k++) { tempkm = k == C.mt-1 ? C.m-k*C.mb : C.mb; ldak = BLKLDD(A, k); ldb = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; if (k < m) { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempmm, tempnn, tempkm, A.mb, alpha, A(m, k), lda, /* lda * K */ B(k, n), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { if (k == m) { QUARK_CORE_dsymm( plasma->quark, &task_flags, side, uplo, tempmm, tempnn, A.mb, alpha, A(k, k), ldak, /* ldak * X */ B(k, n), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaTrans, PlasmaNoTrans, tempmm, tempnn, tempkm, A.mb, alpha, A(k, m), ldak, /* ldak * X */ B(k, n), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } } } } /* * PlasmaLeft / PlasmaUpper */ else { for (k = 0; k < C.mt; k++) { tempkm = k == C.mt-1 ? C.m-k*C.mb : C.mb; ldak = BLKLDD(A, k); ldb = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; if (k < m) { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaTrans, PlasmaNoTrans, tempmm, tempnn, tempkm, A.mb, alpha, A(k, m), ldak, /* ldak * X */ B(k, n), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { if (k == m) { QUARK_CORE_dsymm( plasma->quark, &task_flags, side, uplo, tempmm, tempnn, A.mb, alpha, A(k, k), ldak, /* ldak * K */ B(k, n), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempmm, tempnn, tempkm, A.mb, alpha, A(m, k), lda, /* lda * K */ B(k, n), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } } } } } /* * PlasmaRight / PlasmaLower */ else { lda = BLKLDD(A, n); ldb = BLKLDD(B, m); if (uplo == PlasmaLower) { for (k = 0; k < C.nt; k++) { tempkn = k == C.nt-1 ? C.n-k*C.nb : C.nb; ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; if (k < n) { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaTrans, tempmm, tempnn, tempkn, A.mb, alpha, B(m, k), ldb, /* ldb * K */ A(n, k), lda, /* lda * K */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { if (k == n) { QUARK_CORE_dsymm( plasma->quark, &task_flags, side, uplo, tempmm, tempnn, A.mb, alpha, A(k, k), ldak, /* ldak * Y */ B(m, k), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempmm, tempnn, tempkn, A.mb, alpha, B(m, k), ldb, /* ldb * K */ A(k, n), ldak, /* ldak * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } } } } /* * PlasmaRight / PlasmaUpper */ else { for (k = 0; k < C.nt; k++) { tempkn = k == C.nt-1 ? C.n-k*C.nb : C.nb; ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; if (k < n) { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaNoTrans, tempmm, tempnn, tempkn, A.mb, alpha, B(m, k), ldb, /* ldb * K */ A(k, n), ldak, /* ldak * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { if (k == n) { QUARK_CORE_dsymm( plasma->quark, &task_flags, side, uplo, tempmm, tempnn, A.mb, alpha, A(k, k), ldak, /* ldak * Y */ B(m, k), ldb, /* ldb * Y */ zbeta, C(m, n), ldc); /* ldc * Y */ } else { QUARK_CORE_dgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaTrans, tempmm, tempnn, tempkn, A.mb, alpha, B(m, k), ldb, /* ldb * K */ A(n, k), lda, /* lda * K */ zbeta, C(m, n), ldc); /* ldc * Y */ } } } } } } } }
/***************************************************************************//** * Parallel UU' or L'L operation - dynamic scheduling **/ void plasma_pclauum_quark(PLASMA_enum uplo, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldam; int tempkm, tempmm, tempnn; PLASMA_Complex32_t zone = (PLASMA_Complex32_t)1.0; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); /* * PlasmaLower */ if (uplo == PlasmaLower) { for (m = 0; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); for(n = 0; n < m; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_cherk( plasma->quark, &task_flags, uplo, PlasmaConjTrans, tempnn, tempmm, A.mb, 1.0, A(m, n), ldam, 1.0, A(n, n), A.mb); for(k = n+1; k < m; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; QUARK_CORE_cgemm( plasma->quark, &task_flags, PlasmaConjTrans, PlasmaNoTrans, tempkm, tempnn, tempmm, A.mb, zone, A(m, k), ldam, A(m, n), ldam, zone, A(k, n), A.mb); } } for (n = 0; n < m; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ctrmm( plasma->quark, &task_flags, PlasmaLeft, uplo, PlasmaConjTrans, PlasmaNonUnit, tempmm, tempnn, A.mb, zone, A(m, m), ldam, A(m, n), ldam); } QUARK_CORE_clauum( plasma->quark, &task_flags, uplo, tempmm, A.mb, A(m, m), ldam); } } /* * PlasmaUpper */ else { for (m = 0; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); for (n = 0; n < m; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_cherk( plasma->quark, &task_flags, uplo, PlasmaNoTrans, tempnn, tempmm, A.mb, 1.0, A(n, m), A.mb, 1.0, A(n, n), A.mb); for (k = n+1; k < m; k++){ tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; QUARK_CORE_cgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaConjTrans, tempnn, tempkm, tempmm, A.mb, zone, A(n, m), A.mb, A(k, m), A.mb, zone, A(n, k), A.mb); } } for (n = 0; n < m; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ctrmm( plasma->quark, &task_flags, PlasmaRight, uplo, PlasmaConjTrans, PlasmaNonUnit, tempnn, tempmm, A.mb, zone, A(m, m), ldam, A(n, m), A.mb); } QUARK_CORE_clauum( plasma->quark, &task_flags, uplo, tempmm, A.mb, A(m, m), ldam); } } }
/***************************************************************************//** * Parallel tile LQ factorization - dynamic scheduling **/ void plasma_pcgelqf_quark(PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldak, ldam; int tempkm, tempkn, tempmm, tempnn; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; for (k = 0; k < min(A.mt, A.nt); k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldak = BLKLDD(A, k); QUARK_CORE_cgelqt( plasma->quark, &task_flags, tempkm, tempkn, ib, T.nb, A(k, k), ldak, T(k, k), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_cunmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, tempkn, tempkn, ib, T.nb, A(k, k), ldak, T(k, k), T.mb, A(m, k), ldam); } for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_ctslqt( plasma->quark, &task_flags, tempkm, tempnn, ib, T.nb, A(k, k), ldak, A(k, n), ldak, T(k, n), T.mb); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ctsmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaConjTrans, tempmm, A.nb, tempmm, tempnn, A.mb, ib, T.nb, A(m, k), ldam, A(m, n), ldam, A(k, n), ldak, T(k, n), T.mb); } } } }
/***************************************************************************//** * Parallel construction of Q using tile V (application to identity; * reduction Householder) - dynamic scheduling **/ void plasma_pzunglqrh_quark(PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int K, N, RD, lastRD; int ldak; int ldqm; int tempkm, tempNn, tempnn, tempmm, tempNRDn, tempkmin; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; K = min(A.mt, A.nt); for (k = K-1; k >= 0; k--) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); lastRD = 0; for (RD = BS; RD < A.nt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { for (N = k; N+RD < A.nt-1; /* No reduction with rightmost single-column subdomain */ N += 2*RD) { tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb; for (m = 0; m < Q.mt; m++) { tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb; ldqm = BLKLDD(Q, m ); QUARK_CORE_zttmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, Q.nb, tempmm, tempNRDn, tempkm, ib, T.nb, Q (m, N ), ldqm, Q (m, N+RD), ldqm, A (k, N+RD), ldak, T2(k, N+RD), T.mb); } } } for (N = k; N < A.nt-1 || N == k; /* No rightmost single-column */ N += BS) { tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb; tempkmin = min(tempkm, tempNn); for (n = N+BS-1 == A.nt-2 ? A.nt-1 : min(N+BS-1, A.nt-1); /* Suck in rightmost single-column domain */ n >= N+1; n--) { tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb; for (m = 0; m < Q.mt; m++) { tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb; ldqm = BLKLDD(Q, m); QUARK_CORE_ztsmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, Q.nb, tempmm, tempnn, tempkm, ib, T.nb, Q(m, N), ldqm, Q(m, n), ldqm, A(k, n), ldak, T(k, n), T.mb); } } for (m = 0; m < Q.mt; m++) { tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb; ldqm = BLKLDD(Q, m); QUARK_CORE_zunmlq( plasma->quark, &task_flags, PlasmaRight, PlasmaNoTrans, tempmm, tempNn, tempkmin, ib, T.nb, A(k, N), ldak, T(k, N), T.mb, Q(m, N), ldqm); } } } }
/***************************************************************************//** * Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler **/ void plasma_pdsbrdt_quark(PLASMA_enum uplo, PLASMA_desc A, double *D, double *E, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; #ifdef COMPLEX static double zone = (double) 1.0; static double dzero = (double) 0.0; double ztmp; double absztmp; #endif double *C, *S; int N, NB, INgrsiz, INthgrsiz, BAND; int myid, grsiz, shift=3, stt, st, ed, stind, edind; int blklastind, colpt, PCOL, ACOL, MCOL; int stepercol, mylastid, grnb, grid; int *DEP,*MAXID; int i, j, m; int thgrsiz, thgrnb, thgrid, thed; size_t eltsize = plasma_element_size(A.dtyp); plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); N = A.m; NB = A.mb; /* Quick return */ if (N == 0){ return; } if (NB == 0) { memset(D, 0, N*sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX for (i=0; i<N; i++) D[i] = fabs(*A(i,i)); #else for (i=0; i<N; i++) D[i] = *A(i,i); #endif return; } /* * Barrier is used because the bulge have to wait until * the reduction to band has been finish. * otherwise, I can remove this BARRIER when I integrate * the function dependencies link inside the reduction to * band. Keep in min the case when NB=1, where no bulge-chasing. */ /***************************************************************/ QUARK_Barrier(plasma->quark); tblg = -Wtimming(); /***************************************************************/ /* * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge. * Make diagonal and superdiagonal elements real, storing them in * D and E. if PlasmaLower, first transform lower bidiagonal form * to upper bidiagonal by applying plane rotations/ Householder * from the left, overwriting superdiagonal elements then make * elements real of the resulting upper Bidiagonal. if PlasmaUpper * then make its elements real. For Q, PT: ZSCAL should be done * in case of WANTQ. */ if (NB == 1){ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i<N; i++) { D[i] = ( *A(i, i) ); /* diag value */ if( i < (N-1)) { /* lower off-diag value */ ztmp = *A((i+1),i); absztmp = fabs(ztmp); *A((i+1),i) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (double) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ */ } } } else { /* PlasmaUpper */ for (i=0; i<N; i++) { D[i] = ( *A(i,i) ); /* diag value*/ if(i<(N-1)) { /* lower off-diag value */ ztmp = *A(i, (i+1)); absztmp = fabs(ztmp); *A(i,(i+1)) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (double) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */ } } } /* end PlasmaUpper*/ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif return; } /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */ if( N <= 0 ) /* this will be removed we don t need it. */ { double *work, *TTau; int info, ldwork = N*N; work = (double *) plasma_shared_alloc(plasma, ldwork, PlasmaRealDouble); TTau = (double *) plasma_shared_alloc(plasma, N, PlasmaRealDouble); info = LAPACKE_dsytrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N, A(0,0), A.lm, D, E, TTau, work, ldwork); plasma_shared_free(plasma, (void*) work); plasma_shared_free(plasma, (void*) TTau); if( info == 0 ) sequence->status = PLASMA_SUCCESS; else plasma_sequence_flush(plasma->quark, sequence, request, info); return; } /* General case NB > 1 && N > NB */ DEP = (int *) plasma_shared_alloc(plasma, N+1, PlasmaInteger ); MAXID = (int *) plasma_shared_alloc(plasma, N+1, PlasmaInteger ); C = (double *) plasma_shared_alloc(plasma, N, PlasmaRealDouble); S = (double *) plasma_shared_alloc(plasma, N, PlasmaRealDouble); memset(MAXID,0,(N+1)*sizeof(int)); /*************************************************************************** * START BULGE CHASING CODE **************************************************************************/ /* * Initialisation of local parameter. those parameter should be * input or tuned parameter. */ INgrsiz = 1; if( NB > 160 ) { INgrsiz = 2; } else if( NB > 100 ) { if( N < 5000 ) INgrsiz = 2; else INgrsiz = 4; } else { INgrsiz = 6; } INthgrsiz = N; BAND = 0; grsiz = INgrsiz; thgrsiz = INthgrsiz; if( grsiz == 0 ) grsiz = 6; if( thgrsiz == 0 ) thgrsiz = N; i = shift/grsiz; stepercol = i*grsiz == shift ? i:i+1; i = (N-2)/thgrsiz; thgrnb = i*thgrsiz == (N-2) ? i:i+1; for (thgrid = 1; thgrid<=thgrnb; thgrid++){ stt = (thgrid-1)*thgrsiz+1; thed = min( (stt + thgrsiz -1), (N-2)); for (i = stt; i <= N-2; i++){ ed=min(i,thed); if(stt>ed)break; for (m = 1; m <=stepercol; m++){ st=stt; for (j = st; j <=ed; j++){ /* PCOL: dependency on the ID of the master of the group of the previous column. (Previous Column:PCOL). */ /* ACOL: dependency on the ID of the master of the previous group of my column. (Acctual Column:ACOL). (it is 0(NULL) for myid=1) */ /* MCOL: OUTPUT dependency on the my ID, to be used by the next ID. (My Column: MCOL). I am the master of this group. */ myid = (i-j)*(stepercol*grsiz) +(m-1)*grsiz + 1; mylastid = myid+grsiz-1; PCOL = mylastid+shift-1; /* to know the dependent ID of the previous column. need to know the master of its group */ MAXID[j] = myid; PCOL = min(PCOL,MAXID[j-1]); /* for the last columns, we might do only 1 or 2 kernel, so the PCOL will be wrong. this is to force it to the last ID of the previous col.*/ grnb = PCOL/grsiz; grid = grnb*grsiz == PCOL ? grnb:grnb+1; PCOL = (grid-1)*grsiz +1; /* give me the ID of the master of the group of the previous column. */ ACOL = myid-grsiz; if(myid==1)ACOL=0; MCOL = myid; QUARK_CORE_dtrdalg( plasma->quark, &task_flags, uplo, N, NB, &A, C, S, i, j, m, grsiz, BAND, DEP(PCOL), DEP(ACOL), DEP(MCOL) ); if(mylastid%2 ==0){ blklastind = (mylastid/2)*NB+1+j-1; }else{ colpt = ((mylastid+1)/2)*NB + 1 +j -1 ; stind = colpt-NB+1; edind = min(colpt,N); if( (stind>=edind-1) && (edind==N) ) blklastind=N; else blklastind=0; } if(blklastind >= (N-1)) stt=stt+1; } /* END for j=st:ed */ } /* END for m=1:stepercol */ } /* END for i=1:MINMN-2 */ } /* END for thgrid=1:thgrnb */ /* * Barrier used only for now, to be sure that everything * is done before copying the D and E and free workspace. * this will be removed later when D and E are directly filled * during the bulge process. */ QUARK_Barrier(plasma->quark); tblg += Wtimming(); //printf(" done with bulge %lf \n\n\n",tblg); plasma_shared_free(plasma, (void*) DEP); plasma_shared_free(plasma, (void*) MAXID); plasma_shared_free(plasma, (void*) C); plasma_shared_free(plasma, (void*) S); /* * STORE THE RESULTING diagonal/off-diagonal in D AND E */ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); /* Make diagonal and superdiagonal elements real, * storing them in D and E */ /* In complex case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to E. * When using HouseHolder elimination, * the ZLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i < N-1 ; i++) { D[i] = ( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if(i<(N-2)) E[i] = (*A(i+1, i)); else E[i] = fabs( *A(i+1, i)); } D[i] = ( *A(i, i) ); } else { /* PlasmaUpper */ for (i=0; i<N-1; i++) { D[i] = ( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if( i < (N-2) ) E[i] = (*A(i, (i+1))); else E[i] = fabs(*A(i, (i+1))); } D[i] = ( *A(i, i) ); } /* end PlasmaUpper */ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif } /* END FUNCTION */
/***************************************************************************//** * Parallel tile Cholesky factorization - dynamic scheduling **/ void plasma_pspotrf_quark(PLASMA_enum uplo, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldak, ldam; int tempkm, tempmm; float zone = (float) 1.0; float mzone = (float)-1.0; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); /* * PlasmaLower */ if (uplo == PlasmaLower) { for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); QUARK_CORE_spotrf( plasma->quark, &task_flags, PlasmaLower, tempkm, A.mb, A(k, k), ldak, sequence, request, A.nb*k); for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_strsm( plasma->quark, &task_flags, PlasmaRight, PlasmaLower, PlasmaTrans, PlasmaNonUnit, tempmm, A.mb, A.mb, zone, A(k, k), ldak, A(m, k), ldam); } for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ssyrk( plasma->quark, &task_flags, PlasmaLower, PlasmaNoTrans, tempmm, A.mb, A.mb, -1.0, A(m, k), ldam, 1.0, A(m, m), ldam); for (n = k+1; n < m; n++) { QUARK_CORE_sgemm( plasma->quark, &task_flags, PlasmaNoTrans, PlasmaTrans, tempmm, A.mb, A.mb, A.mb, mzone, A(m, k), ldam, A(n, k), A.mb, zone, A(m, n), ldam); } } } } /* * PlasmaUpper */ else { for (k = 0; k < A.nt; k++) { tempkm = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldak = BLKLDD(A, k); QUARK_CORE_spotrf( plasma->quark, &task_flags, PlasmaUpper, tempkm, A.mb, A(k, k), ldak, sequence, request, A.nb*k); for (m = k+1; m < A.nt; m++) { tempmm = m == A.nt-1 ? A.n-m*A.nb : A.nb; QUARK_CORE_strsm( plasma->quark, &task_flags, PlasmaLeft, PlasmaUpper, PlasmaTrans, PlasmaNonUnit, A.nb, tempmm, A.mb, zone, A(k, k), ldak, A(k, m), ldak); } for (m = k+1; m < A.nt; m++) { tempmm = m == A.nt-1 ? A.n-m*A.nb : A.nb; ldam = BLKLDD(A, m); QUARK_CORE_ssyrk( plasma->quark, &task_flags, PlasmaUpper, PlasmaTrans, tempmm, A.mb, A.mb, -1.0, A(k, m), ldak, 1.0, A(m, m), ldam); for (n = k+1; n < m; n++) { QUARK_CORE_sgemm( plasma->quark, &task_flags, PlasmaTrans, PlasmaNoTrans, A.mb, tempmm, A.mb, A.mb, mzone, A(k, n), ldak, A(k, m), ldak, zone, A(n, m), A.mb); } } } } }
/***************************************************************************//** * Parallel application of Q using tile V - QR factorization (reduction Householder) * - dynamic scheduling **/ void plasma_pcunmqrrh_quark(PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int K, M, RD, lastRD; int ldaM, ldam, ldan, ldaMRD; int ldbM, ldbm, ldbMRD; int tempMm, tempkn, tempnn, tempmm, tempMRDm, tempkmin; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; K = min(A.mt, A.nt); if (side == PlasmaLeft ) { if (trans == PlasmaConjTrans) { /* * PlasmaLeft / PlasmaConjTrans */ for (k = 0; k < K; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; for (M = k; M < A.mt-1 || M == k; /* No bottom single-row subdomain */ M += BS) { tempMm = M == A.mt-1 ? A.m-M*A.mb : A.mb; tempkmin = min(tempMm, tempkn); ldaM = BLKLDD(A, M); ldbM = BLKLDD(B, M); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_cunmqr( plasma->quark, &task_flags, side, trans, tempMm, tempnn, tempkmin, ib, T.nb, A(M, k), ldaM, T(M, k), T.mb, B(M, n), ldbM); } for (m = M+1; (m < M+BS && m < A.mt) || m == A.mt-1; /* Suck in bottom single-row domain */ m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldbm = BLKLDD(B, m); ldam = BLKLDD(A, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_ctsmqr( plasma->quark, &task_flags, side, trans, A.nb, tempnn, tempmm, tempnn, tempkn, ib, T.nb, B(M, n), ldbM, B(m, n), ldbm, A(m, k), ldam, T(m, k), T.mb); } } } for (RD = BS; RD < A.mt-k; RD *= 2) { for (M = k; M+RD < A.mt-1; /* No reduction with bottom single-row subdomain */ M += 2*RD) { tempMRDm = M+RD == A.mt-1 ? A.m-(M+RD)*A.mb : A.mb; ldbM = BLKLDD(B, M ); ldbMRD = BLKLDD(B, M+RD); ldaMRD = BLKLDD(A, M+RD); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_cttmqr( plasma->quark, &task_flags, side, trans, A.nb, tempnn, tempMRDm, tempnn, tempkn, ib, T.nb, B (M, n), ldbM, B (M+RD, n), ldbMRD, A (M+RD, k), ldaMRD, T2(M+RD, k), T.mb); } } } } } else { /* * PlasmaLeft / PlasmaNoTrans */ for (k = K-1; k >= 0; k--) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; lastRD = 0; for (RD = BS; RD < A.mt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { for (M = k; M+RD < A.mt-1; /* No reduction with bottom single-row subdomain */ M += 2*RD) { tempMRDm = M+RD == A.mt-1 ? A.m-(M+RD)*A.mb : A.mb; ldbM = BLKLDD(B, M ); ldbMRD = BLKLDD(B, M+RD); ldaMRD = BLKLDD(A, M+RD); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_cttmqr( plasma->quark, &task_flags, side, trans, A.nb, tempnn, tempMRDm, tempnn, tempkn, ib, T.nb, B (M, n), ldbM, B (M+RD, n), ldbMRD, A (M+RD, k), ldaMRD, T2(M+RD, k), T.mb); } } } for (M = k; M < A.mt-1 || M == k; /* No bottom single-row subdomain */ M += BS) { tempMm = M == A.mt-1 ? A.m-M*A.mb : A.mb; tempkmin = min(tempMm, tempkn); ldaM = BLKLDD(A, M); ldbM = BLKLDD(B, M); for (m = M+BS-1 == A.mt-2 ? A.mt-1 : min(M+BS-1, A.mt-1); /* Suck in bottom single-row domain */ m >= M+1; m--) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldbm = BLKLDD(B, m); ldam = BLKLDD(A, m); for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_ctsmqr( plasma->quark, &task_flags, side, trans, A.nb, tempnn, tempmm, tempnn, tempkn, ib, T.nb, B(M, n), ldbM, B(m, n), ldbm, A(m, k), ldam, T(m, k), T.mb); } } for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; QUARK_CORE_cunmqr( plasma->quark, &task_flags, side, trans, tempMm, tempnn, tempkmin, ib, T.nb, A(M, k), ldaM, T(M, k), T.mb, B(M, n), ldbM); } } } } } else { if (trans == PlasmaConjTrans) { /* * PlasmaRight / PlasmaConjTrans */ for (k = K-1; k >= 0; k--) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; lastRD = 0; for (RD = BS; RD < A.mt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { for (M = k; M+RD < A.mt-1; /* No reduction with bottom single-row subdomain */ M += 2*RD) { tempMRDm = M+RD == A.mt-1 ? A.m-(M+RD)*A.mb : A.mb; ldaMRD = BLKLDD(A, M+RD); for (m = 0; m < B.mt; m++) { ldbm = BLKLDD(B, m); tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; QUARK_CORE_cttmqr( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempMRDm, tempkn, ib, T.nb, B (m, M), ldbm, B (m, M+RD), ldbm, A (M+RD, k), ldaMRD, T2(M+RD, k), T.mb); } } } for (M = k; M < A.mt-1 || M == k; /* No bottom single-row subdomain */ M += BS) { tempMm = M == A.mt-1 ? A.m-M*A.mb : A.mb; tempkmin = min(tempMm, tempkn); ldaM = BLKLDD(A, M); ldbM = BLKLDD(B, M); for (n = M+BS-1 == A.mt-2 ? A.mt-1 : min(M+BS-1, A.mt-1); /* Suck in bottom single-row domain */ n >= M+1; n--) { ldan = BLKLDD(A, n); tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; for (m = 0; m < B.mt; m++) { ldbm = BLKLDD(B, m); tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; QUARK_CORE_ctsmqr( plasma->quark, &task_flags, side, trans, tempmm, tempMm, tempmm, tempnn, tempkn, ib, T.nb, B(m, M), ldbm, B(m, n), ldbm, A(n, k), ldan, T(n, k), T.mb); } } for (m = 0; m < B.mt; m++) { ldbm = BLKLDD(B, m); tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; QUARK_CORE_cunmqr( plasma->quark, &task_flags, side, trans, tempmm, tempMm, tempkmin, ib, T.nb, A(M, k), ldaM, T(M, k), T.mb, B(m, M), ldbm); } } } } else { /* * PlasmaRight / PlasmaNoTrans */ for (k = 0; k < K; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; for (M = k; M < A.mt-1 || M == k; /* No bottom single-row subdomain */ M += BS) { tempMm = M == A.mt-1 ? A.m-M*A.mb : A.mb; tempkmin = min(tempMm, tempkn); ldaM = BLKLDD(A, M); for (m = 0; m < B.mt; m++) { ldbm = BLKLDD(B, m); tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; QUARK_CORE_cunmqr( plasma->quark, &task_flags, side, trans, tempmm, tempMm, tempkmin, ib, T.nb, A(M, k), ldaM, T(M, k), T.mb, B(m, M), ldbm); } for (n = M+1; (n < M+BS && n < A.mt) || n == A.mt-1; /* Suck in bottom single-row domain */ n++) { tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; ldan = BLKLDD(A, n); for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_ctsmqr( plasma->quark, &task_flags, side, trans, tempmm, tempMm, tempmm, tempnn, tempkn, ib, T.nb, B(m, M), ldbm, B(m, n), ldbm, A(n, k), ldan, T(n, k), T.mb); } } } for (RD = BS; RD < A.mt-k; RD *= 2) { for (M = k; M+RD < A.mt-1; /* No reduction with bottom single-row subdomain */ M += 2*RD) { tempMRDm = M+RD == A.mt-1 ? A.m-(M+RD)*A.mb : A.mb; ldaMRD = BLKLDD(A, M+RD); for (m = 0; m < B.mt; m++) { tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldbm = BLKLDD(B, m); QUARK_CORE_cttmqr( plasma->quark, &task_flags, side, trans, tempmm, B.nb, tempmm, tempMRDm, tempkn, ib, T.nb, B (m, M ), ldbm, B (m, M+RD), ldbm, A (M+RD, k), ldaMRD, T2(M+RD, k), T.mb); } } } } } } }
/***************************************************************************//** * Parallel tile Hermitian rank-k update - dynamic scheduling **/ void plasma_pcsyr2k_quark(PLASMA_enum uplo, PLASMA_enum trans, PLASMA_Complex32_t alpha, PLASMA_desc A, PLASMA_desc B, PLASMA_Complex32_t beta, PLASMA_desc C, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int m, n, k; int ldak, ldam, ldan, ldcm, ldcn; int ldbk, ldbm, ldbn; int tempnn, tempmm, tempkn, tempkm; PLASMA_Complex32_t zone = (PLASMA_Complex32_t)1.0; PLASMA_Complex32_t zbeta; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); for (n = 0; n < C.nt; n++) { tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb; ldan = BLKLDD(A, n); ldbn = BLKLDD(B, n); ldcn = BLKLDD(C, n); /* * PlasmaNoTrans */ if (trans == PlasmaNoTrans) { for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; zbeta = k == 0 ? beta : zone; QUARK_CORE_csyr2k( plasma->quark, &task_flags, uplo, trans, tempnn, tempkn, A.mb, alpha, A(n, k), ldan, /* ldan * K */ B(n, k), ldbn, zbeta, C(n, n), ldcn); /* ldc * N */ } /* * PlasmaNoTrans / PlasmaLower */ if (uplo == PlasmaLower) { for (m = n+1; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); ldcm = BLKLDD(C, m); for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; zbeta = k == 0 ? beta : zone; QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaTrans, tempmm, tempnn, tempkn, A.mb, alpha, A(m, k), ldam, /* ldam * K */ B(n, k), ldbn, /* ldan * K */ zbeta, C(m, n), ldcm); /* ldc * N */ QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaTrans, tempmm, tempnn, tempkn, A.mb, alpha, B(m, k), ldbm, /* ldam * K */ A(n, k), ldan, /* ldan * K */ zone, C(m, n), ldcm); /* ldc * N */ } } } /* * PlasmaNoTrans / PlasmaUpper */ else { for (m = n+1; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; zbeta = k == 0 ? beta : zone; QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaTrans, tempnn, tempmm, tempkn, A.mb, alpha, A(n, k), ldan, /* ldan * K */ B(m, k), ldbm, /* ldam * M */ zbeta, C(n, m), ldcn); /* ldc * M */ QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaTrans, tempnn, tempmm, tempkn, A.mb, alpha, B(n, k), ldan, /* ldan * K */ A(m, k), ldam, /* ldam * M */ zone, C(n, m), ldcn); /* ldc * M */ } } } } /* * Plasma[Conj]Trans */ else { for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; QUARK_CORE_csyr2k( plasma->quark, &task_flags, uplo, trans, tempnn, tempkm, A.mb, alpha, A(k, n), ldak, /* lda * N */ B(k, n), ldbk, zbeta, C(n, n), ldcn); /* ldc * N */ } /* * Plasma[Conj]Trans / PlasmaLower */ if (uplo == PlasmaLower) { for (m = n+1; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; ldcm = BLKLDD(C, m); for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaNoTrans, tempmm, tempnn, tempkm, A.mb, alpha, A(k, m), ldak, /* lda * M */ B(k, n), ldbk, /* lda * N */ zbeta, C(m, n), ldcm); /* ldc * N */ QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaNoTrans, tempmm, tempnn, tempkm, A.mb, alpha, B(k, m), ldbk, /* lda * M */ A(k, n), ldak, /* lda * N */ zone, C(m, n), ldcm); /* ldc * N */ } } } /* * Plasma[Conj]Trans / PlasmaUpper */ else { for (m = n+1; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaNoTrans, tempnn, tempmm, tempkm, A.mb, alpha, A(k, n), ldak, /* lda * K */ B(k, m), ldbk, /* lda * M */ zbeta, C(n, m), ldcn); /* ldc * M */ QUARK_CORE_cgemm( plasma->quark, &task_flags, trans, PlasmaNoTrans, tempnn, tempmm, tempkm, A.mb, alpha, B(k, n), ldbk, /* lda * K */ A(k, m), ldak, /* lda * M */ zone, C(n, m), ldcn); /* ldc * M */ } } } } } }
static int RunTest(int *iparam, double *dparam, real_Double_t *t_) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; PLASMA_Complex64_t *A, *A2 = NULL; real_Double_t t; int *ipiv, *ipiv2 = NULL; int i; int m = iparam[TIMING_N]; int n = iparam[TIMING_NRHS]; int check = iparam[TIMING_CHECK]; int lda = m; PLASMA_sequence *sequence = NULL; PLASMA_request request = PLASMA_REQUEST_INITIALIZER; /* Initialize Plasma */ PLASMA_Init( iparam[TIMING_THRDNBR] ); PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_DYNAMIC_SCHEDULING ); PLASMA_Disable(PLASMA_AUTOTUNING); PLASMA_Set(PLASMA_TILE_SIZE, iparam[TIMING_NB] ); PLASMA_Set(PLASMA_INNER_BLOCK_SIZE, iparam[TIMING_IB] ); /* Allocate Data */ A = (PLASMA_Complex64_t *)malloc(lda*n*sizeof(PLASMA_Complex64_t)); /* Check if unable to allocate memory */ if ( (! A) ) { printf("Out of Memory \n "); return -1; } /* Initialiaze Data */ LAPACKE_zlarnv_work(1, ISEED, lda*n, A); /* Allocate Workspace */ ipiv = (int *)malloc( n*sizeof(int) ); /* Save A in lapack layout for check */ if ( check ) { A2 = (PLASMA_Complex64_t *)malloc(lda*n*sizeof(PLASMA_Complex64_t)); ipiv2 = (int *)malloc( n*sizeof(int) ); LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,' ', m, n, A, lda, A2, lda); LAPACKE_zgetrf_work(LAPACK_COL_MAJOR, m, n, A2, lda, ipiv2 ); } plasma = plasma_context_self(); PLASMA_Sequence_Create(&sequence); QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); QUARK_Task_Flag_Set(&task_flags, TASK_THREAD_COUNT, iparam[TIMING_THRDNBR] ); plasma_dynamic_spawn(); CORE_zgetrf_reclap_init(); t = -cWtime(); QUARK_CORE_zgetrf_reclap(plasma->quark, &task_flags, m, n, n, A, lda, ipiv, sequence, &request, 0, 0, iparam[TIMING_THRDNBR]); PLASMA_Sequence_Wait(sequence); t += cWtime(); *t_ = t; PLASMA_Sequence_Destroy(sequence); /* Check the solution */ if ( check ) { double *work = (double *)malloc(max(m,n)*sizeof(double)); /* Check ipiv */ for(i=0; i<n; i++) { if( ipiv[i] != ipiv2[i] ) { fprintf(stderr, "\nPLASMA (ipiv[%d] = %d, A[%d] = %e) / LAPACK (ipiv[%d] = %d, A[%d] = [%e])\n", i, ipiv[i], i, creal(A[ i * lda + i ]), i, ipiv2[i], i, creal(A2[ i * lda + i ])); break; } } dparam[TIMING_ANORM] = LAPACKE_zlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaMaxNorm), m, n, A, lda, work); dparam[TIMING_XNORM] = LAPACKE_zlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaMaxNorm), m, n, A2, lda, work); dparam[TIMING_BNORM] = 0.0; CORE_zaxpy( m, n, -1.0, A, lda, A2, lda); dparam[TIMING_RES] = LAPACKE_zlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaMaxNorm), m, n, A2, lda, work); free( A2 ); free( ipiv2 ); free( work ); } free( A ); free( ipiv ); PLASMA_Finalize(); return 0; }
/***************************************************************************//** * **/ void plasma_pdlacpy_quark(PLASMA_enum uplo, PLASMA_desc A, PLASMA_desc B, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int X, Y; int m, n; int ldam, ldbm; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); switch (uplo) { /* * PlasmaUpper */ case PlasmaUpper: for (m = 0; m < A.mt; m++) { X = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); if (m < A.nt) { Y = m == A.nt-1 ? A.n-m*A.nb : A.nb; QUARK_CORE_dlacpy( plasma->quark, &task_flags, PlasmaUpper, X, Y, A.mb, A(m, m), ldam, B(m, m), ldbm); } for (n = m+1; n < A.nt; n++) { Y = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_dlacpy( plasma->quark, &task_flags, PlasmaUpperLower, X, Y, A.mb, A(m, n), ldam, B(m, n), ldbm); } } break; /* * PlasmaLower */ case PlasmaLower: for (m = 0; m < A.mt; m++) { X = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); if (m < A.nt) { Y = m == A.nt-1 ? A.n-m*A.nb : A.nb; QUARK_CORE_dlacpy( plasma->quark, &task_flags, PlasmaLower, X, Y, A.mb, A(m, m), ldam, B(m, m), ldbm); } for (n = 0; n < min(m, A.nt); n++) { Y = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_dlacpy( plasma->quark, &task_flags, PlasmaUpperLower, X, Y, A.mb, A(m, n), ldam, B(m, n), ldbm); } } break; /* * PlasmaUpperLower */ case PlasmaUpperLower: default: for (m = 0; m < A.mt; m++) { X = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); for (n = 0; n < A.nt; n++) { Y = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_dlacpy( plasma->quark, &task_flags, PlasmaUpperLower, X, Y, A.mb, A(m, n), ldam, B(m, n), ldbm); } } } }
/***************************************************************************//** * Parallel tile matrix-matrix multiplication - dynamic scheduling **/ void plasma_pzgemm_quark(PLASMA_enum transA, PLASMA_enum transB, PLASMA_Complex64_t alpha, PLASMA_desc A, PLASMA_desc B, PLASMA_Complex64_t beta, PLASMA_desc C, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int m, n, k; int ldam, ldak, ldbn, ldbk, ldcm; int tempmm, tempnn, tempkn, tempkm; PLASMA_Complex64_t zbeta; PLASMA_Complex64_t zone = (PLASMA_Complex64_t)1.0; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); for (m = 0; m < C.mt; m++) { tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb; ldcm = BLKLDD(C, m); for (n = 0; n < C.nt; n++) { tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb; /* * A: PlasmaNoTrans / B: PlasmaNoTrans */ if (transA == PlasmaNoTrans) { ldam = BLKLDD(A, m); if (transB == PlasmaNoTrans) { for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; QUARK_CORE_zgemm( plasma->quark, &task_flags, transA, transB, tempmm, tempnn, tempkn, A.mb, alpha, A(m, k), ldam, /* lda * Z */ B(k, n), ldbk, /* ldb * Y */ zbeta, C(m, n), ldcm); /* ldc * Y */ } } /* * A: PlasmaNoTrans / B: Plasma[Conj]Trans */ else { ldbn = BLKLDD(B, n); for (k = 0; k < A.nt; k++) { tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; zbeta = k == 0 ? beta : zone; QUARK_CORE_zgemm( plasma->quark, &task_flags, transA, transB, tempmm, tempnn, tempkn, A.mb, alpha, A(m, k), ldam, /* lda * Z */ B(n, k), ldbn, /* ldb * Z */ zbeta, C(m, n), ldcm); /* ldc * Y */ } } } /* * A: Plasma[Conj]Trans / B: PlasmaNoTrans */ else { if (transB == PlasmaNoTrans) { for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; QUARK_CORE_zgemm( plasma->quark, &task_flags, transA, transB, tempmm, tempnn, tempkm, A.mb, alpha, A(k, m), ldak, /* lda * X */ B(k, n), ldbk, /* ldb * Y */ zbeta, C(m, n), ldcm); /* ldc * Y */ } } /* * A: Plasma[Conj]Trans / B: Plasma[Conj]Trans */ else { ldbn = BLKLDD(B, n); for (k = 0; k < A.mt; k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; QUARK_CORE_zgemm( plasma->quark, &task_flags, transA, transB, tempmm, tempnn, tempkm, A.mb, alpha, A(k, m), ldak, /* lda * X */ B(n, k), ldbn, /* ldb * Z */ zbeta, C(m, n), ldcm); /* ldc * Y */ } } } } } }
/***************************************************************************//** * Parallel tile LU factorization - dynamic scheduling **/ void plasma_psgetrf_incpiv_quark(PLASMA_desc A, PLASMA_desc L, int *IPIV, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int k, m, n; int ldak, ldam; int tempkm, tempkn, tempmm, tempnn; int ib; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); ib = PLASMA_IB; for (k = 0; k < min(A.mt, A.nt); k++) { tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldak = BLKLDD(A, k); QUARK_CORE_sgetrf_incpiv( plasma->quark, &task_flags, tempkm, tempkn, ib, L.nb, A(k, k), ldak, IPIV(k, k), sequence, request, k == A.mt-1, A.nb*k); for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_sgessm( plasma->quark, &task_flags, tempkm, tempnn, tempkm, ib, L.nb, IPIV(k, k), A(k, k), ldak, A(k, n), ldak); } for (m = k+1; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); QUARK_CORE_ststrf( plasma->quark, &task_flags, tempmm, tempkn, ib, L.nb, A(k, k), ldak, A(m, k), ldam, L(m, k), L.mb, IPIV(m, k), sequence, request, m == A.mt-1, A.nb*k); for (n = k+1; n < A.nt; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; QUARK_CORE_sssssm( plasma->quark, &task_flags, A.nb, tempnn, tempmm, tempnn, A.nb, ib, L.nb, A(k, n), ldak, A(m, n), ldam, L(m, k), L.mb, A(m, k), ldam, IPIV(m, k)); } } } }