/***************************************************************************//** * Parallel tile row interchanges - dynamic scheduling **/ void plasma_pclaswp_quark(PLASMA_desc B, int *IPIV, int inc, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; int m, n; int tempi, tempm, tempmm, tempnn; plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); if ( inc > 0 ) { for (m = 0; m < B.mt; m++) { tempi = m * B.mb; tempm = B.m - tempi; tempmm = m == B.mt-1 ? tempm : B.mb; for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n - n * B.nb : B.nb; QUARK_CORE_claswp_ontile( plasma->quark, &task_flags, plasma_desc_submatrix(B, tempi, n*B.nb, tempm, tempnn), B(m, n), 1, tempmm, IPIV(m), inc, B(B.mt-1, n) ); } } } else { for (m = B.mt-1; m > -1; m--) { tempi = m * B.mb; tempm = B.m - tempi; tempmm = m == B.mt-1 ? tempm : B.mb; for (n = 0; n < B.nt; n++) { tempnn = n == B.nt-1 ? B.n - n * B.nb : B.nb; QUARK_CORE_claswp_ontile( plasma->quark, &task_flags, plasma_desc_submatrix(B, tempi, n*B.nb, tempm, tempnn), B(m, n), 1, tempmm, IPIV(m), inc, B(0, n) ); } } } }
/***************************************************************************//** * * @ingroup float_Tile_Async * * PLASMA_ssytrd_Tile_Async - Computes all eigenvalues and, * optionally, eigenvectors of a complex Hermitian matrix A using a * two-stage approach: * First stage: reduction to band tridiagonal form; * Second stage: reduction from band to tridiagonal form. * * May return before the computation is finished. * Allows for pipelining of operations ar runtime. * ******************************************************************************* * * @param[in] sequence * Identifies the sequence of function calls that this call belongs to * (for completion checks and exception handling purposes). * * @param[out] request * Identifies this function call (for exception handling purposes). * ******************************************************************************* * * @sa PLASMA_ssytrd * @sa PLASMA_ssytrd_Tile * @sa PLASMA_chetrd_Tile_Async * @sa PLASMA_dsytrd_Tile_Async * @sa PLASMA_ssytrd_Tile_Async * ******************************************************************************/ int PLASMA_ssytrd_Tile_Async(PLASMA_enum jobz, PLASMA_enum uplo, PLASMA_desc *A, float *D, float *E, PLASMA_desc *T, PLASMA_desc *Q, PLASMA_sequence *sequence, PLASMA_request *request) { int NB, IB, IBNB, NT; PLASMA_desc descA = *A; PLASMA_desc descT = *T; plasma_context_t *plasma; plasma = plasma_context_self(); if (plasma == NULL) { plasma_fatal_error("PLASMA_ssytrd_Tile_Async", "PLASMA not initialized"); return PLASMA_ERR_NOT_INITIALIZED; } if (sequence == NULL) { plasma_fatal_error("PLASMA_ssytrd_Tile_Async", "NULL sequence"); return PLASMA_ERR_UNALLOCATED; } if (request == NULL) { plasma_fatal_error("PLASMA_ssytrd_Tile_Async", "NULL request"); return PLASMA_ERR_UNALLOCATED; } /* Check sequence status */ if (sequence->status == PLASMA_SUCCESS) request->status = PLASMA_SUCCESS; else return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED); /* Set NT & NTRHS */ NB = PLASMA_NB; IB = PLASMA_IB; IBNB = IB*NB; NT = (descA.ln%NB==0) ? (descA.ln/NB) : (descA.ln/NB+1); /* Check descriptors for correctness */ if (plasma_desc_check(&descA) != PLASMA_SUCCESS) { plasma_error("PLASMA_ssytrd_Tile_Async", "invalid descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if (plasma_desc_check(&descT) != PLASMA_SUCCESS) { plasma_error("PLASMA_ssytrd_Tile_Async", "invalid descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if ( (jobz == PlasmaVec) && (plasma_desc_check(Q) != PLASMA_SUCCESS) ) { plasma_error("PLASMA_ssytrd_Tile_Async", "invalid descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* Check input arguments */ if (jobz != PlasmaNoVec && jobz != PlasmaVec) { plasma_error("PLASMA_ssytrd_Tile_Async", "illegal value of jobz"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if (descA.m != descA.n) { plasma_error("PLASMA_ssytrd_Tile_Async", "matrix need to be square"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if (descA.nb != descA.mb) { plasma_error("PLASMA_ssytrd_Tile_Async", "only square tiles supported"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if (jobz == PlasmaVec) { plasma_error("PLASMA_ssytrd_Tile_Async", "computing the eigenvectors is not supported in this version"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if ( (jobz == PlasmaVec) && (Q->nb != Q->mb) ) { plasma_error("PLASMA_ssytrd_Tile_Async", "only square tiles supported"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* Reduction to tridiagonal form * with a two-stage approach. */ /* Reduction to BAND tridiagonal form */ plasma_dynamic_call_5(plasma_pssyrbt, PLASMA_enum, uplo, PLASMA_desc, descA, PLASMA_desc, descT, PLASMA_sequence*, sequence, PLASMA_request*, request); /* * Build the Q of the first stage */ /* if (jobz == PlasmaVec){ */ /* /\* Initialize Q to Identity *\/ */ /* plasma_dynamic_call_6(plasma_pslaset, */ /* PLASMA_enum, PlasmaUpperLower, */ /* float, 0.0, */ /* float, 1.0, */ /* PLASMA_desc, descQ, */ /* PLASMA_sequence*, sequence, */ /* PLASMA_request*, request); */ /* /\* Accumulate the transformations from the first stage*\/ */ /* plasma_dynamic_call_6(plasma_psorgtr, */ /* PLASMA_enum, uplo, */ /* PLASMA_desc, descA, */ /* PLASMA_desc, descQ, */ /* PLASMA_desc, descT, */ /* PLASMA_sequence*, sequence, */ /* PLASMA_request*, request); */ /* } */ /* Set the V's to zero before the 2nd stage (bulge chasing) */ /* */ plasma_dynamic_call_5(plasma_pslaset2, PLASMA_enum, uplo, float, 0.0, PLASMA_desc, uplo == PlasmaLower ? plasma_desc_submatrix(descA, descA.mb, 0, descA.m-descA.mb, descA.n-descA.nb) : plasma_desc_submatrix(descA, 0, descA.nb, descA.m-descA.mb, descA.n-descA.nb), PLASMA_sequence*, sequence, PLASMA_request*, request); /* Reduction from BAND tridiagonal to the final condensed form */ plasma_dynamic_call_7(plasma_pssbrdt, PLASMA_enum, uplo, PLASMA_desc, descA, float*, D, float*, E, PLASMA_desc, descT, PLASMA_sequence*, sequence, PLASMA_request*, request); return PLASMA_SUCCESS; }
/***************************************************************************//** * Parallel tile LU factorization - dynamic scheduling - Right looking **/ void plasma_pdgetrf_rectil_quark(PLASMA_desc A, int *IPIV) { int k, m, n; int tempk, tempm, tempkm, tempkn, tempmm, tempnn; int ldak, ldam; double zone = (double)1.0; double mzone = (double)-1.0; void * fakedep; /* How many threads per panel? Probably needs to be adjusted during factorization. */ CORE_dgetrf_rectil_init(); for (k = 0; k < min(A.mt, A.nt); k++) { tempk = k * A.mb; tempm = A.m - tempk; tempkm = k == A.mt-1 ? tempm : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldak = BLKLDD(A, k); double *dA = A(k, k); int *dB = IPIV(k); PLASMA_desc pDesc = plasma_desc_submatrix(A, tempk, k*A.nb, tempm, tempkn); hclib_pragma_marker("omp", "task depend(inout:dA[0:A.mb*A.nb]) depend(out:dB[0:pDesc.n])", "pragma59_omp_task"); { int info[3]; info[1] = 0; info[2] = 1; CORE_dgetrf_rectil( pDesc, dB, info ); } /* * Update the trailing submatrix */ fakedep = (void *)(intptr_t)(k+1); for (n = k+1; n < A.nt; n++) { /* * Apply row interchange after the panel (work on the panel) */ tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; PLASMA_desc descA = plasma_desc_submatrix(A, tempk, n*A.nb, tempm, tempnn); double *dA = A(k, n); double *dB = A(k, k); int *dipiv = IPIV(k); hclib_pragma_marker("omp", "task depend(inout:dA[0:1]) depend(in:dB[0:ldak], dipiv[0:tempkm])", "pragma82_omp_task"); CORE_dswptr_ontile(descA, 1, tempkm, dipiv, 1, dB, ldak); m = k+1; if ( m < A.mt ) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); double *dA = A(m , k); double *dB = A(k , n); double *dC = A(m , n); hclib_pragma_marker("omp", "task depend(in:dA[0:A.mb*A.mb], dB[0:A.mb*A.mb]) depend(inout:dC[0:A.mb*A.mb])", "pragma93_omp_task"); cblas_dgemm(CblasColMajor, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_TRANSPOSE)PlasmaNoTrans, tempmm, tempnn, A.nb, mzone, dA, ldam, dB, ldak, zone, dC, ldam); for (m = k+2; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); double *dA = A(m , k); double *dB = A(k , n); double *dC = A(m , n); double *fake1 = A(k+1, n); double *fake2 = (double *)fakedep; hclib_pragma_marker("omp", "task depend(in:dA[0:A.mb*A.mb], dB[0:A.mb*A.mb], fake2[0:1]) depend(inout:dC[0:A.mb*A.mb], fake1[0:A.mb*A.nb])", "pragma110_omp_task"); cblas_dgemm(CblasColMajor, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_TRANSPOSE)PlasmaNoTrans, tempmm, tempnn, A.nb, mzone, dA, ldam, dB, ldak, zone, dC, ldam); } } } } for (k = 0; k < min(A.mt, A.nt); k++) { int mintmp; tempk = k * A.mb; tempm = A.m - tempk; tempkm = k == A.mt-1 ? tempm : A.mb; tempkn = k == A.nt-1 ? A.n - k * A.nb : A.nb; mintmp = min(tempkm, tempkn); ldak = BLKLDD(A, k); /* * Apply row interchange behind the panel (work on the panel) */ fakedep = (void*)(intptr_t)k; for (n = 0; n < k; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; double *Aij = A(k, n); double *prevSwap = A(k-1, n); int *dipiv = IPIV(k); PLASMA_desc descA = plasma_desc_submatrix(A, tempk, n*A.nb, tempm, tempnn); hclib_pragma_marker("omp", "task depend(inout:Aij[0:1],fakedep) depend(in:dipiv[0:mintmp], prevSwap[0:A.lm*A.nb])", "pragma142_omp_task"); CORE_dlaswp_ontile(descA, 1, mintmp, dipiv, 1); } } }