/***************************************************************************//** * * @ingroup PLASMA_Complex64_t_Tile_Async * * PLASMA_zlaswp_Tile_Async - performs a series of row interchanges * on the matrix A. One row interchange is initiated for each of * rows K1 through K2 of A. * Non-blocking equivalent of PLASMA_zlaswp_Tile(). * May return before the computation is finished. * Allows for pipelining of operations ar runtime. * ******************************************************************************* * * @param[in] sequence * Identifies the sequence of function calls that this call belongs to * (for completion checks and exception handling purposes). * * @param[out] request * Identifies this function call (for exception handling purposes). * ******************************************************************************* * * @sa PLASMA_zlaswp * @sa PLASMA_zlaswp_Tile * @sa PLASMA_claswp_Tile_Async * @sa PLASMA_dlaswp_Tile_Async * @sa PLASMA_slaswp_Tile_Async * @sa PLASMA_zgetrf_Tile_Async * ******************************************************************************/ int PLASMA_zlaswp_Tile_Async(PLASMA_desc *A, int K1, int K2, int *IPIV, int INCX, PLASMA_sequence *sequence, PLASMA_request *request) { PLASMA_desc descA = *A; plasma_context_t *plasma; plasma = plasma_context_self(); if (plasma == NULL) { plasma_fatal_error("PLASMA_zlaswp_Tile", "PLASMA not initialized"); return PLASMA_ERR_NOT_INITIALIZED; } if (sequence == NULL) { plasma_fatal_error("PLASMA_zlaswp_Tile", "NULL sequence"); return PLASMA_ERR_UNALLOCATED; } if (request == NULL) { plasma_fatal_error("PLASMA_zlaswp_Tile", "NULL request"); return PLASMA_ERR_UNALLOCATED; } /* Check sequence status */ if (sequence->status == PLASMA_SUCCESS) request->status = PLASMA_SUCCESS; else return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED); /* Check descriptors for correctness */ if (plasma_desc_check(&descA) != PLASMA_SUCCESS) { plasma_error("PLASMA_zlaswp_Tile", "invalid first descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if ( (K1 != 1) || (K2 != descA.m) ) { plasma_error("PLASMA_zlaswp_Tile", "invalid K1 or K2 (1..M is the only interval supported right now)"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } plasma_dynamic_call_3( plasma_pzbarrier_tl2pnl, PLASMA_desc, descA, PLASMA_sequence*, sequence, PLASMA_request*, request); /* swap */ plasma_dynamic_call_5( plasma_pzlaswp, PLASMA_desc, descA, int *, IPIV, int, INCX, PLASMA_sequence*, sequence, PLASMA_request*, request); plasma_dynamic_call_3( plasma_pzbarrier_pnl2tl, PLASMA_desc, descA, PLASMA_sequence*, sequence, PLASMA_request*, request); return PLASMA_SUCCESS; }
/***************************************************************************//** * * @ingroup double_Tile_Async * * PLASMA_dsygst_Tile_Async - reduces a complex Hermitian-definite * generalized eigenproblem to standard form. * If PlasmaItype == 1, the problem is A*x = lambda*B*x, and A is * overwritten by inv(U**T)*A*inv(U) or inv(L)*A*inv(L**T) * If PlasmaItype == 2 or 3, the problem is A*B*x = lambda*x or B*A*x * = lambda*x, and A is overwritten by U*A*U**T or L**T*A*L. B must * have been previously factorized as U**T*U or L*L**T by * PLASMA_DPOTRF. * ONLY PlasmaItype == 1 and PlasmaLower supported! * Non-blocking equivalent of PLASMA_dsygst_Tile(). * May return before the computation is finished. * Allows for pipelining of operations ar runtime. * ******************************************************************************* * * @param[in] sequence * Identifies the sequence of function calls that this call belongs to * (for completion checks and exception handling purposes). * * @param[out] request * Identifies this function call (for exception handling purposes). * ******************************************************************************* * * @sa PLASMA_dsygst * @sa PLASMA_dsygst_Tile * @sa PLASMA_chegst_Tile_Async * @sa PLASMA_dsygst_Tile_Async * @sa PLASMA_ssygst_Tile_Async * @sa PLASMA_dsygv_Tile_Async * ******************************************************************************/ int PLASMA_dsygst_Tile_Async(PLASMA_enum itype, PLASMA_enum uplo, PLASMA_desc *A, PLASMA_desc *B, PLASMA_sequence *sequence, PLASMA_request *request) { PLASMA_desc descA = *A; PLASMA_desc descB = *B; plasma_context_t *plasma; plasma = plasma_context_self(); if (plasma == NULL) { plasma_fatal_error("PLASMA_dsygst_Tile", "PLASMA not initialized"); return PLASMA_ERR_NOT_INITIALIZED; } if (sequence == NULL) { plasma_fatal_error("PLASMA_dsygst_Tile", "NULL sequence"); return PLASMA_ERR_UNALLOCATED; } if (request == NULL) { plasma_fatal_error("PLASMA_dsygst_Tile", "NULL request"); return PLASMA_ERR_UNALLOCATED; } /* Check sequence status */ if (sequence->status == PLASMA_SUCCESS) request->status = PLASMA_SUCCESS; else return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED); /* Check descriptors for correctness */ if (plasma_desc_check(&descA) != PLASMA_SUCCESS) { plasma_error("PLASMA_dsygst_Tile", "invalid first descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if (plasma_desc_check(&descB) != PLASMA_SUCCESS) { plasma_error("PLASMA_dsygst_Tile", "invalid second descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* Check input arguments */ if (descA.nb != descA.mb) { plasma_error("PLASMA_dsygst_Tile", "only square tiles supported"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* * Transform Hermitian-definite generalized eigenproblem * to standard form */ plasma_dynamic_call_6(plasma_pdsygst, PLASMA_enum, itype, PLASMA_enum, uplo, PLASMA_desc, descA, PLASMA_desc, descB, PLASMA_sequence*, sequence, PLASMA_request*, request); return PLASMA_SUCCESS; }
/***************************************************************************//** * * @ingroup PLASMA_Complex32_t_Tile_Async * * PLASMA_cpotrf_Tile_Async - Computes the Cholesky factorization of a symmetric * positive definite or Hermitian positive definite matrix. * Non-blocking equivalent of PLASMA_cpotrf_Tile(). * May return before the computation is finished. * Allows for pipelining of operations ar runtime. * ******************************************************************************* * * @param[in] sequence * Identifies the sequence of function calls that this call belongs to * (for completion checks and exception handling purposes). * * @param[out] request * Identifies this function call (for exception handling purposes). * ******************************************************************************* * * @sa PLASMA_cpotrf * @sa PLASMA_cpotrf_Tile * @sa PLASMA_cpotrf_Tile_Async * @sa PLASMA_dpotrf_Tile_Async * @sa PLASMA_spotrf_Tile_Async * @sa PLASMA_cpotrs_Tile_Async * ******************************************************************************/ int PLASMA_cpotrf_Tile_Async(PLASMA_enum uplo, PLASMA_desc *A, PLASMA_sequence *sequence, PLASMA_request *request) { PLASMA_desc descA = *A; plasma_context_t *plasma; plasma = plasma_context_self(); if (plasma == NULL) { plasma_fatal_error("PLASMA_cpotrf_Tile", "PLASMA not initialized"); return PLASMA_ERR_NOT_INITIALIZED; } if (sequence == NULL) { plasma_fatal_error("PLASMA_cpotrf_Tile", "NULL sequence"); return PLASMA_ERR_UNALLOCATED; } if (request == NULL) { plasma_fatal_error("PLASMA_cpotrf_Tile", "NULL request"); return PLASMA_ERR_UNALLOCATED; } /* Check sequence status */ if (sequence->status == PLASMA_SUCCESS) request->status = PLASMA_SUCCESS; else return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED); /* Check descriptors for correctness */ if (plasma_desc_check(&descA) != PLASMA_SUCCESS) { plasma_error("PLASMA_cpotrf_Tile", "invalid descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* Check input arguments */ if (descA.nb != descA.mb) { plasma_error("PLASMA_cpotrf_Tile", "only square tiles supported"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if (uplo != PlasmaUpper && uplo != PlasmaLower) { plasma_error("PLASMA_cpotrf_Tile", "illegal value of uplo"); return plasma_request_fail(sequence, request, -1); } /* Quick return */ /* if (max(N, 0) == 0) return PLASMA_SUCCESS; */ plasma_parallel_call_4(plasma_pcpotrf, PLASMA_enum, uplo, PLASMA_desc, descA, PLASMA_sequence*, sequence, PLASMA_request*, request); return PLASMA_SUCCESS; }
/***************************************************************************//** * * @ingroup float_Tile_Async * * PLASMA_splgsy_Tile_Async - Generate a random hermitian matrix by tiles. * Non-blocking equivalent of PLASMA_splgsy_Tile(). * May return before the computation is finished. * Allows for pipelining of operations ar runtime. * ******************************************************************************* * * @param[in] sequence * Identifies the sequence of function calls that this call belongs to * (for completion checks and exception handling purposes). * * @param[out] request * Identifies this function call (for exception handling purposes). * ******************************************************************************* * * @sa PLASMA_splgsy * @sa PLASMA_splgsy_Tile * @sa PLASMA_cplgsy_Tile_Async * @sa PLASMA_dplgsy_Tile_Async * @sa PLASMA_splgsy_Tile_Async * @sa PLASMA_splgsy_Tile_Async * @sa PLASMA_splgsy_Tile_Async * ******************************************************************************/ int PLASMA_splgsy_Tile_Async( float bump, PLASMA_desc *A, unsigned long long int seed, PLASMA_sequence *sequence, PLASMA_request *request) { PLASMA_desc descA = *A; plasma_context_t *plasma; plasma = plasma_context_self(); if (plasma == NULL) { plasma_fatal_error("PLASMA_splgsy_Tile", "PLASMA not initialized"); return PLASMA_ERR_NOT_INITIALIZED; } if (sequence == NULL) { plasma_fatal_error("PLASMA_splgsy_Tile", "NULL sequence"); return PLASMA_ERR_UNALLOCATED; } if (request == NULL) { plasma_fatal_error("PLASMA_splgsy_Tile", "NULL request"); return PLASMA_ERR_UNALLOCATED; } /* Check sequence status */ if (sequence->status == PLASMA_SUCCESS) request->status = PLASMA_SUCCESS; else return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED); /* Check descriptors for correctness */ if (plasma_desc_check(&descA) != PLASMA_SUCCESS) { plasma_error("PLASMA_splgsy_Tile", "invalid descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* Check input arguments */ if (descA.nb != descA.mb) { plasma_error("PLASMA_splgsy_Tile", "only square tiles supported"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* Quick return */ if (min( descA.m, descA.n ) == 0) return PLASMA_SUCCESS; plasma_parallel_call_5(plasma_psplgsy, float, bump, PLASMA_desc, descA, unsigned long long int, seed, PLASMA_sequence*, sequence, PLASMA_request*, request); return PLASMA_SUCCESS; }
/***************************************************************************//** * * @ingroup PLASMA_Complex64_t_Tile_Async * * PLASMA_zgetrf_nopiv_Tile_Async - Computes the tile LU factorization of a * matrix. Non-blocking equivalent of PLASMA_zgetrf_nopiv_Tile(). May return * before the computation is finished. Allows for pipelining of operations ar * runtime. * ******************************************************************************* * * @param[in,out] A * On entry, the M-by-N matrix to be factored. * On exit, the tile factors L and U from the factorization. * * @param[in] sequence * Identifies the sequence of function calls that this call belongs to * (for completion checks and exception handling purposes). * * @param[out] request * Identifies this function call (for exception handling purposes). * ******************************************************************************* * * @sa PLASMA_zgetrf_nopiv * @sa PLASMA_zgetrf_nopiv_Tile * @sa PLASMA_cgetrf_nopiv_Tile_Async * @sa PLASMA_dgetrf_nopiv_Tile_Async * @sa PLASMA_sgetrf_nopiv_Tile_Async * @sa PLASMA_zgetrs_Tile_Async * ******************************************************************************/ int PLASMA_zgetrf_nopiv_Tile_Async(PLASMA_desc *A, PLASMA_sequence *sequence, PLASMA_request *request) { PLASMA_desc descA; plasma_context_t *plasma; plasma = plasma_context_self(); if (plasma == NULL) { plasma_fatal_error("PLASMA_zgetrf_nopiv_Tile", "PLASMA not initialized"); return PLASMA_ERR_NOT_INITIALIZED; } if (sequence == NULL) { plasma_fatal_error("PLASMA_zgetrf_nopiv_Tile", "NULL sequence"); return PLASMA_ERR_UNALLOCATED; } if (request == NULL) { plasma_fatal_error("PLASMA_zgetrf_nopiv_Tile", "NULL request"); return PLASMA_ERR_UNALLOCATED; } /* Check sequence status */ if (sequence->status == PLASMA_SUCCESS) request->status = PLASMA_SUCCESS; else return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED); /* Check descriptors for correctness */ if (plasma_desc_check(A) != PLASMA_SUCCESS) { plasma_error("PLASMA_zgetrf_nopiv_Tile", "invalid first descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } else { descA = *A; } /* Check input arguments */ if (descA.nb != descA.mb) { plasma_error("PLASMA_zgetrf_nopiv_Tile", "only square tiles supported"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } plasma_dynamic_call_3(plasma_pzgetrf_nopiv, PLASMA_desc, descA, PLASMA_sequence*, sequence, PLASMA_request*, request); return PLASMA_SUCCESS; }
/***************************************************************************//** * **/ void plasma_pdlag2s(plasma_context_t *plasma) { PLASMA_desc A; PLASMA_desc SB; PLASMA_sequence *sequence; PLASMA_request *request; int X, Y; int m, n; int next_m; int next_n; int ldam, ldbm; int info = PLASMA_SUCCESS; plasma_unpack_args_4(A, SB, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; n = 0; m = PLASMA_RANK; while (m >= A.mt && n < A.nt) { n++; m = m-A.mt; } while (n < A.nt) { next_m = m; next_n = n; next_m += PLASMA_SIZE; while (next_m >= A.mt && next_n < A.nt) { next_n++; next_m = next_m-A.mt; } X = m == A.mt-1 ? A.m-A.mb*m : A.nb; Y = n == A.nt-1 ? A.n-A.nb*n : A.nb; ldam = BLKLDD(A, m); ldbm = BLKLDD(SB, m); CORE_dlag2s(X, Y, A(m, n), ldam, SB(m, n), ldbm, &info); if (info != 0) plasma_request_fail(sequence, request, info); m = next_m; n = next_n; } }
/***************************************************************************//** * * @ingroup float_Tile_Async * * PLASMA_ssytrd_Tile_Async - Computes all eigenvalues and, * optionally, eigenvectors of a complex Hermitian matrix A using a * two-stage approach: * First stage: reduction to band tridiagonal form; * Second stage: reduction from band to tridiagonal form. * * May return before the computation is finished. * Allows for pipelining of operations ar runtime. * ******************************************************************************* * * @param[in] sequence * Identifies the sequence of function calls that this call belongs to * (for completion checks and exception handling purposes). * * @param[out] request * Identifies this function call (for exception handling purposes). * ******************************************************************************* * * @sa PLASMA_ssytrd * @sa PLASMA_ssytrd_Tile * @sa PLASMA_chetrd_Tile_Async * @sa PLASMA_dsytrd_Tile_Async * @sa PLASMA_ssytrd_Tile_Async * ******************************************************************************/ int PLASMA_ssytrd_Tile_Async(PLASMA_enum jobz, PLASMA_enum uplo, PLASMA_desc *A, float *D, float *E, PLASMA_desc *T, PLASMA_desc *Q, PLASMA_sequence *sequence, PLASMA_request *request) { int NB, IB, IBNB, NT; PLASMA_desc descA = *A; PLASMA_desc descT = *T; plasma_context_t *plasma; plasma = plasma_context_self(); if (plasma == NULL) { plasma_fatal_error("PLASMA_ssytrd_Tile_Async", "PLASMA not initialized"); return PLASMA_ERR_NOT_INITIALIZED; } if (sequence == NULL) { plasma_fatal_error("PLASMA_ssytrd_Tile_Async", "NULL sequence"); return PLASMA_ERR_UNALLOCATED; } if (request == NULL) { plasma_fatal_error("PLASMA_ssytrd_Tile_Async", "NULL request"); return PLASMA_ERR_UNALLOCATED; } /* Check sequence status */ if (sequence->status == PLASMA_SUCCESS) request->status = PLASMA_SUCCESS; else return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED); /* Set NT & NTRHS */ NB = PLASMA_NB; IB = PLASMA_IB; IBNB = IB*NB; NT = (descA.ln%NB==0) ? (descA.ln/NB) : (descA.ln/NB+1); /* Check descriptors for correctness */ if (plasma_desc_check(&descA) != PLASMA_SUCCESS) { plasma_error("PLASMA_ssytrd_Tile_Async", "invalid descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if (plasma_desc_check(&descT) != PLASMA_SUCCESS) { plasma_error("PLASMA_ssytrd_Tile_Async", "invalid descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if ( (jobz == PlasmaVec) && (plasma_desc_check(Q) != PLASMA_SUCCESS) ) { plasma_error("PLASMA_ssytrd_Tile_Async", "invalid descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* Check input arguments */ if (jobz != PlasmaNoVec && jobz != PlasmaVec) { plasma_error("PLASMA_ssytrd_Tile_Async", "illegal value of jobz"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if (descA.m != descA.n) { plasma_error("PLASMA_ssytrd_Tile_Async", "matrix need to be square"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if (descA.nb != descA.mb) { plasma_error("PLASMA_ssytrd_Tile_Async", "only square tiles supported"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if (jobz == PlasmaVec) { plasma_error("PLASMA_ssytrd_Tile_Async", "computing the eigenvectors is not supported in this version"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if ( (jobz == PlasmaVec) && (Q->nb != Q->mb) ) { plasma_error("PLASMA_ssytrd_Tile_Async", "only square tiles supported"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* Reduction to tridiagonal form * with a two-stage approach. */ /* Reduction to BAND tridiagonal form */ plasma_dynamic_call_5(plasma_pssyrbt, PLASMA_enum, uplo, PLASMA_desc, descA, PLASMA_desc, descT, PLASMA_sequence*, sequence, PLASMA_request*, request); /* * Build the Q of the first stage */ /* if (jobz == PlasmaVec){ */ /* /\* Initialize Q to Identity *\/ */ /* plasma_dynamic_call_6(plasma_pslaset, */ /* PLASMA_enum, PlasmaUpperLower, */ /* float, 0.0, */ /* float, 1.0, */ /* PLASMA_desc, descQ, */ /* PLASMA_sequence*, sequence, */ /* PLASMA_request*, request); */ /* /\* Accumulate the transformations from the first stage*\/ */ /* plasma_dynamic_call_6(plasma_psorgtr, */ /* PLASMA_enum, uplo, */ /* PLASMA_desc, descA, */ /* PLASMA_desc, descQ, */ /* PLASMA_desc, descT, */ /* PLASMA_sequence*, sequence, */ /* PLASMA_request*, request); */ /* } */ /* Set the V's to zero before the 2nd stage (bulge chasing) */ /* */ plasma_dynamic_call_5(plasma_pslaset2, PLASMA_enum, uplo, float, 0.0, PLASMA_desc, uplo == PlasmaLower ? plasma_desc_submatrix(descA, descA.mb, 0, descA.m-descA.mb, descA.n-descA.nb) : plasma_desc_submatrix(descA, 0, descA.nb, descA.m-descA.mb, descA.n-descA.nb), PLASMA_sequence*, sequence, PLASMA_request*, request); /* Reduction from BAND tridiagonal to the final condensed form */ plasma_dynamic_call_7(plasma_pssbrdt, PLASMA_enum, uplo, PLASMA_desc, descA, float*, D, float*, E, PLASMA_desc, descT, PLASMA_sequence*, sequence, PLASMA_request*, request); return PLASMA_SUCCESS; }
/***************************************************************************//** * Parallel tile LU factorization - static scheduling **/ void plasma_pzgetrf_incpiv(plasma_context_t *plasma) { PLASMA_desc A; PLASMA_desc L; int *IPIV; PLASMA_sequence *sequence; PLASMA_request *request; int k, m, n; int next_k; int next_m; int next_n; int ldak, ldam; int info; int tempkn, tempkm, tempmm, tempnn; int ib = PLASMA_IB; PLASMA_Complex64_t *work; plasma_unpack_args_5(A, L, IPIV, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; work = (PLASMA_Complex64_t*)plasma_private_alloc(plasma, ib*L.nb, L.dtyp); ss_init(A.mt, A.nt, -1); k = 0; n = PLASMA_RANK; while (n >= A.nt) { k++; n = n-A.nt+k; } m = k; while (k < min(A.mt, A.nt) && n < A.nt && !ss_aborted()) { next_n = n; next_m = m; next_k = k; next_m++; if (next_m == A.mt) { next_n += PLASMA_SIZE; while (next_n >= A.nt && next_k < min(A.mt, A.nt)) { next_k++; next_n = next_n-A.nt+next_k; } next_m = next_k; } tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldak = BLKLDD(A, k); ldam = BLKLDD(A, m); if (n == k) { if (m == k) { ss_cond_wait(k, k, k-1); CORE_zgetrf_incpiv( tempkm, tempkn, ib, A(k, k), ldak, IPIV(k, k), &info); if (info != 0 && m == A.mt-1) { plasma_request_fail(sequence, request, info + A.nb*k); ss_abort(); } ss_cond_set(k, k, k); } else { ss_cond_wait(m, k, k-1); CORE_ztstrf( tempmm, tempkn, ib, A.nb, A(k, k), ldak, A(m, k), ldam, L(m, k), L.mb, IPIV(m, k), work, L.nb, &info); if (info != 0 && m == A.mt-1) { plasma_request_fail(sequence, request, info + A.nb*k); ss_abort(); } ss_cond_set(m, k, k); } } else { if (m == k) { ss_cond_wait(k, k, k); ss_cond_wait(k, n, k-1); CORE_zgessm( tempkm, tempnn, tempkm, ib, IPIV(k, k), A(k, k), ldak, A(k, n), ldak); } else { ss_cond_wait(m, k, k); ss_cond_wait(m, n, k-1); CORE_zssssm( A.nb, tempnn, tempmm, tempnn, A.nb, ib, A(k, n), ldak, A(m, n), ldam, L(m, k), L.mb, A(m, k), ldam, IPIV(m, k)); ss_cond_set(m, n, k); } } n = next_n; m = next_m; k = next_k; } plasma_private_free(plasma, work); ss_finalize(); }
/***************************************************************************//** * Parallel application of Q using tile V - LQ factorization - static scheduling **/ void plasma_pzunmlq(plasma_context_t *plasma) { PLASMA_enum side; PLASMA_enum trans; PLASMA_desc A; PLASMA_desc B; PLASMA_desc T; PLASMA_sequence *sequence; PLASMA_request *request; int k, m, n; int next_k; int next_m; int next_n; int ldak, ldbk, ldbm; int tempmm, tempnn, tempkm, tempkmin; int minMT, minM; int ib = PLASMA_IB; PLASMA_Complex64_t *work; plasma_unpack_args_7(side, trans, A, B, T, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; if (side != PlasmaLeft) { plasma_request_fail(sequence, request, PLASMA_ERR_NOT_SUPPORTED); return; } if (trans != PlasmaConjTrans) { plasma_request_fail(sequence, request, PLASMA_ERR_NOT_SUPPORTED); return; } work = (PLASMA_Complex64_t*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp); ss_init(B.mt, B.nt, min(A.mt, A.nt)); if (A.m > A.n) { minM = A.n; minMT = A.nt; } else { minM = A.m; minMT = A.mt; } k = minMT-1; n = PLASMA_RANK; while (n >= B.nt) { k--; n = n-B.nt; } m = B.mt-1; while (k >= 0 && n < B.nt) { next_n = n; next_m = m; next_k = k; next_m--; if (next_m == k-1) { next_n += PLASMA_SIZE; while (next_n >= B.nt && next_k >= 0) { next_k--; next_n = next_n-B.nt; } next_m = B.mt-1; } tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb; tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb; tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb; tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); ldbm = BLKLDD(B, m); if (m == k) { CORE_zunmlq( side, trans, tempkm, tempnn, tempkmin, ib, A(k, k), ldak, T(k, k), T.mb, B(k, n), ldbk, work, T.nb); ss_cond_set(k, n, k); } else { ss_cond_wait(m, n, k+1); CORE_ztsmlq( side, trans, A.mb, tempnn, tempmm, tempnn, tempkmin, ib, B(k, n), ldbk, B(m, n), ldbm, A(k, m), ldak, T(k, m), T.mb, work, ib); ss_cond_set(m, n, k); } m = next_m; n = next_n; k = next_k; } plasma_private_free(plasma, work); ss_finalize(); }
/***************************************************************************//** * * @ingroup PLASMA_Complex64_t_Tile_Async * * PLASMA_zgetri_Tile_Async - Computes the inverse of a matrix using the LU * factorization computed by PLASMA_zgetrf. * This method inverts U and then computes inv(A) by solving the system * inv(A)*L = inv(U) for inv(A). * Non-blocking equivalent of PLASMA_zgetri_Tile(). * May return before the computation is finished. * Allows for pipelining of operations at runtime. * ******************************************************************************* * * @param[in] sequence * Identifies the sequence of function calls that this call belongs to * (for completion checks and exception handling purposes). * * @param[out] request * Identifies this function call (for exception handling purposes). * ******************************************************************************* * * @sa PLASMA_zgetri * @sa PLASMA_zgetri_Tile * @sa PLASMA_cgetri_Tile_Async * @sa PLASMA_dgetri_Tile_Async * @sa PLASMA_sgetri_Tile_Async * @sa PLASMA_zgetrf_Tile_Async * ******************************************************************************/ int PLASMA_zgetri_Tile_Async(PLASMA_desc *A, int *IPIV, PLASMA_desc *W, PLASMA_sequence *sequence, PLASMA_request *request) { PLASMA_desc descA; PLASMA_desc descW; plasma_context_t *plasma; plasma = plasma_context_self(); if (plasma == NULL) { plasma_fatal_error("PLASMA_zgetri_Tile_Async", "PLASMA not initialized"); return PLASMA_ERR_NOT_INITIALIZED; } if (sequence == NULL) { plasma_fatal_error("PLASMA_zgetri_Tile_Async", "NULL sequence"); return PLASMA_ERR_UNALLOCATED; } if (request == NULL) { plasma_fatal_error("PLASMA_zgetri_Tile_Async", "NULL request"); return PLASMA_ERR_UNALLOCATED; } /* Check sequence status */ if (sequence->status == PLASMA_SUCCESS) request->status = PLASMA_SUCCESS; else return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED); /* Check descriptors for correctness */ if (plasma_desc_check(A) != PLASMA_SUCCESS) { plasma_error("PLASMA_zgetri_Tile_Async", "invalid A descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } else { descA = *A; } /* Check descriptors for correctness */ if (plasma_desc_check(W) != PLASMA_SUCCESS) { plasma_error("PLASMA_zgetri_Tile_Async", "invalid W descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } else { descW = *W; } /* Check input arguments */ if (descA.nb != descA.mb) { plasma_error("PLASMA_zgetri_Tile_Async", "only square tiles supported"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* Quick return */ if (max(descA.m, 0) == 0) return PLASMA_SUCCESS; plasma_dynamic_call_5(plasma_pztrtri, PLASMA_enum, PlasmaUpper, PLASMA_enum, PlasmaNonUnit, PLASMA_desc, descA, PLASMA_sequence*, sequence, PLASMA_request*, request); plasma_dynamic_call_9(plasma_pztrsmrv, PLASMA_enum, PlasmaRight, PLASMA_enum, PlasmaLower, PLASMA_enum, PlasmaNoTrans, PLASMA_enum, PlasmaUnit, PLASMA_Complex64_t, (PLASMA_Complex64_t) 1.0, PLASMA_desc, descA, PLASMA_desc, descW, PLASMA_sequence*, sequence, PLASMA_request*, request); /* No need for barrier tile2row because of previous dependencies */ /* swap */ plasma_dynamic_call_5( plasma_pzlaswpc, PLASMA_desc, descA, int *, IPIV, int, -1, PLASMA_sequence*, sequence, PLASMA_request*, request); plasma_dynamic_call_3( plasma_pzbarrier_row2tl, PLASMA_desc, descA, PLASMA_sequence*, sequence, PLASMA_request*, request); return PLASMA_SUCCESS; }
/***************************************************************************//** * * @ingroup PLASMA_Complex64_t_Tile_Async * * PLASMA_zlansy_Tile_Async - Non-blocking equivalent of PLASMA_zlansy_Tile(). * May return before the computation is finished. * Allows for pipelining of operations at runtime. * ******************************************************************************* * * @param[in] sequence * Identifies the sequence of function calls that this call belongs to * (for completion checks and exception handling purposes). * * @param[out] request * Identifies this function call (for exception handling purposes). * ******************************************************************************* * * @sa PLASMA_zlansy * @sa PLASMA_zlansy_Tile * @sa PLASMA_clansy_Tile_Async * @sa PLASMA_dlansy_Tile_Async * @sa PLASMA_slansy_Tile_Async * ******************************************************************************/ int PLASMA_zlansy_Tile_Async(PLASMA_enum norm, PLASMA_enum uplo, PLASMA_desc *A, double *value, PLASMA_sequence *sequence, PLASMA_request *request) { PLASMA_desc descA; double *work = NULL; plasma_context_t *plasma; plasma = plasma_context_self(); if (plasma == NULL) { plasma_fatal_error("PLASMA_zlansy_Tile", "PLASMA not initialized"); return PLASMA_ERR_NOT_INITIALIZED; } if (sequence == NULL) { plasma_fatal_error("PLASMA_zlansy_Tile", "NULL sequence"); return PLASMA_ERR_UNALLOCATED; } if (request == NULL) { plasma_fatal_error("PLASMA_zlansy_Tile", "NULL request"); return PLASMA_ERR_UNALLOCATED; } /* Check sequence status */ if (sequence->status == PLASMA_SUCCESS) request->status = PLASMA_SUCCESS; else return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED); /* Check descriptors for correctness */ if (plasma_desc_check(A) != PLASMA_SUCCESS) { plasma_error("PLASMA_zlansy_Tile", "invalid descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } else { descA = *A; } /* Check input arguments */ if (descA.nb != descA.mb) { plasma_error("PLASMA_zlansy_Tile", "only square tiles supported"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if ( (norm != PlasmaMaxNorm) && (norm != PlasmaOneNorm) && (norm != PlasmaInfNorm) && (norm != PlasmaFrobeniusNorm) ) { plasma_error("PLASMA_zlansy_Tile", "illegal value of norm"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if ( (uplo != PlasmaUpper) && (uplo != PlasmaLower) ) { plasma_error("PLASMA_zlansy_Tile", "illegal value of uplo"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* Quick return */ if ( descA.m == 0) { *value = 0.0; return PLASMA_SUCCESS; } if (PLASMA_SCHEDULING == PLASMA_STATIC_SCHEDULING) { if (norm == PlasmaFrobeniusNorm) { work = plasma_shared_alloc(plasma, 2*PLASMA_SIZE, PlasmaRealDouble ); } else { work = plasma_shared_alloc(plasma, PLASMA_SIZE, PlasmaRealDouble ); } } plasma_parallel_call_7(plasma_pzlansy, PLASMA_enum, norm, PLASMA_enum, uplo, PLASMA_desc, descA, double*, work, double*, value, PLASMA_sequence*, sequence, PLASMA_request*, request); if (work != NULL) plasma_shared_free( plasma, work ); return PLASMA_SUCCESS; }
/** **************************************************************************** * * @ingroup InPlaceTransformation * * plasma_dgetmi2 Implementation of inplace transposition * based on the GKK algorithm by Gustavson, Karlsson, Kagstrom. * This algorithm shift some cycles to transpose the matrix. * ******************************************************************************* * * @param[in] m * Number of rows of matrix A * * @param[in] n * Number of columns of matrix A * * @param[in,out] A * Matrix of size L*m*n * * @param[in] nprob * Number of parallel and independant problems * * @param[in] me * Number of rows of the problem * * @param[in] ne * Number of columns in the problem * * @param[in] L * Size of chunk to use for transformation * ******************************************************************************/ int plasma_dshift(plasma_context_t *plasma, int m, int n, double *A, int nprob, int me, int ne, int L, PLASMA_sequence *sequence, PLASMA_request *request) { int *leaders = NULL; int ngrp, thrdbypb, thrdtot, nleaders; /* Check Plasma context */ thrdtot = PLASMA_SIZE; thrdbypb = PLASMA_GRPSIZE; ngrp = thrdtot/thrdbypb; /* check input */ if( (nprob * me * ne * L) != (m * n) ) { plasma_error(__func__, "problem size does not match matrix size"); /*printf("m=%d, n=%d, nprob=%d, me=%d, ne=%d, L=%d\n", m, n, nprob, me, ne, L);*/ return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if( thrdbypb > thrdtot ) { plasma_error(__func__, "number of thread per problem must be less or equal to total number of threads"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if( (thrdtot % thrdbypb) != 0 ) { plasma_error(__func__, "number of thread per problem must divide the total number of thread"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* quick return */ if( (me < 2) || (ne < 2) || (nprob < 1) ) { return PLASMA_SUCCESS; } GKK_getLeaderNbr(me, ne, &nleaders, &leaders); nleaders *= 3; if (PLASMA_SCHEDULING == PLASMA_STATIC_SCHEDULING) { int *Tp = NULL; int i, ipb; int q, owner; q = me*ne - 1; Tp = (int *)plasma_shared_alloc(plasma, thrdtot, PlasmaInteger); for (i=0; i<thrdtot; i++) Tp[i] = 0; ipb = 0; /* First part with coarse parallelism */ if (nprob > ngrp) { ipb = (nprob / ngrp)*ngrp; /* loop over leader */ if (thrdbypb > 1) { for (i=0; i<nleaders; i+=3) { /* assign this cycle to a thread */ owner = minloc(thrdbypb, Tp); /* assign it to owner */ Tp[owner] = Tp[owner] + leaders[i+1] * L; leaders[i+2] = owner; } GKK_BalanceLoad(thrdbypb, Tp, leaders, nleaders, L); } else { for (i=0; i<nleaders; i+=3) { Tp[0] = Tp[0] + leaders[i+1] * L; leaders[i+2] = 0; } } /* shift in parallel */ for (i=0; i< (nprob/ngrp); i++) { plasma_static_call_9(plasma_pdshift, int, me, int, ne, int, L, double*, &(A[i*ngrp*me*ne*L]), int *, leaders, int, nleaders, int, thrdbypb, PLASMA_sequence*, sequence, PLASMA_request*, request); } }
/***************************************************************************//** * Parallel tile Cholesky factorization - static scheduling **/ void plasma_pspotrf(plasma_context_t *plasma) { PLASMA_enum uplo; PLASMA_desc A; PLASMA_sequence *sequence; PLASMA_request *request; int k, m, n; int next_k; int next_m; int next_n; int ldak, ldam, ldan; int info; int tempkn, tempmn; float zone = (float) 1.0; float mzone = (float)-1.0; plasma_unpack_args_4(uplo, A, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; ss_init(A.nt, A.nt, 0); k = 0; m = PLASMA_RANK; while (m >= A.nt) { k++; m = m-A.nt+k; } n = 0; while (k < A.nt && m < A.nt && !ss_aborted()) { next_n = n; next_m = m; next_k = k; next_n++; if (next_n > next_k) { next_m += PLASMA_SIZE; while (next_m >= A.nt && next_k < A.nt) { next_k++; next_m = next_m-A.nt+next_k; } next_n = 0; } tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; tempmn = m == A.nt-1 ? A.n-m*A.nb : A.nb; ldak = BLKLDD(A, k); ldan = BLKLDD(A, n); ldam = BLKLDD(A, m); if (m == k) { if (n == k) { /* * PlasmaLower */ if (uplo == PlasmaLower) { CORE_spotrf( PlasmaLower, tempkn, A(k, k), ldak, &info); } /* * PlasmaUpper */ else { CORE_spotrf( PlasmaUpper, tempkn, A(k, k), ldak, &info); } if (info != 0) { plasma_request_fail(sequence, request, info + A.nb*k); ss_abort(); } ss_cond_set(k, k, 1); } else { ss_cond_wait(k, n, 1); /* * PlasmaLower */ if (uplo == PlasmaLower) { CORE_ssyrk( PlasmaLower, PlasmaNoTrans, tempkn, A.nb, -1.0, A(k, n), ldak, 1.0, A(k, k), ldak); } /* * PlasmaUpper */ else { CORE_ssyrk( PlasmaUpper, PlasmaTrans, tempkn, A.nb, -1.0, A(n, k), ldan, 1.0, A(k, k), ldak); } } } else { if (n == k) { ss_cond_wait(k, k, 1); /* * PlasmaLower */ if (uplo == PlasmaLower) { CORE_strsm( PlasmaRight, PlasmaLower, PlasmaTrans, PlasmaNonUnit, tempmn, A.nb, zone, A(k, k), ldak, A(m, k), ldam); } /* * PlasmaUpper */ else { CORE_strsm( PlasmaLeft, PlasmaUpper, PlasmaTrans, PlasmaNonUnit, A.nb, tempmn, zone, A(k, k), ldak, A(k, m), ldak); } ss_cond_set(m, k, 1); } else { ss_cond_wait(k, n, 1); ss_cond_wait(m, n, 1); /* * PlasmaLower */ if (uplo == PlasmaLower) { CORE_sgemm( PlasmaNoTrans, PlasmaTrans, tempmn, A.nb, A.nb, mzone, A(m, n), ldam, A(k, n), ldak, zone, A(m, k), ldam); } /* * PlasmaUpper */ else { CORE_sgemm( PlasmaTrans, PlasmaNoTrans, A.nb, tempmn, A.nb, mzone, A(n, k), ldan, A(n, m), ldan, zone, A(k, m), ldak); } } } n = next_n; m = next_m; k = next_k; } ss_finalize(); }
/***************************************************************************//** * * @ingroup float_Tile_Async * * PLASMA_sgelqf_Tile_Async - Computes the tile LQ factorization of a matrix. * Non-blocking equivalent of PLASMA_sgelqf_Tile(). * May return before the computation is finished. * Allows for pipelining of operations ar runtime. * ******************************************************************************* * * @param[in] sequence * Identifies the sequence of function calls that this call belongs to * (for completion checks and exception handling purposes). * * @param[out] request * Identifies this function call (for exception handling purposes). * ******************************************************************************* * * @sa PLASMA_sgelqf * @sa PLASMA_sgelqf_Tile * @sa PLASMA_cgelqf_Tile_Async * @sa PLASMA_dgelqf_Tile_Async * @sa PLASMA_sgelqf_Tile_Async * @sa PLASMA_sgelqs_Tile_Async * ******************************************************************************/ int PLASMA_sgelqf_Tile_Async(PLASMA_desc *A, PLASMA_desc *T, PLASMA_sequence *sequence, PLASMA_request *request) { PLASMA_desc descA = *A; PLASMA_desc descT = *T; plasma_context_t *plasma; plasma = plasma_context_self(); if (plasma == NULL) { plasma_fatal_error("PLASMA_sgelqf_Tile", "PLASMA not initialized"); return PLASMA_ERR_NOT_INITIALIZED; } if (sequence == NULL) { plasma_fatal_error("PLASMA_sgelqf_Tile", "NULL sequence"); return PLASMA_ERR_UNALLOCATED; } if (request == NULL) { plasma_fatal_error("PLASMA_sgelqf_Tile", "NULL request"); return PLASMA_ERR_UNALLOCATED; } /* Check sequence status */ if (sequence->status == PLASMA_SUCCESS) request->status = PLASMA_SUCCESS; else return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED); /* Check descriptors for correctness */ if (plasma_desc_check(&descA) != PLASMA_SUCCESS) { plasma_error("PLASMA_sgelqf_Tile", "invalid first descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if (plasma_desc_check(&descT) != PLASMA_SUCCESS) { plasma_error("PLASMA_sgelqf_Tile", "invalid second descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* Check input arguments */ if (descA.nb != descA.mb) { plasma_error("PLASMA_sgelqf_Tile", "only square tiles supported"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* Quick return */ /* if (min(M, N) == 0) return PLASMA_SUCCESS; */ if (plasma->householder == PLASMA_FLAT_HOUSEHOLDER) { plasma_parallel_call_4(plasma_psgelqf, PLASMA_desc, descA, PLASMA_desc, descT, PLASMA_sequence*, sequence, PLASMA_request*, request); } else { plasma_dynamic_call_5(plasma_psgelqfrh, PLASMA_desc, descA, PLASMA_desc, descT, PLASMA_enum, PLASMA_RHBLK, PLASMA_sequence*, sequence, PLASMA_request*, request); } return PLASMA_SUCCESS; }