예제 #1
0
/***************************************************************************//**
 *  Parallel tile row interchanges - dynamic scheduling
 **/
void plasma_pclaswp_quark(PLASMA_desc B, int *IPIV, int inc,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int m, n;
    int tempi, tempm, tempmm, tempnn;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    if ( inc > 0 ) 
    {
        for (m = 0; m < B.mt; m++) {
            tempi = m * B.mb;
            tempm = B.m - tempi;
            tempmm = m == B.mt-1 ? tempm : B.mb;

            for (n = 0; n < B.nt; n++) {
                tempnn = n == B.nt-1 ? B.n - n * B.nb : B.nb;
                
                QUARK_CORE_claswp_ontile(
                    plasma->quark, &task_flags,
                    plasma_desc_submatrix(B, tempi, n*B.nb, tempm, tempnn),
                    B(m, n), 1, tempmm, IPIV(m), inc, B(B.mt-1, n) );
            }
        }
    } 
    else 
    {
        for (m = B.mt-1; m > -1; m--) {
            tempi = m * B.mb;
            tempm = B.m - tempi;
            tempmm = m == B.mt-1 ? tempm : B.mb;

            for (n = 0; n < B.nt; n++) {
                tempnn = n == B.nt-1 ? B.n - n * B.nb : B.nb;
                
                QUARK_CORE_claswp_ontile(
                    plasma->quark, &task_flags,
                    plasma_desc_submatrix(B, tempi, n*B.nb, tempm, tempnn),
                    B(m, n), 1, tempmm, IPIV(m), inc, B(0, n) );
            }
        }
    } 
}
예제 #2
0
/***************************************************************************//**
 *
 * @ingroup float_Tile_Async
 *
 *  PLASMA_ssytrd_Tile_Async - Computes all eigenvalues and,
 *  optionally, eigenvectors of a complex Hermitian matrix A using a
 *  two-stage approach:
 *  First stage: reduction to band tridiagonal form;
 *  Second stage: reduction from band to tridiagonal form.
 *
 *  May return before the computation is finished.
 *  Allows for pipelining of operations ar runtime.
 *
 *******************************************************************************
 *
 * @param[in] sequence
 *          Identifies the sequence of function calls that this call belongs to
 *          (for completion checks and exception handling purposes).
 *
 * @param[out] request
 *          Identifies this function call (for exception handling purposes).
 *
 *******************************************************************************
 *
 * @sa PLASMA_ssytrd
 * @sa PLASMA_ssytrd_Tile
 * @sa PLASMA_chetrd_Tile_Async
 * @sa PLASMA_dsytrd_Tile_Async
 * @sa PLASMA_ssytrd_Tile_Async
 *
 ******************************************************************************/
int PLASMA_ssytrd_Tile_Async(PLASMA_enum jobz, PLASMA_enum uplo,
                            PLASMA_desc *A,
                            float *D,
                            float *E,
                            PLASMA_desc *T,
                            PLASMA_desc *Q,
                            PLASMA_sequence *sequence, PLASMA_request *request)
{
    int NB, IB, IBNB, NT;
    PLASMA_desc descA = *A;
    PLASMA_desc descT = *T;

    plasma_context_t *plasma;

    plasma = plasma_context_self();
    if (plasma == NULL) {
        plasma_fatal_error("PLASMA_ssytrd_Tile_Async", "PLASMA not initialized");
        return PLASMA_ERR_NOT_INITIALIZED;
    }
    if (sequence == NULL) {
        plasma_fatal_error("PLASMA_ssytrd_Tile_Async", "NULL sequence");
        return PLASMA_ERR_UNALLOCATED;
    }
    if (request == NULL) {
        plasma_fatal_error("PLASMA_ssytrd_Tile_Async", "NULL request");
        return PLASMA_ERR_UNALLOCATED;
    }
    /* Check sequence status */
    if (sequence->status == PLASMA_SUCCESS)
        request->status = PLASMA_SUCCESS;
    else
        return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED);

    /* Set NT & NTRHS */
    NB   = PLASMA_NB;
    IB   = PLASMA_IB;
    IBNB = IB*NB;
    NT   = (descA.ln%NB==0) ? (descA.ln/NB) : (descA.ln/NB+1);

    /* Check descriptors for correctness */
    if (plasma_desc_check(&descA) != PLASMA_SUCCESS) {
        plasma_error("PLASMA_ssytrd_Tile_Async", "invalid descriptor");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
    if (plasma_desc_check(&descT) != PLASMA_SUCCESS) {
        plasma_error("PLASMA_ssytrd_Tile_Async", "invalid descriptor");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
    if ( (jobz == PlasmaVec) && (plasma_desc_check(Q) != PLASMA_SUCCESS) ) {
        plasma_error("PLASMA_ssytrd_Tile_Async", "invalid descriptor");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }

    /* Check input arguments */
    if (jobz != PlasmaNoVec && jobz != PlasmaVec) {
        plasma_error("PLASMA_ssytrd_Tile_Async", "illegal value of jobz");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
    if (descA.m != descA.n) {
        plasma_error("PLASMA_ssytrd_Tile_Async", "matrix need to be square");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
   if (descA.nb != descA.mb) {
        plasma_error("PLASMA_ssytrd_Tile_Async", "only square tiles supported");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
    if (jobz == PlasmaVec) {
        plasma_error("PLASMA_ssytrd_Tile_Async", "computing the eigenvectors is not supported in this version");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
    if ( (jobz == PlasmaVec) && (Q->nb != Q->mb) ) {
        plasma_error("PLASMA_ssytrd_Tile_Async", "only square tiles supported");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }

    /* Reduction to tridiagonal form
     * with a two-stage approach.
     */

    /* Reduction to BAND tridiagonal form
     */
    plasma_dynamic_call_5(plasma_pssyrbt,
        PLASMA_enum, uplo,
        PLASMA_desc, descA,
        PLASMA_desc, descT,
        PLASMA_sequence*, sequence,
        PLASMA_request*, request);

    /* 
     * Build the Q of the first stage
     */
    /* if (jobz == PlasmaVec){ */
    /*    /\* Initialize Q to Identity *\/ */
    /*    plasma_dynamic_call_6(plasma_pslaset, */
    /*        PLASMA_enum, PlasmaUpperLower, */
    /*        float, 0.0, */
    /*        float, 1.0, */
    /*        PLASMA_desc, descQ, */
    /*        PLASMA_sequence*, sequence, */
    /*        PLASMA_request*, request); */
    /*    /\* Accumulate the transformations from the first stage*\/ */
    /*    plasma_dynamic_call_6(plasma_psorgtr, */
    /*        PLASMA_enum, uplo, */
    /*        PLASMA_desc, descA, */
    /*        PLASMA_desc, descQ, */
    /*        PLASMA_desc, descT, */
    /*        PLASMA_sequence*, sequence, */
    /*        PLASMA_request*, request); */
    /* } */

    /* Set the V's to zero before the 2nd stage (bulge chasing) */
    /*
    */
    plasma_dynamic_call_5(plasma_pslaset2,
        PLASMA_enum, uplo,
        float, 0.0,
        PLASMA_desc, uplo == PlasmaLower ? plasma_desc_submatrix(descA, descA.mb, 0, descA.m-descA.mb, descA.n-descA.nb) 
                                         : plasma_desc_submatrix(descA, 0, descA.nb, descA.m-descA.mb, descA.n-descA.nb),
        PLASMA_sequence*, sequence,
        PLASMA_request*, request);

    /* Reduction from BAND tridiagonal to the final condensed form
     */
    plasma_dynamic_call_7(plasma_pssbrdt,
        PLASMA_enum, uplo,
        PLASMA_desc, descA,
        float*, D,
        float*, E,
        PLASMA_desc, descT,
        PLASMA_sequence*, sequence,
        PLASMA_request*, request);


    return PLASMA_SUCCESS;
}
예제 #3
0
/***************************************************************************//**
 *  Parallel tile LU factorization - dynamic scheduling - Right looking
 **/
void plasma_pdgetrf_rectil_quark(PLASMA_desc A, int *IPIV)
{
    int k, m, n;
    int tempk, tempm, tempkm, tempkn, tempmm, tempnn;
    int ldak, ldam;

    double zone  = (double)1.0;
    double mzone = (double)-1.0;

    void * fakedep;
    /* How many threads per panel? Probably needs to be adjusted during factorization. */

    CORE_dgetrf_rectil_init();

    for (k = 0; k < min(A.mt, A.nt); k++)
    {
        tempk  = k * A.mb;
        tempm  = A.m - tempk;
        tempkm = k == A.mt-1 ? tempm      : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        ldak = BLKLDD(A, k);

        double *dA = A(k, k);
        int *dB = IPIV(k);
        PLASMA_desc pDesc = plasma_desc_submatrix(A, tempk, k*A.nb, tempm, tempkn);
hclib_pragma_marker("omp", "task depend(inout:dA[0:A.mb*A.nb]) depend(out:dB[0:pDesc.n])", "pragma59_omp_task");
        {
            int info[3];
            info[1] = 0;
            info[2] = 1;

            CORE_dgetrf_rectil( pDesc, dB, info );
        }

        /*
         * Update the trailing submatrix
         */
        fakedep = (void *)(intptr_t)(k+1);
        for (n = k+1; n < A.nt; n++)
        {
            /*
             * Apply row interchange after the panel (work on the panel)
             */
            tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            PLASMA_desc descA = plasma_desc_submatrix(A, tempk, n*A.nb, tempm, tempnn);
            double *dA = A(k, n);
            double *dB = A(k, k);
            int *dipiv = IPIV(k);
hclib_pragma_marker("omp", "task depend(inout:dA[0:1]) depend(in:dB[0:ldak], dipiv[0:tempkm])", "pragma82_omp_task");
            CORE_dswptr_ontile(descA, 1, tempkm, dipiv, 1, dB, ldak);

            m = k+1;
            if ( m < A.mt ) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);

                double *dA = A(m , k);
                double *dB = A(k , n);
                double *dC = A(m , n);
hclib_pragma_marker("omp", "task depend(in:dA[0:A.mb*A.mb], dB[0:A.mb*A.mb]) depend(inout:dC[0:A.mb*A.mb])", "pragma93_omp_task");
                cblas_dgemm(CblasColMajor, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_TRANSPOSE)PlasmaNoTrans,
                        tempmm, tempnn, A.nb,
                        mzone, dA, ldam,
                        dB, ldak,
                        zone, dC, ldam);

                for (m = k+2; m < A.mt; m++)
                {
                    tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                    ldam = BLKLDD(A, m);

                    double *dA = A(m , k);
                    double *dB = A(k , n);
                    double *dC = A(m , n);
                    double *fake1 = A(k+1, n);
                    double *fake2 = (double *)fakedep;
hclib_pragma_marker("omp", "task depend(in:dA[0:A.mb*A.mb], dB[0:A.mb*A.mb], fake2[0:1]) depend(inout:dC[0:A.mb*A.mb], fake1[0:A.mb*A.nb])", "pragma110_omp_task");
                        cblas_dgemm(CblasColMajor, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_TRANSPOSE)PlasmaNoTrans,
                                tempmm, tempnn, A.nb,
                                mzone, dA, ldam,
                                dB, ldak,
                                zone, dC, ldam);
                }
            }
        }
    }

    for (k = 0; k < min(A.mt, A.nt); k++)
    {
        int mintmp;
        tempk  = k * A.mb;
        tempm  = A.m - tempk;
        tempkm = k == A.mt-1 ? tempm : A.mb;
        tempkn = k == A.nt-1 ? A.n - k * A.nb : A.nb;
        mintmp = min(tempkm, tempkn);
        ldak = BLKLDD(A, k);

        /*
         * Apply row interchange behind the panel (work on the panel)
         */
        fakedep = (void*)(intptr_t)k;
        for (n = 0; n < k; n++)
        {
            tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            double *Aij = A(k, n);
            double *prevSwap = A(k-1, n);
            int *dipiv = IPIV(k);
            PLASMA_desc descA = plasma_desc_submatrix(A, tempk, n*A.nb, tempm, tempnn);
hclib_pragma_marker("omp", "task depend(inout:Aij[0:1],fakedep) depend(in:dipiv[0:mintmp], prevSwap[0:A.lm*A.nb])", "pragma142_omp_task");
            CORE_dlaswp_ontile(descA, 1, mintmp, dipiv, 1);
        }
    }
}