コード例 #1
0
ファイル: pcgelqf.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel tile LQ factorization - static scheduling
 **/
void plasma_pcgelqf(plasma_context_t *plasma)
{
    PLASMA_desc A;
    PLASMA_desc T;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_k;
    int next_m;
    int next_n;
    int ldak, ldam;
    int tempkm, tempkn, tempmm, tempnn;
    int ib = PLASMA_IB;
    PLASMA_Complex32_t *work, *tau;

    plasma_unpack_args_4(A, T, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;
    work = (PLASMA_Complex32_t*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp);
    tau  = (PLASMA_Complex32_t*)plasma_private_alloc(plasma, A.nb, A.dtyp);
    ss_init(A.mt, A.nt, -1);

    k = 0;
    m = PLASMA_RANK;
    while (m >= A.mt) {
        k++;
        m = m-A.mt+k;
    }
    n = k;

    while (k < min(A.mt, A.nt) && m < A.mt) {
        next_m = m;
        next_n = n;
        next_k = k;

        next_n++;
        if (next_n == A.nt) {
            next_m += PLASMA_SIZE;
            while (next_m >= A.mt && next_k < min(A.nt, A.mt)) {
                next_k++;
                next_m = next_m-A.mt+next_k;
            }
            next_n = next_k;
        }

        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
        tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;

        ldak = BLKLDD(A, k);
        ldam = BLKLDD(A, m);

        if (m == k) {
            if (n == k) {
                ss_cond_wait(k, k, k-1);
                CORE_cgelqt(
                    tempkm, tempkn, ib,
                    A(k, k), ldak,
                    T(k, k), T.mb,
                    tau, work);
                ss_cond_set(k, k, k);
            }
            else {
                ss_cond_wait(k, n, k-1);
                CORE_ctslqt(
                    tempkm, tempnn, ib,
                    A(k, k), ldak,
                    A(k, n), ldak,
                    T(k, n), T.mb,
                    tau, work);
                ss_cond_set(k, n, k);
            }
        }
        else {
            if (n == k) {
                ss_cond_wait(k, k, k);
                ss_cond_wait(m, k, k-1);
                CORE_cunmlq(
                    PlasmaRight, PlasmaConjTrans,
                    tempmm, tempkn, tempkn, ib,
                    A(k, k), ldak,
                    T(k, k), T.mb,
                    A(m, k), ldam,
                    work, T.nb);
            }
            else {
                ss_cond_wait(k, n, k);
                ss_cond_wait(m, n, k-1);
                CORE_ctsmlq(
                    PlasmaRight, PlasmaConjTrans,
                    tempmm, A.nb, tempmm, tempnn, A.nb, ib,
                    A(m, k), ldam,
                    A(m, n), ldam,
                    A(k, n), ldak,
                    T(k, n), T.mb,
                    work, T.nb);
                ss_cond_set(m, n, k);
            }
        }
        m = next_m;
        n = next_n;
        k = next_k;
    }
    plasma_private_free(plasma, work);
    plasma_private_free(plasma, tau);
    ss_finalize();
}
コード例 #2
0
/***************************************************************************//**
 *  Parallel tile LU factorization - dynamic scheduling
 **/
void plasma_psgetrf_incpiv_quark(PLASMA_desc A, PLASMA_desc L, int *IPIV,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int ldak, ldam;
    int tempkm, tempkn, tempmm, tempnn;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    for (k = 0; k < min(A.mt, A.nt); k++) {
        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        ldak = BLKLDD(A, k);
        QUARK_CORE_sgetrf_incpiv(
            plasma->quark, &task_flags,
            tempkm, tempkn, ib, L.nb,
            A(k, k), ldak, IPIV(k, k),
            sequence, request,
            k == A.mt-1, A.nb*k);

        for (n = k+1; n < A.nt; n++) {
            tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            QUARK_CORE_sgessm(
                plasma->quark, &task_flags,
                tempkm, tempnn, tempkm, ib, L.nb,
                IPIV(k, k),
                A(k, k), ldak,
                A(k, n), ldak);
        }
        for (m = k+1; m < A.mt; m++) {
            tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            QUARK_CORE_ststrf(
                plasma->quark, &task_flags,
                tempmm, tempkn, ib, L.nb,
                A(k, k), ldak,
                A(m, k), ldam,
                L(m, k), L.mb,
                IPIV(m, k),
                sequence, request,
                m == A.mt-1, A.nb*k);

            for (n = k+1; n < A.nt; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_sssssm(
                    plasma->quark, &task_flags,
                    A.nb, tempnn, tempmm, tempnn, A.nb, ib, L.nb,
                    A(k, n), ldak,
                    A(m, n), ldam,
                    L(m, k), L.mb,
                    A(m, k), ldam,
                    IPIV(m, k));
            }
        }
    }
}
コード例 #3
0
ファイル: pzgelqfrh.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel tile LQ factorization (reduction Householder) - static / sequential
 **/
void plasma_pzgelqfrh(plasma_context_t *plasma)
{
    PLASMA_desc A;
    PLASMA_desc T;
    int BS;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int K, N, RD;
    int ldak, ldam;
    int tempkm, tempNn, tempmm, tempnn, tempNRDn;
    int ib;

    if (PLASMA_RANK != 0) return;

    plasma_unpack_args_5(A, T, BS, sequence, request);
    ib = PLASMA_IB;

    PLASMA_Complex64_t *work, *tau;
    work = (PLASMA_Complex64_t*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp);
    tau  = (PLASMA_Complex64_t*)plasma_private_alloc(plasma, A.nb, A.dtyp);

    if (sequence->status != PLASMA_SUCCESS)
        return;

    K = min(A.mt, A.nt);
    for (k = 0; k < K; k++) {
        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        ldak = BLKLDD(A, k);
        for (N = k;
             N < A.nt-1 || N == k;  // No rightmost single-column subdomain
             N += BS) {
            tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
            CORE_zgelqt(
                tempkm, tempNn, ib,
                A(k, N), ldak,
                T(k, N), T.mb,
                tau, work);

            for (m = k+1; m < A.mt; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                CORE_zunmlq(
                    PlasmaRight, PlasmaConjTrans,
                    tempmm, tempNn, tempNn, ib,
                    A(k, N), ldak,
                    T(k, N), T.mb,
                    A(m, N), ldam,
                    work , A.nb);
            }
            for (n = N+1;
                 (n < N+BS && n < A.nt) || n == A.nt-1; // Suck in rightmost single-column domain
                 n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                CORE_ztslqt(
                    tempkm, tempnn, ib,
                    A(k, N), ldak,
                    A(k, n), ldak,
                    T(k, n), T.mb,
                    tau, work);

                for (m = k+1; m < A.mt; m++) {
                    tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                    ldam = BLKLDD(A, m);
                    CORE_ztsmlq(
                        PlasmaRight, PlasmaConjTrans,
                        tempmm, A.mb, tempmm, tempnn, A.mb, ib,
                        A(m, N), ldam,
                        A(m, n), ldam,
                        A(k, n), ldak,
                        T(k, n), T.mb,
                        work , A.nb);
                }
            }
        }
        for (RD = BS; RD < A.nt-k; RD *= 2) {
            for (N = k;
                 N+RD < A.nt-1; // No reduction with rightmost single-column subdomain
                 N += 2*RD) {
                tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                CORE_zttlqt(
                    tempkm, tempNRDn, ib,
                    A (k, N   ), ldak,
                    A (k, N+RD), ldak,
                    T2(k, N+RD), T.mb,
                    tau, work);

                for (m = k+1; m < A.mt; m++) {
                    tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                    ldam = BLKLDD(A, m);
                    CORE_zttmlq(
                        PlasmaRight, PlasmaConjTrans,
                        tempmm, A.nb, tempmm, tempNRDn, tempkm, ib,
                        A (m, N   ), ldam,
                        A (m, N+RD), ldam,
                        A (k, N+RD), ldak,
                        T2(k, N+RD), T.mb,
                        work , A.nb);
                }
            }
        }
    }
}
コード例 #4
0
ファイル: pzunglqrh.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel construction of Q using tile V (application to identity;
 *  reduction Householder) - dynamic scheduling
 **/
void plasma_pzunglqrh_quark(PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, int BS,
                            PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int K, N, RD, lastRD;
    int ldak;
    int ldqm;
    int tempkm, tempNn, tempnn, tempmm, tempNRDn, tempkmin;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    K = min(A.mt, A.nt);

    for (k = K-1; k >= 0; k--) {
        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        ldak = BLKLDD(A, k);
        lastRD = 0;
        for (RD = BS; RD < A.nt-k; RD *= 2)
            lastRD = RD;
        for (RD = lastRD; RD >= BS; RD /= 2) {
            for (N = k;
                 N+RD < A.nt-1; /* No reduction with rightmost single-column subdomain */
                 N += 2*RD) {
                tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                for (m = 0; m < Q.mt; m++) {
                    tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
                    ldqm   = BLKLDD(Q, m   );
                    QUARK_CORE_zttmlq(
                                      plasma->quark, &task_flags,
                                      PlasmaRight, PlasmaNoTrans,
                                      tempmm, Q.nb, tempmm, tempNRDn,
                                      tempkm, ib, T.nb,
                                      Q (m, N   ), ldqm,
                                      Q (m, N+RD), ldqm,
                                      A (k, N+RD), ldak,
                                      T2(k, N+RD), T.mb); 
                }
            }
        }
        for (N = k;
             N < A.nt-1 || N == k;  /* No rightmost single-column */
             N += BS) {
            tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
            tempkmin = min(tempkm, tempNn);
            for (n = N+BS-1 == A.nt-2 ? A.nt-1 : min(N+BS-1, A.nt-1); /* Suck in rightmost single-column domain */
                 n >= N+1;
                 n--) {
                tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb;
                
                for (m = 0; m < Q.mt; m++) {
                    tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
                    ldqm = BLKLDD(Q, m);
                    QUARK_CORE_ztsmlq(
                                      plasma->quark, &task_flags,
                                      PlasmaRight, PlasmaNoTrans,
                                      tempmm, Q.nb, tempmm, tempnn,
                                      tempkm, ib, T.nb,
                                      Q(m, N), ldqm,
                                      Q(m, n), ldqm,
                                      A(k, n), ldak,
                                      T(k, n), T.mb);
                }
            }
            for (m = 0; m < Q.mt; m++) {
                tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
                ldqm = BLKLDD(Q, m);
                QUARK_CORE_zunmlq(
                                  plasma->quark, &task_flags,
                                  PlasmaRight, PlasmaNoTrans,
                                  tempmm, tempNn, 
                                  tempkmin, ib, T.nb,
                                  A(k, N), ldak,
                                  T(k, N), T.mb,
                                  Q(m, N), ldqm);
            }
        }
    }
}
コード例 #5
0
ファイル: pdgeqrf.c プロジェクト: adcastel/ULT_work
/***************************************************************************//**
 *  Parallel tile QR factorization - dynamic scheduling
 **/
void plasma_pdgeqrf_quark(PLASMA_desc A, PLASMA_desc T, int ib)
{

    int k, m, n;
    int ldak, ldam;
    int tempkm, tempkn, tempnn, tempmm;

    for (k = 0; k < min(A.mt, A.nt); k++) {
        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        ldak = BLKLDD(A, k);
        double *dA = A(k, k);
        double *dT = T(k, k);
#if defined(USE_OMPEXT)
omp_set_task_priority(1);
#endif
#pragma omp task depend(inout: dA[0:T.nb*T.nb]) depend(out:dT[0:ib*T.nb])
        {
            double tau[T.nb];
            double work[ib * T.nb];
            CORE_dgeqrt(tempkm, tempkn, ib, dA, ldak, dT, T.mb, &tau[0], &work[0]);
        }

        for (n = k+1; n < A.nt; n++) {
            tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            double *dA = A(k, k);
            double *dT = T(k, k);
            double *dC = A(k, n);
#pragma omp task depend(in: dA[0:T.nb*T.nb], dT[0:ib*T.nb]) depend(inout:dC[0:T.nb*T.nb])
            {
                double work[T.nb * ib];
                CORE_dormqr(PlasmaLeft, PlasmaTrans,
                        tempkm, tempnn, tempkm, ib,
                        dA, ldak,
                        dT, T.mb,
                        dC, ldak,
                        &work[0], T.nb);
            }
        }
        for (m = k+1; m < A.mt; m++) {
            tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            double *dA = A(k, k);
            double *dB = A(m, k);
            double *dT = T(m, k);
#pragma omp task depend(inout:dA[0:T.nb*T.nb], dB[0:T.nb*T.nb]) depend(out:dT[0:ib*T.nb])
            {
                double tau[T.nb];
                double work[ib * T.nb];
                CORE_dtsqrt(tempmm, tempkn, ib,
                        dA, ldak,
                        dB, ldam,
                        dT, T.mb, &tau[0], &work[0]);
            }

            for (n = k+1; n < A.nt; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                double *dA = A(k, n);
                double *dB = A(m, n);
                double *dV = A(m, k);
                double *dT = T(m, k);
#pragma omp task depend(inout:dA[0:T.nb*T.nb], dB[0:T.nb*T.nb]) depend(in:dV[0:T.nb*T.nb], dT[0:ib*T.nb])
                {
                    double work[ib * T.nb];
                    CORE_dtsmqr(PlasmaLeft, PlasmaTrans,
                            A.mb, tempnn, tempmm, tempnn, A.nb, ib,
                            dA, ldak,
                            dB, ldam,
                            dV, ldam,
                            dT, T.mb, &work[0], ib);
                }
            }
        }
    }
}
コード例 #6
0
ファイル: pcunmqr.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel application of Q using tile V - QR factorization - static scheduling
 **/
void plasma_pcunmqr(plasma_context_t *plasma)
{
    PLASMA_enum side;
    PLASMA_enum trans;
    PLASMA_desc A;
    PLASMA_desc B;
    PLASMA_desc T;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_k;
    int next_m;
    int next_n;
    int ldak, ldbk, ldam, ldbm;
    int tempkm, tempnn, tempkmin, tempmm;
    int minMT, minM;
    int ib = PLASMA_IB;
    PLASMA_Complex32_t *work;

    plasma_unpack_args_7(side, trans, A, B, T, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;

    if (side != PlasmaLeft) {
        plasma_request_fail(sequence, request, PLASMA_ERR_NOT_SUPPORTED);
        return;
    }
    if (trans != PlasmaConjTrans) {
        plasma_request_fail(sequence, request, PLASMA_ERR_NOT_SUPPORTED);
        return;
    }

    work = (PLASMA_Complex32_t*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp);
    ss_init(B.mt, B.nt, -1);

   if (A.m > A.n) {
      minM  = A.n;
      minMT = A.nt;
    } else {
      minM  = A.m;
      minMT = A.mt;
    }

    k = 0;
    n = PLASMA_RANK;
    while (n >= B.nt) {
        k++;
        n = n-B.nt;
    }
    m = k;

    while (k < minMT && n < B.nt) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_m++;
        if (next_m == A.mt) {
            next_n += PLASMA_SIZE;
            while (next_n >= B.nt && next_k < minMT) {
                next_k++;
                next_n = next_n-B.nt;
            }
            next_m = next_k;
        }

        tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
        tempkm   = k == B.mt-1 ? B.m-k*B.mb : B.mb;
        tempnn   = n == B.nt-1 ? B.n-n*B.nb : B.nb;
        tempmm   = m == B.mt-1 ? B.m-m*B.mb : B.mb;

        ldak = BLKLDD(A, k);
        ldbk = BLKLDD(B, k);
        ldam = BLKLDD(A, m);
        ldbm = BLKLDD(B, m);

        if (m == k) {
            ss_cond_wait(k, n, k-1);
            CORE_cunmqr(
                side, trans,
                tempkm, tempnn, tempkmin, ib,
                A(k, k), ldak,
                T(k, k), T.mb,
                B(k, n), ldbk,
                work, T.nb);
            ss_cond_set(k, n, k);
        }
        else {
            ss_cond_wait(m, n, k-1);
            CORE_ctsmqr(
                side, trans,
                A.mb, tempnn, tempmm, tempnn, tempkmin, ib,
                B(k, n), ldbk,
                B(m, n), ldbm,
                A(m, k), ldam,
                T(m, k), T.mb,
                work, ib);
            ss_cond_set(m, n, k);
        }
        n = next_n;
        m = next_m;
        k = next_k;
    }
    plasma_private_free(plasma, work);
    ss_finalize();
}
コード例 #7
0
ファイル: pdgetrf_rectil.c プロジェクト: agrippa/omp-to-x
/***************************************************************************//**
 *  Parallel tile LU factorization - dynamic scheduling - Right looking
 **/
void plasma_pdgetrf_rectil_quark(PLASMA_desc A, int *IPIV)
{
    int k, m, n;
    int tempk, tempm, tempkm, tempkn, tempmm, tempnn;
    int ldak, ldam;

    double zone  = (double)1.0;
    double mzone = (double)-1.0;

    void * fakedep;
    /* How many threads per panel? Probably needs to be adjusted during factorization. */

    CORE_dgetrf_rectil_init();

    for (k = 0; k < min(A.mt, A.nt); k++)
    {
        tempk  = k * A.mb;
        tempm  = A.m - tempk;
        tempkm = k == A.mt-1 ? tempm      : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        ldak = BLKLDD(A, k);

        double *dA = A(k, k);
        int *dB = IPIV(k);
        PLASMA_desc pDesc = plasma_desc_submatrix(A, tempk, k*A.nb, tempm, tempkn);
hclib_pragma_marker("omp", "task depend(inout:dA[0:A.mb*A.nb]) depend(out:dB[0:pDesc.n])", "pragma59_omp_task");
        {
            int info[3];
            info[1] = 0;
            info[2] = 1;

            CORE_dgetrf_rectil( pDesc, dB, info );
        }

        /*
         * Update the trailing submatrix
         */
        fakedep = (void *)(intptr_t)(k+1);
        for (n = k+1; n < A.nt; n++)
        {
            /*
             * Apply row interchange after the panel (work on the panel)
             */
            tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            PLASMA_desc descA = plasma_desc_submatrix(A, tempk, n*A.nb, tempm, tempnn);
            double *dA = A(k, n);
            double *dB = A(k, k);
            int *dipiv = IPIV(k);
hclib_pragma_marker("omp", "task depend(inout:dA[0:1]) depend(in:dB[0:ldak], dipiv[0:tempkm])", "pragma82_omp_task");
            CORE_dswptr_ontile(descA, 1, tempkm, dipiv, 1, dB, ldak);

            m = k+1;
            if ( m < A.mt ) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);

                double *dA = A(m , k);
                double *dB = A(k , n);
                double *dC = A(m , n);
hclib_pragma_marker("omp", "task depend(in:dA[0:A.mb*A.mb], dB[0:A.mb*A.mb]) depend(inout:dC[0:A.mb*A.mb])", "pragma93_omp_task");
                cblas_dgemm(CblasColMajor, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_TRANSPOSE)PlasmaNoTrans,
                        tempmm, tempnn, A.nb,
                        mzone, dA, ldam,
                        dB, ldak,
                        zone, dC, ldam);

                for (m = k+2; m < A.mt; m++)
                {
                    tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                    ldam = BLKLDD(A, m);

                    double *dA = A(m , k);
                    double *dB = A(k , n);
                    double *dC = A(m , n);
                    double *fake1 = A(k+1, n);
                    double *fake2 = (double *)fakedep;
hclib_pragma_marker("omp", "task depend(in:dA[0:A.mb*A.mb], dB[0:A.mb*A.mb], fake2[0:1]) depend(inout:dC[0:A.mb*A.mb], fake1[0:A.mb*A.nb])", "pragma110_omp_task");
                        cblas_dgemm(CblasColMajor, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_TRANSPOSE)PlasmaNoTrans,
                                tempmm, tempnn, A.nb,
                                mzone, dA, ldam,
                                dB, ldak,
                                zone, dC, ldam);
                }
            }
        }
    }

    for (k = 0; k < min(A.mt, A.nt); k++)
    {
        int mintmp;
        tempk  = k * A.mb;
        tempm  = A.m - tempk;
        tempkm = k == A.mt-1 ? tempm : A.mb;
        tempkn = k == A.nt-1 ? A.n - k * A.nb : A.nb;
        mintmp = min(tempkm, tempkn);
        ldak = BLKLDD(A, k);

        /*
         * Apply row interchange behind the panel (work on the panel)
         */
        fakedep = (void*)(intptr_t)k;
        for (n = 0; n < k; n++)
        {
            tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            double *Aij = A(k, n);
            double *prevSwap = A(k-1, n);
            int *dipiv = IPIV(k);
            PLASMA_desc descA = plasma_desc_submatrix(A, tempk, n*A.nb, tempm, tempnn);
hclib_pragma_marker("omp", "task depend(inout:Aij[0:1],fakedep) depend(in:dipiv[0:mintmp], prevSwap[0:A.lm*A.nb])", "pragma142_omp_task");
            CORE_dlaswp_ontile(descA, 1, mintmp, dipiv, 1);
        }
    }
}
コード例 #8
0
ファイル: pcsyr2k.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel tile Hermitian rank-k update - dynamic scheduling
 **/
void plasma_pcsyr2k_quark(PLASMA_enum uplo, PLASMA_enum trans,
                          PLASMA_Complex32_t alpha, PLASMA_desc A, PLASMA_desc B,
                          PLASMA_Complex32_t beta,  PLASMA_desc C,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int m, n, k;
    int ldak, ldam, ldan, ldcm, ldcn;
    int ldbk, ldbm, ldbn;
    int tempnn, tempmm, tempkn, tempkm;

    PLASMA_Complex32_t zone   = (PLASMA_Complex32_t)1.0;
    PLASMA_Complex32_t zbeta;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    for (n = 0; n < C.nt; n++) {
        tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb;
        ldan = BLKLDD(A, n);
        ldbn = BLKLDD(B, n);
        ldcn = BLKLDD(C, n);
        /*
         *  PlasmaNoTrans
         */
        if (trans == PlasmaNoTrans) {
            for (k = 0; k < A.nt; k++) {
                tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                zbeta = k == 0 ? beta : zone;
                QUARK_CORE_csyr2k(
                    plasma->quark, &task_flags,
                    uplo, trans,
                    tempnn, tempkn, A.mb,
                    alpha, A(n, k), ldan, /* ldan * K */
                           B(n, k), ldbn,
                    zbeta, C(n, n), ldcn); /* ldc  * N */
            }
            /*
             *  PlasmaNoTrans / PlasmaLower
             */
            if (uplo == PlasmaLower) {
                for (m = n+1; m < C.mt; m++) {
                    tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
                    ldam = BLKLDD(A, m);
                    ldbm = BLKLDD(B, m);
                    ldcm = BLKLDD(C, m);
                    for (k = 0; k < A.nt; k++) {
                        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                        zbeta = k == 0 ? beta : zone;
                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaTrans,
                            tempmm, tempnn, tempkn, A.mb,
                            alpha, A(m, k), ldam,  /* ldam * K */
                                   B(n, k), ldbn,  /* ldan * K */
                            zbeta, C(m, n), ldcm); /* ldc  * N */

                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaTrans,
                            tempmm, tempnn, tempkn, A.mb,
                            alpha, B(m, k), ldbm,  /* ldam * K */
                                   A(n, k), ldan,  /* ldan * K */
                            zone,  C(m, n), ldcm); /* ldc  * N */
                    }
                }
            }
            /*
             *  PlasmaNoTrans / PlasmaUpper
             */
            else {
                for (m = n+1; m < C.mt; m++) {
                    tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
                    ldam = BLKLDD(A, m);
                    ldbm = BLKLDD(B, m);
                    for (k = 0; k < A.nt; k++) {
                        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                        zbeta = k == 0 ? beta : zone;
                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaTrans,
                            tempnn, tempmm, tempkn, A.mb,
                            alpha, A(n, k), ldan,  /* ldan * K */
                                   B(m, k), ldbm,  /* ldam * M */
                            zbeta, C(n, m), ldcn); /* ldc  * M */

                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaTrans,
                            tempnn, tempmm, tempkn, A.mb,
                            alpha, B(n, k), ldan,  /* ldan * K */
                                   A(m, k), ldam,  /* ldam * M */
                            zone,  C(n, m), ldcn); /* ldc  * M */
                    }
                }
            }
        }
        /*
         *  Plasma[Conj]Trans
         */
        else {
            for (k = 0; k < A.mt; k++) {
                tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                ldak = BLKLDD(A, k);
                ldbk = BLKLDD(B, k);
                zbeta = k == 0 ? beta : zone;
                QUARK_CORE_csyr2k(
                    plasma->quark, &task_flags,
                    uplo, trans,
                    tempnn, tempkm, A.mb,
                    alpha, A(k, n), ldak,  /* lda * N */
                           B(k, n), ldbk,
                    zbeta, C(n, n), ldcn); /* ldc * N */
            }
            /*
             *  Plasma[Conj]Trans / PlasmaLower
             */
            if (uplo == PlasmaLower) {
                for (m = n+1; m < C.mt; m++) {
                    tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
                    ldcm = BLKLDD(C, m);
                    for (k = 0; k < A.mt; k++) {
                        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                        ldak = BLKLDD(A, k);
                        ldbk = BLKLDD(B, k);
                        zbeta = k == 0 ? beta : zone;
                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaNoTrans,
                            tempmm, tempnn, tempkm, A.mb,
                            alpha, A(k, m), ldak,  /* lda * M */
                                   B(k, n), ldbk,  /* lda * N */
                            zbeta, C(m, n), ldcm); /* ldc * N */

                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaNoTrans,
                            tempmm, tempnn, tempkm, A.mb,
                            alpha, B(k, m), ldbk,  /* lda * M */
                                   A(k, n), ldak,  /* lda * N */
                            zone,  C(m, n), ldcm); /* ldc * N */
                    }
                }
            }
            /*
             *  Plasma[Conj]Trans / PlasmaUpper
             */
            else {
                for (m = n+1; m < C.mt; m++) {
                    tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
                    for (k = 0; k < A.mt; k++) {
                        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                        ldak = BLKLDD(A, k);
                        ldbk = BLKLDD(B, k);
                        zbeta = k == 0 ? beta : zone;
                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaNoTrans,
                            tempnn, tempmm, tempkm, A.mb,
                            alpha, A(k, n), ldak,  /* lda * K */
                                   B(k, m), ldbk,  /* lda * M */
                            zbeta, C(n, m), ldcn); /* ldc * M */

                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaNoTrans,
                            tempnn, tempmm, tempkm, A.mb,
                            alpha, B(k, n), ldbk,  /* lda * K */
                                   A(k, m), ldak,  /* lda * M */
                            zone,  C(n, m), ldcn); /* ldc * M */
                    }
                }
            }
        }
    }
}
コード例 #9
0
ファイル: pssyrbt.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel tile BAND Tridiagonal Reduction - dynamic scheduler
 **/
void plasma_pssyrbt_quark(PLASMA_enum uplo, 
                          PLASMA_desc A, PLASMA_desc T,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n, i, j;
    int ldak, ldam, ldan, ldaj, ldai;
    int tempkn, tempmm, tempnn, tempjj;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;

    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    if (uplo == PlasmaLower) {
       for (k = 0; k < A.nt-1; k++){
           tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb;
           ldak = BLKLDD(A, k+1);
           QUARK_CORE_sgeqrt(
               plasma->quark, &task_flags,
               tempkn, A.nb, ib, T.nb,
               A(k+1, k), ldak,
               T(k+1, k), T.mb);

           /* LEFT and RIGHT on the symmetric diagonal block */
           QUARK_CORE_ssyrfb(
               plasma->quark, &task_flags,
               PlasmaLower,
               tempkn, tempkn, ib, T.nb,
               A(k+1,   k), ldak,
               T(k+1,   k), T.mb,
               A(k+1, k+1), ldak);

           /* RIGHT on the remaining tiles until the bottom */
           for (m = k+2; m < A.mt ; m++) {
               tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
               ldam = BLKLDD(A, m);
               QUARK_CORE_sormqr(
                   plasma->quark, &task_flags,
                   PlasmaRight, PlasmaNoTrans,
                   tempmm, A.nb, tempkn, ib, T.nb,
                   A(k+1,   k), ldak,
                   T(k+1,   k), T.mb,
                   A(m  , k+1), ldam);
           }

           for (m = k+2; m < A.mt; m++) {
               tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
               ldam = BLKLDD(A, m);
               QUARK_CORE_stsqrt(
                   plasma->quark, &task_flags,
                   tempmm, A.nb, ib, T.nb,
                   A(k+1, k), ldak,
                   A(m  , k), ldam,
                   T(m  , k), T.mb);

               /* LEFT */
               for (i = k+2; i < m; i++) {
                   ldai = BLKLDD(A, i);
                   QUARK_CORE_stsmqr_sytra1(
                       plasma->quark, &task_flags,
                       PlasmaLeft, PlasmaTrans,
                       A.mb, A.nb, tempmm, A.nb, A.nb, ib, T.nb,
                       A(i, k+1), ldai,
                       A(m,   i), ldam,
                       A(m,   k), ldam,
                       T(m,   k), T.mb);
               }

               /* RIGHT */
               for (j = m+1; j < A.mt ; j++) {
                   tempjj = j == A.mt-1 ? A.m-j*A.mb : A.mb;
                   ldaj = BLKLDD(A, j);
                   QUARK_CORE_stsmqr(
                       plasma->quark, &task_flags,
                       PlasmaRight, PlasmaNoTrans,
                       tempjj, A.nb, tempjj, tempmm, A.nb, ib, T.nb,
                       A(j, k+1), ldaj,
                       A(j,   m), ldaj,
                       A(m,   k), ldam,
                       T(m,   k), T.mb);
               }
       
               /* LEFT->RIGHT */
               QUARK_CORE_stsmqr_corner(
                   plasma->quark, &task_flags,
                   A.nb, A.nb, tempmm, A.nb, tempmm, tempmm, A.nb, ib, T.nb,
                   A(k+1, k+1), ldak,
                   A(m  , k+1), ldam,
                   A(m  ,   m), ldam,
                   A(m  ,   k), ldam,
                   T(m  ,   k), T.mb);
           }
       }
    }
    else {
       for (k = 0; k < A.nt-1; k++){
           tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb;
           ldak = BLKLDD(A, k+1);
           QUARK_CORE_sgelqt(
               plasma->quark, &task_flags,
               A.nb, tempkn, ib, T.nb,
               A(k, k+1), A.nb,
               T(k, k+1), T.mb);

           /* RIGHT and LEFT on the symmetric diagonal block             */
           QUARK_CORE_ssyrfb(
               plasma->quark, &task_flags,
               PlasmaUpper,
               tempkn, tempkn, ib, T.nb,
               A(k,   k+1), A.nb,
               T(k,   k+1), T.mb,
               A(k+1, k+1), ldak);

           /* LEFT on the remaining tiles until the left side */
           for (n = k+2; n < A.nt ; n++) {
               tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
               QUARK_CORE_sormlq(
                   plasma->quark, &task_flags,
                   PlasmaLeft, PlasmaNoTrans,
                   A.nb, tempnn, tempkn, ib, T.nb,
                   A(k,   k+1), A.nb,
                   T(k,   k+1), T.mb,
                   A(k+1,   n), ldak);
           }

           for (n = k+2; n < A.nt; n++) {
               tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
               ldan = BLKLDD(A, n);
               QUARK_CORE_stslqt(
                   plasma->quark, &task_flags,
                   A.nb, tempnn, ib, T.nb,
                   A(k, k+1), A.nb,
                   A(k,   n), A.nb,
                   T(k,   n), T.mb);

               /* RIGHT */
               for (i = k+2; i < n; i++) {
                   ldai = BLKLDD(A, i);
                   QUARK_CORE_stsmlq_sytra1(
                       plasma->quark, &task_flags,
                       PlasmaRight, PlasmaTrans,
                       A.mb, A.nb, A.nb, tempnn, A.nb, ib, T.nb,
                       A(k+1, i), ldak,
                       A(i,   n), ldai,
                       A(k,   n), A.nb,
                       T(k,   n), T.mb);
               }

               /* LEFT */
               for (j = n+1; j < A.nt ; j++) {
                   tempjj = j == A.nt-1 ? A.n-j*A.nb : A.nb;
                   ldaj = BLKLDD(A, j);
                   QUARK_CORE_stsmlq(
                       plasma->quark, &task_flags,
                       PlasmaLeft, PlasmaNoTrans,
                       A.nb, tempjj, tempnn, tempjj, A.nb, ib, T.nb,
                       A(k+1, j), ldak,
                       A(n,   j), ldan,
                       A(k,   n), A.nb,
                       T(k,   n), T.mb);
               }
       
               /* RIGHT->LEFT */
               QUARK_CORE_stsmlq_corner(
                   plasma->quark, &task_flags,
                   A.nb, A.nb, A.nb, tempnn, tempnn, tempnn, A.nb, ib, T.nb,
                   A(k+1, k+1), ldak,
                   A(k+1,   n), ldak,
                   A(n  ,   n), ldan,
                   A(k  ,   n), A.nb,
                   T(k  ,   n), T.mb);
           }
       }
    }
}
コード例 #10
0
ファイル: pztrtri.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel tile triangular matrix inverse - dynamic scheduling
 **/
void plasma_pztrtri_quark(PLASMA_enum uplo, PLASMA_enum diag, PLASMA_desc A,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int ldam, ldan;
    int tempkn, tempmm, tempnn;

    PLASMA_Complex64_t zone  = (PLASMA_Complex64_t) 1.0;
    PLASMA_Complex64_t mzone = (PLASMA_Complex64_t)-1.0;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
    /*
     *  PlasmaLower
     */
    if (uplo == PlasmaLower) {
        for (n = 0; n < A.nt; n++) {
            tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            ldan = BLKLDD(A, n);
            for (m = n+1; m < A.mt; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_ztrsm(
                    plasma->quark, &task_flags,
                    PlasmaRight, uplo, PlasmaNoTrans, diag,
                    tempmm, tempnn, A.mb,
                    mzone, A(n, n), ldan,
                           A(m, n), ldam);
            }
            for (m = n+1; m < A.mt; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                for (k = 0; k < n; k++) {
                    tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                    QUARK_CORE_zgemm(
                        plasma->quark, &task_flags,
                        PlasmaNoTrans, PlasmaNoTrans,
                        tempmm, tempkn, tempnn, A.mb,
                        zone, A(m, n), ldam,
                              A(n, k), ldan,
                        zone, A(m, k), ldam);
                }
            }
            for (m = 0; m < n; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                QUARK_CORE_ztrsm(
                    plasma->quark, &task_flags,
                    PlasmaLeft, uplo, PlasmaNoTrans, diag,
                    tempnn, tempmm, A.mb,
                    zone, A(n, n), ldan,
                          A(n, m), ldan);
            }
            QUARK_CORE_ztrtri(
                plasma->quark, &task_flags,
                uplo, diag,
                tempnn, A.mb,
                A(n, n), ldan,
                sequence, request, A.nb*n);
        }
    }
    /*
     *  PlasmaUpper
     */
    else {
        for (m = 0; m < A.mt; m++) {
            tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            for (n = m+1; n < A.nt; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_ztrsm(
                    plasma->quark, &task_flags,
                    PlasmaLeft, uplo, PlasmaNoTrans, diag,
                    tempmm, tempnn, A.mb,
                    mzone, A(m, m), ldam,
                           A(m, n), ldam);
            }
            for (n = 0; n < m; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                ldan = BLKLDD(A, n);
                for (k = m+1; k < A.nt; k++) {
                    tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                    QUARK_CORE_zgemm(
                        plasma->quark, &task_flags,
                        PlasmaNoTrans, PlasmaNoTrans,
                        tempnn, tempkn, tempmm, A.mb,
                        zone, A(n, m), ldan,
                              A(m, k), ldam,
                        zone, A(n, k), ldan);
                }
                QUARK_CORE_ztrsm(
                    plasma->quark, &task_flags,
                    PlasmaRight, uplo, PlasmaNoTrans, diag,
                    tempnn, tempmm, A.mb,
                    zone, A(m, m), ldam,
                          A(n, m), ldan);
            }
            QUARK_CORE_ztrtri(
                plasma->quark, &task_flags,
                uplo, diag,
                tempmm, A.mb,
                A(m, m), ldam,
                sequence, request, A.mb*m);
        }
    }
}
コード例 #11
0
ファイル: pdsymm.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel tile symmetric matrix-matrix multiplication - dynamic scheduling
 **/
void plasma_pdsymm_quark(PLASMA_enum side, PLASMA_enum uplo,
                          double alpha, PLASMA_desc A, PLASMA_desc B,
                          double beta, PLASMA_desc C,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int lda, ldak, ldb, ldc;
    int tempmm, tempnn, tempkn, tempkm;

    double zbeta;
    double zone = (double)1.0;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    for (m = 0; m < C.mt; m++) {
        tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
        ldc = BLKLDD(C, m);
        for (n = 0; n < C.nt; n++) {
            tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb;
            /*
             *  PlasmaLeft / PlasmaLower
             */
            if (side == PlasmaLeft) {
                lda = BLKLDD(A, m);
                if (uplo == PlasmaLower) {
                    for (k = 0; k < C.mt; k++) {
                        tempkm = k == C.mt-1 ? C.m-k*C.mb : C.mb;
                        ldak = BLKLDD(A, k);
                        ldb  = BLKLDD(B, k);
                        zbeta = k == 0 ? beta : zone;
                        if (k < m) {
                            QUARK_CORE_dgemm(
                                plasma->quark, &task_flags,
                                PlasmaNoTrans, PlasmaNoTrans,
                                tempmm, tempnn, tempkm, A.mb,
                                alpha, A(m, k), lda,  /* lda * K */
                                       B(k, n), ldb,  /* ldb * Y */
                                zbeta, C(m, n), ldc); /* ldc * Y */
                        }
                        else {
                            if (k == m) {
                                QUARK_CORE_dsymm(
                                    plasma->quark, &task_flags,
                                    side, uplo,
                                    tempmm, tempnn, A.mb,
                                    alpha, A(k, k), ldak, /* ldak * X */
                                           B(k, n), ldb,  /* ldb  * Y */
                                    zbeta, C(m, n), ldc); /* ldc  * Y */
                            }
                            else {
                                QUARK_CORE_dgemm(
                                    plasma->quark, &task_flags,
                                    PlasmaTrans, PlasmaNoTrans,
                                    tempmm, tempnn, tempkm, A.mb,
                                    alpha, A(k, m), ldak, /* ldak * X */
                                           B(k, n), ldb,  /* ldb  * Y */
                                    zbeta, C(m, n), ldc); /* ldc  * Y */
                            }
                        }
                    }
                }
                /*
                 *  PlasmaLeft / PlasmaUpper
                 */
                else {
                    for (k = 0; k < C.mt; k++) {
                        tempkm = k == C.mt-1 ? C.m-k*C.mb : C.mb;
                        ldak = BLKLDD(A, k);
                        ldb  = BLKLDD(B, k);
                        zbeta = k == 0 ? beta : zone;
                        if (k < m) {
                            QUARK_CORE_dgemm(
                                plasma->quark, &task_flags,
                                PlasmaTrans, PlasmaNoTrans,
                                tempmm, tempnn, tempkm, A.mb,
                                alpha, A(k, m), ldak, /* ldak * X */
                                       B(k, n), ldb,  /* ldb  * Y */
                                zbeta, C(m, n), ldc); /* ldc  * Y */
                        }
                        else {
                            if (k == m) {
                                QUARK_CORE_dsymm(
                                    plasma->quark, &task_flags,
                                    side, uplo,
                                    tempmm, tempnn, A.mb,
                                    alpha, A(k, k), ldak, /* ldak * K */
                                           B(k, n), ldb,  /* ldb  * Y */
                                    zbeta, C(m, n), ldc); /* ldc  * Y */
                            }
                            else {
                                QUARK_CORE_dgemm(
                                    plasma->quark, &task_flags,
                                    PlasmaNoTrans, PlasmaNoTrans,
                                    tempmm, tempnn, tempkm, A.mb,
                                    alpha, A(m, k), lda,  /* lda * K */
                                           B(k, n), ldb,  /* ldb * Y */
                                    zbeta, C(m, n), ldc); /* ldc * Y */
                            }
                        }
                    }
                }
            }
            /*
             *  PlasmaRight / PlasmaLower
             */
            else {
                lda = BLKLDD(A, n);
                ldb = BLKLDD(B, m);
                if (uplo == PlasmaLower) {
                    for (k = 0; k < C.nt; k++) {
                        tempkn = k == C.nt-1 ? C.n-k*C.nb : C.nb;
                        ldak = BLKLDD(A, k);
                        zbeta = k == 0 ? beta : zone;
                        if (k < n) {
                            QUARK_CORE_dgemm(
                                plasma->quark, &task_flags,
                                PlasmaNoTrans, PlasmaTrans,
                                tempmm, tempnn, tempkn, A.mb,
                                alpha, B(m, k), ldb,  /* ldb * K */
                                       A(n, k), lda,  /* lda * K */
                                zbeta, C(m, n), ldc); /* ldc * Y */
                        }
                        else {
                            if (k == n) {
                                QUARK_CORE_dsymm(
                                    plasma->quark, &task_flags,
                                    side, uplo,
                                    tempmm, tempnn, A.mb,
                                    alpha, A(k, k), ldak, /* ldak * Y */
                                           B(m, k), ldb,  /* ldb  * Y */
                                    zbeta, C(m, n), ldc); /* ldc  * Y */
                            }
                            else {
                                QUARK_CORE_dgemm(
                                    plasma->quark, &task_flags,
                                    PlasmaNoTrans, PlasmaNoTrans,
                                    tempmm, tempnn, tempkn, A.mb,
                                    alpha, B(m, k), ldb,  /* ldb  * K */
                                           A(k, n), ldak, /* ldak * Y */
                                    zbeta, C(m, n), ldc); /* ldc  * Y */
                            }
                        }
                    }
                }
                /*
                 *  PlasmaRight / PlasmaUpper
                 */
                else {
                    for (k = 0; k < C.nt; k++) {
                        tempkn = k == C.nt-1 ? C.n-k*C.nb : C.nb;
                        ldak = BLKLDD(A, k);
                        zbeta = k == 0 ? beta : zone;
                        if (k < n) {
                            QUARK_CORE_dgemm(
                                plasma->quark, &task_flags,
                                PlasmaNoTrans, PlasmaNoTrans,
                                tempmm, tempnn, tempkn, A.mb,
                                alpha, B(m, k), ldb,  /* ldb  * K */
                                       A(k, n), ldak, /* ldak * Y */
                                zbeta, C(m, n), ldc); /* ldc  * Y */
                        }
                        else {
                            if (k == n) {
                                QUARK_CORE_dsymm(
                                    plasma->quark, &task_flags,
                                    side, uplo,
                                    tempmm, tempnn, A.mb,
                                    alpha, A(k, k), ldak, /* ldak * Y */
                                           B(m, k), ldb,  /* ldb  * Y */
                                    zbeta, C(m, n), ldc); /* ldc  * Y */
                            }
                            else {
                                QUARK_CORE_dgemm(
                                    plasma->quark, &task_flags,
                                    PlasmaNoTrans, PlasmaTrans,
                                    tempmm, tempnn, tempkn, A.mb,
                                    alpha, B(m, k), ldb,  /* ldb * K */
                                           A(n, k), lda,  /* lda * K */
                                    zbeta, C(m, n), ldc); /* ldc * Y */
                            }
                        }
                    }
                }
            }
        }
    }
}
コード例 #12
0
ファイル: pdsymm.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel tile symmetric matrix-matrix multiplication - static scheduling
 **/
void plasma_pdsymm(plasma_context_t *plasma)
{
    PLASMA_enum side;
    PLASMA_enum uplo;
    double alpha;
    PLASMA_desc A;
    PLASMA_desc B;
    double beta;
    PLASMA_desc C;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_m;
    int next_n;
    int lda, ldak, ldb, ldc;
    int tempmm, tempnn, tempkn, tempkm;

    double zbeta;
    double zone = (double)1.0;

    plasma_unpack_args_9(side, uplo, alpha, A, B, beta, C, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;

    n = 0;
    m = PLASMA_RANK;
    while (m >= C.mt && n < C.nt) {
        n++;
        m = m-C.mt;
    }

    while (n < C.nt) {
        next_m = m;
        next_n = n;

        next_m += PLASMA_SIZE;
        while (next_m >= C.mt && next_n < C.nt) {
            next_n++;
            next_m = next_m - C.mt;
        }

        tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
        tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb;

        ldc = BLKLDD(C, m);
        /*
         *  PlasmaLeft / PlasmaLower
         */
        if (side == PlasmaLeft) {
            lda = BLKLDD(A, m);
            if (uplo == PlasmaLower) {
                for (k = 0; k < C.mt; k++) {
                    tempkm = k == C.mt-1 ? C.m-k*C.mb : C.mb;
                    ldak = BLKLDD(A, k);
                    ldb  = BLKLDD(B, k);
                    zbeta = k == 0 ? beta : zone;
                    if (k < m) {
                        CORE_dgemm(
                            PlasmaNoTrans, PlasmaNoTrans,
                            tempmm, tempnn, tempkm,
                            alpha, A(m, k), lda,
                                   B(k, n), ldb,
                            zbeta, C(m, n), ldc);
                    }
                    else {
                        if (k == m) {
                            CORE_dsymm(
                                side, uplo,
                                tempmm, tempnn,
                                alpha, A(k, k), ldak,
                                       B(k, n), ldb,
                                zbeta, C(m, n), ldc);
                        }
                        else {
                            CORE_dgemm(
                                PlasmaTrans, PlasmaNoTrans,
                                tempmm, tempnn, tempkm,
                                alpha, A(k, m), ldak,
                                       B(k, n), ldb,
                                zbeta, C(m, n), ldc);
                        }
                    }
                }
            }
            /*
             *  PlasmaLeft / PlasmaUpper
             */
            else {
                for (k = 0; k < C.mt; k++) {
                    tempkm = k == C.mt-1 ? C.m-k*C.mb : C.mb;
                    ldak = BLKLDD(A, k);
                    ldb  = BLKLDD(B, k);
                    zbeta = k == 0 ? beta : zone;
                    if (k < m) {
                        CORE_dgemm(
                            PlasmaTrans, PlasmaNoTrans,
                            tempmm, tempnn, tempkm,
                            alpha, A(k, m), ldak,
                                   B(k, n), ldb,
                            zbeta, C(m, n), ldc);
                    }
                    else {
                        if (k == m) {
                            CORE_dsymm(
                                side, uplo,
                                tempmm, tempnn,
                                alpha, A(k, k), ldak,
                                       B(k, n), ldb,
                                zbeta, C(m, n), ldc);
                        }
                        else {
                            CORE_dgemm(
                                PlasmaNoTrans, PlasmaNoTrans,
                                tempmm, tempnn, tempkm,
                                alpha, A(m, k), lda,
                                       B(k, n), ldb,
                                zbeta, C(m, n), ldc);
                        }
                    }
                }
            }
        }
        /*
         *  PlasmaRight / PlasmaLower
         */
        else {
            lda = BLKLDD(A, n);
            ldb = BLKLDD(B, m);
            if (uplo == PlasmaLower) {
                for (k = 0; k < C.nt; k++) {
                    tempkn = k == C.nt-1 ? C.n-k*C.nb : C.nb;
                    ldak = BLKLDD(A, k);
                    zbeta = k == 0 ? beta : zone;
                    if (k < n) {
                        CORE_dgemm(
                            PlasmaNoTrans, PlasmaTrans,
                            tempmm, tempnn, tempkn,
                            alpha, B(m, k), ldb,
                                   A(n, k), lda,
                            zbeta, C(m, n), ldc);
                    }
                    else {
                        if (n == k) {
                            CORE_dsymm(
                                side, uplo,
                                tempmm, tempnn,
                                alpha, A(k, k), ldak,
                                       B(m, k), ldb,
                                zbeta, C(m, n), ldc);
                        }
                        else {
                            CORE_dgemm(
                                PlasmaNoTrans, PlasmaNoTrans,
                                tempmm, tempnn, tempkn,
                                alpha, B(m, k), ldb,
                                       A(k, n), ldak,
                                zbeta, C(m, n), ldc);
                        }
                    }
                }
            }
            /*
             *  PlasmaRight / PlasmaUpper
             */
            else {
                for (k = 0; k < C.nt; k++) {
                    tempkn = k == C.nt-1 ? C.n-k*C.nb : C.nb;
                    ldak = BLKLDD(A, k);
                    zbeta = k == 0 ? beta : zone;
                    if (k < n) {
                        CORE_dgemm(
                            PlasmaNoTrans, PlasmaNoTrans,
                            tempmm, tempnn, tempkn,
                            alpha, B(m, k), ldb,
                                   A(k, n), ldak,
                            zbeta, C(m, n), ldc);
                    }
                    else {
                        if (n == k) {
                            CORE_dsymm(
                                side, uplo,
                                tempmm, tempnn,
                                alpha, A(k, k), ldak,
                                       B(m, k), ldb,
                                zbeta, C(m, n), ldc);
                        }
                        else {
                            CORE_dgemm(
                                PlasmaNoTrans, PlasmaTrans,
                                tempmm, tempnn, tempkn,
                                alpha, B(m, k), ldb,
                                       A(n, k), lda,
                                zbeta, C(m, n), ldc);
                        }
                    }
                }
            }
        }
        m = next_m;
        n = next_n;
    }
}
コード例 #13
0
/***************************************************************************//**
 *  Parallel tile LU factorization - static scheduling
 **/
void plasma_psgetrf_incpiv(plasma_context_t *plasma)
{
    PLASMA_desc A;
    PLASMA_desc L;
    int *IPIV;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_k;
    int next_m;
    int next_n;
    int ldak, ldam;
    int info;
    int tempkn, tempkm, tempmm, tempnn;
    int ib = PLASMA_IB;
    float *work;

    plasma_unpack_args_5(A, L, IPIV, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;
    work = (float*)plasma_private_alloc(plasma, ib*L.nb, L.dtyp);
    ss_init(A.mt, A.nt, -1);

    k = 0;
    n = PLASMA_RANK;
    while (n >= A.nt) {
        k++;
        n = n-A.nt+k;
    }
    m = k;

    while (k < min(A.mt, A.nt) && n < A.nt && !ss_aborted()) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_m++;
        if (next_m == A.mt) {
            next_n += PLASMA_SIZE;
            while (next_n >= A.nt && next_k < min(A.mt, A.nt)) {
                next_k++;
                next_n = next_n-A.nt+next_k;
            }
            next_m = next_k;
        }

        tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;

        ldak = BLKLDD(A, k);
        ldam = BLKLDD(A, m);

        if (n == k) {
            if (m == k) {
                ss_cond_wait(k, k, k-1);
                CORE_sgetrf_incpiv(
                    tempkm, tempkn, ib,
                    A(k, k), ldak,
                    IPIV(k, k), &info);
                if (info != 0 && m == A.mt-1) {
                    plasma_request_fail(sequence, request, info + A.nb*k);
                    ss_abort();
                }
                ss_cond_set(k, k, k);
            }
            else {
                ss_cond_wait(m, k, k-1);
                CORE_ststrf(
                    tempmm, tempkn, ib, A.nb,
                    A(k, k), ldak,
                    A(m, k), ldam,
                    L(m, k), L.mb,
                    IPIV(m, k),
                    work, L.nb, &info);
                if (info != 0 && m == A.mt-1) {
                    plasma_request_fail(sequence, request, info + A.nb*k);
                    ss_abort();
                }
                ss_cond_set(m, k, k);
            }
        }
        else {
            if (m == k) {
                ss_cond_wait(k, k, k);
                ss_cond_wait(k, n, k-1);
                CORE_sgessm(
                    tempkm, tempnn, tempkm, ib,
                    IPIV(k, k),
                    A(k, k), ldak,
                    A(k, n), ldak);
            }
            else {
                ss_cond_wait(m, k, k);
                ss_cond_wait(m, n, k-1);
                CORE_sssssm(
                    A.nb, tempnn, tempmm, tempnn, A.nb, ib,
                    A(k, n), ldak,
                    A(m, n), ldam,
                    L(m, k), L.mb,
                    A(m, k), ldam,
                    IPIV(m, k));
                ss_cond_set(m, n, k);
            }
        }
        n = next_n;
        m = next_m;
        k = next_k;
    }
    plasma_private_free(plasma, work);
    ss_finalize();
}
コード例 #14
0
ファイル: pztrsmpl.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel forward substitution for tile LU - static scheduling
 **/
void plasma_pztrsmpl(plasma_context_t *plasma)
{
    PLASMA_desc A;
    PLASMA_desc B;
    PLASMA_desc L;
    int *IPIV;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_k;
    int next_m;
    int next_n;
    int ldak, ldbk, ldam, ldbm;
    int tempkm, tempnn, tempkmin, tempmm, tempkn;
    int ib;

    plasma_unpack_args_6(A, B, L, IPIV, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;
    ss_init(B.mt, B.nt, -1);

    ib = PLASMA_IB;
    k = 0;
    n = PLASMA_RANK;
    while (n >= B.nt) {
        k++;
        n = n-B.nt;
    }
    m = k;

    while (k < min(A.mt, A.nt) && n < B.nt) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_m++;
        if (next_m == A.mt) {
            next_n += PLASMA_SIZE;
            while (next_n >= B.nt && next_k < min(A.mt, A.nt)) {
                next_k++;
                next_n = next_n-B.nt;
            }
            next_m = next_k;
        }

        tempkm   = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn   = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        tempkmin = k == min(A.mt, A.nt)-1 ? min(A.m, A.n)-k*A.mb : A.mb;
        tempnn   = n == B.nt-1 ? B.n-n*B.nb : B.nb;
        tempmm   = m == A.mt-1 ? A.m-m*A.mb : A.mb;

        ldak = BLKLDD(A, k);
        ldbk = BLKLDD(B, k);
        ldam = BLKLDD(A, m);
        ldbm = BLKLDD(B, m);

        if (m == k) {
            ss_cond_wait(k, n, k-1);
            CORE_zgessm(
                tempkm, tempnn, tempkmin, ib,
                IPIV(k, k),
                A(k, k), ldak,
                B(k, n), ldbk);
            ss_cond_set(k, n, k);
        }
        else {
            ss_cond_wait(m, n, k-1);
            CORE_zssssm(
                A.nb, tempnn, tempmm, tempnn, tempkn, ib,
                B(k, n), ldbk,
                B(m, n), ldbm,
                L(m, k), L.mb,
                A(m, k), ldam,
                IPIV(m, k));
            ss_cond_set(m, n, k);
        }
        n = next_n;
        m = next_m;
        k = next_k;
    }
    ss_finalize();
}
コード例 #15
0
ファイル: pdormqr.c プロジェクト: adcastel/ULT_work
/***************************************************************************//**
 *  Parallel application of Q using tile V - QR factorization - dynamic scheduling
 **/
void plasma_pdormqr_quark(PLASMA_enum side, PLASMA_enum trans,
                          PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int ib)
{
    int k, m, n;
    int ldak, ldbk, ldam, ldan, ldbm;
    int tempkm, tempnn, tempkmin, tempmm, tempkn;
    int minMT, minM;

    if (A.m > A.n) {
        minM  = A.n;
        minMT = A.nt;
    } else {
        minM  = A.m;
        minMT = A.mt;
    }

    double *work = (double *)alloca(sizeof(double) * T.nb * ib);
    /*
     *  PlasmaLeft / PlasmaTrans
     */
    if (side == PlasmaLeft ) {
        if (trans == PlasmaTrans) {
            for (k = 0; k < minMT; k++) {
                tempkm   = k == B.mt-1 ? B.m-k*B.mb : B.mb;
                tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
                ldak = BLKLDD(A, k);
                ldbk = BLKLDD(B, k);
                for (n = 0; n < B.nt; n++) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    double *dA = A(k, k);
                    double *dT = T(k, k);
                    double *dB = B(k, n);
                    {
                        CORE_dormqr(side, trans, tempkm, tempnn, tempkmin, ib,
                                dA, ldak,
                                dT, T.mb,
                                dB, ldbk, work, T.nb);
                    }
                }
                for (m = k+1; m < B.mt; m++) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldam = BLKLDD(A, m);
                    ldbm = BLKLDD(B, m);
                    for (n = 0; n < B.nt; n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        double *dA = B(k, n);
                        double *dB = B(m, n);
                        double *dV = A(m, k);
                        double *dT = T(m, k);
                        {
                            CORE_dtsmqr(side, trans,
                                    B.mb, tempnn, tempmm, tempnn, tempkmin, ib,
                                    dA, ldbk,
                                    dB, ldbm,
                                    dV, ldam,
                                    dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb);
                        }
                    }
                }
            }
        }
        /*
         *  PlasmaLeft / PlasmaNoTrans
         */
        else {
            for (k = minMT-1; k >= 0; k--) {
                tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb;
                tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
                ldak = BLKLDD(A, k);
                ldbk = BLKLDD(B, k);
                for (m = B.mt-1; m > k; m--) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldam = BLKLDD(A, m);
                    ldbm = BLKLDD(B, m);
                    for (n = 0; n < B.nt; n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        double *dA = B(k, n);
                        double *dB = B(m, n);
                        double *dV = A(m, k);
                        double *dT = T(m, k);
                        {
                            CORE_dtsmqr(side, trans,
                                    B.mb, tempnn, tempmm, tempnn, tempkmin, ib,
                                    dA, ldbk,
                                    dB, ldbm,
                                    dV, ldam,
                                    dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb);
                        }
                    }
                }
                for (n = 0; n < B.nt; n++) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    double *dA = A(k, k);
                    double *dT = T(k, k);
                    double *dB = B(k, n);
                    {
                        CORE_dormqr(side, trans, tempkm, tempnn, tempkmin, ib,
                                dA, ldak,
                                dT, T.mb,
                                dB, ldbk, work, T.nb);
                    }
                }
            }
        }
    }
    /*
     *  PlasmaRight / PlasmaTrans
     */
    else {
        if (trans == PlasmaTrans) {
            for (k = minMT-1; k >= 0; k--) {
                tempkn = k == B.nt-1 ? B.n-k*B.nb : B.nb;
                tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
                ldak = BLKLDD(A, k);
                ldbk = BLKLDD(B, k);
                for (n = B.nt-1; n > k; n--) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    ldan = BLKLDD(A, n);
                    for (m = 0; m < B.mt; m++) {
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        ldbm = BLKLDD(B, m);
                        double *dA = B(m, k);
                        double *dB = B(m, n);
                        double *dV = A(n, k);
                        double *dT = T(n, k);
                        {
                            CORE_dtsmqr(side, trans,
                                    tempmm, B.nb, tempmm, tempnn, tempkmin, ib,
                                    dA, ldbm,
                                    dB, ldbm,
                                    dV, ldan,
                                    dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb);
                        }
                    }
                }
                for (m = 0; m < B.mt; m++) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldbm = BLKLDD(B, m);
                    double *dA = A(k, k);
                    double *dT = T(k, k);
                    double *dB = B(m, k);
                    {
                        CORE_dormqr(side, trans, tempmm, tempkn, tempkmin, ib,
                                dA, ldak,
                                dT, T.mb,
                                dB, ldbm, work, T.nb);
                    }
                }
            }
        }
        /*
         *  PlasmaRight / PlasmaNoTrans
         */
        else {
            for (k = 0; k < minMT; k++) {
                tempkn   = k == B.nt-1 ? B.n-k*B.nb : B.nb;
                tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
                ldak = BLKLDD(A, k); 
                for (m = 0; m < B.mt; m++) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldbm = BLKLDD(B, m);
                    double *dA = A(k, k);
                    double *dT = T(k, k);
                    double *dB = B(m, k);
                    {
                        CORE_dormqr(side, trans, tempmm, tempkn, tempkmin, ib,
                                dA, ldak,
                                dT, T.mb,
                                dB, ldbm, work, T.nb);
                    }
                }
                for (n = k+1; n < B.nt; n++) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    ldan = BLKLDD(A, n);
                    for (m = 0; m < B.mt; m++) {
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        ldbm = BLKLDD(B, m);
                        double *dA = B(m, k);
                        double *dB = B(m, n);
                        double *dV = A(n, k);
                        double *dT = T(n, k);
                        {
                            CORE_dtsmqr(side, trans,
                                    tempmm, B.nb, tempmm, tempnn, tempkmin, ib,
                                    dA, ldbm,
                                    dB, ldbm,
                                    dV, ldan,
                                    dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb);
                        }
                    }
                }
            }
        }
    }
}
コード例 #16
0
ファイル: pdgeqrf.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel tile QR factorization - static scheduling
 **/
void plasma_pdgeqrf(plasma_context_t *plasma)
{
    PLASMA_desc A;
    PLASMA_desc T;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_k;
    int next_m;
    int next_n;
    int ldak, ldam;
    int tempkm, tempkn, tempnn, tempmm;
    int ib = PLASMA_IB;
    double *work, *tau;

    plasma_unpack_args_4(A, T, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;
    work = (double*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp);
    tau  = (double*)plasma_private_alloc(plasma, A.nb, A.dtyp);
    ss_init(A.mt, A.nt, -1);

    k = 0;
    n = PLASMA_RANK;
    while (n >= A.nt) {
        k++;
        n = n-A.nt+k;
    }
    m = k;

    while (k < min(A.mt, A.nt) && n < A.nt) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_m++;
        if (next_m == A.mt) {
            next_n += PLASMA_SIZE;
            while (next_n >= A.nt && next_k < min(A.mt, A.nt)) {
                next_k++;
                next_n = next_n-A.nt+next_k;
            }
            next_m = next_k;
        }

        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
        tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;

        ldak = BLKLDD(A, k);
        ldam = BLKLDD(A, m);

        if (n == k) {
            if (m == k) {
                ss_cond_wait(k, k, k-1);
                CORE_dgeqrt(
                    tempkm, tempkn, ib,
                    A(k, k), ldak,
                    T(k, k), T.mb,
                    tau, work);
                ss_cond_set(k, k, k);
            }
            else {
                ss_cond_wait(m, k, k-1);
                CORE_dtsqrt(
                    tempmm, tempkn, ib,
                    A(k, k), ldak,
                    A(m, k), ldam,
                    T(m, k), T.mb,
                    tau, work);
                ss_cond_set(m, k, k);
            }
        }
        else {
            if (m == k) {
                ss_cond_wait(k, k, k);
                ss_cond_wait(k, n, k-1);
                CORE_dormqr(
                    PlasmaLeft, PlasmaTrans,
                    tempkm, tempnn, tempkm, ib,
                    A(k, k), ldak,
                    T(k, k), T.mb,
                    A(k, n), ldak,
                    work, T.nb);
            }
            else {
                ss_cond_wait(m, k, k);
                ss_cond_wait(m, n, k-1);
                CORE_dtsmqr(
                    PlasmaLeft, PlasmaTrans,
                    A.nb, tempnn, tempmm, tempnn, A.nb, ib,
                    A(k, n), ldak,
                    A(m, n), ldam,
                    A(m, k), ldam,
                    T(m, k), T.mb,
                    work, ib);
                ss_cond_set(m, n, k);
            }
        }
        n = next_n;
        m = next_m;
        k = next_k;
    }
    plasma_private_free(plasma, work);
    plasma_private_free(plasma, tau);
    ss_finalize();
}
コード例 #17
0
ファイル: psgeqrfrh.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel tile QR factorization (reduction Householder) - dynamic scheduling
 **/
void plasma_psgeqrfrh_quark(PLASMA_desc A, PLASMA_desc T, int BS,
                            PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int K, M, RD;
    int ldaM, ldam, ldaMRD;
    int tempkn, tempMm, tempnn, tempmm, tempMRDm;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    K = min(A.mt, A.nt);
    for (k = 0; k < K; k++) {
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        for (M = k;
             M < A.mt-1 || M == k;  /* No bottom single-row subdomain */
             M += BS) {
            tempMm = M == A.mt-1 ? A.m-M*A.mb : A.mb;
            ldaM = BLKLDD(A, M);
            QUARK_CORE_sgeqrt(
                plasma->quark, &task_flags,
                tempMm, tempkn, ib, T.nb,
                A(M, k), ldaM,
                T(M, k), T.mb);

            for (n = k+1; n < A.nt; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_sormqr(
                    plasma->quark, &task_flags,
                    PlasmaLeft, PlasmaTrans,
                    tempMm, tempnn, tempMm, ib, T.nb,
                    A(M, k), ldaM,
                    T(M, k), T.mb,
                    A(M, n), ldaM);
            }
            for (m = M+1;
                 (m < M+BS && m < A.mt) || m == A.mt-1; /* Suck in bottom single-row domain */
                 m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_stsqrt(
                    plasma->quark, &task_flags,
                    tempmm, tempkn, ib, T.nb,
                    A(M, k), ldaM,
                    A(m, k), ldam,
                    T(m, k), T.mb);

                for (n = k+1; n < A.nt; n++) {
                    tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                    QUARK_CORE_stsmqr(
                        plasma->quark, &task_flags,
                        PlasmaLeft, PlasmaTrans,
                        A.nb, tempnn, tempmm, tempnn, A.nb, ib, T.nb,
                        A(M, n), ldaM,
                        A(m, n), ldam,
                        A(m, k), ldam,
                        T(m, k), T.mb);
                }
            }
        }
        for (RD = BS; RD < A.mt-k; RD *= 2) {
            for (M = k;
                 M+RD < A.mt-1; /* No reduction with bottom single-row subdomain */
                 M += 2*RD) {
                tempMRDm = M+RD == A.mt-1 ? A.m-(M+RD)*A.mb : A.mb;
                ldaM   = BLKLDD(A, M   );
                ldaMRD = BLKLDD(A, M+RD);
                QUARK_CORE_sttqrt(
                    plasma->quark, &task_flags,
                    tempMRDm, tempkn, ib, T.nb,
                    A (M   , k), ldaM,
                    A (M+RD, k), ldaMRD,
                    T2(M+RD, k), T.mb);

                for (n = k+1; n < A.nt; n++) {
                    tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                    QUARK_CORE_sttmqr(
                        plasma->quark, &task_flags,
                        PlasmaLeft, PlasmaTrans,
                        A.nb, tempnn, tempMRDm, tempnn, A.nb, ib, T.nb,
                        A (M,    n), ldaM,
                        A (M+RD, n), ldaMRD,
                        A (M+RD, k), ldaMRD,
                        T2(M+RD, k), T.mb);
                }
            }
        }
    }
}
コード例 #18
0
ファイル: pcsyr2k.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel tile Hermitian rank-k update - static scheduling
 **/
void plasma_pcsyr2k(plasma_context_t *plasma)
{
    PLASMA_enum uplo;
    PLASMA_enum trans;
    PLASMA_Complex32_t alpha;
    PLASMA_desc A;
    PLASMA_desc B;
    PLASMA_Complex32_t beta;
    PLASMA_desc C;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int m, n, k;
    int next_m;
    int next_n;
    int ldam, ldan, ldak;
    int ldbm, ldbn, ldbk;
    int ldcm, ldcn;
    int tempkn, tempkm, tempmm, tempnn;

    PLASMA_Complex32_t zone = (PLASMA_Complex32_t)1.0;
    PLASMA_Complex32_t zbeta;

    plasma_unpack_args_9(uplo, trans, alpha, A, B, beta, C, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;

    n = 0;
    m = PLASMA_RANK;
    while (m >= C.mt && n < C.nt) {
        n++;
        m = m-C.mt+n;
    }

    while (n < C.nt) {
        next_n = n;
        next_m = m + PLASMA_SIZE;
        while (next_m >= C.mt && next_n < C.nt) {
            next_n++;
            next_m = next_m - C.mt + next_n;
        }

        tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
        tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb;

        ldcn = BLKLDD(C, n);
        ldcm = BLKLDD(C, m);

        if (m == n) {
            /*
             *  PlasmaNoTrans
             */
            if (trans == PlasmaNoTrans) {
                ldam = BLKLDD(A, m);
                ldbm = BLKLDD(B, m);
                for (k = 0; k < A.nt; k++) {
                    tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                    zbeta = k == 0 ? beta : zone;
                    CORE_csyr2k(
                        uplo, trans,
                        tempnn, tempkn,
                        alpha, A(m, k), ldam,
                               B(m, k), ldbm,
                        zbeta, C(m, m), ldcm);
                }
            }
            /*
             *  Plasma[Conj]Trans
             */
            else {
                for (k = 0; k < A.mt; k++) {
                    tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                    ldak = BLKLDD(A, k);
                    ldbk = BLKLDD(B, k);
                    zbeta = k == 0 ? beta : zone;
                    CORE_csyr2k(
                        uplo, trans,
                        tempnn, tempkm,
                        alpha, A(k, m), ldak,
                               B(k, m), ldbk,
                        zbeta, C(m, m), ldcm);
                }
            }
        }
        else {
            if (trans == PlasmaNoTrans) {
                ldam = BLKLDD(A, m);
                ldan = BLKLDD(A, n);
                ldbm = BLKLDD(B, m);
                ldbn = BLKLDD(B, n);
                /*
                 *  PlasmaNoTrans / PlasmaLower
                 */
                if (uplo == PlasmaLower) {
                    for (k = 0; k < A.nt; k++) {
                        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                        zbeta = k == 0 ? beta : zone;
                        CORE_cgemm(
                            trans, PlasmaTrans,
                            tempmm, tempnn, tempkn,
                            alpha, A(m, k), ldam,
                                   B(n, k), ldbn,
                            zbeta, C(m, n), ldcm);

                        CORE_cgemm(
                            trans, PlasmaTrans,
                            tempmm, tempnn, tempkn,
                            alpha, B(m, k), ldbm,
                                   A(n, k), ldan,
                            zone,  C(m, n), ldcm);
                    }
                }
                /*
                 *  PlasmaNoTrans / PlasmaUpper
                 */
                else {
                    for (k = 0; k < A.nt; k++) {
                        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                        zbeta = k == 0 ? beta : zone;
                        CORE_cgemm(
                            trans, PlasmaTrans,
                            tempnn, tempmm, tempkn,
                            alpha, A(n, k), ldan,
                                   B(m, k), ldbm,
                            zbeta, C(n, m), ldcn);

                        CORE_cgemm(
                            trans, PlasmaTrans,
                            tempnn, tempmm, tempkn,
                            alpha, B(n, k), ldbn,
                                   A(m, k), ldam,
                            zone,  C(n, m), ldcn);
                    }
                }
            }
            else {
                /*
                 *  Plasma[Conj]Trans / PlasmaLower
                 */
                if (uplo == PlasmaLower) {
                    for (k = 0; k < A.mt; k++) {
                        ldak = BLKLDD(A, k);
                        ldbk = BLKLDD(B, k);
                        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                        zbeta = k == 0 ? beta : zone;
                        CORE_cgemm(
                            trans, PlasmaNoTrans,
                            tempmm, tempnn, tempkm,
                            alpha, A(k, m), ldak,
                                   B(k, n), ldbk,
                            zbeta, C(m, n), ldcm);

                        CORE_cgemm(
                            trans, PlasmaNoTrans,
                            tempmm, tempnn, tempkm,
                            alpha, B(k, m), ldbk,
                                   A(k, n), ldak,
                            zone,  C(m, n), ldcm);
                    }
                }
                /*
                 *  Plasma[Conj]Trans / PlasmaUpper
                 */
                else {
                    for (k = 0; k < A.mt; k++) {
                        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                        ldak = BLKLDD(A, k);
                        ldbk = BLKLDD(B, k);
                        zbeta = k == 0 ? beta : zone;
                        CORE_cgemm(
                            trans, PlasmaNoTrans,
                            tempnn, tempmm, tempkm,
                            alpha, A(k, n), ldak,
                                   B(k, m), ldbk,
                            zbeta, C(n, m), ldcm);

                        CORE_cgemm(
                            trans, PlasmaNoTrans,
                            tempnn, tempmm, tempkm,
                            alpha, B(k, n), ldbk,
                                   A(k, m), ldak,
                            zone,  C(n, m), ldcn);
                    }
                }
            }
        }
        m = next_m;
        n = next_n;
    }
}
コード例 #19
0
ファイル: pzlaset2.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel initializztion a 2-D array A to 
 *  ALPHA on the offdiagonals.
 **/
void plasma_pzlaset2_quark(PLASMA_enum uplo, PLASMA_Complex64_t alpha, 
                           PLASMA_desc A,
                           PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int i, j;
    int ldai, ldaj;
    int tempim;
    int tempjm, tempjn;
    int minmn = min(A.mt, A.nt);

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;

    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    if (uplo == PlasmaLower) {
       for (j = 0; j < minmn; j++){
           tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb;
           tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
           ldaj = BLKLDD(A, j);
           QUARK_CORE_zlaset2(
               plasma->quark, &task_flags,
               PlasmaLower, tempjm, tempjn, alpha,
               A(j, j), ldaj);

           for (i = j+1; i < A.mt; i++){
               tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
               ldai = BLKLDD(A, i);
               QUARK_CORE_zlaset2(
                   plasma->quark, &task_flags,
                   PlasmaUpperLower, tempim, tempjn, alpha,
                   A(i, j), ldai);
           }
       }
    }
    else if (uplo == PlasmaUpper) {
       for (j = 1; j < A.nt; j++){
           tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
           for (i = 0; i < min(j, A.mt); i++){
               tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
               ldai = BLKLDD(A, i);
               QUARK_CORE_zlaset2(
                   plasma->quark, &task_flags,
                   PlasmaUpperLower, tempim, tempjn, alpha,
                   A(i, j), ldai);
           }
       }
       for (j = 0; j < minmn; j++){
           tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb;
           tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
           ldaj = BLKLDD(A, j);
           QUARK_CORE_zlaset2(
               plasma->quark, &task_flags,
               PlasmaUpper, tempjm, tempjn, alpha,
               A(j, j), ldaj);
       }
    }
    else {
       for (i = 0; i < A.mt; i++){
           tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
           ldai = BLKLDD(A, i);
           for (j = 0; j < A.nt; j++){
               tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
               QUARK_CORE_zlaset2(
                   plasma->quark, &task_flags,
                   PlasmaUpperLower, tempim, tempjn, alpha,
                   A(i, j), ldai);
           }
       }
    } 
}
コード例 #20
0
ファイル: pcunmqr.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel application of Q using tile V - QR factorization - dynamic scheduling
 **/
void plasma_pcunmqr_quark(PLASMA_enum side, PLASMA_enum trans,
                          PLASMA_desc A, PLASMA_desc B, PLASMA_desc T,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int ldak, ldbk, ldam, ldan, ldbm;
    int tempkm, tempnn, tempkmin, tempmm, tempkn;
    int ib, minMT, minM;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    if (A.m > A.n) {
      minM  = A.n;
      minMT = A.nt;
    } else {
      minM  = A.m;
      minMT = A.mt;
    }

    /*
     *  PlasmaLeft / PlasmaConjTrans
     */
    if (side == PlasmaLeft ) {
        if (trans == PlasmaConjTrans) {
            for (k = 0; k < minMT; k++) {
                tempkm   = k == B.mt-1 ? B.m-k*B.mb : B.mb;
                tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
                ldak = BLKLDD(A, k);
                ldbk = BLKLDD(B, k);
                for (n = 0; n < B.nt; n++) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    QUARK_CORE_cunmqr(
                        plasma->quark, &task_flags,
                        side, trans,
                        tempkm, tempnn, tempkmin, ib, T.nb,
                        A(k, k), ldak,
                        T(k, k), T.mb,
                        B(k, n), ldbk);
                }
                for (m = k+1; m < B.mt; m++) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldam = BLKLDD(A, m);
                    ldbm = BLKLDD(B, m);
                    for (n = 0; n < B.nt; n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        QUARK_CORE_ctsmqr(
                            plasma->quark, &task_flags,
                            side, trans,
                            B.mb, tempnn, tempmm, tempnn, tempkmin, ib, T.nb,
                            B(k, n), ldbk,
                            B(m, n), ldbm,
                            A(m, k), ldam,
                            T(m, k), T.mb);
                    }
                }
            }
        }
        /*
         *  PlasmaLeft / PlasmaNoTrans
         */
        else {
            for (k = minMT-1; k >= 0; k--) {
                tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb;
                tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
                ldak = BLKLDD(A, k);
                ldbk = BLKLDD(B, k);
                for (m = B.mt-1; m > k; m--) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldam = BLKLDD(A, m);
                    ldbm = BLKLDD(B, m);
                    for (n = 0; n < B.nt; n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        QUARK_CORE_ctsmqr(
                            plasma->quark, &task_flags,
                            side, trans,
                            B.mb, tempnn, tempmm, tempnn, tempkmin, ib, T.nb,
                            B(k, n), ldbk,
                            B(m, n), ldbm,
                            A(m, k), ldam,
                            T(m, k), T.mb);
                    }
                }
                for (n = 0; n < B.nt; n++) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    QUARK_CORE_cunmqr(
                        plasma->quark, &task_flags,
                        side, trans,
                        tempkm, tempnn, tempkmin, ib, T.nb,
                        A(k, k), ldak,
                        T(k, k), T.mb,
                        B(k, n), ldbk);
                }
            }
        }
    }
    /*
     *  PlasmaRight / PlasmaConjTrans
     */
    else {
        if (trans == PlasmaConjTrans) {
            for (k = minMT-1; k >= 0; k--) {
                tempkn = k == B.nt-1 ? B.n-k*B.nb : B.nb;
                tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
                ldak = BLKLDD(A, k);
                ldbk = BLKLDD(B, k);
                for (n = B.nt-1; n > k; n--) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    ldan = BLKLDD(A, n);
                    for (m = 0; m < B.mt; m++) {
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        ldbm = BLKLDD(B, m);
                        QUARK_CORE_ctsmqr(
                            plasma->quark, &task_flags,
                            side, trans,
                            tempmm, B.nb, tempmm, tempnn, tempkmin, ib, T.nb,
                            B(m, k), ldbm,
                            B(m, n), ldbm,
                            A(n, k), ldan,
                            T(n, k), T.mb);
                    }
                }
                for (m = 0; m < B.mt; m++) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldbm = BLKLDD(B, m);
                    QUARK_CORE_cunmqr(
                        plasma->quark, &task_flags,
                        side, trans,
                        tempmm, tempkn, tempkmin, ib, T.nb,
                        A(k, k), ldak,
                        T(k, k), T.mb,
                        B(m, k), ldbm);
                }
            }
        }
        /*
         *  PlasmaRight / PlasmaNoTrans
         */
        else {
            for (k = 0; k < minMT; k++) {
                tempkn   = k == B.nt-1 ? B.n-k*B.nb : B.nb;
                tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
                ldak = BLKLDD(A, k); 
                for (m = 0; m < B.mt; m++) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldbm = BLKLDD(B, m);
                    QUARK_CORE_cunmqr(
                        plasma->quark, &task_flags,
                        side, trans,
                        tempmm, tempkn, tempkmin, ib, T.nb,
                        A(k, k), ldak,
                        T(k, k), T.mb,
                        B(m, k), ldbm);
                }
                for (n = k+1; n < B.nt; n++) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    ldan = BLKLDD(A, n);
                    for (m = 0; m < B.mt; m++) {
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        ldbm = BLKLDD(B, m);
                        QUARK_CORE_ctsmqr(
                            plasma->quark, &task_flags,
                            side, trans,
                            tempmm, B.nb, tempmm, tempnn, tempkmin, ib, T.nb,
                            B(m, k), ldbm,
                            B(m, n), ldbm,
                            A(n, k), ldan,
                            T(n, k), T.mb);
                    }
                }
            }
        }
    }
}
コード例 #21
0
ファイル: pspotrf.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel tile Cholesky factorization - dynamic scheduling
 **/
void plasma_pspotrf_quark(PLASMA_enum uplo, PLASMA_desc A,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int ldak, ldam;
    int tempkm, tempmm;

    float zone  = (float) 1.0;
    float mzone = (float)-1.0;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
    /*
     *  PlasmaLower
     */
    if (uplo == PlasmaLower) {
        for (k = 0; k < A.mt; k++) {
            tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
            ldak = BLKLDD(A, k);
            QUARK_CORE_spotrf(
                plasma->quark, &task_flags,
                PlasmaLower, tempkm, A.mb,
                A(k, k), ldak,
                sequence, request, A.nb*k);

            for (m = k+1; m < A.mt; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_strsm(
                    plasma->quark, &task_flags,
                    PlasmaRight, PlasmaLower, PlasmaTrans, PlasmaNonUnit,
                    tempmm, A.mb, A.mb,
                    zone, A(k, k), ldak,
                          A(m, k), ldam);
            }
            for (m = k+1; m < A.mt; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_ssyrk(
                    plasma->quark, &task_flags,
                    PlasmaLower, PlasmaNoTrans,
                    tempmm, A.mb, A.mb,
                    -1.0, A(m, k), ldam,
                     1.0, A(m, m), ldam);

                for (n = k+1; n < m; n++) {
                    QUARK_CORE_sgemm(
                        plasma->quark, &task_flags,
                        PlasmaNoTrans, PlasmaTrans,
                        tempmm, A.mb, A.mb, A.mb,
                        mzone, A(m, k), ldam,
                               A(n, k), A.mb,
                        zone,  A(m, n), ldam);
                }
            }
        }
    }
    /*
     *  PlasmaUpper
     */
    else {
        for (k = 0; k < A.nt; k++) {
            tempkm = k == A.nt-1 ? A.n-k*A.nb : A.nb;
            ldak = BLKLDD(A, k);
            QUARK_CORE_spotrf(
                plasma->quark, &task_flags,
                PlasmaUpper,
                tempkm, A.mb,
                A(k, k), ldak,
                sequence, request, A.nb*k);

            for (m = k+1; m < A.nt; m++) {
                tempmm = m == A.nt-1 ? A.n-m*A.nb : A.nb;
                QUARK_CORE_strsm(
                    plasma->quark, &task_flags,
                    PlasmaLeft, PlasmaUpper, PlasmaTrans, PlasmaNonUnit,
                    A.nb, tempmm, A.mb,
                    zone, A(k, k), ldak,
                          A(k, m), ldak);
            }
            for (m = k+1; m < A.nt; m++) {
                tempmm = m == A.nt-1 ? A.n-m*A.nb : A.nb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_ssyrk(
                    plasma->quark, &task_flags,
                    PlasmaUpper, PlasmaTrans,
                    tempmm, A.mb, A.mb,
                    -1.0, A(k, m), ldak,
                     1.0, A(m, m), ldam);

                for (n = k+1; n < m; n++) {
                    QUARK_CORE_sgemm(
                        plasma->quark, &task_flags,
                        PlasmaTrans, PlasmaNoTrans,
                        A.mb, tempmm, A.mb, A.mb,
                        mzone, A(k, n), ldak,
                               A(k, m), ldak,
                        zone,  A(n, m), A.mb);
                }
            }
        }
    }
}
コード例 #22
0
ファイル: pzgelqfrh.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
 **/
void plasma_pzgelqfrh_quark(PLASMA_desc A, PLASMA_desc T, int BS,
                            PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int K, N, RD;
    int ldak, ldam;
    int tempkm, tempNn, tempmm, tempnn, tempNRDn;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    K = min(A.mt, A.nt);
    for (k = 0; k < K; k++) {

        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        ldak = BLKLDD(A, k);
//      for (N = k; N < A.nt; N += BS) {
        for (N = k;
             N < A.nt-1 || N == k;  // No rightmost single-column subdomain
             N += BS) {
            tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
            QUARK_CORE_zgelqt(
                plasma->quark, &task_flags,
                tempkm, tempNn, ib, T.nb,
                A(k, N), ldak,
                T(k, N), T.mb);

            for (m = k+1; m < A.mt; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_zunmlq(
                    plasma->quark, &task_flags,
                    PlasmaRight, PlasmaConjTrans,
                    tempmm, tempNn, tempNn, ib, T.nb,
                    A(k, N), ldak,
                    T(k, N), T.mb,
                    A(m, N), ldam);
            }
//          for (n = N+1; n < N+BS && n < A.nt; n++) {
            for (n = N+1;
                 (n < N+BS && n < A.nt) || n == A.nt-1; // Suck in rightmost single-column domain
                 n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_ztslqt(
                    plasma->quark, &task_flags,
                    tempkm, tempnn, ib, T.nb,
                    A(k, N), ldak,
                    A(k, n), ldak,
                    T(k, n), T.mb);

                for (m = k+1; m < A.mt; m++) {
                    tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                    ldam = BLKLDD(A, m);
                    QUARK_CORE_ztsmlq(
                        plasma->quark, &task_flags,
                        PlasmaRight, PlasmaConjTrans,
                        tempmm, A.nb, tempmm, tempnn, A.mb, ib, T.nb,
                        A(m, N), ldam,
                        A(m, n), ldam,
                        A(k, n), ldak,
                        T(k, n), T.mb);
                }
            }
        }
        for (RD = BS; RD < A.nt-k; RD *= 2) {
//          for (N = k; N+RD < A.nt; N += 2*RD) {
            for (N = k;
                 N+RD < A.nt-1; // No reduction with rightmost single-column subdomain
                 N += 2*RD) {
                tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                QUARK_CORE_zttlqt(
                    plasma->quark, &task_flags,
                    tempkm, tempNRDn, ib, T.nb,
                    A (k, N   ), ldak,
                    A (k, N+RD), ldak,
                    T2(k, N+RD), T.mb);

                for (m = k+1; m < A.mt; m++) {
                    tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                    ldam = BLKLDD(A, m);
                    QUARK_CORE_zttmlq(
                        plasma->quark, &task_flags,
                        PlasmaRight, PlasmaConjTrans,
                        tempmm, A.nb, tempmm, tempNRDn, A.mb, ib, T.nb,
                        A (m, N   ), ldam,
                        A (m, N+RD), ldam,
                        A (k, N+RD), ldak,
                        T2(k, N+RD), T.mb);
                }
            }
        }
    }
}
コード例 #23
0
ファイル: pspotrf.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel tile Cholesky factorization - static scheduling
 **/
void plasma_pspotrf(plasma_context_t *plasma)
{
    PLASMA_enum uplo;
    PLASMA_desc A;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_k;
    int next_m;
    int next_n;
    int ldak, ldam, ldan;
    int info;
    int tempkn, tempmn;

    float zone  = (float) 1.0;
    float mzone = (float)-1.0;

    plasma_unpack_args_4(uplo, A, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;
    ss_init(A.nt, A.nt, 0);

    k = 0;
    m = PLASMA_RANK;
    while (m >= A.nt) {
        k++;
        m = m-A.nt+k;
    }
    n = 0;

    while (k < A.nt && m < A.nt && !ss_aborted()) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_n++;
        if (next_n > next_k) {
            next_m += PLASMA_SIZE;
            while (next_m >= A.nt && next_k < A.nt) {
                next_k++;
                next_m = next_m-A.nt+next_k;
            }
            next_n = 0;
        }

        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        tempmn = m == A.nt-1 ? A.n-m*A.nb : A.nb;

        ldak = BLKLDD(A, k);
        ldan = BLKLDD(A, n);
        ldam = BLKLDD(A, m);

        if (m == k) {
            if (n == k) {
                /*
                 *  PlasmaLower
                 */
                if (uplo == PlasmaLower) {
                    CORE_spotrf(
                        PlasmaLower,
                        tempkn,
                        A(k, k), ldak,
                        &info);
                }
                /*
                 *  PlasmaUpper
                 */
                else {
                    CORE_spotrf(
                        PlasmaUpper,
                        tempkn,
                        A(k, k), ldak,
                        &info);
                }
                if (info != 0) {
                    plasma_request_fail(sequence, request, info + A.nb*k);
                    ss_abort();
                }
                ss_cond_set(k, k, 1);
            }
            else {
                ss_cond_wait(k, n, 1);
                /*
                 *  PlasmaLower
                 */
                if (uplo == PlasmaLower) {
                    CORE_ssyrk(
                         PlasmaLower, PlasmaNoTrans,
                         tempkn, A.nb,
                         -1.0, A(k, n), ldak,
                          1.0, A(k, k), ldak);
                }
                /*
                 *  PlasmaUpper
                 */
                else {
                    CORE_ssyrk(
                         PlasmaUpper, PlasmaTrans,
                         tempkn, A.nb,
                         -1.0, A(n, k), ldan,
                          1.0, A(k, k), ldak);
                }
            }
        }
        else {
            if (n == k) {
                ss_cond_wait(k, k, 1);
                /*
                 *  PlasmaLower
                 */
                if (uplo == PlasmaLower) {
                    CORE_strsm(
                        PlasmaRight, PlasmaLower, PlasmaTrans, PlasmaNonUnit,
                        tempmn, A.nb,
                        zone, A(k, k), ldak,
                              A(m, k), ldam);
                }
                /*
                 *  PlasmaUpper
                 */
                else {
                    CORE_strsm(
                        PlasmaLeft, PlasmaUpper, PlasmaTrans, PlasmaNonUnit,
                        A.nb, tempmn,
                        zone, A(k, k), ldak,
                              A(k, m), ldak);
                }
                ss_cond_set(m, k, 1);
            }
            else {
                ss_cond_wait(k, n, 1);
                ss_cond_wait(m, n, 1);
                /*
                 *  PlasmaLower
                 */
                if (uplo == PlasmaLower) {
                    CORE_sgemm(
                        PlasmaNoTrans, PlasmaTrans,
                        tempmn, A.nb, A.nb,
                        mzone, A(m, n), ldam,
                               A(k, n), ldak,
                         zone, A(m, k), ldam);
                }
                /*
                 *  PlasmaUpper
                 */
                else {
                    CORE_sgemm(
                        PlasmaTrans, PlasmaNoTrans,
                        A.nb, tempmn, A.nb,
                        mzone, A(n, k), ldan,
                               A(n, m), ldan,
                         zone, A(k, m), ldak);
                }
            }
        }
        n = next_n;
        m = next_m;
        k = next_k;
    }
    ss_finalize();
}
コード例 #24
0
ファイル: psormlqrh.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel application of Q using tile V - QR factorization (reduction Householder)
 *  - dynamic scheduling
 **/
void plasma_psormlqrh_quark(PLASMA_enum side, PLASMA_enum trans,
        PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int BS,
        PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int K, N, RD, lastRD;
    int ldaN, ldam, ldak;
    int ldbN, ldbm, ldbNRD;
    int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    K = min(A.mt, A.nt);

    if (side == PlasmaLeft ) {
        if (trans == PlasmaNoTrans) {
            /*
             *  PlasmaLeft / PlasmaNoTrans
             */
            for (k = 0; k < K; k++) {
                tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                ldak = BLKLDD(A, k);
                for (N = k;
                     N < A.nt-1 || N == k;  /* No rightmost single-column subdomain */
                     N += BS) {
                    tempNn   = N == A.nt-1 ? A.n-N*A.nb : A.nb;
                    tempkmin = min(tempkm,tempNn);
                    ldaN = BLKLDD(A, N);
                    ldbN = BLKLDD(B, N);
                    for (n = 0; n < B.nt; n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        QUARK_CORE_sormlq(
                                plasma->quark, &task_flags,
                                side, trans,
                                tempNn, tempnn,
                                tempkmin, ib, T.nb,
                                A(k, N), ldak,
                                T(k, N), T.mb,
                                B(N, n), ldbN);
                    }
                    for (m = N+1;
                           (m < N+BS && m < A.nt) || m == A.nt-1; /* Suck in rightmost single-column domain */
                           m++) {
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        ldbm = BLKLDD(B, m);
                        ldam = BLKLDD(A, m);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_stsmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    B.nb, tempnn, tempmm, tempnn,
                                    tempkm, ib, T.nb,
                                    B(N, n), ldbN,
                                    B(m, n), ldbm,
                                    A(k, m), ldak,
                                    T(k, m), T.mb);
                        }
                    }
                }
                for (RD = BS; RD < A.nt-k; RD *= 2) {
                    for (N = k;
                         N+RD < A.nt-1; /* No reduction with rightmost single-column subdomain */
                         N += 2*RD) {
                        tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                        ldbN   = BLKLDD(B, N   );
                        ldbNRD = BLKLDD(B, N+RD);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_sttmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    B.mb, tempnn, tempNRDn, tempnn,
                                    tempkm, ib, T.nb,
                                    B (N,    n), ldbN,
                                    B (N+RD, n), ldbNRD,
                                    A (k, N+RD), ldak,
                                    T2(k, N+RD), T.mb);
                        }
                    }
                }
            }
        } else {
            /*
             *  PlasmaLeft / PlasmaTrans
             */
            for (k = K-1; k >= 0; k--) {
                tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                ldak = BLKLDD(A, k);
                lastRD = 0;
                for (RD = BS; RD < A.nt-k; RD *= 2)
                    lastRD = RD;
                for (RD = lastRD; RD >= BS; RD /= 2) {
                    for (N = k;
                         N+RD < A.nt-1; /* No reduction with rightmost single-column subdomain */
                         N += 2*RD) {
                        tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                        ldbN   = BLKLDD(B, N   );
                        ldbNRD = BLKLDD(B, N+RD);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_sttmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    B.nb, tempnn, tempNRDn, tempnn,
                                    tempkm, ib, T.nb,
                                    B (N,    n), ldbN,
                                    B (N+RD, n), ldbNRD,
                                    A (k, N+RD), ldak,
                                    T2(k, N+RD), T.mb);
                        }
                    }
                }
                for (N = k;
                     N < A.nt-1 || N == k;  /* No rightmost single-column subdomain */
                     N += BS) {
                    tempNn   = N == A.nt-1 ? A.n-N*A.nb : A.nb;
                    tempkmin = min(tempkm,tempNn);
                    ldaN = BLKLDD(A, N);
                    ldbN = BLKLDD(B, N);
                    for (m = N+BS-1 == A.nt-2 ? A.nt-1 : min(N+BS-1, A.nt-1); /* Suck in rightmost single-column domain */
                         m >= N+1;
                         m--) {
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        ldbm = BLKLDD(B, m);
                        ldam = BLKLDD(A, m);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_stsmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    B.mb, tempnn, tempmm, tempnn,
                                    tempkm, ib, T.nb,
                                    B(N, n), ldbN,
                                    B(m, n), ldbm,
                                    A(k, m), ldak,
                                    T(k, m), T.mb);
                        }
                    }
                    for (n = 0; n < B.nt; n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        QUARK_CORE_sormlq(
                                plasma->quark, &task_flags,
                                side, trans,
                                tempNn, tempnn,
                                tempkmin, ib, T.nb,
                                A(k, N), ldak,
                                T(k, N), T.mb,
                                B(N, n), ldbN);
                    }
                }
            }

        }
    } else {
        if (trans == PlasmaNoTrans) {
            /*
             *  PlasmaRight / PlasmaNoTrans
             */
              for (k = K-1; k >= 0; k--) {
                  tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                  ldak = BLKLDD(A, k);
                  lastRD = 0;
                  for (RD = BS; RD < A.nt-k; RD *= 2)
                      lastRD = RD;
                  for (RD = lastRD; RD >= BS; RD /= 2) {
                      for (N = k;
                           N+RD < A.nt-1; /* No reduction with rightmost single-column subdomain */
                           N += 2*RD) {
                          tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                          for (m = 0; m < B.mt; m++) {
                              ldbm   = BLKLDD(B, m);
                              tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                              QUARK_CORE_sttmlq(
                                      plasma->quark, &task_flags,
                                      side, trans,
                                      tempmm, B.nb, tempmm, tempNRDn,
                                      tempkm, ib, T.nb,
                                      B (m, N   ), ldbm,
                                      B (m, N+RD), ldbm,
                                      A (k, N+RD), ldak,
                                      T2(k, N+RD), T.mb);
                          }
                      }
                  }
                  for (N = k;
                       N < A.nt-1 || N == k;  /* No rightmost single-column subdomain */
                       N += BS) {
                      tempNn   = N == A.nt-1 ? A.n-N*A.nb : A.nb;
                      tempkmin = min(tempkm,tempNn);
                      for (n = N+BS-1 == A.nt-2 ? A.nt-1 : min(N+BS-1, A.nt-1); /* Suck in rightmost single-column domain */
                              n >= N+1;
                              n--) {
                          tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                          for (m = 0; m < B.mt; m++) {
                              tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                              ldbm = BLKLDD(B, m);
                              QUARK_CORE_stsmlq(
                                      plasma->quark, &task_flags,
                                      side, trans,
                                      tempmm, B.nb, tempmm, tempnn,
                                      tempkm, ib, T.nb,
                                      B(m, N), ldbm,
                                      B(m, n), ldbm,
                                      A(k, n), ldak,
                                      T(k, n), T.mb);
                          }
                      }
                      for (m = 0; m < B.mt; m++) {
                          tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                          ldbm = BLKLDD(B, m);
                          QUARK_CORE_sormlq(
                                  plasma->quark, &task_flags,
                                  side, trans,
                                  tempmm, tempNn,
                                  tempkmin, ib, T.nb,
                                  A(k, N), ldak,
                                  T(k, N), T.mb,
                                  B(m, N), ldbm);
                      }
                  }
              }
        } else {
            /*
             *  PlasmaRight / PlasmaTrans
             */
            for (k = 0; k < K; k++) {
                tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                ldak = BLKLDD(A, k);
                for (N = k;
                     N < A.nt-1 || N == k;  /* No rightmost single-column subdomain */
                     N += BS) {
                    tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
                    tempkmin = min(tempkm,tempNn);
                    ldaN = BLKLDD(A, N);
                    for (m = 0; m < B.mt; m++) {
                        ldbm = BLKLDD(B, m);
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        QUARK_CORE_sormlq(
                                plasma->quark, &task_flags,
                                side, trans,
                                tempmm, tempNn,
                                tempkmin, ib, T.nb,
                                A(k, N), ldaN,
                                T(k, N), T.mb,
                                B(m, N), ldbm);
                    }
                    for (n = N+1;
                            (n < N+BS && n < A.nt) || n == A.nt-1; /* Suck in rightmost single-column domain */
                            n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        for (m = 0; m < B.mt; m++) {
                            tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                            ldbm = BLKLDD(B, m);
                            QUARK_CORE_stsmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    tempmm, tempNn, tempmm, tempnn,
                                    tempkm, ib, T.nb,
                                    B(m, N), ldbm,
                                    B(m, n), ldbm,
                                    A(k, n), ldak,
                                    T(k, n), T.mb);
                        }
                    }
                }
                for (RD = BS; RD < A.nt-k; RD *= 2) {
                    for (N = k;
                            N+RD < A.nt-1; /* No reduction with rightmost single-column subdomain */
                            N += 2*RD) {
                        tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                        for (m = 0; m < B.mt; m++) {
                            tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                            ldbm   = BLKLDD(B, m);
                            QUARK_CORE_sttmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    tempmm, B.nb, tempmm, tempNRDn,
                                    tempkm, ib, T.nb,
                                    B (m, N   ), ldbm,
                                    B (m, N+RD), ldbm,
                                    A (k, N+RD), ldak,
                                    T2(k, N+RD), T.mb);
                        }
                    }
                }
            }
        }
    }
}
コード例 #25
0
ファイル: pdlacpy.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *
 **/
void plasma_pdlacpy_quark(PLASMA_enum uplo, PLASMA_desc A, PLASMA_desc B,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int X, Y;
    int m, n;
    int ldam, ldbm;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    switch (uplo) {
    /*
     *  PlasmaUpper
     */
    case PlasmaUpper:
        for (m = 0; m < A.mt; m++) {
            X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            ldbm = BLKLDD(B, m);
            if (m < A.nt) {
                Y = m == A.nt-1 ? A.n-m*A.nb : A.nb;
                QUARK_CORE_dlacpy(
                    plasma->quark, &task_flags,
                    PlasmaUpper,
                    X, Y, A.mb,
                    A(m, m), ldam,
                    B(m, m), ldbm);
            }
            for (n = m+1; n < A.nt; n++) {
                Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_dlacpy(
                    plasma->quark, &task_flags,
                    PlasmaUpperLower,
                    X, Y, A.mb,
                    A(m, n), ldam,
                    B(m, n), ldbm);
            }
        }
        break;
    /*
     *  PlasmaLower
     */
    case PlasmaLower:
        for (m = 0; m < A.mt; m++) {
            X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            ldbm = BLKLDD(B, m);
            if (m < A.nt) {
                Y = m == A.nt-1 ? A.n-m*A.nb : A.nb;
                QUARK_CORE_dlacpy(
                    plasma->quark, &task_flags,
                    PlasmaLower,
                    X, Y, A.mb,
                    A(m, m), ldam,
                    B(m, m), ldbm);
            }
            for (n = 0; n < min(m, A.nt); n++) {
                Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_dlacpy(
                    plasma->quark, &task_flags,
                    PlasmaUpperLower,
                    X, Y, A.mb,
                    A(m, n), ldam,
                    B(m, n), ldbm);
            }
        }
        break;
    /*
     *  PlasmaUpperLower
     */
    case PlasmaUpperLower:
    default:
        for (m = 0; m < A.mt; m++) {
            X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            ldbm = BLKLDD(B, m);
            for (n = 0; n < A.nt; n++) {
                Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_dlacpy(
                    plasma->quark, &task_flags,
                    PlasmaUpperLower,
                    X, Y, A.mb,
                    A(m, n), ldam,
                    B(m, n), ldbm);
            }
        }
    }
}
コード例 #26
0
ファイル: pcgelqf.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel tile LQ factorization - dynamic scheduling
 **/
void plasma_pcgelqf_quark(PLASMA_desc A, PLASMA_desc T,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int ldak, ldam;
    int tempkm, tempkn, tempmm, tempnn;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    for (k = 0; k < min(A.mt, A.nt); k++) {
        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        ldak = BLKLDD(A, k);
        QUARK_CORE_cgelqt(
            plasma->quark, &task_flags,
            tempkm, tempkn, ib, T.nb,
            A(k, k), ldak,
            T(k, k), T.mb);

        for (m = k+1; m < A.mt; m++) {
            tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            QUARK_CORE_cunmlq(
                plasma->quark, &task_flags,
                PlasmaRight, PlasmaConjTrans,
                tempmm, tempkn, tempkn, ib, T.nb,
                A(k, k), ldak,
                T(k, k), T.mb,
                A(m, k), ldam);
        }
        for (n = k+1; n < A.nt; n++) {
            tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            QUARK_CORE_ctslqt(
                plasma->quark, &task_flags,
                tempkm, tempnn, ib, T.nb,
                A(k, k), ldak,
                A(k, n), ldak,
                T(k, n), T.mb);

            for (m = k+1; m < A.mt; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_ctsmlq(
                    plasma->quark, &task_flags,
                    PlasmaRight, PlasmaConjTrans,
                    tempmm, A.nb, tempmm, tempnn, A.mb, ib, T.nb,
                    A(m, k), ldam,
                    A(m, n), ldam,
                    A(k, n), ldak,
                    T(k, n), T.mb);
            }
        }
    }
}
コード例 #27
0
ファイル: pdlacpy.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *
 **/
void plasma_pdlacpy(plasma_context_t *plasma)
{
    PLASMA_enum uplo;
    PLASMA_desc A;
    PLASMA_desc B;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int X, Y;
    int m, n;
    int next_m;
    int next_n;
    int ldam, ldbm;

    plasma_unpack_args_5(uplo, A, B, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;

    switch (uplo) {
    /*
     *  PlasmaUpper
     */
    case PlasmaUpper:
        m = 0;
        n = PLASMA_RANK;
        while (n >= A.nt) {
            m++;
            n = n - A.nt + m;
        }

        while (m < A.mt) {
            next_m = m;
            next_n = n;

            next_n += PLASMA_SIZE;
            while (next_n >= A.nt && next_m < A.mt) {
                next_m++;
                next_n = next_n - A.nt + next_m;
            }

            X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            ldam = BLKLDD(A, m);
            ldbm = BLKLDD(B, m);
            CORE_dlacpy(
                m == n ? uplo : PlasmaUpperLower,
                X, Y,
                A(m, n), ldam,
                B(m, n), ldbm);

            n = next_n;
            m = next_m;
        }
        break;
    /*
     *  PlasmaLower
     */
    case PlasmaLower:
        n = 0;
        m = PLASMA_RANK;
        while (m >= A.mt) {
            n++;
            m = m - A.mt + n;
        }

        while (n < A.nt) {
            next_m = m;
            next_n = n;

            next_m += PLASMA_SIZE;
            while (next_m >= A.mt && next_n < A.nt) {
                next_n++;
                next_m = next_m - A.mt + next_n;
            }

            X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            ldam = BLKLDD(A, m);
            ldbm = BLKLDD(B, m);
            CORE_dlacpy(
                m == n ? uplo : PlasmaUpperLower,
                X, Y,
                A(m, n), ldam,
                B(m, n), ldbm);

            n = next_n;
            m = next_m;
        }
        break;
    /*
     *  PlasmaUpperLower
     */
    case PlasmaUpperLower:
    default:
        n = 0;
        m = PLASMA_RANK;
        while (m >= A.mt) {
            n++;
            m = m - A.mt;
        }

        while (n < A.nt) {
            next_m = m;
            next_n = n;

            next_m += PLASMA_SIZE;
            while (next_m >= A.mt && next_n < A.nt) {
                next_n++;
                next_m = next_m - A.mt;
            }

            X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            ldam = BLKLDD(A, m);
            ldbm = BLKLDD(B, m);
            CORE_dlacpy(
                PlasmaUpperLower,
                X, Y,
                A(m, n), ldam,
                B(m, n), ldbm);

            n = next_n;
            m = next_m;
        }
        break;
    }
}
コード例 #28
0
ファイル: pclauum.c プロジェクト: joao-lima/plasma-kaapi
/***************************************************************************//**
 *  Parallel UU' or L'L operation - dynamic scheduling
 **/
void plasma_pclauum_quark(PLASMA_enum uplo, PLASMA_desc A,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int ldam;
    int tempkm, tempmm, tempnn;

    PLASMA_Complex32_t zone = (PLASMA_Complex32_t)1.0;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
    /*
     *  PlasmaLower
     */
    if (uplo == PlasmaLower) {
        for (m = 0; m < A.mt; m++) {
            tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            for(n = 0; n < m; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_cherk(
                    plasma->quark, &task_flags,
                    uplo, PlasmaConjTrans,
                    tempnn, tempmm, A.mb,
                    1.0, A(m, n), ldam,
                    1.0, A(n, n), A.mb);

                for(k = n+1; k < m; k++) {
                    tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                    QUARK_CORE_cgemm(
                        plasma->quark, &task_flags,
                        PlasmaConjTrans, PlasmaNoTrans,
                        tempkm, tempnn, tempmm, A.mb,
                        zone, A(m, k), ldam,
                              A(m, n), ldam,
                        zone, A(k, n), A.mb);
                }
            }
            for (n = 0; n < m; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_ctrmm(
                    plasma->quark, &task_flags,
                    PlasmaLeft, uplo, PlasmaConjTrans, PlasmaNonUnit,
                    tempmm, tempnn, A.mb,
                    zone, A(m, m), ldam,
                          A(m, n), ldam);
            }
            QUARK_CORE_clauum(
                plasma->quark, &task_flags,
                uplo,
                tempmm,
                A.mb, A(m, m), ldam);
        }
    }
    /*
     *  PlasmaUpper
     */
    else {
        for (m = 0; m < A.mt; m++) {
            tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            for (n = 0; n < m; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_cherk(
                    plasma->quark, &task_flags,
                    uplo, PlasmaNoTrans,
                    tempnn, tempmm, A.mb,
                    1.0, A(n, m), A.mb,
                    1.0, A(n, n), A.mb);

                for (k = n+1; k < m; k++){
                    tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                    QUARK_CORE_cgemm(
                        plasma->quark, &task_flags,
                        PlasmaNoTrans, PlasmaConjTrans,
                        tempnn, tempkm, tempmm, A.mb,
                        zone, A(n, m), A.mb,
                              A(k, m), A.mb,
                        zone, A(n, k), A.mb);
                }
            }
            for (n = 0; n < m; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_ctrmm(
                    plasma->quark, &task_flags,
                    PlasmaRight, uplo, PlasmaConjTrans, PlasmaNonUnit,
                    tempnn, tempmm, A.mb,
                    zone, A(m, m), ldam,
                          A(n, m), A.mb);
            }
            QUARK_CORE_clauum(
                plasma->quark, &task_flags,
                uplo,
                tempmm,
                A.mb, A(m, m), ldam);
        }
    }
}
コード例 #29
0
ファイル: pzgetrf_nopiv.c プロジェクト: gpichon/eigenproblems
/***************************************************************************//**
 *  Parallel tile LU factorization with no pivoting - dynamic scheduling
 **/
void plasma_pzgetrf_nopiv_quark(PLASMA_desc A,
                                PLASMA_sequence *sequence,
                                PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n, ib;
    int ldak, ldam;
    int tempkm, tempkn, tempmm, tempnn;

    PLASMA_Complex64_t zone  = (PLASMA_Complex64_t) 1.0;
    PLASMA_Complex64_t mzone = (PLASMA_Complex64_t)-1.0;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;

    for (k = 0; k < min(A.mt, A.nt); k++) {
        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        ldak = BLKLDD(A, k);
        QUARK_CORE_zgetrf_nopiv(
            plasma->quark, &task_flags,
            tempkm, tempkn, ib, A.mb,
            A(k, k), ldak,
            sequence, request, A.mb*k);

        for (m = k+1; m < A.mt; m++) {
            tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            QUARK_CORE_ztrsm(
                plasma->quark, &task_flags,
                PlasmaRight, PlasmaUpper, PlasmaNoTrans, PlasmaNonUnit,
                tempmm, tempkn, A.mb,
                zone, A(k, k), ldak,
                      A(m, k), ldam);
        }
        for (n = k+1; n < A.nt; n++) {
            tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            QUARK_CORE_ztrsm(
                plasma->quark, &task_flags,
                PlasmaLeft, PlasmaLower, PlasmaNoTrans, PlasmaUnit,
                tempkm, tempnn, A.mb,
                zone, A(k, k), ldak,
                      A(k, n), ldak);

            for (m = k+1; m < A.mt; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_zgemm(
                    plasma->quark, &task_flags,
                    PlasmaNoTrans, PlasmaNoTrans,
                    tempmm, tempnn, A.mb, A.mb,
                    mzone, A(m, k), ldam,
                           A(k, n), ldak,
                    zone,  A(m, n), ldam);
            }
        }
    }
}