예제 #1
0
/***************************************************************************//**
 *  Parallel construction of Q using tile V (application to identity;
 *  reduction Householder) - dynamic scheduling
 **/
void plasma_pzunglqrh_quark(PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, int BS,
                            PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int K, N, RD, lastRD;
    int ldak;
    int ldqm;
    int tempkm, tempNn, tempnn, tempmm, tempNRDn, tempkmin;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    K = min(A.mt, A.nt);

    for (k = K-1; k >= 0; k--) {
        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        ldak = BLKLDD(A, k);
        lastRD = 0;
        for (RD = BS; RD < A.nt-k; RD *= 2)
            lastRD = RD;
        for (RD = lastRD; RD >= BS; RD /= 2) {
            for (N = k;
                 N+RD < A.nt-1; /* No reduction with rightmost single-column subdomain */
                 N += 2*RD) {
                tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                for (m = 0; m < Q.mt; m++) {
                    tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
                    ldqm   = BLKLDD(Q, m   );
                    QUARK_CORE_zttmlq(
                                      plasma->quark, &task_flags,
                                      PlasmaRight, PlasmaNoTrans,
                                      tempmm, Q.nb, tempmm, tempNRDn,
                                      tempkm, ib, T.nb,
                                      Q (m, N   ), ldqm,
                                      Q (m, N+RD), ldqm,
                                      A (k, N+RD), ldak,
                                      T2(k, N+RD), T.mb); 
                }
            }
        }
        for (N = k;
             N < A.nt-1 || N == k;  /* No rightmost single-column */
             N += BS) {
            tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
            tempkmin = min(tempkm, tempNn);
            for (n = N+BS-1 == A.nt-2 ? A.nt-1 : min(N+BS-1, A.nt-1); /* Suck in rightmost single-column domain */
                 n >= N+1;
                 n--) {
                tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb;
                
                for (m = 0; m < Q.mt; m++) {
                    tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
                    ldqm = BLKLDD(Q, m);
                    QUARK_CORE_ztsmlq(
                                      plasma->quark, &task_flags,
                                      PlasmaRight, PlasmaNoTrans,
                                      tempmm, Q.nb, tempmm, tempnn,
                                      tempkm, ib, T.nb,
                                      Q(m, N), ldqm,
                                      Q(m, n), ldqm,
                                      A(k, n), ldak,
                                      T(k, n), T.mb);
                }
            }
            for (m = 0; m < Q.mt; m++) {
                tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
                ldqm = BLKLDD(Q, m);
                QUARK_CORE_zunmlq(
                                  plasma->quark, &task_flags,
                                  PlasmaRight, PlasmaNoTrans,
                                  tempmm, tempNn, 
                                  tempkmin, ib, T.nb,
                                  A(k, N), ldak,
                                  T(k, N), T.mb,
                                  Q(m, N), ldqm);
            }
        }
    }
}
예제 #2
0
/***************************************************************************//**
 *  Parallel application of Q using tile V - LQ factorization (reduction
 *  Householder) - dynamic scheduling
 **/
void plasma_pzunmlqrh_quark(PLASMA_enum side, PLASMA_enum trans,
        PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int BS,
        PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int K, N, RD, lastRD;
    int ldaN, ldak;
    int ldbN, ldbm, ldbNRD;
    int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    K = min(A.mt, A.nt);

    if (side == PlasmaLeft ) {
        if (trans == PlasmaNoTrans) {
            /*
             *  PlasmaLeft / PlasmaNoTrans
             */
            for (k = 0; k < K; k++) {
                tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                ldak = BLKLDD(A, k);
                for (N = k; N < A.nt; N += BS) {
                    tempNn   = N == A.nt-1 ? A.n-N*A.nb : A.nb;
                    tempkmin = min(tempkm,tempNn);
                    ldaN = BLKLDD(A, N);
                    ldbN = BLKLDD(B, N);
                    for (n = 0; n < B.nt; n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        QUARK_CORE_zunmlq(
                                plasma->quark, &task_flags,
                                side, trans,
                                tempNn, tempnn,
                                tempkmin, ib, T.nb,
                                A(k, N), ldak,
                                T(k, N), T.mb,
                                B(N, n), ldbN);
                    }
                    for (m = N+1; m < min(N+BS, A.nt); m++) {
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        ldbm = BLKLDD(B, m);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_ztsmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    B.nb, tempnn, tempmm, tempnn,
                                    tempkm, ib, T.nb,
                                    B(N, n), ldbN,
                                    B(m, n), ldbm,
                                    A(k, m), ldak,
                                    T(k, m), T.mb);
                        }
                    }
                }
                for (RD = BS; RD < A.nt-k; RD *= 2) {
                    for (N = k; N+RD < A.nt; N += 2*RD) {
                        tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                        ldbN   = BLKLDD(B, N   );
                        ldbNRD = BLKLDD(B, N+RD);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_zttmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    B.mb, tempnn, tempNRDn, tempnn,
                                    tempkm, ib, T.nb,
                                    B (N,    n), ldbN,
                                    B (N+RD, n), ldbNRD,
                                    A (k, N+RD), ldak,
                                    T2(k, N+RD), T.mb);
                        }
                    }
                }
            }
        } else {
            /*
             *  PlasmaLeft / PlasmaConjTrans
             */
            for (k = K-1; k >= 0; k--) {
                tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                ldak = BLKLDD(A, k);
                lastRD = 0;
                for (RD = BS; RD < A.nt-k; RD *= 2)
                    lastRD = RD;
                for (RD = lastRD; RD >= BS; RD /= 2) {
                    for (N = k; N+RD < A.nt; N += 2*RD) {
                        tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                        ldbN   = BLKLDD(B, N   );
                        ldbNRD = BLKLDD(B, N+RD);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_zttmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    B.nb, tempnn, tempNRDn, tempnn,
                                    tempkm, ib, T.nb,
                                    B (N,    n), ldbN,
                                    B (N+RD, n), ldbNRD,
                                    A (k, N+RD), ldak,
                                    T2(k, N+RD), T.mb);
                        }
                    }
                }
                for (N = k; N < A.nt; N += BS) {
                    tempNn   = N == A.nt-1 ? A.n-N*A.nb : A.nb;
                    tempkmin = min(tempkm,tempNn);
                    ldaN = BLKLDD(A, N);
                    ldbN = BLKLDD(B, N);
                    for (m = min(N+BS, A.nt)-1; m > N; m--) {
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        ldbm = BLKLDD(B, m);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_ztsmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    B.mb, tempnn, tempmm, tempnn,
                                    tempkm, ib, T.nb,
                                    B(N, n), ldbN,
                                    B(m, n), ldbm,
                                    A(k, m), ldak,
                                    T(k, m), T.mb);
                        }
                    }
                    for (n = 0; n < B.nt; n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        QUARK_CORE_zunmlq(
                                plasma->quark, &task_flags,
                                side, trans,
                                tempNn, tempnn,
                                tempkmin, ib, T.nb,
                                A(k, N), ldak,
                                T(k, N), T.mb,
                                B(N, n), ldbN);
                    }
                }
            }

        }
    } else {
        if (trans == PlasmaNoTrans) {
            /*
             *  PlasmaRight / PlasmaNoTrans
             */
              for (k = K-1; k >= 0; k--) {
                  tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                  ldak = BLKLDD(A, k);
                  lastRD = 0;
                  for (RD = BS; RD < A.nt-k; RD *= 2)
                      lastRD = RD;
                  for (RD = lastRD; RD >= BS; RD /= 2) {
                      for (N = k; N+RD < A.nt; N += 2*RD) {
                          tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                          for (m = 0; m < B.mt; m++) {
                              ldbm   = BLKLDD(B, m);
                              tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                              QUARK_CORE_zttmlq(
                                      plasma->quark, &task_flags,
                                      side, trans,
                                      tempmm, B.nb, tempmm, tempNRDn,
                                      tempkm, ib, T.nb,
                                      B (m, N   ), ldbm,
                                      B (m, N+RD), ldbm,
                                      A (k, N+RD), ldak,
                                      T2(k, N+RD), T.mb);
                          }
                      }
                  }
                  for (N = k; N < A.nt; N += BS) {
                      tempNn   = N == A.nt-1 ? A.n-N*A.nb : A.nb;
                      tempkmin = min(tempkm,tempNn);
                      for (n = min(N+BS, A.nt)-1; n > N; n--) {
                          tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                          for (m = 0; m < B.mt; m++) {
                              tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                              ldbm = BLKLDD(B, m);
                              QUARK_CORE_ztsmlq(
                                      plasma->quark, &task_flags,
                                      side, trans,
                                      tempmm, B.nb, tempmm, tempnn,
                                      tempkm, ib, T.nb,
                                      B(m, N), ldbm,
                                      B(m, n), ldbm,
                                      A(k, n), ldak,
                                      T(k, n), T.mb);
                          }
                      }
                      for (m = 0; m < B.mt; m++) {
                          tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                          ldbm = BLKLDD(B, m);
                          QUARK_CORE_zunmlq(
                                  plasma->quark, &task_flags,
                                  side, trans,
                                  tempmm, tempNn,
                                  tempkmin, ib, T.nb,
                                  A(k, N), ldak,
                                  T(k, N), T.mb,
                                  B(m, N), ldbm);
                      }
                  }
              }
        } else {
            /*
             *  PlasmaRight / PlasmaConjTrans
             */
            for (k = 0; k < K; k++) {
                tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                ldak = BLKLDD(A, k);
                for (N = k; N < A.nt; N += BS) {
                    tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
                    tempkmin = min(tempkm,tempNn);
                    ldaN = BLKLDD(A, N);
                    for (m = 0; m < B.mt; m++) {
                        ldbm = BLKLDD(B, m);
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        QUARK_CORE_zunmlq(
                                plasma->quark, &task_flags,
                                side, trans,
                                tempmm, tempNn,
                                tempkmin, ib, T.nb,
                                A(k, N), ldaN,
                                T(k, N), T.mb,
                                B(m, N), ldbm);
                    }
                    for (n = N+1; n < min(N+BS, A.nt); n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        for (m = 0; m < B.mt; m++) {
                            tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                            ldbm = BLKLDD(B, m);
                            QUARK_CORE_ztsmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    tempmm, tempNn, tempmm, tempnn,
                                    tempkm, ib, T.nb,
                                    B(m, N), ldbm,
                                    B(m, n), ldbm,
                                    A(k, n), ldak,
                                    T(k, n), T.mb);
                        }
                    }
                }
                for (RD = BS; RD < A.nt-k; RD *= 2) {
                    for (N = k; N+RD < A.nt; N += 2*RD) {
                        tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                        for (m = 0; m < B.mt; m++) {
                            tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                            ldbm   = BLKLDD(B, m);
                            QUARK_CORE_zttmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    tempmm, B.nb, tempmm, tempNRDn,
                                    tempkm, ib, T.nb,
                                    B (m, N   ), ldbm,
                                    B (m, N+RD), ldbm,
                                    A (k, N+RD), ldak,
                                    T2(k, N+RD), T.mb);
                        }
                    }
                }
            }
        }
    }
}
예제 #3
0
/***************************************************************************//**
 *  Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
 **/
void plasma_pzgelqfrh_quark(PLASMA_desc A, PLASMA_desc T, int BS,
                            PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int K, N, RD;
    int ldak, ldam;
    int tempkm, tempNn, tempmm, tempnn, tempNRDn;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    K = min(A.mt, A.nt);
    for (k = 0; k < K; k++) {

        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        ldak = BLKLDD(A, k);
//      for (N = k; N < A.nt; N += BS) {
        for (N = k;
             N < A.nt-1 || N == k;  // No rightmost single-column subdomain
             N += BS) {
            tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
            QUARK_CORE_zgelqt(
                plasma->quark, &task_flags,
                tempkm, tempNn, ib, T.nb,
                A(k, N), ldak,
                T(k, N), T.mb);

            for (m = k+1; m < A.mt; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_zunmlq(
                    plasma->quark, &task_flags,
                    PlasmaRight, PlasmaConjTrans,
                    tempmm, tempNn, tempNn, ib, T.nb,
                    A(k, N), ldak,
                    T(k, N), T.mb,
                    A(m, N), ldam);
            }
//          for (n = N+1; n < N+BS && n < A.nt; n++) {
            for (n = N+1;
                 (n < N+BS && n < A.nt) || n == A.nt-1; // Suck in rightmost single-column domain
                 n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_ztslqt(
                    plasma->quark, &task_flags,
                    tempkm, tempnn, ib, T.nb,
                    A(k, N), ldak,
                    A(k, n), ldak,
                    T(k, n), T.mb);

                for (m = k+1; m < A.mt; m++) {
                    tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                    ldam = BLKLDD(A, m);
                    QUARK_CORE_ztsmlq(
                        plasma->quark, &task_flags,
                        PlasmaRight, PlasmaConjTrans,
                        tempmm, A.nb, tempmm, tempnn, A.mb, ib, T.nb,
                        A(m, N), ldam,
                        A(m, n), ldam,
                        A(k, n), ldak,
                        T(k, n), T.mb);
                }
            }
        }
        for (RD = BS; RD < A.nt-k; RD *= 2) {
//          for (N = k; N+RD < A.nt; N += 2*RD) {
            for (N = k;
                 N+RD < A.nt-1; // No reduction with rightmost single-column subdomain
                 N += 2*RD) {
                tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                QUARK_CORE_zttlqt(
                    plasma->quark, &task_flags,
                    tempkm, tempNRDn, ib, T.nb,
                    A (k, N   ), ldak,
                    A (k, N+RD), ldak,
                    T2(k, N+RD), T.mb);

                for (m = k+1; m < A.mt; m++) {
                    tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                    ldam = BLKLDD(A, m);
                    QUARK_CORE_zttmlq(
                        plasma->quark, &task_flags,
                        PlasmaRight, PlasmaConjTrans,
                        tempmm, A.nb, tempmm, tempNRDn, A.mb, ib, T.nb,
                        A (m, N   ), ldam,
                        A (m, N+RD), ldam,
                        A (k, N+RD), ldak,
                        T2(k, N+RD), T.mb);
                }
            }
        }
    }
}