int CORE_dtsmqr_sytra1( int side, int trans,
                        int m1, int n1, int m2, int n2,
                        int k, int ib,
                        double *A1, int lda1,
                        double *A2, int lda2,
                        double *V, int ldv,
                        double *T, int ldt,
                        double *WORK, int ldwork)
{
    int i, j;

    if ( (m1 != n1) ) {
        coreblas_error(3, "Illegal value of M1, N1");
        return -3;
    }

    /* in-place transposition of A1 */
    for (j = 0; j < n1; j++){
        A1[j + j*lda1] = (A1[j + j*lda1]);

        for (i = j+1; i < m1; i++){
            *WORK = *(A1 + i + j*lda1);
            *(A1 + i + j*lda1) = (*(A1 + j + i*lda1));
            *(A1 + j + i*lda1) = (*WORK);
        }
    }

    CORE_dtsmqr(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);

    /* in-place transposition of A1 */
    for (j = 0; j < n1; j++){
        A1[j + j*lda1] = (A1[j + j*lda1]);

        for (i = j+1; i < m1; i++){
            *WORK = *(A1 + i + j*lda1);
            *(A1 + i + j*lda1) = (*(A1 + j + i*lda1));
            *(A1 + j + i*lda1) = (*WORK);
        }
    }

    return PLASMA_SUCCESS;
}
int CORE_dtsmqr_corner( int m1, int n1, int m2, int n2, int m3, int n3,
                        int k, int ib, int nb,
                        double *A1, int lda1,
                        double *A2, int lda2,
                        double *A3, int lda3,
                        double *V, int ldv,
                        double *T, int ldt,
                        double *WORK, int ldwork)
{
    int i, j;
    PLASMA_enum side, trans;

    if ( m1 != n1 ) {
        coreblas_error(1, "Illegal value of M1, N1");
        return -1;
    }

    /*  Rebuild the symmetric block: WORK <- A1 */
    for (j = 0; j < n1; j++)
        for (i = j; i < m1; i++){
            *(WORK + i + j*ldwork) = *(A1 + i + j*lda1);
            if (i > j){
                *(WORK + j + i*ldwork) =  ( *(WORK + i + j*ldwork) );
            }
        }
    
    /*  Copy the transpose of A2: WORK+nb*ldwork <- A2' */
    for (j = 0; j < n2; j++)
        for (i = 0; i < m2; i++){
            *(WORK + j + (i + nb) * ldwork) = ( *(A2 + i + j*lda2) );
        }

    side  = PlasmaLeft;
    trans = PlasmaTrans;

    /*  Left application on |A1| */
    /*                      |A2| */
    CORE_dtsmqr(side, trans, m1, n1, m2, n2, k, ib, 
                WORK, ldwork, A2, lda2, 
                V, ldv, T, ldt, 
                WORK + 3*nb*ldwork, ldwork);

    /*  Rebuild the symmetric block: WORK+2*nb*ldwork <- A3 */
    for (j = 0; j < n3; j++)
        for (i = j; i < m3; i++){
            *(WORK + i + (j + 2*nb) * ldwork) = *(A3 + i + j*lda3);
            if (i != j){
                *(WORK + j + (i + 2*nb) * ldwork) =  ( *(WORK + i + (j + 2*nb) * ldwork) );
            }
        }
    /*  Left application on | A2'| */
    /*                      | A3 | */
    CORE_dtsmqr(side, trans, n2, m2, m3, n3, k, ib, 
                WORK+nb*ldwork, ldwork, WORK+2*nb*ldwork, ldwork, 
                V, ldv, T, ldt, 
                WORK + 3*nb*ldwork, ldwork);

    side  = PlasmaRight;
    trans = PlasmaNoTrans;

    /*  Right application on | A1 A2' | */
    CORE_dtsmqr(side, trans, m1, n1, n2, m2, k, ib, 
                WORK, ldwork, WORK+nb*ldwork, ldwork, 
                V, ldv, T, ldt, 
                WORK + 3*nb*ldwork, ldwork);

    /*  Copy back the final result to the lower part of A1 */
    /*  A1 = WORK */
    for (j = 0; j < n1; j++)
        for (i = j; i < m1; i++)
            *(A1 + i + j*lda1) = *(WORK + i + j*ldwork);

    /*  Right application on | A2 A3 | */
    CORE_dtsmqr(side, trans, m2, n2, m3, n3, k, ib, 
                A2, lda2, WORK+2*nb*ldwork, ldwork, 
                V,  ldv,  T, ldt, 
                WORK + 3*nb*ldwork, ldwork);

    /*  Copy back the final result to the lower part of A3 */
    /*  A3 = WORK+2*nb*ldwork */
    for (j = 0; j < n3; j++)
        for (i = j; i < m3; i++)
            *(A3 + i + j*lda3) = *(WORK + i + (j+ 2*nb) * ldwork);

    return PLASMA_SUCCESS;
}
Exemple #3
0
/***************************************************************************//**
 *  Parallel tile QR factorization - dynamic scheduling
 **/
void plasma_pdgeqrf_quark(PLASMA_desc A, PLASMA_desc T, int ib)
{

    int k, m, n;
    int ldak, ldam;
    int tempkm, tempkn, tempnn, tempmm;

    for (k = 0; k < min(A.mt, A.nt); k++) {
        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        ldak = BLKLDD(A, k);
        double *dA = A(k, k);
        double *dT = T(k, k);
#if defined(USE_OMPEXT)
omp_set_task_priority(1);
#endif
#pragma omp task depend(inout: dA[0:T.nb*T.nb]) depend(out:dT[0:ib*T.nb])
        {
            double tau[T.nb];
            double work[ib * T.nb];
            CORE_dgeqrt(tempkm, tempkn, ib, dA, ldak, dT, T.mb, &tau[0], &work[0]);
        }

        for (n = k+1; n < A.nt; n++) {
            tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            double *dA = A(k, k);
            double *dT = T(k, k);
            double *dC = A(k, n);
#pragma omp task depend(in: dA[0:T.nb*T.nb], dT[0:ib*T.nb]) depend(inout:dC[0:T.nb*T.nb])
            {
                double work[T.nb * ib];
                CORE_dormqr(PlasmaLeft, PlasmaTrans,
                        tempkm, tempnn, tempkm, ib,
                        dA, ldak,
                        dT, T.mb,
                        dC, ldak,
                        &work[0], T.nb);
            }
        }
        for (m = k+1; m < A.mt; m++) {
            tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            double *dA = A(k, k);
            double *dB = A(m, k);
            double *dT = T(m, k);
#pragma omp task depend(inout:dA[0:T.nb*T.nb], dB[0:T.nb*T.nb]) depend(out:dT[0:ib*T.nb])
            {
                double tau[T.nb];
                double work[ib * T.nb];
                CORE_dtsqrt(tempmm, tempkn, ib,
                        dA, ldak,
                        dB, ldam,
                        dT, T.mb, &tau[0], &work[0]);
            }

            for (n = k+1; n < A.nt; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                double *dA = A(k, n);
                double *dB = A(m, n);
                double *dV = A(m, k);
                double *dT = T(m, k);
#pragma omp task depend(inout:dA[0:T.nb*T.nb], dB[0:T.nb*T.nb]) depend(in:dV[0:T.nb*T.nb], dT[0:ib*T.nb])
                {
                    double work[ib * T.nb];
                    CORE_dtsmqr(PlasmaLeft, PlasmaTrans,
                            A.mb, tempnn, tempmm, tempnn, A.nb, ib,
                            dA, ldak,
                            dB, ldam,
                            dV, ldam,
                            dT, T.mb, &work[0], ib);
                }
            }
        }
    }
}
Exemple #4
0
/***************************************************************************//**
 *  Parallel tile QR factorization - static scheduling
 **/
void plasma_pdgeqrf(plasma_context_t *plasma)
{
    PLASMA_desc A;
    PLASMA_desc T;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_k;
    int next_m;
    int next_n;
    int ldak, ldam;
    int tempkm, tempkn, tempnn, tempmm;
    int ib = PLASMA_IB;
    double *work, *tau;

    plasma_unpack_args_4(A, T, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;
    work = (double*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp);
    tau  = (double*)plasma_private_alloc(plasma, A.nb, A.dtyp);
    ss_init(A.mt, A.nt, -1);

    k = 0;
    n = PLASMA_RANK;
    while (n >= A.nt) {
        k++;
        n = n-A.nt+k;
    }
    m = k;

    while (k < min(A.mt, A.nt) && n < A.nt) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_m++;
        if (next_m == A.mt) {
            next_n += PLASMA_SIZE;
            while (next_n >= A.nt && next_k < min(A.mt, A.nt)) {
                next_k++;
                next_n = next_n-A.nt+next_k;
            }
            next_m = next_k;
        }

        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
        tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;

        ldak = BLKLDD(A, k);
        ldam = BLKLDD(A, m);

        if (n == k) {
            if (m == k) {
                ss_cond_wait(k, k, k-1);
                CORE_dgeqrt(
                    tempkm, tempkn, ib,
                    A(k, k), ldak,
                    T(k, k), T.mb,
                    tau, work);
                ss_cond_set(k, k, k);
            }
            else {
                ss_cond_wait(m, k, k-1);
                CORE_dtsqrt(
                    tempmm, tempkn, ib,
                    A(k, k), ldak,
                    A(m, k), ldam,
                    T(m, k), T.mb,
                    tau, work);
                ss_cond_set(m, k, k);
            }
        }
        else {
            if (m == k) {
                ss_cond_wait(k, k, k);
                ss_cond_wait(k, n, k-1);
                CORE_dormqr(
                    PlasmaLeft, PlasmaTrans,
                    tempkm, tempnn, tempkm, ib,
                    A(k, k), ldak,
                    T(k, k), T.mb,
                    A(k, n), ldak,
                    work, T.nb);
            }
            else {
                ss_cond_wait(m, k, k);
                ss_cond_wait(m, n, k-1);
                CORE_dtsmqr(
                    PlasmaLeft, PlasmaTrans,
                    A.nb, tempnn, tempmm, tempnn, A.nb, ib,
                    A(k, n), ldak,
                    A(m, n), ldam,
                    A(m, k), ldam,
                    T(m, k), T.mb,
                    work, ib);
                ss_cond_set(m, n, k);
            }
        }
        n = next_n;
        m = next_m;
        k = next_k;
    }
    plasma_private_free(plasma, work);
    plasma_private_free(plasma, tau);
    ss_finalize();
}
Exemple #5
0
/***************************************************************************//**
 *  Parallel application of Q using tile V - QR factorization - dynamic scheduling
 **/
void plasma_pdormqr_quark(PLASMA_enum side, PLASMA_enum trans,
                          PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int ib)
{
    int k, m, n;
    int ldak, ldbk, ldam, ldan, ldbm;
    int tempkm, tempnn, tempkmin, tempmm, tempkn;
    int minMT, minM;

    if (A.m > A.n) {
        minM  = A.n;
        minMT = A.nt;
    } else {
        minM  = A.m;
        minMT = A.mt;
    }

    double *work = (double *)alloca(sizeof(double) * T.nb * ib);
    /*
     *  PlasmaLeft / PlasmaTrans
     */
    if (side == PlasmaLeft ) {
        if (trans == PlasmaTrans) {
            for (k = 0; k < minMT; k++) {
                tempkm   = k == B.mt-1 ? B.m-k*B.mb : B.mb;
                tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
                ldak = BLKLDD(A, k);
                ldbk = BLKLDD(B, k);
                for (n = 0; n < B.nt; n++) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    double *dA = A(k, k);
                    double *dT = T(k, k);
                    double *dB = B(k, n);
                    {
                        CORE_dormqr(side, trans, tempkm, tempnn, tempkmin, ib,
                                dA, ldak,
                                dT, T.mb,
                                dB, ldbk, work, T.nb);
                    }
                }
                for (m = k+1; m < B.mt; m++) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldam = BLKLDD(A, m);
                    ldbm = BLKLDD(B, m);
                    for (n = 0; n < B.nt; n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        double *dA = B(k, n);
                        double *dB = B(m, n);
                        double *dV = A(m, k);
                        double *dT = T(m, k);
                        {
                            CORE_dtsmqr(side, trans,
                                    B.mb, tempnn, tempmm, tempnn, tempkmin, ib,
                                    dA, ldbk,
                                    dB, ldbm,
                                    dV, ldam,
                                    dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb);
                        }
                    }
                }
            }
        }
        /*
         *  PlasmaLeft / PlasmaNoTrans
         */
        else {
            for (k = minMT-1; k >= 0; k--) {
                tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb;
                tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
                ldak = BLKLDD(A, k);
                ldbk = BLKLDD(B, k);
                for (m = B.mt-1; m > k; m--) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldam = BLKLDD(A, m);
                    ldbm = BLKLDD(B, m);
                    for (n = 0; n < B.nt; n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        double *dA = B(k, n);
                        double *dB = B(m, n);
                        double *dV = A(m, k);
                        double *dT = T(m, k);
                        {
                            CORE_dtsmqr(side, trans,
                                    B.mb, tempnn, tempmm, tempnn, tempkmin, ib,
                                    dA, ldbk,
                                    dB, ldbm,
                                    dV, ldam,
                                    dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb);
                        }
                    }
                }
                for (n = 0; n < B.nt; n++) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    double *dA = A(k, k);
                    double *dT = T(k, k);
                    double *dB = B(k, n);
                    {
                        CORE_dormqr(side, trans, tempkm, tempnn, tempkmin, ib,
                                dA, ldak,
                                dT, T.mb,
                                dB, ldbk, work, T.nb);
                    }
                }
            }
        }
    }
    /*
     *  PlasmaRight / PlasmaTrans
     */
    else {
        if (trans == PlasmaTrans) {
            for (k = minMT-1; k >= 0; k--) {
                tempkn = k == B.nt-1 ? B.n-k*B.nb : B.nb;
                tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
                ldak = BLKLDD(A, k);
                ldbk = BLKLDD(B, k);
                for (n = B.nt-1; n > k; n--) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    ldan = BLKLDD(A, n);
                    for (m = 0; m < B.mt; m++) {
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        ldbm = BLKLDD(B, m);
                        double *dA = B(m, k);
                        double *dB = B(m, n);
                        double *dV = A(n, k);
                        double *dT = T(n, k);
                        {
                            CORE_dtsmqr(side, trans,
                                    tempmm, B.nb, tempmm, tempnn, tempkmin, ib,
                                    dA, ldbm,
                                    dB, ldbm,
                                    dV, ldan,
                                    dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb);
                        }
                    }
                }
                for (m = 0; m < B.mt; m++) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldbm = BLKLDD(B, m);
                    double *dA = A(k, k);
                    double *dT = T(k, k);
                    double *dB = B(m, k);
                    {
                        CORE_dormqr(side, trans, tempmm, tempkn, tempkmin, ib,
                                dA, ldak,
                                dT, T.mb,
                                dB, ldbm, work, T.nb);
                    }
                }
            }
        }
        /*
         *  PlasmaRight / PlasmaNoTrans
         */
        else {
            for (k = 0; k < minMT; k++) {
                tempkn   = k == B.nt-1 ? B.n-k*B.nb : B.nb;
                tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
                ldak = BLKLDD(A, k); 
                for (m = 0; m < B.mt; m++) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldbm = BLKLDD(B, m);
                    double *dA = A(k, k);
                    double *dT = T(k, k);
                    double *dB = B(m, k);
                    {
                        CORE_dormqr(side, trans, tempmm, tempkn, tempkmin, ib,
                                dA, ldak,
                                dT, T.mb,
                                dB, ldbm, work, T.nb);
                    }
                }
                for (n = k+1; n < B.nt; n++) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    ldan = BLKLDD(A, n);
                    for (m = 0; m < B.mt; m++) {
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        ldbm = BLKLDD(B, m);
                        double *dA = B(m, k);
                        double *dB = B(m, n);
                        double *dV = A(n, k);
                        double *dT = T(n, k);
                        {
                            CORE_dtsmqr(side, trans,
                                    tempmm, B.nb, tempmm, tempnn, tempkmin, ib,
                                    dA, ldbm,
                                    dB, ldbm,
                                    dV, ldan,
                                    dT, T.mb, work, (side == PlasmaLeft)?ib:T.nb);
                        }
                    }
                }
            }
        }
    }
}