예제 #1
0
/***************************************************************************//**
 *  Parallel tile Cholesky factorization - dynamic scheduling
 **/
void plasma_pdplrnt_quark( PLASMA_desc A, unsigned long long int seed,
                           PLASMA_sequence *sequence, PLASMA_request *request )
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int m, n;
    int ldam;
    int tempmm, tempnn;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    for (m = 0; m < A.mt; m++) {
        tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
        ldam = BLKLDD(A, m);

        for (n = 0; n < A.nt; n++) {
            tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;

            QUARK_CORE_dplrnt( 
                plasma->quark, &task_flags,
                tempmm, tempnn, A(m, n), ldam,
                A.m, m*A.mb, n*A.nb, seed );
        }
    }
}
예제 #2
0
파일: oocdag.cpp 프로젝트: efdazedo/oocdag
void warmup(Quark *q){
    int NB = 200;
    double *H = (double*) malloc(NB*NB*OOC_NTHREADS*sizeof(double));
    double *D = (double*) offload_Alloc(NB*NB*OOC_NTHREADS*sizeof(double), 0);
    
    {
        Quark_Task_Flags tflags = Quark_Task_Flags_Initializer;
//      for(int r = 0; r < OOC_NTHREADS; r++){
        for(int r = 0; r < 2; r++){
            QUARK_Task_Flag_Set(&tflags, TASK_LOCK_TO_THREAD, r);
//          QUARK_Task_Flag_Set(&tflags, THREAD_SET_TO_MANUAL_SCHEDULING, (r==0)||(r==1));
            QUARK_Insert_Task(q, CORE_H2D, &tflags,
                sizeof(int),                &NB,        VALUE,
                sizeof(int),                &NB,        VALUE,
                sizeof(double),             H+r*NB*NB,  INPUT,
                sizeof(int),                &NB,        VALUE,
                sizeof(double),             D+r*NB*NB,  OUTPUT,
                sizeof(int),                &NB,        VALUE,
                0);
            QUARK_Insert_Task(q, CORE_D2H, &tflags,
                sizeof(int),                &NB,        VALUE,
                sizeof(int),                &NB,        VALUE,
                sizeof(double),             D+r*NB*NB,  INPUT,
                sizeof(int),                &NB,        VALUE,
                sizeof(double),             H+r*NB*NB,  OUTPUT,
                sizeof(int),                &NB,        VALUE,
                0);
        }
    }
    QUARK_Barrier(q);
    offload_Free(D, 0);
    free(H);
}
예제 #3
0
/***************************************************************************//**
 *
 **/
void plasma_pslag2d_quark(PLASMA_desc SA, PLASMA_desc B,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int X, Y;
    int m, n;
    int ldam, ldbm;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    for(m = 0; m < SA.mt; m++) {
        X = m == SA.mt-1 ? SA.m-m*SA.mb : SA.mb;
        ldam = BLKLDD(SA, m);
        ldbm = BLKLDD(B, m);
        for(n = 0; n < SA.nt; n++) {
            Y = n == SA.nt-1 ? SA.n-n*SA.nb : SA.nb;
            QUARK_CORE_slag2d(
                plasma->quark, &task_flags,
                X, Y, SA.mb,
                SA(m, n), ldam,
                B(m, n), ldbm);
        }
    }
}
예제 #4
0
/***************************************************************************//**
 *
 **/
void plasma_pdaxpy_quark(double alpha, PLASMA_desc A, PLASMA_desc B,
                         PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int X, Y;
    int m, n;
    int ldam, ldbm;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    for (m = 0; m < A.mt; m++) {
        X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
        ldam = BLKLDD(A, m);
        ldbm = BLKLDD(B, m);

        for (n = 0; n < A.nt; n++) {
            Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            QUARK_CORE_daxpy(
                plasma->quark, &task_flags,
                X, Y, A.mb,
                alpha, A(m, n), ldam,
                       B(m, n), ldbm);
        }
    }
}
예제 #5
0
/***************************************************************************//**
 *  Parallel construction of Q using tile V (application to identity) - dynamic scheduling
 **/
void plasma_pdorgqr_quark(PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int ldak, ldqk, ldam, ldqm;
    int tempmm, tempnn, tempkmin, tempkm;
    int tempAkm, tempAkn;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    for (k = min(A.mt, A.nt)-1; k >= 0; k--) {
        tempAkm  = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempAkn  = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        tempkmin = min( tempAkn, tempAkm );
        tempkm   = k == Q.mt-1 ? Q.m-k*Q.mb : Q.mb;
        ldak = BLKLDD(A, k);
        ldqk = BLKLDD(Q, k);
        for (m = Q.mt - 1; m > k; m--) {
            tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
            ldam = BLKLDD(A, m);
            ldqm = BLKLDD(Q, m);
            for (n = 0; n < Q.nt; n++) {
                tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb;
                QUARK_CORE_dtsmqr(
                    plasma->quark, &task_flags,
                    PlasmaLeft, PlasmaNoTrans,
                    Q.mb, tempnn, tempmm, tempnn, tempAkn, ib, T.nb,
                    Q(k, n), ldqk,
                    Q(m, n), ldqm,
                    A(m, k), ldam,
                    T(m, k), T.mb);
            }
        }
        for (n = 0; n < Q.nt; n++) {
            tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb;
            QUARK_CORE_dormqr(
                plasma->quark, &task_flags,
                PlasmaLeft, PlasmaNoTrans,
                tempkm, tempnn, tempkmin, ib, T.nb,
                A(k, k), ldak,
                T(k, k), T.mb,
                Q(k, n), ldqk);
        }
    }
}
예제 #6
0
/***************************************************************************//**
 *  Parallel forward substitution for tile LU - dynamic scheduling
 **/
void plasma_pztrsmpl_quark(PLASMA_desc A, PLASMA_desc B, PLASMA_desc L, int *IPIV,
                           PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int ldak, ldam, ldbk, ldbm;
    int tempkm, tempnn, tempkmin, tempmm, tempkn;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    for (k = 0; k < min(A.mt, A.nt); k++) {
        tempkm   = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn   = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        tempkmin = k == min(A.mt, A.nt)-1 ? min(A.m, A.n)-k*A.mb : A.mb;
        ldak = BLKLDD(A, k);
        ldbk = BLKLDD(B, k);
        for (n = 0; n < B.nt; n++) {
            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
            QUARK_CORE_zgessm(
                plasma->quark, &task_flags,
                tempkm, tempnn, tempkmin, ib, L.nb,
                IPIV(k, k),
                A(k, k), ldak,
                B(k, n), ldbk);
        }
        for (m = k+1; m < A.mt; m++) {
            tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            ldbm = BLKLDD(B, m);
            for (n = 0; n < B.nt; n++) {
                tempnn  = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                QUARK_CORE_zssssm(
                    plasma->quark, &task_flags,
                    A.nb, tempnn, tempmm, tempnn, tempkn, ib, L.nb,
                    B(k, n), ldbk,
                    B(m, n), ldbm,
                    L(m, k), L.mb,
                    A(m, k), ldam,
                    IPIV(m, k));
            }
        }
    }
}
예제 #7
0
/***************************************************************************//**
 *  Parallel tile row interchanges - dynamic scheduling
 **/
void plasma_pclaswp_quark(PLASMA_desc B, int *IPIV, int inc,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int m, n;
    int tempi, tempm, tempmm, tempnn;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    if ( inc > 0 ) 
    {
        for (m = 0; m < B.mt; m++) {
            tempi = m * B.mb;
            tempm = B.m - tempi;
            tempmm = m == B.mt-1 ? tempm : B.mb;

            for (n = 0; n < B.nt; n++) {
                tempnn = n == B.nt-1 ? B.n - n * B.nb : B.nb;
                
                QUARK_CORE_claswp_ontile(
                    plasma->quark, &task_flags,
                    plasma_desc_submatrix(B, tempi, n*B.nb, tempm, tempnn),
                    B(m, n), 1, tempmm, IPIV(m), inc, B(B.mt-1, n) );
            }
        }
    } 
    else 
    {
        for (m = B.mt-1; m > -1; m--) {
            tempi = m * B.mb;
            tempm = B.m - tempi;
            tempmm = m == B.mt-1 ? tempm : B.mb;

            for (n = 0; n < B.nt; n++) {
                tempnn = n == B.nt-1 ? B.n - n * B.nb : B.nb;
                
                QUARK_CORE_claswp_ontile(
                    plasma->quark, &task_flags,
                    plasma_desc_submatrix(B, tempi, n*B.nb, tempm, tempnn),
                    B(m, n), 1, tempmm, IPIV(m), inc, B(0, n) );
            }
        }
    } 
}
예제 #8
0
/***************************************************************************//**
 *  Zeroes a submatrix in tile layout - dynamic scheduling
 **/
void plasma_pztile_zero_quark(PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
{
    PLASMA_Complex64_t *bdl;
    plasma_context_t *plasma;
    int X1, Y1;
    int X2, Y2;
    int n, m, ldt;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    for (m = 0; m < A.mt; m++)
    {
        ldt = BLKLDD(A, m);
        for (n = 0; n < A.nt; n++)
        {
            X1 = n == 0 ? A.j%A.nb : 0;
            Y1 = m == 0 ? A.i%A.mb : 0;
            X2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
            Y2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;

            bdl = ABDL(m, n);
            QUARK_Insert_Task(plasma->quark, CORE_ztile_zero_quark, &task_flags,
                sizeof(int),                       &X1,  VALUE,
                sizeof(int),                       &X2,  VALUE,
                sizeof(int),                       &Y1,  VALUE,
                sizeof(int),                       &Y2,  VALUE,
                sizeof(PLASMA_Complex64_t)*A.bsiz, bdl,      OUTPUT | LOCALITY,
                sizeof(int),                       &ldt, VALUE,
                0);
        }
    }
}
예제 #9
0
/***************************************************************************//**
 *  Conversion from LAPACK F77 matrix layout to tile layout - dynamic scheduling
 **/
void plasma_pzlapack_to_tile_quark(PLASMA_Complex64_t *Af77, int lda, PLASMA_desc A,
                                   PLASMA_sequence *sequence, PLASMA_request *request)
{
    PLASMA_Complex64_t *f77;
    PLASMA_Complex64_t *bdl;
    plasma_context_t *plasma;
    int X1, Y1;
    int X2, Y2;
    int n, m, ldt;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    for (m = 0; m < A.mt; m++)
    {
        ldt = BLKLDD(A, m);
        for (n = 0; n < A.nt; n++)
        {
            X1 = n == 0 ? A.j%A.nb : 0;
            Y1 = m == 0 ? A.i%A.mb : 0;
            X2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
            Y2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;

            f77 = AF77(m, n);
            bdl = ABDL(m, n);
            QUARK_CORE_zlacpy(
                plasma->quark, &task_flags,
                PlasmaUpperLower, (Y2-Y1), (X2-X1), A.mb,
                &(f77[X1*lda+Y1]), lda, 
                &(bdl[X1*lda+Y1]), ldt);
        }
    }
}
예제 #10
0
/***************************************************************************//**
 *  Parallel tile QR factorization (reduction Householder) - dynamic scheduling
 **/
void plasma_psgeqrfrh_quark(PLASMA_desc A, PLASMA_desc T, int BS,
                            PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int K, M, RD;
    int ldaM, ldam, ldaMRD;
    int tempkn, tempMm, tempnn, tempmm, tempMRDm;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    K = min(A.mt, A.nt);
    for (k = 0; k < K; k++) {
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        for (M = k;
             M < A.mt-1 || M == k;  /* No bottom single-row subdomain */
             M += BS) {
            tempMm = M == A.mt-1 ? A.m-M*A.mb : A.mb;
            ldaM = BLKLDD(A, M);
            QUARK_CORE_sgeqrt(
                plasma->quark, &task_flags,
                tempMm, tempkn, ib, T.nb,
                A(M, k), ldaM,
                T(M, k), T.mb);

            for (n = k+1; n < A.nt; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_sormqr(
                    plasma->quark, &task_flags,
                    PlasmaLeft, PlasmaTrans,
                    tempMm, tempnn, tempMm, ib, T.nb,
                    A(M, k), ldaM,
                    T(M, k), T.mb,
                    A(M, n), ldaM);
            }
            for (m = M+1;
                 (m < M+BS && m < A.mt) || m == A.mt-1; /* Suck in bottom single-row domain */
                 m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_stsqrt(
                    plasma->quark, &task_flags,
                    tempmm, tempkn, ib, T.nb,
                    A(M, k), ldaM,
                    A(m, k), ldam,
                    T(m, k), T.mb);

                for (n = k+1; n < A.nt; n++) {
                    tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                    QUARK_CORE_stsmqr(
                        plasma->quark, &task_flags,
                        PlasmaLeft, PlasmaTrans,
                        A.nb, tempnn, tempmm, tempnn, A.nb, ib, T.nb,
                        A(M, n), ldaM,
                        A(m, n), ldam,
                        A(m, k), ldam,
                        T(m, k), T.mb);
                }
            }
        }
        for (RD = BS; RD < A.mt-k; RD *= 2) {
            for (M = k;
                 M+RD < A.mt-1; /* No reduction with bottom single-row subdomain */
                 M += 2*RD) {
                tempMRDm = M+RD == A.mt-1 ? A.m-(M+RD)*A.mb : A.mb;
                ldaM   = BLKLDD(A, M   );
                ldaMRD = BLKLDD(A, M+RD);
                QUARK_CORE_sttqrt(
                    plasma->quark, &task_flags,
                    tempMRDm, tempkn, ib, T.nb,
                    A (M   , k), ldaM,
                    A (M+RD, k), ldaMRD,
                    T2(M+RD, k), T.mb);

                for (n = k+1; n < A.nt; n++) {
                    tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                    QUARK_CORE_sttmqr(
                        plasma->quark, &task_flags,
                        PlasmaLeft, PlasmaTrans,
                        A.nb, tempnn, tempMRDm, tempnn, A.nb, ib, T.nb,
                        A (M,    n), ldaM,
                        A (M+RD, n), ldaMRD,
                        A (M+RD, k), ldaMRD,
                        T2(M+RD, k), T.mb);
                }
            }
        }
    }
}
예제 #11
0
/***************************************************************************//**
 *  Parallel tile triangular matrix inverse - dynamic scheduling
 **/
void plasma_pztrtri_quark(PLASMA_enum uplo, PLASMA_enum diag, PLASMA_desc A,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int ldam, ldan;
    int tempkn, tempmm, tempnn;

    PLASMA_Complex64_t zone  = (PLASMA_Complex64_t) 1.0;
    PLASMA_Complex64_t mzone = (PLASMA_Complex64_t)-1.0;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
    /*
     *  PlasmaLower
     */
    if (uplo == PlasmaLower) {
        for (n = 0; n < A.nt; n++) {
            tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            ldan = BLKLDD(A, n);
            for (m = n+1; m < A.mt; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_ztrsm(
                    plasma->quark, &task_flags,
                    PlasmaRight, uplo, PlasmaNoTrans, diag,
                    tempmm, tempnn, A.mb,
                    mzone, A(n, n), ldan,
                           A(m, n), ldam);
            }
            for (m = n+1; m < A.mt; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                for (k = 0; k < n; k++) {
                    tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                    QUARK_CORE_zgemm(
                        plasma->quark, &task_flags,
                        PlasmaNoTrans, PlasmaNoTrans,
                        tempmm, tempkn, tempnn, A.mb,
                        zone, A(m, n), ldam,
                              A(n, k), ldan,
                        zone, A(m, k), ldam);
                }
            }
            for (m = 0; m < n; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                QUARK_CORE_ztrsm(
                    plasma->quark, &task_flags,
                    PlasmaLeft, uplo, PlasmaNoTrans, diag,
                    tempnn, tempmm, A.mb,
                    zone, A(n, n), ldan,
                          A(n, m), ldan);
            }
            QUARK_CORE_ztrtri(
                plasma->quark, &task_flags,
                uplo, diag,
                tempnn, A.mb,
                A(n, n), ldan,
                sequence, request, A.nb*n);
        }
    }
    /*
     *  PlasmaUpper
     */
    else {
        for (m = 0; m < A.mt; m++) {
            tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            for (n = m+1; n < A.nt; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_ztrsm(
                    plasma->quark, &task_flags,
                    PlasmaLeft, uplo, PlasmaNoTrans, diag,
                    tempmm, tempnn, A.mb,
                    mzone, A(m, m), ldam,
                           A(m, n), ldam);
            }
            for (n = 0; n < m; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                ldan = BLKLDD(A, n);
                for (k = m+1; k < A.nt; k++) {
                    tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                    QUARK_CORE_zgemm(
                        plasma->quark, &task_flags,
                        PlasmaNoTrans, PlasmaNoTrans,
                        tempnn, tempkn, tempmm, A.mb,
                        zone, A(n, m), ldan,
                              A(m, k), ldam,
                        zone, A(n, k), ldan);
                }
                QUARK_CORE_ztrsm(
                    plasma->quark, &task_flags,
                    PlasmaRight, uplo, PlasmaNoTrans, diag,
                    tempnn, tempmm, A.mb,
                    zone, A(m, m), ldam,
                          A(n, m), ldan);
            }
            QUARK_CORE_ztrtri(
                plasma->quark, &task_flags,
                uplo, diag,
                tempmm, A.mb,
                A(m, m), ldam,
                sequence, request, A.mb*m);
        }
    }
}
예제 #12
0
/** ****************************************************************************
 *
 * @ingroup InPlaceTransformation
 *
 * plasma_pcgetmi2_quark - realises nprob independant transpositions. Each
 * subproblem is a tile of mb-by-nb elements.
 * This function use an extra space of PLASMA_SIZE*(mb*nb). This is a
 * maximum in case of dynamic scheduling.
 * 
 *
 *******************************************************************************
 *
 * @param[in] idep
 *       PlasmaIPT_Nodep: No fake dependencies are added.
 *       PlasmaIPT_Panel: A gatherv is added on each panel and panel size is m*nb.
 *       PlasmaIPT_All:   A gatherv is added on the whole matrix.
 *
 * @param[in] odep
 *       PlasmaIPT_Nodep: No fake dependencies are added.
 *       PlasmaIPT_Panel: A gatherv is added on each panel and panel size is m*nb.
 *       PlasmaIPT_All:   A gatherv is added on the whole matrix.
 *
 * @param[in] storev
 *       PlasmaColumnWise: Data stored in column major.
 *       PlasmaRowWise: Data stored in row major.
 *
 * @param[in] m
 *       Number of row of A if tiles are sorted in column major format,
 *       number of columns otherwise.
 *
 * @param[in] n
 *       Number of columns of A if tiles are sorted in column major format,
 *       number of rows otherwise.
 *
 * @param[in] mb
 *       Number of rows in each individual subproblem if storev == PlasmaColumnWise, 
 *       number of columns otherwise. m%mb must be 0.
 *
 * @param[in] nb
 *       Number of columns in each individual subproblem if storev == PlasmaColumnWise, 
 *       number of rows otherwise. n%nb must be 0.
 *
 * @param[in,out] A
 *       Matrix of size m*n.
 *
 * @param[in] sequence
 *          Identifies the sequence of function calls that this call belongs to
 *          (for completion checks and exception handling purposes).
 *
 * @param[out] request
 *          Identifies this function call (for exception handling purposes).
 *
 *******************************************************************************
 *
 * @sa plasma_pcgetmi2
 *
 ******************************************************************************/
void plasma_pcgetmi2_quark(PLASMA_enum idep, PLASMA_enum odep, PLASMA_enum storev, 
                           int m, int n, int mb, int nb, PLASMA_Complex32_t *A,
                           PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;
    PLASMA_Complex32_t *Al, *Ap;
    int i, j, nprob, mt, nt;
    int bsiz, psiz, size;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;

    /* quick return */
    if( (mb < 2) || (nb < 2) ) {
        return ;
    }

    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    bsiz  = mb*nb;
    if ( storev == PlasmaColumnwise ) {
        psiz = m*nb;
        mt = ( m / mb );
        nt = ( n / nb );
    } else {
        psiz = n*mb;
        mt = ( n / nb );
        nt = ( m / mb );    
    }

    size = m*n;

    switch ( idep ) {
/*
 * Dependencies on each panel as input
 */
    case PlasmaIPT_Panel:
        switch ( odep ) {
        case PlasmaIPT_Panel:
            for (j=0; j<nt; j++) {
                Ap = A + (psiz*j);
                for (i=0; i<mt; i++) {
                    Al = Ap + i*bsiz;
                    QUARK_CORE_cgetrip_f1(
                        plasma->quark, &task_flags,
                        mb, nb, Al, bsiz, 
                        Ap, psiz, INOUT|GATHERV);
                }
            }
            break;
              
        case PlasmaIPT_All:
            for (j=0; j<nt; j++) {
                Ap = A + (psiz*j);
                for (i=0; i<mt; i++) {
                    Al = Ap + i*bsiz;
                    
                    QUARK_CORE_cgetrip_f2(plasma->quark, &task_flags,
                                          mb, nb, Al, bsiz, 
                                          Ap, size, INPUT,
                                          A,  size, INOUT|GATHERV);
                }
            }
            break;

        case PlasmaIPT_NoDep:
        default:
            for (j=0; j<nt; j++) {
                Ap = A + (psiz*j);
                for (i=0; i<mt; i++) {
                    Al = Ap + i*bsiz;
                    QUARK_CORE_cgetrip_f1(
                        plasma->quark, &task_flags,
                        mb, nb, Al, bsiz, 
                        Ap, psiz, INPUT);
                }
            }
        }
        break;
        
/*
 * Dependency on all the matrix as input
 */
    case PlasmaIPT_All:
        switch ( odep ) {
        case PlasmaIPT_Panel:
            for (j=0; j<nt; j++) {
                Ap = A + (psiz*j);
                for (i=0; i<mt; i++) {
                    Al = Ap + i*bsiz;
                    QUARK_CORE_cgetrip_f2(
                        plasma->quark, &task_flags,
                        mb, nb, Al, bsiz, 
                        A,  size, INPUT,
                        Ap, psiz, INOUT|GATHERV);
                }
            }
            break;
              
        case PlasmaIPT_All:
            nprob = mt*nt;
            for (i=0; i<nprob; i++) {
                QUARK_CORE_cgetrip_f1(plasma->quark, &task_flags,
                                      mb, nb, &(A[ i*bsiz ]), bsiz, 
                                      A, size, INOUT|GATHERV);
            }
            break;
            
        case PlasmaIPT_NoDep:
        default:
            nprob = mt*nt;
            for (i=0; i<nprob; i++) {
                QUARK_CORE_cgetrip_f1(plasma->quark, &task_flags,
                                      mb, nb, &(A[ i*bsiz ]), bsiz, 
                                      A, size, INPUT);
            }
        }
        break;
        
/*
 * No Dependencies as input
 */
    case PlasmaIPT_NoDep:
    default:
        switch ( odep ) {
        case PlasmaIPT_Panel:
            for (j=0; j<nt; j++) {
                Ap = A + (psiz*j);
                for (i=0; i<mt; i++) {
                    Al = Ap + i*bsiz;
                    QUARK_CORE_cgetrip_f1(
                        plasma->quark, &task_flags,
                        mb, nb, Al, bsiz, 
                        Ap, psiz, INOUT|GATHERV);
                }
            }
            break;
              
        case PlasmaIPT_All:
            nprob = mt*nt;
            for (i=0; i<nprob; i++) {
                QUARK_CORE_cgetrip_f1(plasma->quark, &task_flags,
                                      mb, nb, &(A[ i*bsiz ]), bsiz, 
                                      A, size, INOUT|GATHERV);
            }
            break;

        case PlasmaIPT_NoDep:
        default:
            nprob = mt*nt;
            for (i=0; i<nprob; i++) {
                QUARK_CORE_cgetrip(plasma->quark, &task_flags,
                                   mb, nb,  &(A[ i*bsiz ]), bsiz);
            }
        }
    }
}
예제 #13
0
/***************************************************************************//**
 *  Parallel application of Q using tile V - LQ factorization (reduction
 *  Householder) - dynamic scheduling
 **/
void plasma_pzunmlqrh_quark(PLASMA_enum side, PLASMA_enum trans,
        PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int BS,
        PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int K, N, RD, lastRD;
    int ldaN, ldak;
    int ldbN, ldbm, ldbNRD;
    int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    K = min(A.mt, A.nt);

    if (side == PlasmaLeft ) {
        if (trans == PlasmaNoTrans) {
            /*
             *  PlasmaLeft / PlasmaNoTrans
             */
            for (k = 0; k < K; k++) {
                tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                ldak = BLKLDD(A, k);
                for (N = k; N < A.nt; N += BS) {
                    tempNn   = N == A.nt-1 ? A.n-N*A.nb : A.nb;
                    tempkmin = min(tempkm,tempNn);
                    ldaN = BLKLDD(A, N);
                    ldbN = BLKLDD(B, N);
                    for (n = 0; n < B.nt; n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        QUARK_CORE_zunmlq(
                                plasma->quark, &task_flags,
                                side, trans,
                                tempNn, tempnn,
                                tempkmin, ib, T.nb,
                                A(k, N), ldak,
                                T(k, N), T.mb,
                                B(N, n), ldbN);
                    }
                    for (m = N+1; m < min(N+BS, A.nt); m++) {
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        ldbm = BLKLDD(B, m);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_ztsmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    B.nb, tempnn, tempmm, tempnn,
                                    tempkm, ib, T.nb,
                                    B(N, n), ldbN,
                                    B(m, n), ldbm,
                                    A(k, m), ldak,
                                    T(k, m), T.mb);
                        }
                    }
                }
                for (RD = BS; RD < A.nt-k; RD *= 2) {
                    for (N = k; N+RD < A.nt; N += 2*RD) {
                        tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                        ldbN   = BLKLDD(B, N   );
                        ldbNRD = BLKLDD(B, N+RD);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_zttmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    B.mb, tempnn, tempNRDn, tempnn,
                                    tempkm, ib, T.nb,
                                    B (N,    n), ldbN,
                                    B (N+RD, n), ldbNRD,
                                    A (k, N+RD), ldak,
                                    T2(k, N+RD), T.mb);
                        }
                    }
                }
            }
        } else {
            /*
             *  PlasmaLeft / PlasmaConjTrans
             */
            for (k = K-1; k >= 0; k--) {
                tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                ldak = BLKLDD(A, k);
                lastRD = 0;
                for (RD = BS; RD < A.nt-k; RD *= 2)
                    lastRD = RD;
                for (RD = lastRD; RD >= BS; RD /= 2) {
                    for (N = k; N+RD < A.nt; N += 2*RD) {
                        tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                        ldbN   = BLKLDD(B, N   );
                        ldbNRD = BLKLDD(B, N+RD);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_zttmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    B.nb, tempnn, tempNRDn, tempnn,
                                    tempkm, ib, T.nb,
                                    B (N,    n), ldbN,
                                    B (N+RD, n), ldbNRD,
                                    A (k, N+RD), ldak,
                                    T2(k, N+RD), T.mb);
                        }
                    }
                }
                for (N = k; N < A.nt; N += BS) {
                    tempNn   = N == A.nt-1 ? A.n-N*A.nb : A.nb;
                    tempkmin = min(tempkm,tempNn);
                    ldaN = BLKLDD(A, N);
                    ldbN = BLKLDD(B, N);
                    for (m = min(N+BS, A.nt)-1; m > N; m--) {
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        ldbm = BLKLDD(B, m);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_ztsmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    B.mb, tempnn, tempmm, tempnn,
                                    tempkm, ib, T.nb,
                                    B(N, n), ldbN,
                                    B(m, n), ldbm,
                                    A(k, m), ldak,
                                    T(k, m), T.mb);
                        }
                    }
                    for (n = 0; n < B.nt; n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        QUARK_CORE_zunmlq(
                                plasma->quark, &task_flags,
                                side, trans,
                                tempNn, tempnn,
                                tempkmin, ib, T.nb,
                                A(k, N), ldak,
                                T(k, N), T.mb,
                                B(N, n), ldbN);
                    }
                }
            }

        }
    } else {
        if (trans == PlasmaNoTrans) {
            /*
             *  PlasmaRight / PlasmaNoTrans
             */
              for (k = K-1; k >= 0; k--) {
                  tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                  ldak = BLKLDD(A, k);
                  lastRD = 0;
                  for (RD = BS; RD < A.nt-k; RD *= 2)
                      lastRD = RD;
                  for (RD = lastRD; RD >= BS; RD /= 2) {
                      for (N = k; N+RD < A.nt; N += 2*RD) {
                          tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                          for (m = 0; m < B.mt; m++) {
                              ldbm   = BLKLDD(B, m);
                              tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                              QUARK_CORE_zttmlq(
                                      plasma->quark, &task_flags,
                                      side, trans,
                                      tempmm, B.nb, tempmm, tempNRDn,
                                      tempkm, ib, T.nb,
                                      B (m, N   ), ldbm,
                                      B (m, N+RD), ldbm,
                                      A (k, N+RD), ldak,
                                      T2(k, N+RD), T.mb);
                          }
                      }
                  }
                  for (N = k; N < A.nt; N += BS) {
                      tempNn   = N == A.nt-1 ? A.n-N*A.nb : A.nb;
                      tempkmin = min(tempkm,tempNn);
                      for (n = min(N+BS, A.nt)-1; n > N; n--) {
                          tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                          for (m = 0; m < B.mt; m++) {
                              tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                              ldbm = BLKLDD(B, m);
                              QUARK_CORE_ztsmlq(
                                      plasma->quark, &task_flags,
                                      side, trans,
                                      tempmm, B.nb, tempmm, tempnn,
                                      tempkm, ib, T.nb,
                                      B(m, N), ldbm,
                                      B(m, n), ldbm,
                                      A(k, n), ldak,
                                      T(k, n), T.mb);
                          }
                      }
                      for (m = 0; m < B.mt; m++) {
                          tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                          ldbm = BLKLDD(B, m);
                          QUARK_CORE_zunmlq(
                                  plasma->quark, &task_flags,
                                  side, trans,
                                  tempmm, tempNn,
                                  tempkmin, ib, T.nb,
                                  A(k, N), ldak,
                                  T(k, N), T.mb,
                                  B(m, N), ldbm);
                      }
                  }
              }
        } else {
            /*
             *  PlasmaRight / PlasmaConjTrans
             */
            for (k = 0; k < K; k++) {
                tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                ldak = BLKLDD(A, k);
                for (N = k; N < A.nt; N += BS) {
                    tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
                    tempkmin = min(tempkm,tempNn);
                    ldaN = BLKLDD(A, N);
                    for (m = 0; m < B.mt; m++) {
                        ldbm = BLKLDD(B, m);
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        QUARK_CORE_zunmlq(
                                plasma->quark, &task_flags,
                                side, trans,
                                tempmm, tempNn,
                                tempkmin, ib, T.nb,
                                A(k, N), ldaN,
                                T(k, N), T.mb,
                                B(m, N), ldbm);
                    }
                    for (n = N+1; n < min(N+BS, A.nt); n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        for (m = 0; m < B.mt; m++) {
                            tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                            ldbm = BLKLDD(B, m);
                            QUARK_CORE_ztsmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    tempmm, tempNn, tempmm, tempnn,
                                    tempkm, ib, T.nb,
                                    B(m, N), ldbm,
                                    B(m, n), ldbm,
                                    A(k, n), ldak,
                                    T(k, n), T.mb);
                        }
                    }
                }
                for (RD = BS; RD < A.nt-k; RD *= 2) {
                    for (N = k; N+RD < A.nt; N += 2*RD) {
                        tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                        for (m = 0; m < B.mt; m++) {
                            tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                            ldbm   = BLKLDD(B, m);
                            QUARK_CORE_zttmlq(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    tempmm, B.nb, tempmm, tempNRDn,
                                    tempkm, ib, T.nb,
                                    B (m, N   ), ldbm,
                                    B (m, N+RD), ldbm,
                                    A (k, N+RD), ldak,
                                    T2(k, N+RD), T.mb);
                        }
                    }
                }
            }
        }
    }
}
예제 #14
0
/***************************************************************************//**
 *  Parallel initializztion a 2-D array A to 
 *  ALPHA on the offdiagonals.
 **/
void plasma_pzlaset2_quark(PLASMA_enum uplo, PLASMA_Complex64_t alpha, 
                           PLASMA_desc A,
                           PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int i, j;
    int ldai, ldaj;
    int tempim;
    int tempjm, tempjn;
    int minmn = min(A.mt, A.nt);

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;

    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    if (uplo == PlasmaLower) {
       for (j = 0; j < minmn; j++){
           tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb;
           tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
           ldaj = BLKLDD(A, j);
           QUARK_CORE_zlaset2(
               plasma->quark, &task_flags,
               PlasmaLower, tempjm, tempjn, alpha,
               A(j, j), ldaj);

           for (i = j+1; i < A.mt; i++){
               tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
               ldai = BLKLDD(A, i);
               QUARK_CORE_zlaset2(
                   plasma->quark, &task_flags,
                   PlasmaUpperLower, tempim, tempjn, alpha,
                   A(i, j), ldai);
           }
       }
    }
    else if (uplo == PlasmaUpper) {
       for (j = 1; j < A.nt; j++){
           tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
           for (i = 0; i < min(j, A.mt); i++){
               tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
               ldai = BLKLDD(A, i);
               QUARK_CORE_zlaset2(
                   plasma->quark, &task_flags,
                   PlasmaUpperLower, tempim, tempjn, alpha,
                   A(i, j), ldai);
           }
       }
       for (j = 0; j < minmn; j++){
           tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb;
           tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
           ldaj = BLKLDD(A, j);
           QUARK_CORE_zlaset2(
               plasma->quark, &task_flags,
               PlasmaUpper, tempjm, tempjn, alpha,
               A(j, j), ldaj);
       }
    }
    else {
       for (i = 0; i < A.mt; i++){
           tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
           ldai = BLKLDD(A, i);
           for (j = 0; j < A.nt; j++){
               tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
               QUARK_CORE_zlaset2(
                   plasma->quark, &task_flags,
                   PlasmaUpperLower, tempim, tempjn, alpha,
                   A(i, j), ldai);
           }
       }
    } 
}
예제 #15
0
/***************************************************************************//**
 *  Parallel application of Q using tile V - LQ factorization - dynamic scheduling
 **/
void plasma_pzunmlq_quark(PLASMA_enum side, PLASMA_enum trans,
        PLASMA_desc A, PLASMA_desc B, PLASMA_desc T,
        PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int ldak, ldan, ldbk, ldbm;
    int tempmm, tempnn, tempkn, tempkm, tempkmin;
    int ib, minMT, minM;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    if (A.m > A.n) {
        minM  = A.n;
        minMT = A.nt;
    } else {
        minM  = A.m;
        minMT = A.mt;
    }

    if (side == PlasmaLeft ) {
        if (trans == PlasmaNoTrans) {
            /*
             *  PlasmaLeft / PlasmaNoTrans
             */
            for (k = 0; k < minMT; k++) {
                tempkm   = k == B.mt -1 ? B.m -k*B.mb : B.mb;
                tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
                ldak = BLKLDD(A, k);
                ldbk = BLKLDD(B, k);
                for (n = 0; n < B.nt; n++) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    QUARK_CORE_zunmlq(
                            plasma->quark, &task_flags,
                            side, trans,
                            tempkm, tempnn, tempkmin, ib, T.nb,
                            A(k, k), ldak,
                            T(k, k), T.mb,
                            B(k, n), ldbk);
                }
                for (m = k+1; m < B.mt; m++) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldbm = BLKLDD(B, m);
                    for (n = 0; n < B.nt; n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        QUARK_CORE_ztsmlq(
                                plasma->quark, &task_flags,
                                side, trans,
                                B.mb, tempnn, tempmm, tempnn, tempkmin, ib, T.nb,
                                B(k, n), ldbk,
                                B(m, n), ldbm,
                                A(k, m), ldak,
                                T(k, m), T.mb);
                    }
                }
            }
        }
        else {
            /*
             *  PlasmaLeft / PlasmaConjTrans
             */
            for (k = minMT-1; k >= 0; k--) {
                tempkm   = k == B.mt -1 ? B.m -k*B.mb : B.mb;
                tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
                ldak = BLKLDD(A, k);
                ldbk = BLKLDD(B, k);
                for (m = B.mt-1; m > k; m--) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldbm = BLKLDD(B, m);
                    for (n = 0; n < B.nt; n++) {
                        tempnn   = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        QUARK_CORE_ztsmlq(
                                plasma->quark, &task_flags,
                                side, trans,
                                B.mb, tempnn, tempmm, tempnn, tempkmin, ib, T.nb,
                                B(k, n), ldbk,
                                B(m, n), ldbm,
                                A(k, m), ldak,
                                T(k, m), T.mb);
                    }
                }
                for (n = 0; n < B.nt; n++) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    QUARK_CORE_zunmlq(
                            plasma->quark, &task_flags,
                            side, trans,
                            tempkm, tempnn, tempkmin, ib, T.nb,
                            A(k, k), ldak,
                            T(k, k), T.mb,
                            B(k, n), ldbk);
                }
            }
        }
    }
    else {
        if (trans == PlasmaNoTrans) {
            /*
             *  PlasmaRight / PlasmaNoTrans
             */
            for (k = minMT-1; k >= 0; k--) {
                tempkn   = k == B.nt -1 ? B.n -k*B.nb : B.nb;
                tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
                ldak = BLKLDD(A, k);
                for (n = B.nt-1; n > k; n--) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    ldan = BLKLDD(A, n);
                    for (m = 0; m < B.mt; m++) {
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        ldbm = BLKLDD(B, m);
                        QUARK_CORE_ztsmlq(
                                plasma->quark, &task_flags,
                                side, trans,
                                tempmm, B.nb, tempmm, tempnn, tempkmin, ib, T.nb,
                                B(m, k), ldbm,
                                B(m, n), ldbm,
                                A(k, n), ldak,
                                T(k, n), T.mb);
                    }
                }
                for (m = 0; m < B.mt; m++) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldbm = BLKLDD(B, m);
                    QUARK_CORE_zunmlq(
                            plasma->quark, &task_flags,
                            side, trans,
                            tempmm, tempkn, tempkmin, ib, T.nb,
                            A(k, k), ldak,
                            T(k, k), T.mb,
                            B(m, k), ldbm);
                }
            }
        }
        else {
            /*
             *  PlasmaRight / PlasmaConjTrans
             */
            for (k = 0; k < minMT; k++) {
                tempkn   = k == B.nt -1 ? B.n -k*B.nb : B.nb;
                tempkmin = k == minMT-1 ? minM-k*A.mb : A.mb;
                ldak = BLKLDD(A, k);
                for (m = 0; m < B.mt; m++) {
                    tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                    ldbm = BLKLDD(B, m);
                    QUARK_CORE_zunmlq(
                            plasma->quark, &task_flags,
                            side, trans,
                            tempmm, tempkn, tempkmin, ib, T.nb,
                            A(k, k), ldak,
                            T(k, k), T.mb,
                            B(m, k), ldbm);
                }
                for (n = k+1; n < B.nt; n++) {
                    tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                    for (m = 0; m < B.mt; m++) {
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        ldbm = BLKLDD(B, m);
                        QUARK_CORE_ztsmlq(
                                plasma->quark, &task_flags,
                                side, trans,
                                tempmm, B.nb, tempmm, tempnn, tempkmin, ib, T.nb,
                                B(m, k), ldbm,
                                B(m, n), ldbm,
                                A(k, n), ldak,
                                T(k, n), T.mb);
                    }
                }
            }
        }
    }
}
예제 #16
0
/***************************************************************************//**
 *  Parallel tile BAND Tridiagonal Reduction - dynamic scheduler
 **/
void plasma_pzherbt_quark(PLASMA_enum uplo, 
                          PLASMA_desc A, PLASMA_desc T,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n, i, j;
    int ldak, ldam, ldan, ldaj, ldai;
    int tempkn, tempmm, tempnn, tempjj;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;

    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    if (uplo == PlasmaLower) {
       for (k = 0; k < A.nt-1; k++){
           tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb;
           ldak = BLKLDD(A, k+1);
           QUARK_CORE_zgeqrt(
               plasma->quark, &task_flags,
               tempkn, A.nb, ib, T.nb,
               A(k+1, k), ldak,
               T(k+1, k), T.mb);

           /* LEFT and RIGHT on the symmetric diagonal block */
           QUARK_CORE_zherfb(
               plasma->quark, &task_flags,
               PlasmaLower,
               tempkn, tempkn, ib, T.nb,
               A(k+1,   k), ldak,
               T(k+1,   k), T.mb,
               A(k+1, k+1), ldak);

           /* RIGHT on the remaining tiles until the bottom */
           for (m = k+2; m < A.mt ; m++) {
               tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
               ldam = BLKLDD(A, m);
               QUARK_CORE_zunmqr(
                   plasma->quark, &task_flags,
                   PlasmaRight, PlasmaNoTrans,
                   tempmm, A.nb, tempkn, ib, T.nb,
                   A(k+1,   k), ldak,
                   T(k+1,   k), T.mb,
                   A(m  , k+1), ldam);
           }

           for (m = k+2; m < A.mt; m++) {
               tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
               ldam = BLKLDD(A, m);
               QUARK_CORE_ztsqrt(
                   plasma->quark, &task_flags,
                   tempmm, A.nb, ib, T.nb,
                   A(k+1, k), ldak,
                   A(m  , k), ldam,
                   T(m  , k), T.mb);

               /* LEFT */
               for (i = k+2; i < m; i++) {
                   ldai = BLKLDD(A, i);
                   QUARK_CORE_ztsmqr_hetra1(
                       plasma->quark, &task_flags,
                       PlasmaLeft, PlasmaConjTrans,
                       A.mb, A.nb, tempmm, A.nb, A.nb, ib, T.nb,
                       A(i, k+1), ldai,
                       A(m,   i), ldam,
                       A(m,   k), ldam,
                       T(m,   k), T.mb);
               }

               /* RIGHT */
               for (j = m+1; j < A.mt ; j++) {
                   tempjj = j == A.mt-1 ? A.m-j*A.mb : A.mb;
                   ldaj = BLKLDD(A, j);
                   QUARK_CORE_ztsmqr(
                       plasma->quark, &task_flags,
                       PlasmaRight, PlasmaNoTrans,
                       tempjj, A.nb, tempjj, tempmm, A.nb, ib, T.nb,
                       A(j, k+1), ldaj,
                       A(j,   m), ldaj,
                       A(m,   k), ldam,
                       T(m,   k), T.mb);
               }
       
               /* LEFT->RIGHT */
               QUARK_CORE_ztsmqr_corner(
                   plasma->quark, &task_flags,
                   A.nb, A.nb, tempmm, A.nb, tempmm, tempmm, A.nb, ib, T.nb,
                   A(k+1, k+1), ldak,
                   A(m  , k+1), ldam,
                   A(m  ,   m), ldam,
                   A(m  ,   k), ldam,
                   T(m  ,   k), T.mb);
           }
       }
    }
    else {
       for (k = 0; k < A.nt-1; k++){
           tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb;
           ldak = BLKLDD(A, k+1);
           QUARK_CORE_zgelqt(
               plasma->quark, &task_flags,
               A.nb, tempkn, ib, T.nb,
               A(k, k+1), A.nb,
               T(k, k+1), T.mb);

           /* RIGHT and LEFT on the symmetric diagonal block             */
           QUARK_CORE_zherfb(
               plasma->quark, &task_flags,
               PlasmaUpper,
               tempkn, tempkn, ib, T.nb,
               A(k,   k+1), A.nb,
               T(k,   k+1), T.mb,
               A(k+1, k+1), ldak);

           /* LEFT on the remaining tiles until the left side */
           for (n = k+2; n < A.nt ; n++) {
               tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
               QUARK_CORE_zunmlq(
                   plasma->quark, &task_flags,
                   PlasmaLeft, PlasmaNoTrans,
                   A.nb, tempnn, tempkn, ib, T.nb,
                   A(k,   k+1), A.nb,
                   T(k,   k+1), T.mb,
                   A(k+1,   n), ldak);
           }

           for (n = k+2; n < A.nt; n++) {
               tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
               ldan = BLKLDD(A, n);
               QUARK_CORE_ztslqt(
                   plasma->quark, &task_flags,
                   A.nb, tempnn, ib, T.nb,
                   A(k, k+1), A.nb,
                   A(k,   n), A.nb,
                   T(k,   n), T.mb);

               /* RIGHT */
               for (i = k+2; i < n; i++) {
                   ldai = BLKLDD(A, i);
                   QUARK_CORE_ztsmlq_hetra1(
                       plasma->quark, &task_flags,
                       PlasmaRight, PlasmaConjTrans,
                       A.mb, A.nb, A.nb, tempnn, A.nb, ib, T.nb,
                       A(k+1, i), ldak,
                       A(i,   n), ldai,
                       A(k,   n), A.nb,
                       T(k,   n), T.mb);
               }

               /* LEFT */
               for (j = n+1; j < A.nt ; j++) {
                   tempjj = j == A.nt-1 ? A.n-j*A.nb : A.nb;
                   ldaj = BLKLDD(A, j);
                   QUARK_CORE_ztsmlq(
                       plasma->quark, &task_flags,
                       PlasmaLeft, PlasmaNoTrans,
                       A.nb, tempjj, tempnn, tempjj, A.nb, ib, T.nb,
                       A(k+1, j), ldak,
                       A(n,   j), ldan,
                       A(k,   n), A.nb,
                       T(k,   n), T.mb);
               }
       
               /* RIGHT->LEFT */
               QUARK_CORE_ztsmlq_corner(
                   plasma->quark, &task_flags,
                   A.nb, A.nb, A.nb, tempnn, tempnn, tempnn, A.nb, ib, T.nb,
                   A(k+1, k+1), ldak,
                   A(k+1,   n), ldak,
                   A(n  ,   n), ldan,
                   A(k  ,   n), A.nb,
                   T(k  ,   n), T.mb);
           }
       }
    }
}
예제 #17
0
/***************************************************************************//**
 *  Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
 **/
void plasma_pzgelqfrh_quark(PLASMA_desc A, PLASMA_desc T, int BS,
                            PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int K, N, RD;
    int ldak, ldam;
    int tempkm, tempNn, tempmm, tempnn, tempNRDn;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    K = min(A.mt, A.nt);
    for (k = 0; k < K; k++) {

        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        ldak = BLKLDD(A, k);
//      for (N = k; N < A.nt; N += BS) {
        for (N = k;
             N < A.nt-1 || N == k;  // No rightmost single-column subdomain
             N += BS) {
            tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
            QUARK_CORE_zgelqt(
                plasma->quark, &task_flags,
                tempkm, tempNn, ib, T.nb,
                A(k, N), ldak,
                T(k, N), T.mb);

            for (m = k+1; m < A.mt; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_zunmlq(
                    plasma->quark, &task_flags,
                    PlasmaRight, PlasmaConjTrans,
                    tempmm, tempNn, tempNn, ib, T.nb,
                    A(k, N), ldak,
                    T(k, N), T.mb,
                    A(m, N), ldam);
            }
//          for (n = N+1; n < N+BS && n < A.nt; n++) {
            for (n = N+1;
                 (n < N+BS && n < A.nt) || n == A.nt-1; // Suck in rightmost single-column domain
                 n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_ztslqt(
                    plasma->quark, &task_flags,
                    tempkm, tempnn, ib, T.nb,
                    A(k, N), ldak,
                    A(k, n), ldak,
                    T(k, n), T.mb);

                for (m = k+1; m < A.mt; m++) {
                    tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                    ldam = BLKLDD(A, m);
                    QUARK_CORE_ztsmlq(
                        plasma->quark, &task_flags,
                        PlasmaRight, PlasmaConjTrans,
                        tempmm, A.nb, tempmm, tempnn, A.mb, ib, T.nb,
                        A(m, N), ldam,
                        A(m, n), ldam,
                        A(k, n), ldak,
                        T(k, n), T.mb);
                }
            }
        }
        for (RD = BS; RD < A.nt-k; RD *= 2) {
//          for (N = k; N+RD < A.nt; N += 2*RD) {
            for (N = k;
                 N+RD < A.nt-1; // No reduction with rightmost single-column subdomain
                 N += 2*RD) {
                tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                QUARK_CORE_zttlqt(
                    plasma->quark, &task_flags,
                    tempkm, tempNRDn, ib, T.nb,
                    A (k, N   ), ldak,
                    A (k, N+RD), ldak,
                    T2(k, N+RD), T.mb);

                for (m = k+1; m < A.mt; m++) {
                    tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                    ldam = BLKLDD(A, m);
                    QUARK_CORE_zttmlq(
                        plasma->quark, &task_flags,
                        PlasmaRight, PlasmaConjTrans,
                        tempmm, A.nb, tempmm, tempNRDn, A.mb, ib, T.nb,
                        A (m, N   ), ldam,
                        A (m, N+RD), ldam,
                        A (k, N+RD), ldak,
                        T2(k, N+RD), T.mb);
                }
            }
        }
    }
}
예제 #18
0
/***************************************************************************//**
 *  Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler
 **/
void plasma_pzhbrdt_quark(PLASMA_enum uplo,
                          PLASMA_desc A, double *D, double *E, PLASMA_desc T,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

#ifdef COMPLEX
    static PLASMA_Complex64_t zone  = (PLASMA_Complex64_t) 1.0;
    static double             dzero = (double) 0.0;
    PLASMA_Complex64_t ztmp;
    double absztmp;
#endif

    PLASMA_Complex64_t *C, *S;
    int blksweep, lcsweep, blkid, lcNB;
    int N, NB, NT, grsiz, lcgrsiz;
    int i;
    size_t eltsize = plasma_element_size(A.dtyp);

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;

    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    NT = A.nt;
    N  = A.m;
    NB = A.mb;

    /* Quick return */
    if (N == 0){
        return;
    }

    if (NB == 0) {
        memset(D, 0,     N*sizeof(double));
        memset(E, 0, (N-1)*sizeof(double));
#ifdef COMPLEX
        for (i=0; i<N; i++)
            D[i]  = cabs(*A(i,i));
#else
        for (i=0; i<N; i++)
          D[i]  = *A(i,i);
#endif
        return;
    }

    /*
     * Barrier is used because the bulge have to wait until
     * the reduction to band has been finish.
     * otherwise, I can remove this BARRIER when I integrate
     * the function dependencies link inside the reduction to
     * band. Keep in min the case when NB=1, where no bulge-chasing.
     */
    /***************************************************************/
    QUARK_Barrier(plasma->quark);
    tblg   = -Wtimming();
    /***************************************************************/

    /*
     * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge.
     * Make diagonal and superdiagonal elements real, storing them in
     * D and E. if PlasmaLower, first transform lower bidiagonal form
     * to upper bidiagonal by applying plane rotations/ Householder
     * from the left, overwriting superdiagonal elements then make
     * elements real of the resulting upper Bidiagonal. if PlasmaUpper
     * then make its elements real.  For Q, PT: ZSCAL should be done
     * in case of WANTQ.
     */
    if (NB == 1){
        memset(D, 0, N    *sizeof(double));
        memset(E, 0, (N-1)*sizeof(double));
#ifdef COMPLEX
        if(uplo==PlasmaLower){
            for (i=0; i<N; i++)
            {
                D[i] = creal( *A(i, i) );               /* diag value */
                if( i < (N-1)) {                            /* lower off-diag value */
                    ztmp        = *A((i+1),i);
                    absztmp     = cabs(ztmp);
                    *A((i+1),i) = absztmp;
                    E[i]        = absztmp;
                    if(absztmp != dzero)
                        ztmp = (PLASMA_Complex64_t) (ztmp / absztmp);
                    else
                        ztmp = zone;
                    if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp;
                    /* for Q: ZSCAL should be done in case of WANTQ */
                }
            }
        } else { /* PlasmaUpper */
            for (i=0; i<N; i++)
            {
                D[i]  =  creal( *A(i,i) );               /* diag value*/
                if(i<(N-1)) {                            /* lower off-diag value */
                    ztmp        = *A(i, (i+1));
                    absztmp     = cabs(ztmp);
                    *A(i,(i+1)) = absztmp;
                    E[i]        = absztmp;
                    if(absztmp != dzero)
                        ztmp = (PLASMA_Complex64_t) (ztmp / absztmp);
                    else
                        ztmp = zone;
                    if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp;
                    /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */
                }
            }
        } /* end PlasmaUpper*/
#else
       if( uplo == PlasmaLower ){
           for (i=0; i < N-1; i++) {
               D[i] = *A(i,   i);
               E[i] = *A(i+1, i);
           }
           D[i] = *A(i, i);
       } else {
           for (i=0; i < N-1; i++) {
               D[i] = *A(i, i  );
               E[i] = *A(i, i+1);
           }
           D[i] = *A(i, i);
       }
#endif
       return;
    }

    /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */
    if( N <= 0 ) /* this will be removed we don t need it. */
    {
        PLASMA_Complex64_t *work, *TTau;
        int info, ldwork = N*N;
        work = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, ldwork, PlasmaComplexDouble);
        TTau = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N,   PlasmaComplexDouble);

        info = LAPACKE_zhetrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N,
                                   A(0,0), A.lm, D, E, TTau, work, ldwork);
        plasma_shared_free(plasma, (void*) work);
        plasma_shared_free(plasma, (void*) TTau);

        if( info == 0 )
            sequence->status = PLASMA_SUCCESS;
        else
            plasma_sequence_flush(plasma->quark, sequence, request, info);
        return;
    }

    /* General case NB > 1 && N > NB */
    C   = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N,   PlasmaComplexDouble);
    S   = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N,   PlasmaComplexDouble);

    /***************************************************************************
     *                       START BULGE CHASING CODE
     **************************************************************************/
    /*
     * Initialisation of local parameter. those parameter should be
     * input or tuned parameter.
     */
    grsiz = 1;
    if( NB > 160 ) {
        grsiz = 1;
    }
    else if( NB > 100 ) {
        grsiz = 1;
        /*
        if( N < 5000 )
            grsiz = 1;
        else
            grsiz = 2;
        */
    } else {
        grsiz = 2;
    }
    grsiz = max(1, grsiz);

    /*grsiz=1;*/
    /*printf("  Version -dp- N %5d   NB %5d    lcNB %5d  grsiz %5d    A.ln %5d   A.nb %5d \n",N,NB,lcNB,grsiz,A.ln,A.nb);*/
    for (blksweep = 0; blksweep<NT; blksweep++){
       lcNB =  blksweep == NT-1 ? A.n-blksweep*A.nb : A.nb;
       /*printf("  Version -dp- N %5d   NB %5d    lcNB %5d  grsiz %5d blksweep%5d     NT %5d \n",N,NB,lcNB,grsiz,blksweep,NT);*/
       for (lcsweep = 0; lcsweep<lcNB; lcsweep++){
          for (blkid = blksweep; blkid<NT; blkid=blkid+grsiz){
              lcgrsiz = (blkid+1) < NT ? grsiz : NT-blkid;
              /*printf("  Version -dp- N %5d   NB %5d    lcNB %5d  grsiz %5d lcgrsiz %5d  blkid %5d \n",N,NB,lcNB,grsiz,lcgrsiz,blkid);*/
              QUARK_CORE_ztrdalg_v2(
                        plasma->quark, &task_flags,
                        uplo, &A, C, S,
                        lcgrsiz, lcsweep, blkid, blksweep);
             }
       }
    }
    /*
     * Barrier used only for now, to be sure that everything
     * is done before copying the D and E and free workspace.
     * this will be removed later when D and E are directly filled
     * during the bulge process.
     */
    QUARK_Barrier(plasma->quark);
    tblg   += Wtimming();
    printf("   done with bulge %lf \n\n\n",tblg);

    plasma_shared_free(plasma, (void*) C);
    plasma_shared_free(plasma, (void*) S);

    /*
     * STORE THE RESULTING diagonal/off-diagonal in D AND E
     */
    memset(D, 0,  N   *sizeof(double));
    memset(E, 0, (N-1)*sizeof(double));
    /* Make diagonal and superdiagonal elements real,
     * storing them in D and E
     */
    /* In complex case, the off diagonal element are
     * not necessary real. we have to make off-diagonal
     * elements real and copy them to E.
     * When using HouseHolder elimination,
     * the ZLARFG give us a real as output so, all the
     * diagonal/off-diagonal element except the last one are already
     * real and thus we need only to take the abs of the last
     * one.
     *  */
#ifdef COMPLEX
    if(uplo==PlasmaLower){
       for (i=0; i < N-1 ; i++)
       {
          D[i] = creal( *A(i,i) );
          /*
           * Alternative for Householder case, all off-diag
           * are real except the last off-diag, where we
           * have to take the abs
           */
          if(i<(N-2))
              E[i] = creal(*A(i+1, i));
          else
              E[i] = cabs( *A(i+1, i));
       }
        D[i] = creal( *A(i, i) );
    } else { /* PlasmaUpper */
        for (i=0; i<N-1; i++)
        {
            D[i]  =  creal( *A(i,i) );
            /*
             * Alternative for Householder case, all off-diag
             * are real except the last off-diag, where we
             * have to take the abs
             */
            if( i < (N-2) )
                E[i] = creal(*A(i, (i+1)));
            else
                E[i] = cabs(*A(i, (i+1)));
        }
        D[i] = creal( *A(i, i) );
    } /* end PlasmaUpper */
#else
    if( uplo == PlasmaLower ){
        for (i=0; i < N-1; i++) {
            D[i] = *A(i,   i);
            E[i] = *A(i+1, i);
        }
        D[i] = *A(i, i);
    } else {
        for (i=0; i < N-1; i++) {
            D[i] = *A(i, i  );
            E[i] = *A(i, i+1);
        }
        D[i] = *A(i, i);
    }
#endif

} /* END FUNCTION */
예제 #19
0
/***************************************************************************//**
 *  Parallel tile symmetric matrix-matrix multiplication - dynamic scheduling
 **/
void plasma_pdsymm_quark(PLASMA_enum side, PLASMA_enum uplo,
                          double alpha, PLASMA_desc A, PLASMA_desc B,
                          double beta, PLASMA_desc C,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int lda, ldak, ldb, ldc;
    int tempmm, tempnn, tempkn, tempkm;

    double zbeta;
    double zone = (double)1.0;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    for (m = 0; m < C.mt; m++) {
        tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
        ldc = BLKLDD(C, m);
        for (n = 0; n < C.nt; n++) {
            tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb;
            /*
             *  PlasmaLeft / PlasmaLower
             */
            if (side == PlasmaLeft) {
                lda = BLKLDD(A, m);
                if (uplo == PlasmaLower) {
                    for (k = 0; k < C.mt; k++) {
                        tempkm = k == C.mt-1 ? C.m-k*C.mb : C.mb;
                        ldak = BLKLDD(A, k);
                        ldb  = BLKLDD(B, k);
                        zbeta = k == 0 ? beta : zone;
                        if (k < m) {
                            QUARK_CORE_dgemm(
                                plasma->quark, &task_flags,
                                PlasmaNoTrans, PlasmaNoTrans,
                                tempmm, tempnn, tempkm, A.mb,
                                alpha, A(m, k), lda,  /* lda * K */
                                       B(k, n), ldb,  /* ldb * Y */
                                zbeta, C(m, n), ldc); /* ldc * Y */
                        }
                        else {
                            if (k == m) {
                                QUARK_CORE_dsymm(
                                    plasma->quark, &task_flags,
                                    side, uplo,
                                    tempmm, tempnn, A.mb,
                                    alpha, A(k, k), ldak, /* ldak * X */
                                           B(k, n), ldb,  /* ldb  * Y */
                                    zbeta, C(m, n), ldc); /* ldc  * Y */
                            }
                            else {
                                QUARK_CORE_dgemm(
                                    plasma->quark, &task_flags,
                                    PlasmaTrans, PlasmaNoTrans,
                                    tempmm, tempnn, tempkm, A.mb,
                                    alpha, A(k, m), ldak, /* ldak * X */
                                           B(k, n), ldb,  /* ldb  * Y */
                                    zbeta, C(m, n), ldc); /* ldc  * Y */
                            }
                        }
                    }
                }
                /*
                 *  PlasmaLeft / PlasmaUpper
                 */
                else {
                    for (k = 0; k < C.mt; k++) {
                        tempkm = k == C.mt-1 ? C.m-k*C.mb : C.mb;
                        ldak = BLKLDD(A, k);
                        ldb  = BLKLDD(B, k);
                        zbeta = k == 0 ? beta : zone;
                        if (k < m) {
                            QUARK_CORE_dgemm(
                                plasma->quark, &task_flags,
                                PlasmaTrans, PlasmaNoTrans,
                                tempmm, tempnn, tempkm, A.mb,
                                alpha, A(k, m), ldak, /* ldak * X */
                                       B(k, n), ldb,  /* ldb  * Y */
                                zbeta, C(m, n), ldc); /* ldc  * Y */
                        }
                        else {
                            if (k == m) {
                                QUARK_CORE_dsymm(
                                    plasma->quark, &task_flags,
                                    side, uplo,
                                    tempmm, tempnn, A.mb,
                                    alpha, A(k, k), ldak, /* ldak * K */
                                           B(k, n), ldb,  /* ldb  * Y */
                                    zbeta, C(m, n), ldc); /* ldc  * Y */
                            }
                            else {
                                QUARK_CORE_dgemm(
                                    plasma->quark, &task_flags,
                                    PlasmaNoTrans, PlasmaNoTrans,
                                    tempmm, tempnn, tempkm, A.mb,
                                    alpha, A(m, k), lda,  /* lda * K */
                                           B(k, n), ldb,  /* ldb * Y */
                                    zbeta, C(m, n), ldc); /* ldc * Y */
                            }
                        }
                    }
                }
            }
            /*
             *  PlasmaRight / PlasmaLower
             */
            else {
                lda = BLKLDD(A, n);
                ldb = BLKLDD(B, m);
                if (uplo == PlasmaLower) {
                    for (k = 0; k < C.nt; k++) {
                        tempkn = k == C.nt-1 ? C.n-k*C.nb : C.nb;
                        ldak = BLKLDD(A, k);
                        zbeta = k == 0 ? beta : zone;
                        if (k < n) {
                            QUARK_CORE_dgemm(
                                plasma->quark, &task_flags,
                                PlasmaNoTrans, PlasmaTrans,
                                tempmm, tempnn, tempkn, A.mb,
                                alpha, B(m, k), ldb,  /* ldb * K */
                                       A(n, k), lda,  /* lda * K */
                                zbeta, C(m, n), ldc); /* ldc * Y */
                        }
                        else {
                            if (k == n) {
                                QUARK_CORE_dsymm(
                                    plasma->quark, &task_flags,
                                    side, uplo,
                                    tempmm, tempnn, A.mb,
                                    alpha, A(k, k), ldak, /* ldak * Y */
                                           B(m, k), ldb,  /* ldb  * Y */
                                    zbeta, C(m, n), ldc); /* ldc  * Y */
                            }
                            else {
                                QUARK_CORE_dgemm(
                                    plasma->quark, &task_flags,
                                    PlasmaNoTrans, PlasmaNoTrans,
                                    tempmm, tempnn, tempkn, A.mb,
                                    alpha, B(m, k), ldb,  /* ldb  * K */
                                           A(k, n), ldak, /* ldak * Y */
                                    zbeta, C(m, n), ldc); /* ldc  * Y */
                            }
                        }
                    }
                }
                /*
                 *  PlasmaRight / PlasmaUpper
                 */
                else {
                    for (k = 0; k < C.nt; k++) {
                        tempkn = k == C.nt-1 ? C.n-k*C.nb : C.nb;
                        ldak = BLKLDD(A, k);
                        zbeta = k == 0 ? beta : zone;
                        if (k < n) {
                            QUARK_CORE_dgemm(
                                plasma->quark, &task_flags,
                                PlasmaNoTrans, PlasmaNoTrans,
                                tempmm, tempnn, tempkn, A.mb,
                                alpha, B(m, k), ldb,  /* ldb  * K */
                                       A(k, n), ldak, /* ldak * Y */
                                zbeta, C(m, n), ldc); /* ldc  * Y */
                        }
                        else {
                            if (k == n) {
                                QUARK_CORE_dsymm(
                                    plasma->quark, &task_flags,
                                    side, uplo,
                                    tempmm, tempnn, A.mb,
                                    alpha, A(k, k), ldak, /* ldak * Y */
                                           B(m, k), ldb,  /* ldb  * Y */
                                    zbeta, C(m, n), ldc); /* ldc  * Y */
                            }
                            else {
                                QUARK_CORE_dgemm(
                                    plasma->quark, &task_flags,
                                    PlasmaNoTrans, PlasmaTrans,
                                    tempmm, tempnn, tempkn, A.mb,
                                    alpha, B(m, k), ldb,  /* ldb * K */
                                           A(n, k), lda,  /* lda * K */
                                    zbeta, C(m, n), ldc); /* ldc * Y */
                            }
                        }
                    }
                }
            }
        }
    }
}
예제 #20
0
/***************************************************************************//**
 *  Parallel UU' or L'L operation - dynamic scheduling
 **/
void plasma_pclauum_quark(PLASMA_enum uplo, PLASMA_desc A,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int ldam;
    int tempkm, tempmm, tempnn;

    PLASMA_Complex32_t zone = (PLASMA_Complex32_t)1.0;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
    /*
     *  PlasmaLower
     */
    if (uplo == PlasmaLower) {
        for (m = 0; m < A.mt; m++) {
            tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            for(n = 0; n < m; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_cherk(
                    plasma->quark, &task_flags,
                    uplo, PlasmaConjTrans,
                    tempnn, tempmm, A.mb,
                    1.0, A(m, n), ldam,
                    1.0, A(n, n), A.mb);

                for(k = n+1; k < m; k++) {
                    tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                    QUARK_CORE_cgemm(
                        plasma->quark, &task_flags,
                        PlasmaConjTrans, PlasmaNoTrans,
                        tempkm, tempnn, tempmm, A.mb,
                        zone, A(m, k), ldam,
                              A(m, n), ldam,
                        zone, A(k, n), A.mb);
                }
            }
            for (n = 0; n < m; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_ctrmm(
                    plasma->quark, &task_flags,
                    PlasmaLeft, uplo, PlasmaConjTrans, PlasmaNonUnit,
                    tempmm, tempnn, A.mb,
                    zone, A(m, m), ldam,
                          A(m, n), ldam);
            }
            QUARK_CORE_clauum(
                plasma->quark, &task_flags,
                uplo,
                tempmm,
                A.mb, A(m, m), ldam);
        }
    }
    /*
     *  PlasmaUpper
     */
    else {
        for (m = 0; m < A.mt; m++) {
            tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            for (n = 0; n < m; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_cherk(
                    plasma->quark, &task_flags,
                    uplo, PlasmaNoTrans,
                    tempnn, tempmm, A.mb,
                    1.0, A(n, m), A.mb,
                    1.0, A(n, n), A.mb);

                for (k = n+1; k < m; k++){
                    tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                    QUARK_CORE_cgemm(
                        plasma->quark, &task_flags,
                        PlasmaNoTrans, PlasmaConjTrans,
                        tempnn, tempkm, tempmm, A.mb,
                        zone, A(n, m), A.mb,
                              A(k, m), A.mb,
                        zone, A(n, k), A.mb);
                }
            }
            for (n = 0; n < m; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_ctrmm(
                    plasma->quark, &task_flags,
                    PlasmaRight, uplo, PlasmaConjTrans, PlasmaNonUnit,
                    tempnn, tempmm, A.mb,
                    zone, A(m, m), ldam,
                          A(n, m), A.mb);
            }
            QUARK_CORE_clauum(
                plasma->quark, &task_flags,
                uplo,
                tempmm,
                A.mb, A(m, m), ldam);
        }
    }
}
예제 #21
0
/***************************************************************************//**
 *  Parallel tile LQ factorization - dynamic scheduling
 **/
void plasma_pcgelqf_quark(PLASMA_desc A, PLASMA_desc T,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int ldak, ldam;
    int tempkm, tempkn, tempmm, tempnn;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    for (k = 0; k < min(A.mt, A.nt); k++) {
        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        ldak = BLKLDD(A, k);
        QUARK_CORE_cgelqt(
            plasma->quark, &task_flags,
            tempkm, tempkn, ib, T.nb,
            A(k, k), ldak,
            T(k, k), T.mb);

        for (m = k+1; m < A.mt; m++) {
            tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            QUARK_CORE_cunmlq(
                plasma->quark, &task_flags,
                PlasmaRight, PlasmaConjTrans,
                tempmm, tempkn, tempkn, ib, T.nb,
                A(k, k), ldak,
                T(k, k), T.mb,
                A(m, k), ldam);
        }
        for (n = k+1; n < A.nt; n++) {
            tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            QUARK_CORE_ctslqt(
                plasma->quark, &task_flags,
                tempkm, tempnn, ib, T.nb,
                A(k, k), ldak,
                A(k, n), ldak,
                T(k, n), T.mb);

            for (m = k+1; m < A.mt; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_ctsmlq(
                    plasma->quark, &task_flags,
                    PlasmaRight, PlasmaConjTrans,
                    tempmm, A.nb, tempmm, tempnn, A.mb, ib, T.nb,
                    A(m, k), ldam,
                    A(m, n), ldam,
                    A(k, n), ldak,
                    T(k, n), T.mb);
            }
        }
    }
}
예제 #22
0
/***************************************************************************//**
 *  Parallel construction of Q using tile V (application to identity;
 *  reduction Householder) - dynamic scheduling
 **/
void plasma_pzunglqrh_quark(PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, int BS,
                            PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int K, N, RD, lastRD;
    int ldak;
    int ldqm;
    int tempkm, tempNn, tempnn, tempmm, tempNRDn, tempkmin;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    K = min(A.mt, A.nt);

    for (k = K-1; k >= 0; k--) {
        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        ldak = BLKLDD(A, k);
        lastRD = 0;
        for (RD = BS; RD < A.nt-k; RD *= 2)
            lastRD = RD;
        for (RD = lastRD; RD >= BS; RD /= 2) {
            for (N = k;
                 N+RD < A.nt-1; /* No reduction with rightmost single-column subdomain */
                 N += 2*RD) {
                tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
                for (m = 0; m < Q.mt; m++) {
                    tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
                    ldqm   = BLKLDD(Q, m   );
                    QUARK_CORE_zttmlq(
                                      plasma->quark, &task_flags,
                                      PlasmaRight, PlasmaNoTrans,
                                      tempmm, Q.nb, tempmm, tempNRDn,
                                      tempkm, ib, T.nb,
                                      Q (m, N   ), ldqm,
                                      Q (m, N+RD), ldqm,
                                      A (k, N+RD), ldak,
                                      T2(k, N+RD), T.mb); 
                }
            }
        }
        for (N = k;
             N < A.nt-1 || N == k;  /* No rightmost single-column */
             N += BS) {
            tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
            tempkmin = min(tempkm, tempNn);
            for (n = N+BS-1 == A.nt-2 ? A.nt-1 : min(N+BS-1, A.nt-1); /* Suck in rightmost single-column domain */
                 n >= N+1;
                 n--) {
                tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb;
                
                for (m = 0; m < Q.mt; m++) {
                    tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
                    ldqm = BLKLDD(Q, m);
                    QUARK_CORE_ztsmlq(
                                      plasma->quark, &task_flags,
                                      PlasmaRight, PlasmaNoTrans,
                                      tempmm, Q.nb, tempmm, tempnn,
                                      tempkm, ib, T.nb,
                                      Q(m, N), ldqm,
                                      Q(m, n), ldqm,
                                      A(k, n), ldak,
                                      T(k, n), T.mb);
                }
            }
            for (m = 0; m < Q.mt; m++) {
                tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
                ldqm = BLKLDD(Q, m);
                QUARK_CORE_zunmlq(
                                  plasma->quark, &task_flags,
                                  PlasmaRight, PlasmaNoTrans,
                                  tempmm, tempNn, 
                                  tempkmin, ib, T.nb,
                                  A(k, N), ldak,
                                  T(k, N), T.mb,
                                  Q(m, N), ldqm);
            }
        }
    }
}
예제 #23
0
/***************************************************************************//**
 *  Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler
 **/
void plasma_pdsbrdt_quark(PLASMA_enum uplo,
                          PLASMA_desc A, double *D, double *E, PLASMA_desc T,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

#ifdef COMPLEX
    static double zone  = (double) 1.0;
    static double             dzero = (double) 0.0;
    double ztmp;
    double absztmp;
#endif

    double *C, *S;
    int N, NB, INgrsiz, INthgrsiz, BAND;
    int myid, grsiz, shift=3, stt, st, ed, stind, edind;
    int blklastind, colpt, PCOL, ACOL, MCOL;
    int stepercol, mylastid, grnb, grid;
    int *DEP,*MAXID;
    int i, j, m;
    int thgrsiz, thgrnb, thgrid, thed;
    size_t eltsize = plasma_element_size(A.dtyp);

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;

    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    N  = A.m;
    NB = A.mb;

    /* Quick return */
    if (N == 0){
        return;
    }
    
    if (NB == 0) {
        memset(D, 0,     N*sizeof(double));
        memset(E, 0, (N-1)*sizeof(double));
#ifdef COMPLEX
        for (i=0; i<N; i++)
            D[i]  = fabs(*A(i,i));
#else
        for (i=0; i<N; i++)
          D[i]  = *A(i,i);
#endif
        return;
    }

    /*
     * Barrier is used because the bulge have to wait until
     * the reduction to band has been finish.
     * otherwise, I can remove this BARRIER when I integrate
     * the function dependencies link inside the reduction to
     * band. Keep in min the case when NB=1, where no bulge-chasing.
     */
    /***************************************************************/
    QUARK_Barrier(plasma->quark);
    tblg   = -Wtimming();
    /***************************************************************/

    /*
     * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge.
     * Make diagonal and superdiagonal elements real, storing them in
     * D and E. if PlasmaLower, first transform lower bidiagonal form
     * to upper bidiagonal by applying plane rotations/ Householder
     * from the left, overwriting superdiagonal elements then make
     * elements real of the resulting upper Bidiagonal. if PlasmaUpper
     * then make its elements real.  For Q, PT: ZSCAL should be done
     * in case of WANTQ.
     */
    if (NB == 1){
        memset(D, 0, N    *sizeof(double));
        memset(E, 0, (N-1)*sizeof(double));
#ifdef COMPLEX
        if(uplo==PlasmaLower){
            for (i=0; i<N; i++)
            {
                D[i] = ( *A(i, i) );               /* diag value */
                if( i < (N-1)) {                            /* lower off-diag value */
                    ztmp        = *A((i+1),i);
                    absztmp     = fabs(ztmp);
                    *A((i+1),i) = absztmp;
                    E[i]        = absztmp;
                    if(absztmp != dzero)
                        ztmp = (double) (ztmp / absztmp);
                    else
                        ztmp = zone;
                    if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp;
                    /* for Q: ZSCAL should be done in case of WANTQ */
                }
            }
        } else { /* PlasmaUpper */
            for (i=0; i<N; i++)
            {
                D[i]  =  ( *A(i,i) );               /* diag value*/
                if(i<(N-1)) {                            /* lower off-diag value */
                    ztmp        = *A(i, (i+1));
                    absztmp     = fabs(ztmp);
                    *A(i,(i+1)) = absztmp;
                    E[i]        = absztmp;
                    if(absztmp != dzero)
                        ztmp = (double) (ztmp / absztmp);
                    else
                        ztmp = zone;
                    if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp;
                    /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */
                }
            }
        } /* end PlasmaUpper*/
#else
       if( uplo == PlasmaLower ){
           for (i=0; i < N-1; i++) {
               D[i] = *A(i,   i);
               E[i] = *A(i+1, i);
           }
           D[i] = *A(i, i);
       } else {
           for (i=0; i < N-1; i++) {
               D[i] = *A(i, i  );
               E[i] = *A(i, i+1);
           }
           D[i] = *A(i, i);
       }
#endif
       return;
    }

    /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */
    if( N <= 0 ) /* this will be removed we don t need it. */
    {
        double *work, *TTau;
        int info, ldwork = N*N;
        work = (double *) plasma_shared_alloc(plasma, ldwork, PlasmaRealDouble);
        TTau = (double *) plasma_shared_alloc(plasma, N,   PlasmaRealDouble);
        
        info = LAPACKE_dsytrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N,
                                   A(0,0), A.lm, D, E, TTau, work, ldwork);
        plasma_shared_free(plasma, (void*) work);
        plasma_shared_free(plasma, (void*) TTau);

        if( info == 0 )
            sequence->status = PLASMA_SUCCESS;
        else
            plasma_sequence_flush(plasma->quark, sequence, request, info);
        return;
    }

    /* General case NB > 1 && N > NB */
    DEP   = (int *)                plasma_shared_alloc(plasma, N+1, PlasmaInteger      );
    MAXID = (int *)                plasma_shared_alloc(plasma, N+1, PlasmaInteger      );
    C     = (double *) plasma_shared_alloc(plasma, N,   PlasmaRealDouble);
    S     = (double *) plasma_shared_alloc(plasma, N,   PlasmaRealDouble);
    memset(MAXID,0,(N+1)*sizeof(int));

    /***************************************************************************
     *                       START BULGE CHASING CODE
     **************************************************************************/
    /* 
     * Initialisation of local parameter. those parameter should be
     * input or tuned parameter.
     */
    INgrsiz = 1;
    if( NB > 160 ) {
        INgrsiz = 2;
    }
    else if( NB > 100 ) {
        if( N < 5000 )
            INgrsiz = 2;
        else
            INgrsiz = 4;
    } else {
        INgrsiz = 6;
    }
    INthgrsiz = N;
    BAND      = 0;

    grsiz   = INgrsiz;
    thgrsiz = INthgrsiz;
    if( grsiz   == 0 ) grsiz   = 6;
    if( thgrsiz == 0 ) thgrsiz = N;

    i = shift/grsiz;
    stepercol =  i*grsiz == shift ? i:i+1;

    i       = (N-2)/thgrsiz;
    thgrnb  = i*thgrsiz == (N-2) ? i:i+1;

    for (thgrid = 1; thgrid<=thgrnb; thgrid++){
        stt  = (thgrid-1)*thgrsiz+1;
        thed = min( (stt + thgrsiz -1), (N-2));
        for (i = stt; i <= N-2; i++){
            ed=min(i,thed);
            if(stt>ed)break;
            for (m = 1; m <=stepercol; m++){
                st=stt;
                for (j = st; j <=ed; j++){
                    /* PCOL:  dependency on the ID of the master of the group of the previous column.  (Previous Column:PCOL). */
                    /* ACOL:  dependency on the ID of the master of the previous group of my column.   (Acctual  Column:ACOL). (it is 0(NULL) for myid=1) */
                    /* MCOL:  OUTPUT dependency on the my ID, to be used by the next ID. (My Column: MCOL). I am the master of this group. */
                    myid     = (i-j)*(stepercol*grsiz) +(m-1)*grsiz + 1;
                    mylastid = myid+grsiz-1;
                    PCOL     = mylastid+shift-1;  /* to know the dependent ID of the previous column. need to know the master of its group */
                    MAXID[j] = myid;
                    PCOL     = min(PCOL,MAXID[j-1]); /* for the last columns, we might do only 1 or 2 kernel, so the PCOL will be wrong. this is to force it to the last ID of the previous col.*/
                    grnb     = PCOL/grsiz;
                    grid     = grnb*grsiz == PCOL ? grnb:grnb+1;
                    PCOL     = (grid-1)*grsiz +1; /* give me the ID of the master of the group of the previous column. */
                    ACOL     = myid-grsiz;
                    if(myid==1)ACOL=0;
                    MCOL     = myid;
                    
                    QUARK_CORE_dtrdalg(
                        plasma->quark, &task_flags,
                        uplo, N, NB,
                        &A, C, S, i, j, m, grsiz, BAND,
                        DEP(PCOL), DEP(ACOL), DEP(MCOL) );
                    
                    if(mylastid%2 ==0){
                        blklastind      = (mylastid/2)*NB+1+j-1;
                    }else{
                        colpt      = ((mylastid+1)/2)*NB + 1 +j -1 ;
                        stind      = colpt-NB+1;
                        edind      = min(colpt,N);
                        if( (stind>=edind-1) && (edind==N) )
                            blklastind=N;
                        else
                            blklastind=0;
                    }
                    if(blklastind >= (N-1))  stt=stt+1;
                } /* END for j=st:ed    */
            } /* END for m=1:stepercol */
        } /* END for i=1:MINMN-2      */
    } /* END for thgrid=1:thgrnb     */
    
    /*
     * Barrier used only for now, to be sure that everything
     * is done before copying the D and E and free workspace.
     * this will be removed later when D and E are directly filled
     * during the bulge process.
     */
    QUARK_Barrier(plasma->quark);
    tblg   += Wtimming();
    //printf("   done with bulge %lf \n\n\n",tblg);

    plasma_shared_free(plasma, (void*) DEP);
    plasma_shared_free(plasma, (void*) MAXID);
    plasma_shared_free(plasma, (void*) C);
    plasma_shared_free(plasma, (void*) S);

    /*
     * STORE THE RESULTING diagonal/off-diagonal in D AND E
     */
    memset(D, 0,  N   *sizeof(double));
    memset(E, 0, (N-1)*sizeof(double));
    /* Make diagonal and superdiagonal elements real,
     * storing them in D and E
     */
    /* In complex case, the off diagonal element are
     * not necessary real. we have to make off-diagonal
     * elements real and copy them to E.
     * When using HouseHolder elimination,
     * the ZLARFG give us a real as output so, all the
     * diagonal/off-diagonal element except the last one are already
     * real and thus we need only to take the abs of the last
     * one.
     *  */
#ifdef COMPLEX
    if(uplo==PlasmaLower){
       for (i=0; i < N-1 ; i++)
       {
          D[i] = ( *A(i,i) );               
          /*
           * Alternative for Householder case, all off-diag
           * are real except the last off-diag, where we
           * have to take the abs
           */
          if(i<(N-2))
              E[i] = (*A(i+1, i));
          else
              E[i] = fabs( *A(i+1, i));
       }
        D[i] = ( *A(i, i) );
    } else { /* PlasmaUpper */
        for (i=0; i<N-1; i++)
        {
            D[i]  =  ( *A(i,i) );               
            /*
             * Alternative for Householder case, all off-diag
             * are real except the last off-diag, where we
             * have to take the abs
             */
            if( i < (N-2) )
                E[i] = (*A(i, (i+1)));
            else
                E[i] = fabs(*A(i, (i+1)));
        }
        D[i] = ( *A(i, i) );
    } /* end PlasmaUpper */
#else
    if( uplo == PlasmaLower ){
        for (i=0; i < N-1; i++) {
            D[i] = *A(i,   i);
            E[i] = *A(i+1, i);
        }
        D[i] = *A(i, i);
    } else {
        for (i=0; i < N-1; i++) {
            D[i] = *A(i, i  );
            E[i] = *A(i, i+1);
        }
        D[i] = *A(i, i);
    }
#endif

} /* END FUNCTION */
예제 #24
0
/***************************************************************************//**
 *  Parallel tile Cholesky factorization - dynamic scheduling
 **/
void plasma_pspotrf_quark(PLASMA_enum uplo, PLASMA_desc A,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int ldak, ldam;
    int tempkm, tempmm;

    float zone  = (float) 1.0;
    float mzone = (float)-1.0;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
    /*
     *  PlasmaLower
     */
    if (uplo == PlasmaLower) {
        for (k = 0; k < A.mt; k++) {
            tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
            ldak = BLKLDD(A, k);
            QUARK_CORE_spotrf(
                plasma->quark, &task_flags,
                PlasmaLower, tempkm, A.mb,
                A(k, k), ldak,
                sequence, request, A.nb*k);

            for (m = k+1; m < A.mt; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_strsm(
                    plasma->quark, &task_flags,
                    PlasmaRight, PlasmaLower, PlasmaTrans, PlasmaNonUnit,
                    tempmm, A.mb, A.mb,
                    zone, A(k, k), ldak,
                          A(m, k), ldam);
            }
            for (m = k+1; m < A.mt; m++) {
                tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_ssyrk(
                    plasma->quark, &task_flags,
                    PlasmaLower, PlasmaNoTrans,
                    tempmm, A.mb, A.mb,
                    -1.0, A(m, k), ldam,
                     1.0, A(m, m), ldam);

                for (n = k+1; n < m; n++) {
                    QUARK_CORE_sgemm(
                        plasma->quark, &task_flags,
                        PlasmaNoTrans, PlasmaTrans,
                        tempmm, A.mb, A.mb, A.mb,
                        mzone, A(m, k), ldam,
                               A(n, k), A.mb,
                        zone,  A(m, n), ldam);
                }
            }
        }
    }
    /*
     *  PlasmaUpper
     */
    else {
        for (k = 0; k < A.nt; k++) {
            tempkm = k == A.nt-1 ? A.n-k*A.nb : A.nb;
            ldak = BLKLDD(A, k);
            QUARK_CORE_spotrf(
                plasma->quark, &task_flags,
                PlasmaUpper,
                tempkm, A.mb,
                A(k, k), ldak,
                sequence, request, A.nb*k);

            for (m = k+1; m < A.nt; m++) {
                tempmm = m == A.nt-1 ? A.n-m*A.nb : A.nb;
                QUARK_CORE_strsm(
                    plasma->quark, &task_flags,
                    PlasmaLeft, PlasmaUpper, PlasmaTrans, PlasmaNonUnit,
                    A.nb, tempmm, A.mb,
                    zone, A(k, k), ldak,
                          A(k, m), ldak);
            }
            for (m = k+1; m < A.nt; m++) {
                tempmm = m == A.nt-1 ? A.n-m*A.nb : A.nb;
                ldam = BLKLDD(A, m);
                QUARK_CORE_ssyrk(
                    plasma->quark, &task_flags,
                    PlasmaUpper, PlasmaTrans,
                    tempmm, A.mb, A.mb,
                    -1.0, A(k, m), ldak,
                     1.0, A(m, m), ldam);

                for (n = k+1; n < m; n++) {
                    QUARK_CORE_sgemm(
                        plasma->quark, &task_flags,
                        PlasmaTrans, PlasmaNoTrans,
                        A.mb, tempmm, A.mb, A.mb,
                        mzone, A(k, n), ldak,
                               A(k, m), ldak,
                        zone,  A(n, m), A.mb);
                }
            }
        }
    }
}
예제 #25
0
/***************************************************************************//**
 *  Parallel application of Q using tile V - QR factorization (reduction Householder)
 *  - dynamic scheduling
 **/
void plasma_pcunmqrrh_quark(PLASMA_enum side, PLASMA_enum trans,
        PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int BS,
        PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int K, M, RD, lastRD;
    int ldaM, ldam, ldan, ldaMRD;
    int ldbM, ldbm, ldbMRD;
    int tempMm, tempkn, tempnn, tempmm, tempMRDm, tempkmin;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    K = min(A.mt, A.nt);

    if (side == PlasmaLeft ) {
        if (trans == PlasmaConjTrans) {
            /*
             *  PlasmaLeft / PlasmaConjTrans
             */
            for (k = 0; k < K; k++) {
                tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                for (M = k;
                        M < A.mt-1 || M == k;  /*  No bottom single-row subdomain */
                        M += BS) {
                    tempMm   = M == A.mt-1 ? A.m-M*A.mb : A.mb;
                    tempkmin = min(tempMm, tempkn);
                    ldaM = BLKLDD(A, M);
                    ldbM = BLKLDD(B, M);
                    for (n = 0; n < B.nt; n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        QUARK_CORE_cunmqr(
                                plasma->quark, &task_flags,
                                side, trans,
                                tempMm, tempnn,
                                tempkmin, ib, T.nb,
                                A(M, k), ldaM,
                                T(M, k), T.mb,
                                B(M, n), ldbM);
                    }
                    for (m = M+1;
                            (m < M+BS && m < A.mt) || m == A.mt-1; /*  Suck in bottom single-row domain */
                            m++) {
                        tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                        ldbm = BLKLDD(B, m);
                        ldam = BLKLDD(A, m);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_ctsmqr(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    A.nb, tempnn, tempmm, tempnn,
                                    tempkn, ib, T.nb,
                                    B(M, n), ldbM,
                                    B(m, n), ldbm,
                                    A(m, k), ldam,
                                    T(m, k), T.mb);
                        }
                    }
                }
                for (RD = BS; RD < A.mt-k; RD *= 2) {
                    for (M = k;
                            M+RD < A.mt-1; /*  No reduction with bottom single-row subdomain */
                            M += 2*RD) {
                        tempMRDm = M+RD == A.mt-1 ? A.m-(M+RD)*A.mb : A.mb;
                        ldbM   = BLKLDD(B, M   );
                        ldbMRD = BLKLDD(B, M+RD);
                        ldaMRD = BLKLDD(A, M+RD);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_cttmqr(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    A.nb, tempnn, tempMRDm, tempnn,
                                    tempkn, ib, T.nb,
                                    B (M,    n), ldbM,
                                    B (M+RD, n), ldbMRD,
                                    A (M+RD, k), ldaMRD,
                                    T2(M+RD, k), T.mb);
                        }
                    }
                }
            }
        } else {
            /*
             *  PlasmaLeft / PlasmaNoTrans
             */
            for (k = K-1; k >= 0; k--) {
                tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                lastRD = 0;
                for (RD = BS; RD < A.mt-k; RD *= 2)
                    lastRD = RD;
                for (RD = lastRD; RD >= BS; RD /= 2) {
                    for (M = k;
                            M+RD < A.mt-1; /*  No reduction with bottom single-row subdomain */
                            M += 2*RD) {
                        tempMRDm = M+RD == A.mt-1 ? A.m-(M+RD)*A.mb : A.mb;
                        ldbM   = BLKLDD(B, M   );
                        ldbMRD = BLKLDD(B, M+RD);
                        ldaMRD = BLKLDD(A, M+RD);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_cttmqr(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    A.nb, tempnn, tempMRDm, tempnn,
                                    tempkn, ib, T.nb,
                                    B (M,    n), ldbM,
                                    B (M+RD, n), ldbMRD,
                                    A (M+RD, k), ldaMRD,
                                    T2(M+RD, k), T.mb);
                        }
                    }
                }
                for (M = k;
                        M < A.mt-1 || M == k;  /*  No bottom single-row subdomain */
                        M += BS) {
                    tempMm   = M == A.mt-1 ? A.m-M*A.mb : A.mb;
                    tempkmin = min(tempMm, tempkn);
                    ldaM = BLKLDD(A, M);
                    ldbM = BLKLDD(B, M);
                    for (m = M+BS-1 == A.mt-2 ? A.mt-1 : min(M+BS-1, A.mt-1); /*  Suck in bottom single-row domain */
                            m >= M+1;
                            m--) {
                        tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
                        ldbm = BLKLDD(B, m);
                        ldam = BLKLDD(A, m);
                        for (n = 0; n < B.nt; n++) {
                            tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                            QUARK_CORE_ctsmqr(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    A.nb, tempnn, tempmm, tempnn,
                                    tempkn, ib, T.nb,
                                    B(M, n), ldbM,
                                    B(m, n), ldbm,
                                    A(m, k), ldam,
                                    T(m, k), T.mb);
                        }
                    }
                    for (n = 0; n < B.nt; n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        QUARK_CORE_cunmqr(
                                plasma->quark, &task_flags,
                                side, trans,
                                tempMm, tempnn,
                                tempkmin, ib, T.nb,
                                A(M, k), ldaM,
                                T(M, k), T.mb,
                                B(M, n), ldbM);
                    }
                }
            }
        }
    } else {
        if (trans == PlasmaConjTrans) {
            /*
             *  PlasmaRight / PlasmaConjTrans
             */
              for (k = K-1; k >= 0; k--) {
                  tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                  lastRD = 0;
                  for (RD = BS; RD < A.mt-k; RD *= 2)
                      lastRD = RD;
                  for (RD = lastRD; RD >= BS; RD /= 2) {
                      for (M = k;
                              M+RD < A.mt-1; /*  No reduction with bottom single-row subdomain */
                              M += 2*RD) {
                          tempMRDm = M+RD == A.mt-1 ? A.m-(M+RD)*A.mb : A.mb;
                          ldaMRD = BLKLDD(A, M+RD);
                          for (m = 0; m < B.mt; m++) {
                              ldbm   = BLKLDD(B, m);
                              tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                              QUARK_CORE_cttmqr(
                                      plasma->quark, &task_flags,
                                      side, trans,
                                      tempmm, B.nb, tempmm, tempMRDm,
                                      tempkn, ib, T.nb,
                                      B (m, M), ldbm,
                                      B (m, M+RD), ldbm,
                                      A (M+RD, k), ldaMRD,
                                      T2(M+RD, k), T.mb);
                          }
                      }
                  }
                  for (M = k;
                          M < A.mt-1 || M == k;  /*  No bottom single-row subdomain */
                          M += BS) {
                      tempMm   = M == A.mt-1 ? A.m-M*A.mb : A.mb;
                      tempkmin = min(tempMm, tempkn);
                      ldaM = BLKLDD(A, M);
                      ldbM = BLKLDD(B, M);
                      for (n = M+BS-1 == A.mt-2 ? A.mt-1 : min(M+BS-1, A.mt-1); /*  Suck in bottom single-row domain */
                              n >= M+1;
                              n--) {
                          ldan = BLKLDD(A, n);
                          tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                          for (m = 0; m < B.mt; m++) {
                              ldbm = BLKLDD(B, m);
                              tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                              QUARK_CORE_ctsmqr(
                                      plasma->quark, &task_flags,
                                      side, trans,
                                      tempmm, tempMm, tempmm, tempnn,
                                      tempkn, ib, T.nb,
                                      B(m, M), ldbm,
                                      B(m, n), ldbm,
                                      A(n, k), ldan,
                                      T(n, k), T.mb);
                          }
                      }
                      for (m = 0; m < B.mt; m++) {
                          ldbm = BLKLDD(B, m);
                          tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                          QUARK_CORE_cunmqr(
                                  plasma->quark, &task_flags,
                                  side, trans,
                                  tempmm, tempMm,
                                  tempkmin, ib, T.nb,
                                  A(M, k), ldaM,
                                  T(M, k), T.mb,
                                  B(m, M), ldbm);
                      }
                  }
              }
        } else {
            /*
             *  PlasmaRight / PlasmaNoTrans
             */
            for (k = 0; k < K; k++) {
                tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                for (M = k;
                        M < A.mt-1 || M == k;  /*  No bottom single-row subdomain */
                        M += BS) {
                    tempMm   = M == A.mt-1 ? A.m-M*A.mb : A.mb;
                    tempkmin = min(tempMm, tempkn);
                    ldaM = BLKLDD(A, M);
                    for (m = 0; m < B.mt; m++) {
                        ldbm = BLKLDD(B, m);
                        tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                        QUARK_CORE_cunmqr(
                                plasma->quark, &task_flags,
                                side, trans,
                                tempmm, tempMm,
                                tempkmin, ib, T.nb,
                                A(M, k), ldaM,
                                T(M, k), T.mb,
                                B(m, M), ldbm);
                    }
                    for (n = M+1;
                            (n < M+BS && n < A.mt) || n == A.mt-1; /*  Suck in bottom single-row domain */
                            n++) {
                        tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
                        ldan = BLKLDD(A, n);
                        for (m = 0; m < B.mt; m++) {
                            tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                            ldbm = BLKLDD(B, m);
                            QUARK_CORE_ctsmqr(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    tempmm, tempMm, tempmm, tempnn,
                                    tempkn, ib, T.nb,
                                    B(m, M), ldbm,
                                    B(m, n), ldbm,
                                    A(n, k), ldan,
                                    T(n, k), T.mb);
                        }
                    }
                }
                for (RD = BS; RD < A.mt-k; RD *= 2) {
                    for (M = k;
                            M+RD < A.mt-1; /*  No reduction with bottom single-row subdomain */
                            M += 2*RD) {
                        tempMRDm = M+RD == A.mt-1 ? A.m-(M+RD)*A.mb : A.mb;
                        ldaMRD = BLKLDD(A, M+RD);
                        for (m = 0; m < B.mt; m++) {
                            tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
                            ldbm   = BLKLDD(B, m);
                            QUARK_CORE_cttmqr(
                                    plasma->quark, &task_flags,
                                    side, trans,
                                    tempmm, B.nb, tempmm, tempMRDm,
                                    tempkn, ib, T.nb,
                                    B (m, M   ), ldbm,
                                    B (m, M+RD), ldbm,
                                    A (M+RD, k), ldaMRD,
                                    T2(M+RD, k), T.mb);
                        }
                    }
                }
            }
        }
    }
}
예제 #26
0
/***************************************************************************//**
 *  Parallel tile Hermitian rank-k update - dynamic scheduling
 **/
void plasma_pcsyr2k_quark(PLASMA_enum uplo, PLASMA_enum trans,
                          PLASMA_Complex32_t alpha, PLASMA_desc A, PLASMA_desc B,
                          PLASMA_Complex32_t beta,  PLASMA_desc C,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int m, n, k;
    int ldak, ldam, ldan, ldcm, ldcn;
    int ldbk, ldbm, ldbn;
    int tempnn, tempmm, tempkn, tempkm;

    PLASMA_Complex32_t zone   = (PLASMA_Complex32_t)1.0;
    PLASMA_Complex32_t zbeta;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    for (n = 0; n < C.nt; n++) {
        tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb;
        ldan = BLKLDD(A, n);
        ldbn = BLKLDD(B, n);
        ldcn = BLKLDD(C, n);
        /*
         *  PlasmaNoTrans
         */
        if (trans == PlasmaNoTrans) {
            for (k = 0; k < A.nt; k++) {
                tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                zbeta = k == 0 ? beta : zone;
                QUARK_CORE_csyr2k(
                    plasma->quark, &task_flags,
                    uplo, trans,
                    tempnn, tempkn, A.mb,
                    alpha, A(n, k), ldan, /* ldan * K */
                           B(n, k), ldbn,
                    zbeta, C(n, n), ldcn); /* ldc  * N */
            }
            /*
             *  PlasmaNoTrans / PlasmaLower
             */
            if (uplo == PlasmaLower) {
                for (m = n+1; m < C.mt; m++) {
                    tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
                    ldam = BLKLDD(A, m);
                    ldbm = BLKLDD(B, m);
                    ldcm = BLKLDD(C, m);
                    for (k = 0; k < A.nt; k++) {
                        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                        zbeta = k == 0 ? beta : zone;
                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaTrans,
                            tempmm, tempnn, tempkn, A.mb,
                            alpha, A(m, k), ldam,  /* ldam * K */
                                   B(n, k), ldbn,  /* ldan * K */
                            zbeta, C(m, n), ldcm); /* ldc  * N */

                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaTrans,
                            tempmm, tempnn, tempkn, A.mb,
                            alpha, B(m, k), ldbm,  /* ldam * K */
                                   A(n, k), ldan,  /* ldan * K */
                            zone,  C(m, n), ldcm); /* ldc  * N */
                    }
                }
            }
            /*
             *  PlasmaNoTrans / PlasmaUpper
             */
            else {
                for (m = n+1; m < C.mt; m++) {
                    tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
                    ldam = BLKLDD(A, m);
                    ldbm = BLKLDD(B, m);
                    for (k = 0; k < A.nt; k++) {
                        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                        zbeta = k == 0 ? beta : zone;
                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaTrans,
                            tempnn, tempmm, tempkn, A.mb,
                            alpha, A(n, k), ldan,  /* ldan * K */
                                   B(m, k), ldbm,  /* ldam * M */
                            zbeta, C(n, m), ldcn); /* ldc  * M */

                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaTrans,
                            tempnn, tempmm, tempkn, A.mb,
                            alpha, B(n, k), ldan,  /* ldan * K */
                                   A(m, k), ldam,  /* ldam * M */
                            zone,  C(n, m), ldcn); /* ldc  * M */
                    }
                }
            }
        }
        /*
         *  Plasma[Conj]Trans
         */
        else {
            for (k = 0; k < A.mt; k++) {
                tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                ldak = BLKLDD(A, k);
                ldbk = BLKLDD(B, k);
                zbeta = k == 0 ? beta : zone;
                QUARK_CORE_csyr2k(
                    plasma->quark, &task_flags,
                    uplo, trans,
                    tempnn, tempkm, A.mb,
                    alpha, A(k, n), ldak,  /* lda * N */
                           B(k, n), ldbk,
                    zbeta, C(n, n), ldcn); /* ldc * N */
            }
            /*
             *  Plasma[Conj]Trans / PlasmaLower
             */
            if (uplo == PlasmaLower) {
                for (m = n+1; m < C.mt; m++) {
                    tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
                    ldcm = BLKLDD(C, m);
                    for (k = 0; k < A.mt; k++) {
                        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                        ldak = BLKLDD(A, k);
                        ldbk = BLKLDD(B, k);
                        zbeta = k == 0 ? beta : zone;
                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaNoTrans,
                            tempmm, tempnn, tempkm, A.mb,
                            alpha, A(k, m), ldak,  /* lda * M */
                                   B(k, n), ldbk,  /* lda * N */
                            zbeta, C(m, n), ldcm); /* ldc * N */

                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaNoTrans,
                            tempmm, tempnn, tempkm, A.mb,
                            alpha, B(k, m), ldbk,  /* lda * M */
                                   A(k, n), ldak,  /* lda * N */
                            zone,  C(m, n), ldcm); /* ldc * N */
                    }
                }
            }
            /*
             *  Plasma[Conj]Trans / PlasmaUpper
             */
            else {
                for (m = n+1; m < C.mt; m++) {
                    tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
                    for (k = 0; k < A.mt; k++) {
                        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                        ldak = BLKLDD(A, k);
                        ldbk = BLKLDD(B, k);
                        zbeta = k == 0 ? beta : zone;
                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaNoTrans,
                            tempnn, tempmm, tempkm, A.mb,
                            alpha, A(k, n), ldak,  /* lda * K */
                                   B(k, m), ldbk,  /* lda * M */
                            zbeta, C(n, m), ldcn); /* ldc * M */

                        QUARK_CORE_cgemm(
                            plasma->quark, &task_flags,
                            trans, PlasmaNoTrans,
                            tempnn, tempmm, tempkm, A.mb,
                            alpha, B(k, n), ldbk,  /* lda * K */
                                   A(k, m), ldak,  /* lda * M */
                            zone,  C(n, m), ldcn); /* ldc * M */
                    }
                }
            }
        }
    }
}
예제 #27
0
static int
RunTest(int *iparam, double *dparam, real_Double_t *t_) 
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;
    PLASMA_Complex64_t *A, *A2 = NULL;
    real_Double_t       t;
    int                *ipiv, *ipiv2 = NULL;
    int i;
    int m     = iparam[TIMING_N];
    int n     = iparam[TIMING_NRHS];
    int check = iparam[TIMING_CHECK];
    int lda   = m;
    PLASMA_sequence *sequence = NULL;
    PLASMA_request request = PLASMA_REQUEST_INITIALIZER;

    /* Initialize Plasma */ 
    PLASMA_Init( iparam[TIMING_THRDNBR] );
    PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_DYNAMIC_SCHEDULING );

    PLASMA_Disable(PLASMA_AUTOTUNING);
    PLASMA_Set(PLASMA_TILE_SIZE,        iparam[TIMING_NB] );
    PLASMA_Set(PLASMA_INNER_BLOCK_SIZE, iparam[TIMING_IB] );

    /* Allocate Data */
    A  = (PLASMA_Complex64_t *)malloc(lda*n*sizeof(PLASMA_Complex64_t));

    /* Check if unable to allocate memory */
    if ( (! A) ) {
        printf("Out of Memory \n ");
        return -1;
    }

    /* Initialiaze Data */
    LAPACKE_zlarnv_work(1, ISEED, lda*n, A);

    /* Allocate Workspace */
    ipiv  = (int *)malloc( n*sizeof(int) );

    /* Save A in lapack layout for check */
    if ( check ) {
        A2 = (PLASMA_Complex64_t *)malloc(lda*n*sizeof(PLASMA_Complex64_t));
        ipiv2 = (int *)malloc( n*sizeof(int) );
        LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,' ', m, n, A, lda, A2, lda);
    
        LAPACKE_zgetrf_work(LAPACK_COL_MAJOR, m, n, A2, lda, ipiv2 );
    }

    plasma = plasma_context_self();
    PLASMA_Sequence_Create(&sequence);
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
    QUARK_Task_Flag_Set(&task_flags, TASK_THREAD_COUNT, iparam[TIMING_THRDNBR] );

    plasma_dynamic_spawn();
    CORE_zgetrf_reclap_init();

    t = -cWtime();
    QUARK_CORE_zgetrf_reclap(plasma->quark, &task_flags,
                             m, n, n,
                             A, lda, ipiv,
                             sequence, &request,
                             0, 0,
                             iparam[TIMING_THRDNBR]);
    PLASMA_Sequence_Wait(sequence);
    t += cWtime();
    *t_ = t;
    
    PLASMA_Sequence_Destroy(sequence);

    /* Check the solution */
    if ( check )
    {
        double *work = (double *)malloc(max(m,n)*sizeof(double));

        /* Check ipiv */
        for(i=0; i<n; i++)
        {
            if( ipiv[i] != ipiv2[i] ) {
                fprintf(stderr, "\nPLASMA (ipiv[%d] = %d, A[%d] = %e) / LAPACK (ipiv[%d] = %d, A[%d] = [%e])\n",
                        i, ipiv[i],  i, creal(A[  i * lda + i ]), 
                        i, ipiv2[i], i, creal(A2[ i * lda + i ])); 
                break;
            }
        }

        dparam[TIMING_ANORM] = LAPACKE_zlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaMaxNorm), 
                                                   m, n, A, lda, work);
        dparam[TIMING_XNORM] = LAPACKE_zlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaMaxNorm), 
                                                   m, n, A2, lda, work);
        dparam[TIMING_BNORM] = 0.0;

        CORE_zaxpy( m, n, -1.0, A, lda, A2, lda);

        dparam[TIMING_RES] = LAPACKE_zlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaMaxNorm), 
                                                 m, n, A2, lda, work);

        free( A2 );
        free( ipiv2 );
        free( work );
    }
    
    free( A  );
    free( ipiv );
    PLASMA_Finalize();

    return 0;
}
예제 #28
0
/***************************************************************************//**
 *
 **/
void plasma_pdlacpy_quark(PLASMA_enum uplo, PLASMA_desc A, PLASMA_desc B,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int X, Y;
    int m, n;
    int ldam, ldbm;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    switch (uplo) {
    /*
     *  PlasmaUpper
     */
    case PlasmaUpper:
        for (m = 0; m < A.mt; m++) {
            X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            ldbm = BLKLDD(B, m);
            if (m < A.nt) {
                Y = m == A.nt-1 ? A.n-m*A.nb : A.nb;
                QUARK_CORE_dlacpy(
                    plasma->quark, &task_flags,
                    PlasmaUpper,
                    X, Y, A.mb,
                    A(m, m), ldam,
                    B(m, m), ldbm);
            }
            for (n = m+1; n < A.nt; n++) {
                Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_dlacpy(
                    plasma->quark, &task_flags,
                    PlasmaUpperLower,
                    X, Y, A.mb,
                    A(m, n), ldam,
                    B(m, n), ldbm);
            }
        }
        break;
    /*
     *  PlasmaLower
     */
    case PlasmaLower:
        for (m = 0; m < A.mt; m++) {
            X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            ldbm = BLKLDD(B, m);
            if (m < A.nt) {
                Y = m == A.nt-1 ? A.n-m*A.nb : A.nb;
                QUARK_CORE_dlacpy(
                    plasma->quark, &task_flags,
                    PlasmaLower,
                    X, Y, A.mb,
                    A(m, m), ldam,
                    B(m, m), ldbm);
            }
            for (n = 0; n < min(m, A.nt); n++) {
                Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_dlacpy(
                    plasma->quark, &task_flags,
                    PlasmaUpperLower,
                    X, Y, A.mb,
                    A(m, n), ldam,
                    B(m, n), ldbm);
            }
        }
        break;
    /*
     *  PlasmaUpperLower
     */
    case PlasmaUpperLower:
    default:
        for (m = 0; m < A.mt; m++) {
            X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            ldbm = BLKLDD(B, m);
            for (n = 0; n < A.nt; n++) {
                Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_dlacpy(
                    plasma->quark, &task_flags,
                    PlasmaUpperLower,
                    X, Y, A.mb,
                    A(m, n), ldam,
                    B(m, n), ldbm);
            }
        }
    }
}
예제 #29
0
/***************************************************************************//**
 *  Parallel tile matrix-matrix multiplication - dynamic scheduling
 **/
void plasma_pzgemm_quark(PLASMA_enum transA, PLASMA_enum transB,
                         PLASMA_Complex64_t alpha, PLASMA_desc A, PLASMA_desc B,
                         PLASMA_Complex64_t beta,  PLASMA_desc C,
                         PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int m, n, k;
    int ldam, ldak, ldbn, ldbk, ldcm;
    int tempmm, tempnn, tempkn, tempkm;

    PLASMA_Complex64_t zbeta;
    PLASMA_Complex64_t zone = (PLASMA_Complex64_t)1.0;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    for (m = 0; m < C.mt; m++) {
        tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
        ldcm = BLKLDD(C, m);
        for (n = 0; n < C.nt; n++) {
            tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb;
            /*
             *  A: PlasmaNoTrans / B: PlasmaNoTrans
             */
            if (transA == PlasmaNoTrans) {
                ldam = BLKLDD(A, m);
                if (transB == PlasmaNoTrans) {
                    for (k = 0; k < A.nt; k++) {
                        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                        ldbk = BLKLDD(B, k);
                        zbeta = k == 0 ? beta : zone;
                        QUARK_CORE_zgemm(
                            plasma->quark, &task_flags,
                            transA, transB,
                            tempmm, tempnn, tempkn, A.mb,
                            alpha, A(m, k), ldam,  /* lda * Z */
                                   B(k, n), ldbk,  /* ldb * Y */
                            zbeta, C(m, n), ldcm); /* ldc * Y */
                    }
                }
                /*
                 *  A: PlasmaNoTrans / B: Plasma[Conj]Trans
                 */
                else {
                    ldbn = BLKLDD(B, n);
                    for (k = 0; k < A.nt; k++) {
                        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
                        zbeta = k == 0 ? beta : zone;
                        QUARK_CORE_zgemm(
                            plasma->quark, &task_flags,
                            transA, transB,
                            tempmm, tempnn, tempkn, A.mb,
                            alpha, A(m, k), ldam,  /* lda * Z */
                                   B(n, k), ldbn,  /* ldb * Z */
                            zbeta, C(m, n), ldcm); /* ldc * Y */
                    }
                }
            }
            /*
             *  A: Plasma[Conj]Trans / B: PlasmaNoTrans
             */
            else {
                if (transB == PlasmaNoTrans) {
                    for (k = 0; k < A.mt; k++) {
                        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                        ldak = BLKLDD(A, k);
                        ldbk = BLKLDD(B, k);
                        zbeta = k == 0 ? beta : zone;
                        QUARK_CORE_zgemm(
                            plasma->quark, &task_flags,
                            transA, transB,
                            tempmm, tempnn, tempkm, A.mb,
                            alpha, A(k, m), ldak,  /* lda * X */
                                   B(k, n), ldbk,  /* ldb * Y */
                            zbeta, C(m, n), ldcm); /* ldc * Y */
                    }
                }
                /*
                 *  A: Plasma[Conj]Trans / B: Plasma[Conj]Trans
                 */
                else {
                    ldbn = BLKLDD(B, n);
                    for (k = 0; k < A.mt; k++) {
                        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
                        ldak = BLKLDD(A, k);
                        zbeta = k == 0 ? beta : zone;
                        QUARK_CORE_zgemm(
                            plasma->quark, &task_flags,
                            transA, transB,
                            tempmm, tempnn, tempkm, A.mb,
                            alpha, A(k, m), ldak,  /* lda * X */
                                   B(n, k), ldbn,  /* ldb * Z */
                            zbeta, C(m, n), ldcm); /* ldc * Y */
                    }
                }
            }
        }
    }
}
예제 #30
0
/***************************************************************************//**
 *  Parallel tile LU factorization - dynamic scheduling
 **/
void plasma_psgetrf_incpiv_quark(PLASMA_desc A, PLASMA_desc L, int *IPIV,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    int k, m, n;
    int ldak, ldam;
    int tempkm, tempkn, tempmm, tempnn;
    int ib;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    ib = PLASMA_IB;
    for (k = 0; k < min(A.mt, A.nt); k++) {
        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        ldak = BLKLDD(A, k);
        QUARK_CORE_sgetrf_incpiv(
            plasma->quark, &task_flags,
            tempkm, tempkn, ib, L.nb,
            A(k, k), ldak, IPIV(k, k),
            sequence, request,
            k == A.mt-1, A.nb*k);

        for (n = k+1; n < A.nt; n++) {
            tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
            QUARK_CORE_sgessm(
                plasma->quark, &task_flags,
                tempkm, tempnn, tempkm, ib, L.nb,
                IPIV(k, k),
                A(k, k), ldak,
                A(k, n), ldak);
        }
        for (m = k+1; m < A.mt; m++) {
            tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
            ldam = BLKLDD(A, m);
            QUARK_CORE_ststrf(
                plasma->quark, &task_flags,
                tempmm, tempkn, ib, L.nb,
                A(k, k), ldak,
                A(m, k), ldam,
                L(m, k), L.mb,
                IPIV(m, k),
                sequence, request,
                m == A.mt-1, A.nb*k);

            for (n = k+1; n < A.nt; n++) {
                tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
                QUARK_CORE_sssssm(
                    plasma->quark, &task_flags,
                    A.nb, tempnn, tempmm, tempnn, A.nb, ib, L.nb,
                    A(k, n), ldak,
                    A(m, n), ldam,
                    L(m, k), L.mb,
                    A(m, k), ldam,
                    IPIV(m, k));
            }
        }
    }
}