Exemplo n.º 1
0
int main(int argc, char *argv[]) {
  double t1, t2;
  Node root;
  StealStack *ss;

  /* initialize stealstacks and comm. layer */
  ss = ss_init(&argc, &argv);

  /* determine benchmark parameters */
  uts_parseParams(argc, argv);

  /* Initialize trace collection structures */
  ss_initStats(ss);
 
  /* show parameter settings */
  if (ss_get_thread_num() == 0) {
      uts_printParams();
  }
  
  fflush(NULL);

  // Workers will return 1 from ss_start(), all others (managers)
  // will return 0 here once the computation ends
  if (ss_start(sizeof(Node), chunkSize)) {

      /* initialize root node and push on thread 0 stack */
      if (ss_get_thread_num() == 0) {
          uts_initRoot(&root, type);
#ifdef TRACE
	  ss_markSteal(ss, 0); // first session is own "parent session"
#endif
          ss_put_work(ss, &root);
      }
  
      /* time parallel search */
      t1 = uts_wctime();
      parTreeSearch(ss);
      t2 = uts_wctime();
      ss->walltime = t2 - t1;
#ifdef TRACE
      ss->startTime = t1;
      ss->sessionRecords[SS_IDLE][ss->entries[SS_IDLE] - 1].endTime = t2;
#endif
  }

  ss_stop();

  /* display results */
  showStats();

  ss_finalize();

  return 0;
}
Exemplo n.º 2
0
/***************************************************************************//**
 *  Parallel tile LU factorization - static scheduling
 **/
void plasma_pzgetrf_incpiv(plasma_context_t *plasma)
{
    PLASMA_desc A;
    PLASMA_desc L;
    int *IPIV;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_k;
    int next_m;
    int next_n;
    int ldak, ldam;
    int info;
    int tempkn, tempkm, tempmm, tempnn;
    int ib = PLASMA_IB;
    PLASMA_Complex64_t *work;

    plasma_unpack_args_5(A, L, IPIV, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;
    work = (PLASMA_Complex64_t*)plasma_private_alloc(plasma, ib*L.nb, L.dtyp);
    ss_init(A.mt, A.nt, -1);

    k = 0;
    n = PLASMA_RANK;
    while (n >= A.nt) {
        k++;
        n = n-A.nt+k;
    }
    m = k;

    while (k < min(A.mt, A.nt) && n < A.nt && !ss_aborted()) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_m++;
        if (next_m == A.mt) {
            next_n += PLASMA_SIZE;
            while (next_n >= A.nt && next_k < min(A.mt, A.nt)) {
                next_k++;
                next_n = next_n-A.nt+next_k;
            }
            next_m = next_k;
        }

        tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;

        ldak = BLKLDD(A, k);
        ldam = BLKLDD(A, m);

        if (n == k) {
            if (m == k) {
                ss_cond_wait(k, k, k-1);
                CORE_zgetrf_incpiv(
                    tempkm, tempkn, ib,
                    A(k, k), ldak,
                    IPIV(k, k), &info);
                if (info != 0 && m == A.mt-1) {
                    plasma_request_fail(sequence, request, info + A.nb*k);
                    ss_abort();
                }
                ss_cond_set(k, k, k);
            }
            else {
                ss_cond_wait(m, k, k-1);
                CORE_ztstrf(
                    tempmm, tempkn, ib, A.nb,
                    A(k, k), ldak,
                    A(m, k), ldam,
                    L(m, k), L.mb,
                    IPIV(m, k),
                    work, L.nb, &info);
                if (info != 0 && m == A.mt-1) {
                    plasma_request_fail(sequence, request, info + A.nb*k);
                    ss_abort();
                }
                ss_cond_set(m, k, k);
            }
        }
        else {
            if (m == k) {
                ss_cond_wait(k, k, k);
                ss_cond_wait(k, n, k-1);
                CORE_zgessm(
                    tempkm, tempnn, tempkm, ib,
                    IPIV(k, k),
                    A(k, k), ldak,
                    A(k, n), ldak);
            }
            else {
                ss_cond_wait(m, k, k);
                ss_cond_wait(m, n, k-1);
                CORE_zssssm(
                    A.nb, tempnn, tempmm, tempnn, A.nb, ib,
                    A(k, n), ldak,
                    A(m, n), ldam,
                    L(m, k), L.mb,
                    A(m, k), ldam,
                    IPIV(m, k));
                ss_cond_set(m, n, k);
            }
        }
        n = next_n;
        m = next_m;
        k = next_k;
    }
    plasma_private_free(plasma, work);
    ss_finalize();
}
Exemplo n.º 3
0
/***************************************************************************//**
 *  Parallel application of Q using tile V - LQ factorization - static scheduling
 **/
void plasma_pzunmlq(plasma_context_t *plasma)
{
    PLASMA_enum side;
    PLASMA_enum trans;
    PLASMA_desc A;
    PLASMA_desc B;
    PLASMA_desc T;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_k;
    int next_m;
    int next_n;
    int ldak, ldbk, ldbm;
    int tempmm, tempnn, tempkm, tempkmin;
    int minMT, minM;
    int ib = PLASMA_IB;
    PLASMA_Complex64_t *work;

    plasma_unpack_args_7(side, trans, A, B, T, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;

    if (side != PlasmaLeft) {
        plasma_request_fail(sequence, request, PLASMA_ERR_NOT_SUPPORTED);
        return;
    }
    if (trans != PlasmaConjTrans) {
        plasma_request_fail(sequence, request, PLASMA_ERR_NOT_SUPPORTED);
        return;
    }

    work = (PLASMA_Complex64_t*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp);
    ss_init(B.mt, B.nt, min(A.mt, A.nt));

    if (A.m > A.n) {
        minM  = A.n;
        minMT = A.nt;
    } else {
        minM  = A.m;
        minMT = A.mt;
    }

    k = minMT-1;
    n = PLASMA_RANK;
    while (n >= B.nt) {
        k--;
        n = n-B.nt;
    }
    m = B.mt-1;

    while (k >= 0 && n < B.nt) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_m--;
        if (next_m == k-1) {
            next_n += PLASMA_SIZE;
            while (next_n >= B.nt && next_k >= 0) {
                next_k--;
                next_n = next_n-B.nt;
            }
            next_m = B.mt-1;
        }

        tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
        tempkm   = k == B.mt-1 ? B.m-k*B.mb : B.mb;
        tempnn   = n == B.nt-1 ? B.n-n*B.nb : B.nb;
        tempmm   = m == B.mt-1 ? B.m-m*B.mb : B.mb;

        ldak = BLKLDD(A, k);
        ldbk = BLKLDD(B, k);
        ldbm = BLKLDD(B, m);

        if (m == k) {
            CORE_zunmlq(
                    side, trans,
                    tempkm, tempnn, tempkmin, ib,
                    A(k, k), ldak,
                    T(k, k), T.mb,
                    B(k, n), ldbk,
                    work, T.nb);
            ss_cond_set(k, n, k);
        }
        else {
            ss_cond_wait(m, n, k+1);
            CORE_ztsmlq(
                    side, trans,
                    A.mb, tempnn, tempmm, tempnn, tempkmin, ib,
                    B(k, n), ldbk,
                    B(m, n), ldbm,
                    A(k, m), ldak,
                    T(k, m), T.mb,
                    work, ib);
            ss_cond_set(m, n, k);
        }
        m = next_m;
        n = next_n;
        k = next_k;
    }
    plasma_private_free(plasma, work);
    ss_finalize();
}
Exemplo n.º 4
0
/***************************************************************************//**
 *  Parallel tile LQ factorization - static scheduling
 **/
void plasma_pcgelqf(plasma_context_t *plasma)
{
    PLASMA_desc A;
    PLASMA_desc T;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_k;
    int next_m;
    int next_n;
    int ldak, ldam;
    int tempkm, tempkn, tempmm, tempnn;
    int ib = PLASMA_IB;
    PLASMA_Complex32_t *work, *tau;

    plasma_unpack_args_4(A, T, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;
    work = (PLASMA_Complex32_t*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp);
    tau  = (PLASMA_Complex32_t*)plasma_private_alloc(plasma, A.nb, A.dtyp);
    ss_init(A.mt, A.nt, -1);

    k = 0;
    m = PLASMA_RANK;
    while (m >= A.mt) {
        k++;
        m = m-A.mt+k;
    }
    n = k;

    while (k < min(A.mt, A.nt) && m < A.mt) {
        next_m = m;
        next_n = n;
        next_k = k;

        next_n++;
        if (next_n == A.nt) {
            next_m += PLASMA_SIZE;
            while (next_m >= A.mt && next_k < min(A.nt, A.mt)) {
                next_k++;
                next_m = next_m-A.mt+next_k;
            }
            next_n = next_k;
        }

        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
        tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;

        ldak = BLKLDD(A, k);
        ldam = BLKLDD(A, m);

        if (m == k) {
            if (n == k) {
                ss_cond_wait(k, k, k-1);
                CORE_cgelqt(
                    tempkm, tempkn, ib,
                    A(k, k), ldak,
                    T(k, k), T.mb,
                    tau, work);
                ss_cond_set(k, k, k);
            }
            else {
                ss_cond_wait(k, n, k-1);
                CORE_ctslqt(
                    tempkm, tempnn, ib,
                    A(k, k), ldak,
                    A(k, n), ldak,
                    T(k, n), T.mb,
                    tau, work);
                ss_cond_set(k, n, k);
            }
        }
        else {
            if (n == k) {
                ss_cond_wait(k, k, k);
                ss_cond_wait(m, k, k-1);
                CORE_cunmlq(
                    PlasmaRight, PlasmaConjTrans,
                    tempmm, tempkn, tempkn, ib,
                    A(k, k), ldak,
                    T(k, k), T.mb,
                    A(m, k), ldam,
                    work, T.nb);
            }
            else {
                ss_cond_wait(k, n, k);
                ss_cond_wait(m, n, k-1);
                CORE_ctsmlq(
                    PlasmaRight, PlasmaConjTrans,
                    tempmm, A.nb, tempmm, tempnn, A.nb, ib,
                    A(m, k), ldam,
                    A(m, n), ldam,
                    A(k, n), ldak,
                    T(k, n), T.mb,
                    work, T.nb);
                ss_cond_set(m, n, k);
            }
        }
        m = next_m;
        n = next_n;
        k = next_k;
    }
    plasma_private_free(plasma, work);
    plasma_private_free(plasma, tau);
    ss_finalize();
}
Exemplo n.º 5
0
/***************************************************************************//**
 *  Parallel tile QR factorization - static scheduling
 **/
void plasma_pdgeqrf(plasma_context_t *plasma)
{
    PLASMA_desc A;
    PLASMA_desc T;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_k;
    int next_m;
    int next_n;
    int ldak, ldam;
    int tempkm, tempkn, tempnn, tempmm;
    int ib = PLASMA_IB;
    double *work, *tau;

    plasma_unpack_args_4(A, T, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;
    work = (double*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp);
    tau  = (double*)plasma_private_alloc(plasma, A.nb, A.dtyp);
    ss_init(A.mt, A.nt, -1);

    k = 0;
    n = PLASMA_RANK;
    while (n >= A.nt) {
        k++;
        n = n-A.nt+k;
    }
    m = k;

    while (k < min(A.mt, A.nt) && n < A.nt) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_m++;
        if (next_m == A.mt) {
            next_n += PLASMA_SIZE;
            while (next_n >= A.nt && next_k < min(A.mt, A.nt)) {
                next_k++;
                next_n = next_n-A.nt+next_k;
            }
            next_m = next_k;
        }

        tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
        tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;

        ldak = BLKLDD(A, k);
        ldam = BLKLDD(A, m);

        if (n == k) {
            if (m == k) {
                ss_cond_wait(k, k, k-1);
                CORE_dgeqrt(
                    tempkm, tempkn, ib,
                    A(k, k), ldak,
                    T(k, k), T.mb,
                    tau, work);
                ss_cond_set(k, k, k);
            }
            else {
                ss_cond_wait(m, k, k-1);
                CORE_dtsqrt(
                    tempmm, tempkn, ib,
                    A(k, k), ldak,
                    A(m, k), ldam,
                    T(m, k), T.mb,
                    tau, work);
                ss_cond_set(m, k, k);
            }
        }
        else {
            if (m == k) {
                ss_cond_wait(k, k, k);
                ss_cond_wait(k, n, k-1);
                CORE_dormqr(
                    PlasmaLeft, PlasmaTrans,
                    tempkm, tempnn, tempkm, ib,
                    A(k, k), ldak,
                    T(k, k), T.mb,
                    A(k, n), ldak,
                    work, T.nb);
            }
            else {
                ss_cond_wait(m, k, k);
                ss_cond_wait(m, n, k-1);
                CORE_dtsmqr(
                    PlasmaLeft, PlasmaTrans,
                    A.nb, tempnn, tempmm, tempnn, A.nb, ib,
                    A(k, n), ldak,
                    A(m, n), ldam,
                    A(m, k), ldam,
                    T(m, k), T.mb,
                    work, ib);
                ss_cond_set(m, n, k);
            }
        }
        n = next_n;
        m = next_m;
        k = next_k;
    }
    plasma_private_free(plasma, work);
    plasma_private_free(plasma, tau);
    ss_finalize();
}
Exemplo n.º 6
0
/***************************************************************************//**
 *  Parallel forward substitution for tile LU - static scheduling
 **/
void plasma_pztrsmpl(plasma_context_t *plasma)
{
    PLASMA_desc A;
    PLASMA_desc B;
    PLASMA_desc L;
    int *IPIV;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_k;
    int next_m;
    int next_n;
    int ldak, ldbk, ldam, ldbm;
    int tempkm, tempnn, tempkmin, tempmm, tempkn;
    int ib;

    plasma_unpack_args_6(A, B, L, IPIV, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;
    ss_init(B.mt, B.nt, -1);

    ib = PLASMA_IB;
    k = 0;
    n = PLASMA_RANK;
    while (n >= B.nt) {
        k++;
        n = n-B.nt;
    }
    m = k;

    while (k < min(A.mt, A.nt) && n < B.nt) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_m++;
        if (next_m == A.mt) {
            next_n += PLASMA_SIZE;
            while (next_n >= B.nt && next_k < min(A.mt, A.nt)) {
                next_k++;
                next_n = next_n-B.nt;
            }
            next_m = next_k;
        }

        tempkm   = k == A.mt-1 ? A.m-k*A.mb : A.mb;
        tempkn   = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        tempkmin = k == min(A.mt, A.nt)-1 ? min(A.m, A.n)-k*A.mb : A.mb;
        tempnn   = n == B.nt-1 ? B.n-n*B.nb : B.nb;
        tempmm   = m == A.mt-1 ? A.m-m*A.mb : A.mb;

        ldak = BLKLDD(A, k);
        ldbk = BLKLDD(B, k);
        ldam = BLKLDD(A, m);
        ldbm = BLKLDD(B, m);

        if (m == k) {
            ss_cond_wait(k, n, k-1);
            CORE_zgessm(
                tempkm, tempnn, tempkmin, ib,
                IPIV(k, k),
                A(k, k), ldak,
                B(k, n), ldbk);
            ss_cond_set(k, n, k);
        }
        else {
            ss_cond_wait(m, n, k-1);
            CORE_zssssm(
                A.nb, tempnn, tempmm, tempnn, tempkn, ib,
                B(k, n), ldbk,
                B(m, n), ldbm,
                L(m, k), L.mb,
                A(m, k), ldam,
                IPIV(m, k));
            ss_cond_set(m, n, k);
        }
        n = next_n;
        m = next_m;
        k = next_k;
    }
    ss_finalize();
}
Exemplo n.º 7
0
/***************************************************************************//**
 *  Parallel tile Cholesky factorization - static scheduling
 **/
void plasma_pspotrf(plasma_context_t *plasma)
{
    PLASMA_enum uplo;
    PLASMA_desc A;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_k;
    int next_m;
    int next_n;
    int ldak, ldam, ldan;
    int info;
    int tempkn, tempmn;

    float zone  = (float) 1.0;
    float mzone = (float)-1.0;

    plasma_unpack_args_4(uplo, A, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;
    ss_init(A.nt, A.nt, 0);

    k = 0;
    m = PLASMA_RANK;
    while (m >= A.nt) {
        k++;
        m = m-A.nt+k;
    }
    n = 0;

    while (k < A.nt && m < A.nt && !ss_aborted()) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_n++;
        if (next_n > next_k) {
            next_m += PLASMA_SIZE;
            while (next_m >= A.nt && next_k < A.nt) {
                next_k++;
                next_m = next_m-A.nt+next_k;
            }
            next_n = 0;
        }

        tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
        tempmn = m == A.nt-1 ? A.n-m*A.nb : A.nb;

        ldak = BLKLDD(A, k);
        ldan = BLKLDD(A, n);
        ldam = BLKLDD(A, m);

        if (m == k) {
            if (n == k) {
                /*
                 *  PlasmaLower
                 */
                if (uplo == PlasmaLower) {
                    CORE_spotrf(
                        PlasmaLower,
                        tempkn,
                        A(k, k), ldak,
                        &info);
                }
                /*
                 *  PlasmaUpper
                 */
                else {
                    CORE_spotrf(
                        PlasmaUpper,
                        tempkn,
                        A(k, k), ldak,
                        &info);
                }
                if (info != 0) {
                    plasma_request_fail(sequence, request, info + A.nb*k);
                    ss_abort();
                }
                ss_cond_set(k, k, 1);
            }
            else {
                ss_cond_wait(k, n, 1);
                /*
                 *  PlasmaLower
                 */
                if (uplo == PlasmaLower) {
                    CORE_ssyrk(
                         PlasmaLower, PlasmaNoTrans,
                         tempkn, A.nb,
                         -1.0, A(k, n), ldak,
                          1.0, A(k, k), ldak);
                }
                /*
                 *  PlasmaUpper
                 */
                else {
                    CORE_ssyrk(
                         PlasmaUpper, PlasmaTrans,
                         tempkn, A.nb,
                         -1.0, A(n, k), ldan,
                          1.0, A(k, k), ldak);
                }
            }
        }
        else {
            if (n == k) {
                ss_cond_wait(k, k, 1);
                /*
                 *  PlasmaLower
                 */
                if (uplo == PlasmaLower) {
                    CORE_strsm(
                        PlasmaRight, PlasmaLower, PlasmaTrans, PlasmaNonUnit,
                        tempmn, A.nb,
                        zone, A(k, k), ldak,
                              A(m, k), ldam);
                }
                /*
                 *  PlasmaUpper
                 */
                else {
                    CORE_strsm(
                        PlasmaLeft, PlasmaUpper, PlasmaTrans, PlasmaNonUnit,
                        A.nb, tempmn,
                        zone, A(k, k), ldak,
                              A(k, m), ldak);
                }
                ss_cond_set(m, k, 1);
            }
            else {
                ss_cond_wait(k, n, 1);
                ss_cond_wait(m, n, 1);
                /*
                 *  PlasmaLower
                 */
                if (uplo == PlasmaLower) {
                    CORE_sgemm(
                        PlasmaNoTrans, PlasmaTrans,
                        tempmn, A.nb, A.nb,
                        mzone, A(m, n), ldam,
                               A(k, n), ldak,
                         zone, A(m, k), ldam);
                }
                /*
                 *  PlasmaUpper
                 */
                else {
                    CORE_sgemm(
                        PlasmaTrans, PlasmaNoTrans,
                        A.nb, tempmn, A.nb,
                        mzone, A(n, k), ldan,
                               A(n, m), ldan,
                         zone, A(k, m), ldak);
                }
            }
        }
        n = next_n;
        m = next_m;
        k = next_k;
    }
    ss_finalize();
}