Beispiel #1
void Qthread::initialize( int thread_count )
  // Environment variable: QTHREAD_NUM_SHEPHERDS
  // Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
  // Environment variable: QTHREAD_HWPAR

    char buffer[256];

  const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
                       ( thread_count    == qthread_num_shepherds() * qthread_num_workers_local(NO_SHEPHERD) ) &&
                       ( thread_count    == qthread_num_workers() );

  bool ok_symmetry = true ;

  if ( ok_init ) {
    Impl::s_number_shepherds            = qthread_num_shepherds();
    Impl::s_number_workers_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
    Impl::s_number_workers              = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd ;

    for ( int i = 0 ; ok_symmetry && i < Impl::s_number_shepherds ; ++i ) {
      ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local(i) );

  if ( ! ok_init || ! ok_symmetry ) {
    std::ostringstream msg ;

    msg << "Kokkos::Qthread::initialize(" << thread_count << ") FAILED" ;
    msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
    msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local(NO_SHEPHERD);
    msg << " : qthread_num_workers = " << qthread_num_workers();

    if ( ! ok_symmetry ) {
      msg << " : qthread_num_workers_local = {" ;
      for ( int i = 0 ; i < Impl::s_number_shepherds ; ++i ) {
        msg << " " << qthread_num_workers_local(i) ;
      msg << " }" ;

    Impl::s_number_workers   = 0 ;
    Impl::s_number_shepherds = 0 ;
    Impl::s_number_workers_per_shepherd = 0 ;

    if ( ok_init ) { qthread_finalize(); }

    Kokkos::Impl::throw_runtime_exception( msg.str() );

  Impl::QthreadExec::resize_worker_scratch( 256 , 256 );

  // Init the array for used for arbitrarily sized atomics

// Test that writeFF waits for empty var to be filled, writes, and leaves full.
// Requires that only one worker is running. Basically does:
//     1: empty var
//     1: fork(writeFF)
//     1: yields
//     2: starts runnning
//     2: hits writeFF, and yields since var is empty
//     1: writeEF
//     1: hits readFF on forked task and yield
//     2: running again, finishes writeFF, task returns
//     1: readFF competes, finishes
static void testWriteFFWaits(void)
    aligned_t ret;
    assert(qthread_num_workers() == 1);

    iprintf("1: Forking writeFF wrapper\n");
    qthread_fork_to(writeFF_wrapper, NULL, &ret, qthread_shep());
    iprintf("1: Forked, now yielding to 2\n");
    iprintf("1: Back from yield\n");

    // verify that writeFF has not completed
    assert(qthread_feb_status(&concurrent_t) == 0);
    assert(concurrent_t != 55);

    iprintf("1: Writing EF\n");
    qthread_writeEF_const(&concurrent_t, 35);

    // wait for writeFF wrapper to complete
    qthread_readFF(NULL, &ret);

    // veify that writeFF completed and that FEB is full
    iprintf("1: concurrent_t=%d\n", concurrent_t);
    assert(qthread_feb_status(&concurrent_t) == 1);
    assert(concurrent_t == 55);
int main(int argc,
         char *argv[])
    assert(qthread_init(1) == 0); 
    iprintf("%i shepherds...\n", qthread_num_shepherds());
    iprintf("  %i threads total\n", qthread_num_workers());

Beispiel #4
* Unbalanced Tree Search v2.1                        *
* Based on the implementation available at           *
*  *

# include "config.h" /* for _GNU_SOURCE */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <limits.h> /* for INT_MAX */
#include <math.h>   /* for floor, log, sin */
#include <qthread/qthread.h>
#include <qthread/qtimer.h>
#include "argparsing.h"

#define BRG_RNG // Select RNG
#include "../../utils/rng/rng.h"

#define PRINT_STATS 1


typedef enum {
    BIN = 0,
} tree_t;
static char *type_names[] = {

typedef enum {
    LINEAR = 0,
} shape_t;
static char *shape_names[] = {
    "Linear decrease",
    "Exponential decrease",
    "Fixed branching factor"

typedef struct {
    int            height; // Depth of node in the tree
    struct state_t state;  // Local RNG state
    int            num_children;
    aligned_t     *acc;
    aligned_t     *dc;
    aligned_t      expect;
} node_t;

// Default values
static tree_t  tree_type     = GEO;
static double  bf_0          = 4.0;
static int     root_seed     = 0;
static int     num_samples   = 1;
static int     tree_depth    = 6;
static shape_t shape_fn      = LINEAR;
static int     non_leaf_bf   = 4;
static double  non_leaf_prob = 15.0 / 64.0;
static double  shift_depth   = 0.5;

// Tree metrics
static uint64_t tree_height = 0;
static uint64_t num_leaves  = 0;

static double normalize(int n)
    if (n < 0) {
        printf("*** toProb: rand n = %d out of range\n", n);

    return ((n < 0) ? 0.0 : ((double)n) / (double)INT_MAX);

static int calc_num_children_bin(node_t *parent)
    int    v = rng_rand(parent->state.state);
    double d = normalize(v);

    return (d < non_leaf_prob) ? non_leaf_bf : 0;

static int calc_num_children(node_t *parent)
    int num_children = 0;

    if (parent->height == 0) { num_children = (int)floor(bf_0); } else { num_children = calc_num_children_bin(parent); }

    if (parent->height == 0) {
        int root_bf = (int)ceil(bf_0);
        if (num_children > root_bf) {
            printf("*** Number of children truncated from %d to %d\n",
                   num_children, root_bf);
            num_children = root_bf;
    } else {
        if (num_children > MAXNUMCHILDREN) {
            printf("*** Number of children truncated from %d to %d\n",
                   num_children, MAXNUMCHILDREN);
            num_children = MAXNUMCHILDREN;

    return num_children;

// Notes:
// -    Each task receives distinct copy of parent
// -    Copy of child is shallow, be careful with `state` member
static aligned_t visit(void *args_)
    node_t  *parent          = (node_t *)args_;
    int      parent_height   = parent->height;
    int      num_children    = parent->num_children;
    aligned_t expect         = parent->expect;
    aligned_t num_descendants[num_children];
    aligned_t sum_descendants = 1;

    if (num_children != 0) {
        node_t     child __attribute__((aligned(8)));
        aligned_t  donec = 0;

        // Spawn children, if any
        child.height = parent_height + 1;
        child.dc     = &donec;
        child.expect = num_children;


        for (int i = 0; i < num_children; i++) {
            child.acc    = &num_descendants[i];

            for (int j = 0; j < num_samples; j++) {
                rng_spawn(parent->state.state, child.state.state, i);

            child.num_children = calc_num_children(&child);

            qthread_fork_syncvar_copyargs(visit, &child, sizeof(node_t), NULL);

        // Wait for children to finish up, accumulate descendants counts
        if (donec != expect) qthread_readFF(NULL, &donec);

        for (int i = 0; i < num_children; i++) {
            sum_descendants += num_descendants[i];

    *parent->acc = sum_descendants;
    if (qthread_incr(parent->dc, 1) + 1 == expect) {

    return 0;

static void print_stats(void)
    printf("tree-type %d\ntree-type-name %s\n",
           tree_type, type_names[tree_type]);
    printf("root-bf %.1f\nroot-seed %d\n",
           bf_0, root_seed);

    if ((tree_type == GEO) || (tree_type == HYBRID)) {
        printf("gen_mx %d\nshape-fn %d\nshape-fn-name %s\n",
               tree_depth, shape_fn, shape_names[shape_fn]);

    if ((tree_type == BIN) || (tree_type == HYBRID)) {
        double q  = non_leaf_prob;
        int    m  = non_leaf_bf;
        double es = (1.0 / (1.0 - q * m));
        printf("q %f\nm %d\nE(n) %f\nE(s) %.2f\n",
               q, m, q * m, es);

    if (tree_type == HYBRID) {
        printf("root-to-depth %d\n",
               (int)ceil(shift_depth * tree_depth));

    if (tree_type == BALANCED) {
        printf("gen_mx %d\n", tree_depth);
        printf("expected-num-nodes %llu\nexpected-num-leaves %llu\n",
               (unsigned long long)((pow(bf_0, tree_depth + 1) - 1.0) / (bf_0 - 1.0)),
               (unsigned long long)pow(bf_0, tree_depth));

    printf("compute-granularity %d\n", num_samples);
    printf("num-sheps %d\n", qthread_num_shepherds());
    printf("num-workers %d\n", qthread_num_workers());



#else /* ifdef PRINT_STATS */
static void print_banner(void)
    printf("UTS - Unbalanced Tree Search 2.1 (C/Qthreads)\n");
    printf("Tree type:%3d (%s)\n", tree_type, type_names[tree_type]);
    printf("Tree shape parameters:\n");
    printf("  root branching factor b_0 = %.1f, root seed = %d\n",
           bf_0, root_seed);

    if ((tree_type == GEO) || (tree_type == HYBRID)) {
        printf("  GEO parameters: gen_mx = %d, shape function = %d (%s)\n",
               tree_depth, shape_fn, shape_names[shape_fn]);

    if ((tree_type == BIN) || (tree_type == HYBRID)) {
        double q  = non_leaf_prob;
        int    m  = non_leaf_bf;
        double es = (1.0 / (1.0 - q * m));
        printf("  BIN parameters: q = %f, m = %d, E(n) = %f, E(s) = %.2f\n",
               q, m, q * m, es);

    if (tree_type == HYBRID) {
        printf("  HYBRID: GEO from root to depth %d, then BIN\n",
               (int)ceil(shift_depth * tree_depth));

    if (tree_type == BALANCED) {
        printf("  BALANCED parameters: gen_mx = %d\n", tree_depth);
        printf("    Expected size: %llu nodes, %llu leaves\n",
               (unsigned long long)((pow(bf_0, tree_depth + 1) - 1.0) / (bf_0 - 1.0)),
               (unsigned long long)pow(bf_0, tree_depth));

    printf("Random number generator: ");
    printf("SHA-1 (state size = %ldB)\n", sizeof(struct state_t));
    printf("Compute granularity: %d\n", num_samples);
    printf("Execution strategy:\n");
    printf("  Shepherds: %d\n", qthread_num_shepherds());
    printf("  Workers:   %d\n", qthread_num_workers());


Beispiel #5
int main(int   argc,
         char *argv[])
    assert(qthread_initialize() == QTHREAD_SUCCESS);
    NUMARG(numincrs, "NUM_INCRS");
    // future_init(128);
    iprintf("%i shepherds\n", qthread_num_shepherds());
    iprintf("%i threads\n", qthread_num_workers());

    qt_loop_balance_sinc(0, numincrs, sum, NULL);

    if (threads != numincrs) {
        iprintf("threads == %lu, not %lu\n", (unsigned long)threads, (unsigned long)numincrs);
    assert(threads == numincrs);

    return 0;
Beispiel #6
static void print_stats(void)
    printf("tree-type %d\ntree-type-name %s\n",
           tree_type, type_names[tree_type]);
    printf("root-bf %.1f\nroot-seed %d\n",
           bf_0, root_seed);

    if ((tree_type == GEO) || (tree_type == HYBRID)) {
        printf("gen_mx %d\nshape-fn %d\nshape-fn-name %s\n",
               tree_depth, shape_fn, shape_names[shape_fn]);

    if ((tree_type == BIN) || (tree_type == HYBRID)) {
        double q  = non_leaf_prob;
        int    m  = non_leaf_bf;
        double es = (1.0 / (1.0 - q * m));
        printf("q %f\nm %d\nE(n) %f\nE(s) %.2f\n",
               q, m, q * m, es);

    if (tree_type == HYBRID) {
        printf("root-to-depth %d\n",
               (int)ceil(shift_depth * tree_depth));

    if (tree_type == BALANCED) {
        printf("gen_mx %d\n", tree_depth);
        printf("expected-num-nodes %llu\nexpected-num-leaves %llu\n",
               (unsigned long long)((pow(bf_0, tree_depth + 1) - 1.0) / (bf_0 - 1.0)),
               (unsigned long long)pow(bf_0, tree_depth));

    printf("compute-granularity %d\n", num_samples);
    printf("num-sheps %d\n", qthread_num_shepherds());
    printf("num-workers %d\n", qthread_num_workers());


Beispiel #7
int main(int   argc,
         char *argv[])
    aligned_t return_value = 0;
    int status, ret;

    CHECK_VERBOSE(); // part of the testing harness; toggles iprintf() output

    status = qthread_initialize();
    assert(status == QTHREAD_SUCCESS);

    iprintf("%i shepherds...\n", qthread_num_shepherds());
    iprintf("  %i threads total\n", qthread_num_workers());

    iprintf("Creating the queue...\n");
    the_queue = qthread_queue_create(QTHREAD_QUEUE_MULTI_JOIN_LENGTH, 0);

    iprintf("\tSINGLE THREAD TEST\n\n");

    iprintf("1/4 Spawning thread to be queued...\n");
    status = qthread_fork(tobequeued, NULL, &return_value);
    assert(status == QTHREAD_SUCCESS);

    iprintf("2/4 Waiting for thread to queue itself...\n");
    while(qthread_queue_length(the_queue) != 1) qthread_yield();
    assert(qthread_readstate(NODE_BUSYNESS) == 1);

    iprintf("3/4 Releasing the queue...\n");

    ret = qthread_readFF(NULL, &return_value);
    assert(ret == QTHREAD_SUCCESS);

    assert(threads_in == 1);
    assert(awoke == 1);
    assert(qthread_queue_length(the_queue) == 0);
    assert(qthread_readstate(NODE_BUSYNESS) == 1);
    iprintf("4/4 Test passed!\n");

    iprintf("\tMULTI THREAD TEST\n\n");

    threads_in = 0;
    awoke = 0;
    aligned_t *retvals = malloc(sizeof(aligned_t) * THREADS_ENQUEUED);
    iprintf("1/6 Spawning %u threads to be queued...\n", THREADS_ENQUEUED);
    for (int i=0; i<THREADS_ENQUEUED; i++) {
        status = qthread_fork(tobequeued, NULL, retvals + i);
        assert(status == QTHREAD_SUCCESS);

    iprintf("2/6 Waiting for %u threads to queue themselves...\n", THREADS_ENQUEUED);
    while(qthread_queue_length(the_queue) != THREADS_ENQUEUED) qthread_yield();
    assert(threads_in == THREADS_ENQUEUED);
    assert(qthread_readstate(NODE_BUSYNESS) == 1);

    iprintf("3/6 Releasing a single thread...\n");

    iprintf("4/6 Waiting for that thread to exit\n");
    while (awoke == 0) qthread_yield();

    assert(qthread_queue_length(the_queue) == (THREADS_ENQUEUED - 1));
    assert(qthread_readstate(NODE_BUSYNESS) == 1);

    iprintf("5/6 Releasing the rest of the threads...\n");

    for (int i=0; i<THREADS_ENQUEUED; i++) {
        ret = qthread_readFF(NULL, retvals + i);
        assert(ret == QTHREAD_SUCCESS);

    assert(qthread_queue_length(the_queue) == 0);
    assert(qthread_readstate(NODE_BUSYNESS) == 1);

    iprintf("6/6 Test passed!\n");

    return EXIT_SUCCESS;
Beispiel #8
uint32_t chpl_task_getNumThreads(void)
    return (uint32_t)qthread_num_workers();
Beispiel #9
int main(int argc, char *argv[])
    aligned_t *ui_array, *ui_array2;
    double *d_array, *d_array2;
    size_t len = 1000000;
    qtimer_t timer = qtimer_create();
    double cumulative_time_qutil = 0.0;
    double cumulative_time_libc = 0.0;
    int using_doubles = 0;
    unsigned long iterations = 10;


    printf("%i threads\n", (int)qthread_num_workers());
    NUMARG(len, "TEST_LEN");
    NUMARG(iterations, "TEST_ITERATIONS");
    NUMARG(using_doubles, "TEST_USING_DOUBLES");
    printf("using %s\n", using_doubles ? "doubles" : "aligned_ts");

    if (using_doubles) {
        d_array = calloc(len, sizeof(double));
	printf("array is %s\n", human_readable(len * sizeof(double)));
        // madvise(d_array,len*sizeof(double), MADV_SEQUENTIAL);
        for (unsigned int i = 0; i < len; i++) {
            d_array[i] = ((double)random()) / ((double)RAND_MAX) + random();
        d_array2 = calloc(len, sizeof(double));
        // madvise(d_array2,len*sizeof(double), MADV_RANDOM);
        iprintf("double array generated...\n");
        for (unsigned int i = 0; i < iterations; i++) {
            memcpy(d_array2, d_array, len * sizeof(double));
            qutil_qsort(d_array2, len);
            cumulative_time_qutil += qtimer_secs(timer);
            iprintf("\t%u: sorting %lu doubles with qutil took: %f seconds\n",
                    i, (unsigned long)len, qtimer_secs(timer));
        cumulative_time_qutil /= (double)iterations;
        printf("sorting %lu doubles with qutil took: %f seconds (avg)\n",
               (unsigned long)len, cumulative_time_qutil);
        for (unsigned int i = 0; i < iterations; i++) {
            memcpy(d_array2, d_array, len * sizeof(double));
            qsort(d_array2, len, sizeof(double), dcmp);
            cumulative_time_libc += qtimer_secs(timer);
            iprintf("\t%u: sorting %lu doubles with libc took: %f seconds\n",
                    i, (unsigned long)len, qtimer_secs(timer));
	cumulative_time_libc /= (double)iterations;
        printf("sorting %lu doubles with libc took: %f seconds\n",
               (unsigned long)len, cumulative_time_libc);
    } else {
        ui_array = calloc(len, sizeof(aligned_t));
	printf("array is %s\n", human_readable(len * sizeof(aligned_t)));
        for (unsigned int i = 0; i < len; i++) {
            ui_array[i] = random();
        ui_array2 = calloc(len, sizeof(aligned_t));
        iprintf("ui_array generated...\n");
        for (int i = 0; i < iterations; i++) {
            memcpy(ui_array2, ui_array, len * sizeof(aligned_t));
            qutil_aligned_qsort(ui_array2, len);
            cumulative_time_qutil += qtimer_secs(timer);
	cumulative_time_qutil /= (double)iterations;
        printf("sorting %lu aligned_ts with qutil took: %f seconds\n",
               (unsigned long)len, cumulative_time_qutil);
        for (int i = 0; i < iterations; i++) {
            memcpy(ui_array2, ui_array, len * sizeof(aligned_t));
            qsort(ui_array2, len, sizeof(double), acmp);
            cumulative_time_libc += qtimer_secs(timer);
	cumulative_time_libc /= (double)iterations;
        printf("sorting %lu aligned_ts with libc took: %f seconds (avg)\n",
               (unsigned long)len, cumulative_time_libc);
    if (cumulative_time_qutil < cumulative_time_libc) {
	printf("qutil with %lu threads provides a %0.2fx speedup.\n", (unsigned long)qthread_num_shepherds(), cumulative_time_libc/cumulative_time_qutil);
    } else {
	printf("qutil with %lu threads provides a %0.2fx slowdown.\n", (unsigned long)qthread_num_shepherds(), cumulative_time_libc/cumulative_time_qutil);


    return 0;
Beispiel #10
static void hazardous_scan(hazard_freelist_t *hfl)
    const size_t num_hps = qthread_num_workers() * HAZARD_PTRS_PER_SHEP;
    void            **plist = MALLOC(sizeof(void *) * (num_hps + hzptr_list_len));
    hazard_freelist_t tmpfreelist;

    tmpfreelist.freelist = calloc(freelist_max, sizeof(hazard_freelist_entry_t));
    do {
        /* Stage 1: Collect hazardpointers */
            qthread_shepherd_id_t i;
            for (i = 0; i < qthread_num_shepherds(); ++i) {
                for (qthread_worker_id_t j = 0; j < qlib->nworkerspershep; ++j) {
                    if (&(qlib->shepherds[i].workers[j].hazard_free_list) != hfl) {
                        memcpy(plist + (i * qlib->nworkerspershep * HAZARD_PTRS_PER_SHEP) + (j * HAZARD_PTRS_PER_SHEP),
                               sizeof(void *) * HAZARD_PTRS_PER_SHEP);
                    } else {
                        memset(plist + (i * qlib->nworkerspershep * HAZARD_PTRS_PER_SHEP) + (j * HAZARD_PTRS_PER_SHEP),
                               sizeof(void *) * HAZARD_PTRS_PER_SHEP);
            uintptr_t *hzptr_tmp = QTHREAD_CASLOCK_READ(hzptr_list);
            while (hzptr_tmp != NULL) {
                memcpy(plist + (i * qlib->nworkerspershep * HAZARD_PTRS_PER_SHEP),
                       sizeof(uintptr_t) * HAZARD_PTRS_PER_SHEP);
                hzptr_tmp = (uintptr_t *)hzptr_tmp[HAZARD_PTRS_PER_SHEP];

        /* Stage 2: free pointers that are not in the set of hazardous pointers */
        tmpfreelist.count = 0;
        qsort(plist, num_hps, sizeof(void *), void_cmp);
        assert(hfl->count == freelist_max);
        for (size_t i = 0; i < freelist_max; ++i) {
            const uintptr_t ptr = (uintptr_t)hfl->freelist[i].ptr;
            if (ptr == 0) { break; }
            /* look for this ptr in the plist */
            if (binary_search((uintptr_t *)plist, ptr, num_hps)) {
                /* if found, cannot free it */
                tmpfreelist.freelist[tmpfreelist.count] = hfl->freelist[i];
            } else {
                /* not found, therefore, we can free it */
                hfl->freelist[i].freefunc((void *)ptr);
        if (tmpfreelist.count == freelist_max) {
            /* This will ONLY happen under *extremely* heavy contention. */
    } while (tmpfreelist.count == freelist_max);
    assert(tmpfreelist.count < freelist_max);
    memcpy(hfl->freelist, tmpfreelist.freelist, tmpfreelist.count * sizeof(hazard_freelist_entry_t));
    hfl->count = tmpfreelist.count;
    FREE(tmpfreelist.freelist, sizeof(hazard_freelist_entry_t));
    FREE(plist, sizeof(void *) * (num_hps + hzptr_list_len));
Beispiel #11
int main(int   argc,
         char *argv[])
    size_t     threads, i;
    aligned_t *rets;
    qtimer_t   t;
    unsigned int iter, iterations = 10;
    double tot = 0.0;

    assert(qthread_initialize() == 0);
    t = qtimer_create();

    NUMARG(iterations, "ITERATIONS");

    threads = qthread_num_workers();
    iprintf("%i shepherds...\n", qthread_num_shepherds());
    iprintf("%i threads...\n", (int)threads);

    initme = calloc(threads, sizeof(aligned_t));

    rets = malloc(threads * sizeof(aligned_t));

    iprintf("Creating a barrier to block %i threads\n", threads);
    wait_on_me = qt_barrier_create(threads, REGION_BARRIER, 0);     // all my spawnees plus me

    for (iter = 0; iter < iterations; iter++) {
        iprintf("%i: forking the threads\n", iter);
        for (i = 1; i < threads; i++) {
            void *arg[2] = {wait_on_me, (void*)(intptr_t)i};
            qthread_spawn(barrier_thread, arg, sizeof(void*)*2, rets + i, 0, NULL, i, 0);
        iprintf("%i: done forking the threads, entering the barrier\n", iter);
        qt_barrier_enter(wait_on_me, 0);
        iprintf("%i: main thread exited barrier in %f seconds\n", iter, qtimer_secs(t));
        tot += qtimer_secs(t);

        // reset
        initme_idx = 1;

        // check retvals
        for (i = 1; i < threads; i++) {
            qthread_readFF(NULL, rets + i);
            if (initme[i] != iter + 1) {
                iprintf("initme[%i] = %i (should be %i)\n", (int)i,
                        (int)initme[i], iter + 1);
            assert(initme[i] == iter + 1);

    iprintf("Average barrier time = %f\n", tot / iterations);

    iprintf("Destroying the barrier...\n");


    return 0;
Beispiel #12
void chpl_task_init(void)
    chpl_bool we_set_worker_unit = false;
    int32_t   numThreadsPerLocale;
    int32_t   commMaxThreads;
    int32_t   hwpar;
    size_t    callStackSize;
    pthread_t initer;
    char      newenv_stack[100] = { 0 };
    char *noWorkSteal;

    // Set up available hardware parallelism.

    // Experience has shown that we hardly ever win by using more than
    // one PU per core, so default to that.  If this was explicitly
    // set by the user we won't override it, however.
    if (getenv("QTHREAD_WORKER_UNIT") == NULL) {
        we_set_worker_unit = (getenv("QT_WORKER_UNIT") == NULL);
        (void) setenv("QT_WORKER_UNIT", "core", 0);

    // Determine the thread count.  CHPL_RT_NUM_THREADS_PER_LOCALE has
    // the highest precedence but we limit it to the number of PUs.
    // QTHREAD_HWPAR has the next precedence.  We don't impose the
    // same limit on it, so it can be used to overload the hardware.
    // In either case the number of threads can be no greater than any
    // maximum imposed by the comm layer.  This limit is imposed
    // silently.
    numThreadsPerLocale = chpl_task_getenvNumThreadsPerLocale();
    commMaxThreads = chpl_comm_getMaxThreads();
    hwpar = 0;
    if (numThreadsPerLocale != 0) {
        int32_t numPUsPerLocale;

        hwpar = numThreadsPerLocale;

        numPUsPerLocale = chpl_numCoresOnThisLocale();
        if (0 < numPUsPerLocale && numPUsPerLocale < hwpar) {
            if (2 == verbosity) {
                printf("QTHREADS: Reduced numThreadsPerLocale=%d to %d "
                       "to prevent oversubscription of the system.\n",
                       hwpar, numPUsPerLocale);

            // Do not oversubscribe the system, use all available resources.
            hwpar = numPUsPerLocale;

        if (0 < commMaxThreads && commMaxThreads < hwpar) {
            hwpar = commMaxThreads;
    } else {
        if (0 < commMaxThreads) {
            hwpar = qt_internal_get_env_num("HWPAR", 0, 0);
            if (commMaxThreads < hwpar) {
                hwpar = commMaxThreads;

    if (hwpar > 0) {
        char newenv[100];
        char *sched;

        // Unset relevant Qthreads environment variables.  Currently
        // QTHREAD_HWPAR has precedence over the QTHREAD_NUM_* ones,
        // but that isn't documented and may not be true forever, so
        // we unset them all.
        // The current check for scheduler and setting HWPAR or
        // NUM_SHEPHERDS/WORKERS_PER_SHEPHERD is just to experiment with
        // the performance of different schedulers. This is not production code
        // and if it's around after July 2014, yell at Elliot.  
        sched = getenv("CHPL_QTHREAD_SCHEDULER");
        if (sched != NULL && strncmp(sched, "nemesis", 7) == 0) {
            // Set environment variable for Qthreads
            snprintf(newenv, sizeof(newenv), "%i", (int)hwpar);
            setenv("QT_NUM_SHEPHERDS", newenv, 1);
            setenv("QT_NUM_WORKERS_PER_SHEPHERD", "1", 1);
            // Unset QT_WORKER_UNIT iff we set it.
            if (we_set_worker_unit) {
              (void) unsetenv("QT_WORKER_UNIT");
        } else {
            // Set environment variable for Qthreads
            snprintf(newenv, sizeof(newenv), "%i", (int)hwpar);
            setenv("QT_HWPAR", newenv, 1);

    // Precedence (high-to-low):
    // 1) Chapel minimum
    // In practice we never get to #2, because the Chapel minimum is
    // always > 0, but we cover that case as a backstop.
    callStackSize = chpl_task_getMinCallStackSize();
    if (callStackSize <= 0)
        callStackSize = 1024 * 1024 * sizeof(size_t);
    snprintf(newenv_stack, 99, "%zu", callStackSize);
    setenv("QT_STACK_SIZE", newenv_stack, 1);

    // Turn on informative Qthreads setting messages with Chapel's verbose flag
    if (verbosity == 2) {
        setenv("QT_INFO", "1", 1);

    pthread_create(&initer, NULL, initializer, NULL);
    while (chpl_qthread_done_initializing == 0) SPINLOCK_BODY();

    // Now that Qthreads is up and running, make sure that the number
    // of workers is less than any comm layer limit.  This is mainly
    // checking that the default thread count without QTHREAD_HWPAR
    // being set is within any comm layer limit, because we can't
    // determine that default ahead of time.  Secondarily, it's a
    // sanity check on the thread count versus comm limit logic
    // above.
    assert(0 == commMaxThreads || qthread_num_workers() < commMaxThreads);

    if (blockreport || taskreport) {
        if (signal(SIGINT, SIGINT_handler) == SIG_ERR) {
            perror("Could not register SIGINT handler");

    // Turn off work stealing if it was configured to be off
    noWorkSteal = getenv("CHPL_QTHREAD_NO_WORK_STEALING");
    if (noWorkSteal != NULL && strncmp(noWorkSteal, "yes", 3) == 0) {
Beispiel #13
int main(int   argc,
         char *argv[])
    uint64_t total_num_nodes = 0;
    qtimer_t timer;
    double   total_time = 0.0;


        unsigned int tmp = (unsigned int)tree_type;
        NUMARG(tmp, "UTS_TREE_TYPE");
        if (tmp <= BALANCED) {
            tree_type = (tree_t)tmp;
        } else {
            fprintf(stderr, "invalid tree type\n");
            return EXIT_FAILURE;
        tmp = (unsigned int)shape_fn;
        NUMARG(tmp, "UTS_SHAPE_FN");
        if (tmp <= FIXED) {
            shape_fn = (shape_t)tmp;
        } else {
            fprintf(stderr, "invalid shape function\n");
            return EXIT_FAILURE;
    DBLARG(bf_0, "UTS_BF_0");
    NUMARG(root_seed, "UTS_ROOT_SEED");
    NUMARG(tree_depth, "UTS_TREE_DEPTH");
    DBLARG(non_leaf_prob, "UTS_NON_LEAF_PROB");
    NUMARG(non_leaf_bf, "UTS_NON_LEAF_NUM");
    NUMARG(shift_depth, "UTS_SHIFT_DEPTH");
    NUMARG(num_samples, "UTS_NUM_SAMPLES");

    // If the operator did not attempt to set a stack size, force
    // a reasonable lower bound
    if (!getenv("QT_STACK_SIZE") && !getenv("QTHREAD_STACK_SIZE"))
        setenv("QT_STACK_SIZE", "32768", 0);

    assert(qthread_initialize() == 0);


    timer = qtimer_create();

    node_t root;
    root.height = 0;
    rng_init(root.state.state, root_seed);
    root.num_children = calc_num_children(&root);
    aligned_t donecount = 0;
    root.dc = &donecount;
    aligned_t tot = 0;
    root.acc = &tot;
    root.expect = 1;

    qthread_fork_syncvar(visit, &root, NULL);
    qthread_readFF(NULL, root.dc);
    total_num_nodes = tot;


    total_time = qtimer_secs(timer);


    printf("tree-size %lu\ntree-depth %d\nnum-leaves %llu\nperc-leaves %.2f\n",
           (unsigned long)total_num_nodes,
           (unsigned long long)num_leaves,
           num_leaves / (float)total_num_nodes * 100.0);
    printf("exec-time %.3f\ntotal-perf %.0f\npu-perf %.0f\n\n",
           total_num_nodes / total_time,
           total_num_nodes / total_time / qthread_num_workers());
    printf("Tree size = %lu, tree depth = %d, num leaves = %llu (%.2f%%)\n",
           (unsigned long)total_num_nodes,
           (unsigned long long)num_leaves,
           num_leaves / (float)total_num_nodes * 100.0);
    printf("Wallclock time = %.3f sec, performance = %.0f "
           "nodes/sec (%.0f nodes/sec per PE)\n\n",
           total_num_nodes / total_time,
           total_num_nodes / total_time / qthread_num_workers());
#endif /* ifdef PRINT_STATS */

    return 0;
Beispiel #14
GLT_func_prefix void glt_subthread_get_num(GLT_subthread *num) {
    *num = qthread_num_workers();