// Test that writeFF waits for empty var to be filled, writes, and leaves full.
// Requires that only one worker is running. Basically does:
//     1: empty var
//     1: fork(writeFF)
//     1: yields
//     2: starts runnning
//     2: hits writeFF, and yields since var is empty
//     1: writeEF
//     1: hits readFF on forked task and yield
//     2: running again, finishes writeFF, task returns
//     1: readFF competes, finishes
static void testWriteFFWaits(void)
{
    aligned_t ret;
    concurrent_t=45;
    qthread_empty(&concurrent_t);
    assert(qthread_num_workers() == 1);

    iprintf("1: Forking writeFF wrapper\n");
    qthread_fork_to(writeFF_wrapper, NULL, &ret, qthread_shep());
    iprintf("1: Forked, now yielding to 2\n");
    qthread_yield();
    iprintf("1: Back from yield\n");

    // verify that writeFF has not completed
    assert(qthread_feb_status(&concurrent_t) == 0);
    assert(concurrent_t != 55);

    iprintf("1: Writing EF\n");
    qthread_writeEF_const(&concurrent_t, 35);

    // wait for writeFF wrapper to complete
    qthread_readFF(NULL, &ret);

    // veify that writeFF completed and that FEB is full
    iprintf("1: concurrent_t=%d\n", concurrent_t);
    assert(qthread_feb_status(&concurrent_t) == 1);
    assert(concurrent_t == 55);
}
Beispiel #2
0
int qthread_fork_remote(qthread_f   f,
                        const void *arg,
                        aligned_t  *ret,
                        int         rank,
                        size_t      arg_len)
{
    struct fork_msg_t msg;

    qthread_debug(MULTINODE_CALLS, "[%d] begin qthread_fork_remote(0x%lx, 0x%lx, 0x%lx, %d, %ld)\n",
                  my_rank, (unsigned long)f, (unsigned long)arg,
                  (unsigned long)ret, rank, arg_len);

    if (NULL != ret) {
        qthread_empty(ret);
    }

    if (arg_len <= sizeof(msg.args)) {
        msg.uid = (uint64_t)qt_hash_get(ptr_to_uid_hash, f);
        if (qt_hash_get(uid_to_ptr_hash, (qt_key_t)(uintptr_t)msg.uid) != f) {
            fprintf(stderr, "action not registered at source\n");
            abort();
        }
        msg.return_addr = (uint64_t)ret;
        msg.origin_node = my_rank;
        msg.arg_len     = arg_len;
        memcpy(msg.args, arg, arg_len);
        qthread_debug(MULTINODE_DETAILS, "[%d] remote fork %d %d 0x%lx %d\n",
                      my_rank, rank, msg.uid, msg.return_addr, msg.arg_len);
        return qthread_internal_net_driver_send(rank, SHORT_MSG_TAG, &msg, sizeof(msg));
    }

    fprintf(stderr, "long remote fork unsupported\n");
    abort();
}
Beispiel #3
0
static aligned_t update(void *arg)
{
    stencil_t *points = ((update_args_t *)arg)->points;
    size_t i = ((update_args_t *)arg)->i;
    size_t j = ((update_args_t *)arg)->j;
    size_t this_stage = ((update_args_t *)arg)->stage;
    size_t step = ((update_args_t *)arg)->step;

    size_t next_stage_id = next_stage(this_stage);

    // Perform local work
    perform_local_work();
    aligned_t **prev = points->stage[prev_stage(this_stage)];
    aligned_t sum = *(NORTH(prev, i, j)) 
                  + *(WEST(prev, i, j)) 
                  + *(HERE(prev, i, j)) 
                  + *(EAST(prev, i, j)) 
                  + *(SOUTH(prev, i, j));

    // Empty the next stage for this index
    qthread_empty(&points->stage[next_stage_id][i][j]);

    // Update this point
    qthread_writeEF_const(&points->stage[this_stage][i][j], sum/NUM_NEIGHBORS);
    
    if (step < num_timesteps) {
        // Spawn next stage
        update_args_t args = {points, i, j, next_stage_id, step+1};
#ifdef BOUNDARY_SYNC 
        qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, NUM_NEIGHBORS, NEIGHBORS(points->stage[this_stage],i,j));
#else
        if (i == 1) {                   // North edge
            if (j == 1)                     // West edge: EAST & SOUTH
                qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 2, EAST(points->stage[this_stage],i,j), SOUTH(points->stage[this_stage],i,j));
            else if (j == points->M-2)      // East edge: WEST & SOUTH
                qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 2, WEST(points->stage[this_stage],i,j), SOUTH(points->stage[this_stage],i,j));
            else                            // Interior: WEST & EAST & SOUTH
                qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 3, WEST(points->stage[this_stage],i,j), EAST(points->stage[this_stage],i,j), SOUTH(points->stage[this_stage],i,j));
        } else if (i == points->N-2) {  // South edge
            if (j == 1)                     // West edge: NORTH & EAST
                qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 2, NORTH(points->stage[this_stage],i,j), EAST(points->stage[this_stage],i,j));
            else if (j == points->M-2)      // East edge: NORTH & WEST
                qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 2, NORTH(points->stage[this_stage],i,j), WEST(points->stage[this_stage],i,j));
            else                            // Interior: NORTH & WEST & EAST
                qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 3, NORTH(points->stage[this_stage],i,j), WEST(points->stage[this_stage],i,j), EAST(points->stage[this_stage],i,j));
        } else {                        // Interior
            if (j == 1)                     // West edge: NORTH & EAST & SOUTH
                qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 3 , NORTH(points->stage[this_stage],i,j), EAST(points->stage[this_stage],i,j), SOUTH(points->stage[this_stage],i,j));
            else if (j == points->M-2)      // East edge: NORTH & WEST & SOUTH
                qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 3, NORTH(points->stage[this_stage],i,j), WEST(points->stage[this_stage],i,j), SOUTH(points->stage[this_stage],i,j));
            else                            // Interior: ALL
                qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 4, NORTH(points->stage[this_stage],i,j), EAST(points->stage[this_stage],i,j), WEST(points->stage[this_stage],i,j), SOUTH(points->stage[this_stage],i,j));
        }
#endif
    }
    else
        qt_feb_barrier_enter(points->barrier);

    return 0;
}
Beispiel #4
0
int qthread_fork_remote(qthread_f   f,
                        const void *arg,
                        aligned_t  *ret,
                        int         rank,
                        size_t      arg_len)
{
    qthread_debug(MULTINODE_CALLS, "[%d] begin f=0x%lx, arg=0x%lx, ret=0x%lx, rank=%d, arg_len=%ld)\n",
                  my_rank, (unsigned long)f, (unsigned long)arg,
                  (unsigned long)ret, rank, arg_len);

    uint64_t const uid = (uint64_t)qt_hash_get(ptr_to_uid_hash, f);
    if (qt_hash_get(uid_to_ptr_hash, (qt_key_t)(uintptr_t)uid) != f) {
        fprintf(stderr, "action not registered at source\n");
        abort();
    }

    if (NULL != ret) {
        qthread_empty(ret);
    }

    if (arg_len <= FORK_MSG_PAYLOAD) {
        struct fork_msg_t msg;

        msg.uid         = uid;
        msg.return_addr = (uint64_t)ret;
        msg.origin_node = my_rank;
        msg.arg_len     = arg_len;
        memcpy(msg.args, arg, arg_len);
        qthread_debug(MULTINODE_DETAILS, "[%d] remote fork %d %d 0x%lx %d\n",
                      my_rank, rank, msg.uid, msg.return_addr, msg.arg_len);
        return qthread_internal_net_driver_send(rank, SHORT_MSG_TAG, &msg, sizeof(msg));
    } else {

        struct fork_long_msg_t * long_msg;

        size_t long_msg_size = sizeof(struct fork_long_msg_t) + arg_len;
        long_msg = malloc(long_msg_size);
        assert(NULL != long_msg);

        long_msg->uid         = uid;
        long_msg->return_addr = (uint64_t)ret;
        long_msg->origin_node = my_rank;
        long_msg->arg_len     = arg_len;
        memcpy(&long_msg->args, arg, arg_len);

        qthread_debug(MULTINODE_DETAILS, "[%d] remote long fork rank=%d uid=%d return_addr=0x%lx arg_len=%d\n",
                      my_rank, rank, long_msg->uid, long_msg->return_addr, long_msg->arg_len);
        int const rc = qthread_internal_net_driver_send(rank, LONG_MSG_TAG, long_msg, long_msg_size);

        free(long_msg);

        return rc;
    }
}
Beispiel #5
0
void mt_purge(T& target)
{
#ifdef __MTA__
  purge(&target);
#elif USING_QTHREADS
  qthread_empty(&target);
  target = 0;
#else
  target = 0;
#endif
}
Beispiel #6
0
// Notes:
// -    Each task receives distinct copy of parent
// -    Copy of child is shallow, be careful with `state` member
static aligned_t visit(void *args_)
{
    node_t  *parent          = (node_t *)args_;
    int      parent_height   = parent->height;
    int      num_children    = parent->num_children;
    aligned_t expect         = parent->expect;
    aligned_t num_descendants[num_children];
    aligned_t sum_descendants = 1;

    if (num_children != 0) {
        node_t     child __attribute__((aligned(8)));
        aligned_t  donec = 0;

        // Spawn children, if any
        child.height = parent_height + 1;
        child.dc     = &donec;
        child.expect = num_children;

        qthread_empty(&donec);

        for (int i = 0; i < num_children; i++) {
            child.acc    = &num_descendants[i];

            for (int j = 0; j < num_samples; j++) {
                rng_spawn(parent->state.state, child.state.state, i);
            }

            child.num_children = calc_num_children(&child);

            qthread_fork_syncvar_copyargs(visit, &child, sizeof(node_t), NULL);
        }

        // Wait for children to finish up, accumulate descendants counts
        if (donec != expect) qthread_readFF(NULL, &donec);

        for (int i = 0; i < num_children; i++) {
            sum_descendants += num_descendants[i];
        }
    }

    *parent->acc = sum_descendants;
    if (qthread_incr(parent->dc, 1) + 1 == expect) {
        qthread_fill(parent->dc);
    }

    return 0;
}
Beispiel #7
0
//
// remote fork should launch a thread on locale that runs function f
// passing it arg where the size of arg is stored in arg_size
// notes:
//   multiple forks to the same locale should be handled concurrently
//
void chpl_comm_fork(int locale, chpl_fn_int_t fid,
                    void *arg, int32_t arg_size, int32_t arg_tid)
{
    aligned_t ret;

    PROFILE_INCR(profile_comm_fork,1);
    PROFILE_BIN_INCR(profile_comm_fork_size,arg_size);

    qthread_debug(CHAPEL_CALLS, "[%d] begin locale=%d, fid=%d, arg_size=%d\n", chpl_localeID, locale, fid, arg_size);

    qthread_debug(CHAPEL_BEHAVIOR, "[%d] (blocking) forking fn %d with arg-size %d\n", chpl_localeID, fid, arg_size);

    qthread_empty(&ret);
    spawn(locale, fid, arg, arg_size, arg_tid, &ret);
    qthread_readFF(NULL, &ret);

    qthread_debug(CHAPEL_CALLS, "[%d] end locale=%d, fid=%d, arg_size=%d\n", chpl_localeID, locale, fid, arg_size);
}
Beispiel #8
0
int qthread_multinode_initialize(void)
{
    int ret;

    qthread_debug(MULTINODE_CALLS, "begin qthread_multinode_initialize\n");

    /* initialize structures */
    initialized     = 1;
    my_rank         = world_size = -1;
    uid_to_ptr_hash = qt_hash_create(0);
    ptr_to_uid_hash = qt_hash_create(0);

    qthread_internal_net_driver_register(SHORT_MSG_TAG, fork_msg_handler);
    qthread_internal_net_driver_register(LONG_MSG_TAG, fork_long_msg_handler);
    qthread_internal_net_driver_register(RETURN_MSG_TAG, return_msg_handler);
    qthread_internal_net_driver_register(RETURN_LONG_MSG_TAG, return_long_msg_handler);
    qthread_internal_net_driver_register(DIE_MSG_TAG, die_msg_handler);

    /* initialize the network driver and provie barrier */
    ret = qthread_internal_net_driver_initialize();
    if (0 != ret) {
        qthread_debug(MULTINODE_FUNCTIONS, "qthread_internal_net_driver_init failed: %d\n", ret);
        return ret;
    }

    my_rank    = qthread_internal_net_driver_get_rank();
    world_size = qthread_internal_net_driver_get_size();

    if (0 != my_rank) {
        qthread_empty(&time_to_die);
    }

    /* make sure we can clean up */
    qthread_internal_cleanup_early(net_cleanup);

    qthread_debug(MULTINODE_CALLS, "[%d] end qthread_multinode_initialize\n", my_rank);

    return QTHREAD_SUCCESS;
}
Task::TaskMember( const function_dealloc_type  arg_dealloc
                , const function_single_type   arg_apply_single
                , const function_team_type     arg_apply_team
                , volatile int &               arg_active_count
                , const unsigned               arg_sizeof_derived
                , const unsigned               arg_dependence_capacity
                )
  : m_dealloc( arg_dealloc )
  , m_verify(  & Task::verify_type<void> )
  , m_apply_single( arg_apply_single )
  , m_apply_team( arg_apply_team )
  , m_active_count( & arg_active_count )
  , m_qfeb(0)
  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
  , m_dep_capacity( arg_dependence_capacity )
  , m_dep_size( 0 )
  , m_ref_count( 0 )
  , m_state( Kokkos::Experimental::TASK_STATE_CONSTRUCTING )
{
  qthread_empty( & m_qfeb ); // Set to full when complete
  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
}
Beispiel #10
0
int main(int argc, char *argv[])
{
    int n = 10;
    int m = 10;
    num_timesteps = 10;
    workload = 0;
    workload_per = 0;
    workload_var = 0;
    int print_final = 0;
    int alltime = 0;

    CHECK_VERBOSE();
    NUMARG(n, "N");
    NUMARG(m, "M");
    NUMARG(num_timesteps, "TIMESTEPS");
    NUMARG(workload, "WORKLOAD");
    NUMARG(workload_per, "WORKLOAD_PER");
    NUMARG(workload_var, "WORKLOAD_VAR");
    NUMARG(print_final, "PRINT_FINAL");
    NUMARG(alltime, "ALL_TIME");

    assert (n > 0 && m > 0);

    // Initialize Qthreads
    assert(qthread_initialize() == 0);

    qtimer_t alloc_timer = qtimer_create();
    qtimer_t init_timer = qtimer_create();
    qtimer_t exec_timer = qtimer_create();

    // Allocate memory for 3-stage stencil (with boundary padding)
    qtimer_start(alloc_timer);
    stencil_t points;
    points.N = n + 2;
    points.M = m + 2;

    for (int s = 0; s < NUM_STAGES; s++) {
        points.stage[s] = malloc(points.N*sizeof(aligned_t *));
        assert(NULL != points.stage[s]);
        for (int i = 0; i < points.N; i++) {
            points.stage[s][i] = calloc(points.M, sizeof(aligned_t));
            assert(NULL != points.stage[s][i]);
        }
    }
    qtimer_stop(alloc_timer);

    // Initialize first stage and set boundary conditions
    qtimer_start(init_timer);
    for (int i = 1; i < points.N-1; i++) {
        for (int j = 1; j < points.M-1; j++) {
            qthread_writeF_const(&points.stage[0][i][j], 0);
            for (int s = 1; s < NUM_STAGES; s++)
                qthread_empty(&points.stage[s][i][j]);
        }
    }
    for (int i = 0; i < points.N; i++) {
        for (int s = 0; s < NUM_STAGES; s++) {
#ifdef BOUNDARY_SYNC
            qthread_writeF_const(&points.stage[s][i][0], BOUNDARY);
            qthread_writeF_const(&points.stage[s][i][points.M-1], BOUNDARY);
#else
            points.stage[s][i][0] = BOUNDARY;
            points.stage[s][i][points.M-1] = BOUNDARY;
#endif
        }
    }
    for (int j = 0; j < points.M; j++) {
        for (int s = 0; s < NUM_STAGES; s++) {
#ifdef BOUNDARY_SYNC
            qthread_writeF_const(&points.stage[s][0][j], BOUNDARY);
            qthread_writeF_const(&points.stage[s][points.N-1][j], BOUNDARY);
#else
            points.stage[s][0][j] = BOUNDARY;
            points.stage[s][points.N-1][j] = BOUNDARY;
#endif
        }
    }
    qtimer_stop(init_timer);

    // Create barrier to synchronize on completion of calculations
    qtimer_start(exec_timer);
    points.barrier = qt_feb_barrier_create(n*m+1);

    // Spawn tasks to start calculating updates at each point
    update_args_t args = {&points, -1, -1, 1, 1};
    for (int i = 1; i < points.N-1; i++) {
        for (int j = 1; j < points.M-1; j++) {
            args.i = i;
            args.j = j;
            qthread_fork_syncvar_copyargs(update, &args, sizeof(update_args_t), NULL);
        }
    }

    // Wait for calculations to finish
    qt_feb_barrier_enter(points.barrier);
    qtimer_stop(exec_timer);

    // Print timing info
    if (alltime) {
        fprintf(stderr, "Allocation time: %f\n", qtimer_secs(alloc_timer));
        fprintf(stderr, "Initialization time: %f\n", qtimer_secs(init_timer));
        fprintf(stderr, "Execution time: %f\n", qtimer_secs(exec_timer));
    } else {
        fprintf(stdout, "%f\n", qtimer_secs(exec_timer));
    }

    // Print stencils
    if (print_final) {
        size_t final = (num_timesteps % NUM_STAGES);
        iprintf("Stage %lu:\n", prev_stage(prev_stage(final)));
        print_stage(&points, prev_stage(prev_stage(final)));
        iprintf("\nStage %lu:\n", prev_stage(final));
        print_stage(&points, prev_stage(final));
        iprintf("\nStage %lu:\n", final);
        print_stage(&points, final);
    }

    qt_feb_barrier_destroy(points.barrier);
    qtimer_destroy(alloc_timer);
    qtimer_destroy(init_timer);
    qtimer_destroy(exec_timer);

    // Free allocated memory
    for (int i = 0; i < points.N; i++) {
        free(points.stage[0][i]);
        free(points.stage[1][i]);
        free(points.stage[2][i]);
    }
    free(points.stage[0]);
    free(points.stage[1]);
    free(points.stage[2]);

    return 0;
}
Beispiel #11
0
inline int qthread_empty(const T *const dest)
{
    QTHREAD_CHECKSIZE(T);
    return qthread_empty((aligned_t *)dest);
}
Beispiel #12
0
int main(int   argc,
         char *argv[])
{
    uint64_t total_num_nodes = 0;
    qtimer_t timer;
    double   total_time = 0.0;

    CHECK_VERBOSE();

    {
        unsigned int tmp = (unsigned int)tree_type;
        NUMARG(tmp, "UTS_TREE_TYPE");
        if (tmp <= BALANCED) {
            tree_type = (tree_t)tmp;
        } else {
            fprintf(stderr, "invalid tree type\n");
            return EXIT_FAILURE;
        }
        tmp = (unsigned int)shape_fn;
        NUMARG(tmp, "UTS_SHAPE_FN");
        if (tmp <= FIXED) {
            shape_fn = (shape_t)tmp;
        } else {
            fprintf(stderr, "invalid shape function\n");
            return EXIT_FAILURE;
        }
    }
    DBLARG(bf_0, "UTS_BF_0");
    NUMARG(root_seed, "UTS_ROOT_SEED");
    NUMARG(tree_depth, "UTS_TREE_DEPTH");
    DBLARG(non_leaf_prob, "UTS_NON_LEAF_PROB");
    NUMARG(non_leaf_bf, "UTS_NON_LEAF_NUM");
    NUMARG(shift_depth, "UTS_SHIFT_DEPTH");
    NUMARG(num_samples, "UTS_NUM_SAMPLES");

    // If the operator did not attempt to set a stack size, force
    // a reasonable lower bound
    if (!getenv("QT_STACK_SIZE") && !getenv("QTHREAD_STACK_SIZE"))
        setenv("QT_STACK_SIZE", "32768", 0);

    assert(qthread_initialize() == 0);

#ifdef PRINT_STATS
    print_stats();
#else
    print_banner();
#endif

    timer = qtimer_create();
    qtimer_start(timer);

    node_t root;
    root.height = 0;
    rng_init(root.state.state, root_seed);
    root.num_children = calc_num_children(&root);
    aligned_t donecount = 0;
    root.dc = &donecount;
    qthread_empty(&donecount);
    aligned_t tot = 0;
    root.acc = &tot;
    root.expect = 1;

    qthread_fork_syncvar(visit, &root, NULL);
    qthread_readFF(NULL, root.dc);
    total_num_nodes = tot;

    qtimer_stop(timer);

    total_time = qtimer_secs(timer);

    qtimer_destroy(timer);

#ifdef PRINT_STATS
    printf("tree-size %lu\ntree-depth %d\nnum-leaves %llu\nperc-leaves %.2f\n",
           (unsigned long)total_num_nodes,
           (int)tree_height,
           (unsigned long long)num_leaves,
           num_leaves / (float)total_num_nodes * 100.0);
    printf("exec-time %.3f\ntotal-perf %.0f\npu-perf %.0f\n\n",
           total_time,
           total_num_nodes / total_time,
           total_num_nodes / total_time / qthread_num_workers());
#else
    printf("Tree size = %lu, tree depth = %d, num leaves = %llu (%.2f%%)\n",
           (unsigned long)total_num_nodes,
           (int)tree_height,
           (unsigned long long)num_leaves,
           num_leaves / (float)total_num_nodes * 100.0);
    printf("Wallclock time = %.3f sec, performance = %.0f "
           "nodes/sec (%.0f nodes/sec per PE)\n\n",
           total_time,
           total_num_nodes / total_time,
           total_num_nodes / total_time / qthread_num_workers());
#endif /* ifdef PRINT_STATS */

    return 0;
}