// Test that writeFF waits for empty var to be filled, writes, and leaves full. // Requires that only one worker is running. Basically does: // 1: empty var // 1: fork(writeFF) // 1: yields // 2: starts runnning // 2: hits writeFF, and yields since var is empty // 1: writeEF // 1: hits readFF on forked task and yield // 2: running again, finishes writeFF, task returns // 1: readFF competes, finishes static void testWriteFFWaits(void) { aligned_t ret; concurrent_t=45; qthread_empty(&concurrent_t); assert(qthread_num_workers() == 1); iprintf("1: Forking writeFF wrapper\n"); qthread_fork_to(writeFF_wrapper, NULL, &ret, qthread_shep()); iprintf("1: Forked, now yielding to 2\n"); qthread_yield(); iprintf("1: Back from yield\n"); // verify that writeFF has not completed assert(qthread_feb_status(&concurrent_t) == 0); assert(concurrent_t != 55); iprintf("1: Writing EF\n"); qthread_writeEF_const(&concurrent_t, 35); // wait for writeFF wrapper to complete qthread_readFF(NULL, &ret); // veify that writeFF completed and that FEB is full iprintf("1: concurrent_t=%d\n", concurrent_t); assert(qthread_feb_status(&concurrent_t) == 1); assert(concurrent_t == 55); }
int qthread_fork_remote(qthread_f f, const void *arg, aligned_t *ret, int rank, size_t arg_len) { struct fork_msg_t msg; qthread_debug(MULTINODE_CALLS, "[%d] begin qthread_fork_remote(0x%lx, 0x%lx, 0x%lx, %d, %ld)\n", my_rank, (unsigned long)f, (unsigned long)arg, (unsigned long)ret, rank, arg_len); if (NULL != ret) { qthread_empty(ret); } if (arg_len <= sizeof(msg.args)) { msg.uid = (uint64_t)qt_hash_get(ptr_to_uid_hash, f); if (qt_hash_get(uid_to_ptr_hash, (qt_key_t)(uintptr_t)msg.uid) != f) { fprintf(stderr, "action not registered at source\n"); abort(); } msg.return_addr = (uint64_t)ret; msg.origin_node = my_rank; msg.arg_len = arg_len; memcpy(msg.args, arg, arg_len); qthread_debug(MULTINODE_DETAILS, "[%d] remote fork %d %d 0x%lx %d\n", my_rank, rank, msg.uid, msg.return_addr, msg.arg_len); return qthread_internal_net_driver_send(rank, SHORT_MSG_TAG, &msg, sizeof(msg)); } fprintf(stderr, "long remote fork unsupported\n"); abort(); }
static aligned_t update(void *arg) { stencil_t *points = ((update_args_t *)arg)->points; size_t i = ((update_args_t *)arg)->i; size_t j = ((update_args_t *)arg)->j; size_t this_stage = ((update_args_t *)arg)->stage; size_t step = ((update_args_t *)arg)->step; size_t next_stage_id = next_stage(this_stage); // Perform local work perform_local_work(); aligned_t **prev = points->stage[prev_stage(this_stage)]; aligned_t sum = *(NORTH(prev, i, j)) + *(WEST(prev, i, j)) + *(HERE(prev, i, j)) + *(EAST(prev, i, j)) + *(SOUTH(prev, i, j)); // Empty the next stage for this index qthread_empty(&points->stage[next_stage_id][i][j]); // Update this point qthread_writeEF_const(&points->stage[this_stage][i][j], sum/NUM_NEIGHBORS); if (step < num_timesteps) { // Spawn next stage update_args_t args = {points, i, j, next_stage_id, step+1}; #ifdef BOUNDARY_SYNC qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, NUM_NEIGHBORS, NEIGHBORS(points->stage[this_stage],i,j)); #else if (i == 1) { // North edge if (j == 1) // West edge: EAST & SOUTH qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 2, EAST(points->stage[this_stage],i,j), SOUTH(points->stage[this_stage],i,j)); else if (j == points->M-2) // East edge: WEST & SOUTH qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 2, WEST(points->stage[this_stage],i,j), SOUTH(points->stage[this_stage],i,j)); else // Interior: WEST & EAST & SOUTH qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 3, WEST(points->stage[this_stage],i,j), EAST(points->stage[this_stage],i,j), SOUTH(points->stage[this_stage],i,j)); } else if (i == points->N-2) { // South edge if (j == 1) // West edge: NORTH & EAST qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 2, NORTH(points->stage[this_stage],i,j), EAST(points->stage[this_stage],i,j)); else if (j == points->M-2) // East edge: NORTH & WEST qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 2, NORTH(points->stage[this_stage],i,j), WEST(points->stage[this_stage],i,j)); else // Interior: NORTH & WEST & EAST qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 3, NORTH(points->stage[this_stage],i,j), WEST(points->stage[this_stage],i,j), EAST(points->stage[this_stage],i,j)); } else { // Interior if (j == 1) // West edge: NORTH & EAST & SOUTH qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 3 , NORTH(points->stage[this_stage],i,j), EAST(points->stage[this_stage],i,j), SOUTH(points->stage[this_stage],i,j)); else if (j == points->M-2) // East edge: NORTH & WEST & SOUTH qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 3, NORTH(points->stage[this_stage],i,j), WEST(points->stage[this_stage],i,j), SOUTH(points->stage[this_stage],i,j)); else // Interior: ALL qthread_fork_copyargs_precond(update, &args, sizeof(update_args_t), NULL, 4, NORTH(points->stage[this_stage],i,j), EAST(points->stage[this_stage],i,j), WEST(points->stage[this_stage],i,j), SOUTH(points->stage[this_stage],i,j)); } #endif } else qt_feb_barrier_enter(points->barrier); return 0; }
int qthread_fork_remote(qthread_f f, const void *arg, aligned_t *ret, int rank, size_t arg_len) { qthread_debug(MULTINODE_CALLS, "[%d] begin f=0x%lx, arg=0x%lx, ret=0x%lx, rank=%d, arg_len=%ld)\n", my_rank, (unsigned long)f, (unsigned long)arg, (unsigned long)ret, rank, arg_len); uint64_t const uid = (uint64_t)qt_hash_get(ptr_to_uid_hash, f); if (qt_hash_get(uid_to_ptr_hash, (qt_key_t)(uintptr_t)uid) != f) { fprintf(stderr, "action not registered at source\n"); abort(); } if (NULL != ret) { qthread_empty(ret); } if (arg_len <= FORK_MSG_PAYLOAD) { struct fork_msg_t msg; msg.uid = uid; msg.return_addr = (uint64_t)ret; msg.origin_node = my_rank; msg.arg_len = arg_len; memcpy(msg.args, arg, arg_len); qthread_debug(MULTINODE_DETAILS, "[%d] remote fork %d %d 0x%lx %d\n", my_rank, rank, msg.uid, msg.return_addr, msg.arg_len); return qthread_internal_net_driver_send(rank, SHORT_MSG_TAG, &msg, sizeof(msg)); } else { struct fork_long_msg_t * long_msg; size_t long_msg_size = sizeof(struct fork_long_msg_t) + arg_len; long_msg = malloc(long_msg_size); assert(NULL != long_msg); long_msg->uid = uid; long_msg->return_addr = (uint64_t)ret; long_msg->origin_node = my_rank; long_msg->arg_len = arg_len; memcpy(&long_msg->args, arg, arg_len); qthread_debug(MULTINODE_DETAILS, "[%d] remote long fork rank=%d uid=%d return_addr=0x%lx arg_len=%d\n", my_rank, rank, long_msg->uid, long_msg->return_addr, long_msg->arg_len); int const rc = qthread_internal_net_driver_send(rank, LONG_MSG_TAG, long_msg, long_msg_size); free(long_msg); return rc; } }
void mt_purge(T& target) { #ifdef __MTA__ purge(&target); #elif USING_QTHREADS qthread_empty(&target); target = 0; #else target = 0; #endif }
// Notes: // - Each task receives distinct copy of parent // - Copy of child is shallow, be careful with `state` member static aligned_t visit(void *args_) { node_t *parent = (node_t *)args_; int parent_height = parent->height; int num_children = parent->num_children; aligned_t expect = parent->expect; aligned_t num_descendants[num_children]; aligned_t sum_descendants = 1; if (num_children != 0) { node_t child __attribute__((aligned(8))); aligned_t donec = 0; // Spawn children, if any child.height = parent_height + 1; child.dc = &donec; child.expect = num_children; qthread_empty(&donec); for (int i = 0; i < num_children; i++) { child.acc = &num_descendants[i]; for (int j = 0; j < num_samples; j++) { rng_spawn(parent->state.state, child.state.state, i); } child.num_children = calc_num_children(&child); qthread_fork_syncvar_copyargs(visit, &child, sizeof(node_t), NULL); } // Wait for children to finish up, accumulate descendants counts if (donec != expect) qthread_readFF(NULL, &donec); for (int i = 0; i < num_children; i++) { sum_descendants += num_descendants[i]; } } *parent->acc = sum_descendants; if (qthread_incr(parent->dc, 1) + 1 == expect) { qthread_fill(parent->dc); } return 0; }
// // remote fork should launch a thread on locale that runs function f // passing it arg where the size of arg is stored in arg_size // notes: // multiple forks to the same locale should be handled concurrently // void chpl_comm_fork(int locale, chpl_fn_int_t fid, void *arg, int32_t arg_size, int32_t arg_tid) { aligned_t ret; PROFILE_INCR(profile_comm_fork,1); PROFILE_BIN_INCR(profile_comm_fork_size,arg_size); qthread_debug(CHAPEL_CALLS, "[%d] begin locale=%d, fid=%d, arg_size=%d\n", chpl_localeID, locale, fid, arg_size); qthread_debug(CHAPEL_BEHAVIOR, "[%d] (blocking) forking fn %d with arg-size %d\n", chpl_localeID, fid, arg_size); qthread_empty(&ret); spawn(locale, fid, arg, arg_size, arg_tid, &ret); qthread_readFF(NULL, &ret); qthread_debug(CHAPEL_CALLS, "[%d] end locale=%d, fid=%d, arg_size=%d\n", chpl_localeID, locale, fid, arg_size); }
int qthread_multinode_initialize(void) { int ret; qthread_debug(MULTINODE_CALLS, "begin qthread_multinode_initialize\n"); /* initialize structures */ initialized = 1; my_rank = world_size = -1; uid_to_ptr_hash = qt_hash_create(0); ptr_to_uid_hash = qt_hash_create(0); qthread_internal_net_driver_register(SHORT_MSG_TAG, fork_msg_handler); qthread_internal_net_driver_register(LONG_MSG_TAG, fork_long_msg_handler); qthread_internal_net_driver_register(RETURN_MSG_TAG, return_msg_handler); qthread_internal_net_driver_register(RETURN_LONG_MSG_TAG, return_long_msg_handler); qthread_internal_net_driver_register(DIE_MSG_TAG, die_msg_handler); /* initialize the network driver and provie barrier */ ret = qthread_internal_net_driver_initialize(); if (0 != ret) { qthread_debug(MULTINODE_FUNCTIONS, "qthread_internal_net_driver_init failed: %d\n", ret); return ret; } my_rank = qthread_internal_net_driver_get_rank(); world_size = qthread_internal_net_driver_get_size(); if (0 != my_rank) { qthread_empty(&time_to_die); } /* make sure we can clean up */ qthread_internal_cleanup_early(net_cleanup); qthread_debug(MULTINODE_CALLS, "[%d] end qthread_multinode_initialize\n", my_rank); return QTHREAD_SUCCESS; }
Task::TaskMember( const function_dealloc_type arg_dealloc , const function_single_type arg_apply_single , const function_team_type arg_apply_team , volatile int & arg_active_count , const unsigned arg_sizeof_derived , const unsigned arg_dependence_capacity ) : m_dealloc( arg_dealloc ) , m_verify( & Task::verify_type<void> ) , m_apply_single( arg_apply_single ) , m_apply_team( arg_apply_team ) , m_active_count( & arg_active_count ) , m_qfeb(0) , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) ) , m_dep_capacity( arg_dependence_capacity ) , m_dep_size( 0 ) , m_ref_count( 0 ) , m_state( Kokkos::Experimental::TASK_STATE_CONSTRUCTING ) { qthread_empty( & m_qfeb ); // Set to full when complete for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ; }
int main(int argc, char *argv[]) { int n = 10; int m = 10; num_timesteps = 10; workload = 0; workload_per = 0; workload_var = 0; int print_final = 0; int alltime = 0; CHECK_VERBOSE(); NUMARG(n, "N"); NUMARG(m, "M"); NUMARG(num_timesteps, "TIMESTEPS"); NUMARG(workload, "WORKLOAD"); NUMARG(workload_per, "WORKLOAD_PER"); NUMARG(workload_var, "WORKLOAD_VAR"); NUMARG(print_final, "PRINT_FINAL"); NUMARG(alltime, "ALL_TIME"); assert (n > 0 && m > 0); // Initialize Qthreads assert(qthread_initialize() == 0); qtimer_t alloc_timer = qtimer_create(); qtimer_t init_timer = qtimer_create(); qtimer_t exec_timer = qtimer_create(); // Allocate memory for 3-stage stencil (with boundary padding) qtimer_start(alloc_timer); stencil_t points; points.N = n + 2; points.M = m + 2; for (int s = 0; s < NUM_STAGES; s++) { points.stage[s] = malloc(points.N*sizeof(aligned_t *)); assert(NULL != points.stage[s]); for (int i = 0; i < points.N; i++) { points.stage[s][i] = calloc(points.M, sizeof(aligned_t)); assert(NULL != points.stage[s][i]); } } qtimer_stop(alloc_timer); // Initialize first stage and set boundary conditions qtimer_start(init_timer); for (int i = 1; i < points.N-1; i++) { for (int j = 1; j < points.M-1; j++) { qthread_writeF_const(&points.stage[0][i][j], 0); for (int s = 1; s < NUM_STAGES; s++) qthread_empty(&points.stage[s][i][j]); } } for (int i = 0; i < points.N; i++) { for (int s = 0; s < NUM_STAGES; s++) { #ifdef BOUNDARY_SYNC qthread_writeF_const(&points.stage[s][i][0], BOUNDARY); qthread_writeF_const(&points.stage[s][i][points.M-1], BOUNDARY); #else points.stage[s][i][0] = BOUNDARY; points.stage[s][i][points.M-1] = BOUNDARY; #endif } } for (int j = 0; j < points.M; j++) { for (int s = 0; s < NUM_STAGES; s++) { #ifdef BOUNDARY_SYNC qthread_writeF_const(&points.stage[s][0][j], BOUNDARY); qthread_writeF_const(&points.stage[s][points.N-1][j], BOUNDARY); #else points.stage[s][0][j] = BOUNDARY; points.stage[s][points.N-1][j] = BOUNDARY; #endif } } qtimer_stop(init_timer); // Create barrier to synchronize on completion of calculations qtimer_start(exec_timer); points.barrier = qt_feb_barrier_create(n*m+1); // Spawn tasks to start calculating updates at each point update_args_t args = {&points, -1, -1, 1, 1}; for (int i = 1; i < points.N-1; i++) { for (int j = 1; j < points.M-1; j++) { args.i = i; args.j = j; qthread_fork_syncvar_copyargs(update, &args, sizeof(update_args_t), NULL); } } // Wait for calculations to finish qt_feb_barrier_enter(points.barrier); qtimer_stop(exec_timer); // Print timing info if (alltime) { fprintf(stderr, "Allocation time: %f\n", qtimer_secs(alloc_timer)); fprintf(stderr, "Initialization time: %f\n", qtimer_secs(init_timer)); fprintf(stderr, "Execution time: %f\n", qtimer_secs(exec_timer)); } else { fprintf(stdout, "%f\n", qtimer_secs(exec_timer)); } // Print stencils if (print_final) { size_t final = (num_timesteps % NUM_STAGES); iprintf("Stage %lu:\n", prev_stage(prev_stage(final))); print_stage(&points, prev_stage(prev_stage(final))); iprintf("\nStage %lu:\n", prev_stage(final)); print_stage(&points, prev_stage(final)); iprintf("\nStage %lu:\n", final); print_stage(&points, final); } qt_feb_barrier_destroy(points.barrier); qtimer_destroy(alloc_timer); qtimer_destroy(init_timer); qtimer_destroy(exec_timer); // Free allocated memory for (int i = 0; i < points.N; i++) { free(points.stage[0][i]); free(points.stage[1][i]); free(points.stage[2][i]); } free(points.stage[0]); free(points.stage[1]); free(points.stage[2]); return 0; }
inline int qthread_empty(const T *const dest) { QTHREAD_CHECKSIZE(T); return qthread_empty((aligned_t *)dest); }
int main(int argc, char *argv[]) { uint64_t total_num_nodes = 0; qtimer_t timer; double total_time = 0.0; CHECK_VERBOSE(); { unsigned int tmp = (unsigned int)tree_type; NUMARG(tmp, "UTS_TREE_TYPE"); if (tmp <= BALANCED) { tree_type = (tree_t)tmp; } else { fprintf(stderr, "invalid tree type\n"); return EXIT_FAILURE; } tmp = (unsigned int)shape_fn; NUMARG(tmp, "UTS_SHAPE_FN"); if (tmp <= FIXED) { shape_fn = (shape_t)tmp; } else { fprintf(stderr, "invalid shape function\n"); return EXIT_FAILURE; } } DBLARG(bf_0, "UTS_BF_0"); NUMARG(root_seed, "UTS_ROOT_SEED"); NUMARG(tree_depth, "UTS_TREE_DEPTH"); DBLARG(non_leaf_prob, "UTS_NON_LEAF_PROB"); NUMARG(non_leaf_bf, "UTS_NON_LEAF_NUM"); NUMARG(shift_depth, "UTS_SHIFT_DEPTH"); NUMARG(num_samples, "UTS_NUM_SAMPLES"); // If the operator did not attempt to set a stack size, force // a reasonable lower bound if (!getenv("QT_STACK_SIZE") && !getenv("QTHREAD_STACK_SIZE")) setenv("QT_STACK_SIZE", "32768", 0); assert(qthread_initialize() == 0); #ifdef PRINT_STATS print_stats(); #else print_banner(); #endif timer = qtimer_create(); qtimer_start(timer); node_t root; root.height = 0; rng_init(root.state.state, root_seed); root.num_children = calc_num_children(&root); aligned_t donecount = 0; root.dc = &donecount; qthread_empty(&donecount); aligned_t tot = 0; root.acc = &tot; root.expect = 1; qthread_fork_syncvar(visit, &root, NULL); qthread_readFF(NULL, root.dc); total_num_nodes = tot; qtimer_stop(timer); total_time = qtimer_secs(timer); qtimer_destroy(timer); #ifdef PRINT_STATS printf("tree-size %lu\ntree-depth %d\nnum-leaves %llu\nperc-leaves %.2f\n", (unsigned long)total_num_nodes, (int)tree_height, (unsigned long long)num_leaves, num_leaves / (float)total_num_nodes * 100.0); printf("exec-time %.3f\ntotal-perf %.0f\npu-perf %.0f\n\n", total_time, total_num_nodes / total_time, total_num_nodes / total_time / qthread_num_workers()); #else printf("Tree size = %lu, tree depth = %d, num leaves = %llu (%.2f%%)\n", (unsigned long)total_num_nodes, (int)tree_height, (unsigned long long)num_leaves, num_leaves / (float)total_num_nodes * 100.0); printf("Wallclock time = %.3f sec, performance = %.0f " "nodes/sec (%.0f nodes/sec per PE)\n\n", total_time, total_num_nodes / total_time, total_num_nodes / total_time / qthread_num_workers()); #endif /* ifdef PRINT_STATS */ return 0; }