static aligned_t visit(void *arg_) { v_args_t *arg = (v_args_t *)arg_; if (arg->depth > 2) { /* I'm an internal node. */ v_args_t args = { arg->depth - 1, arg->sinc }; qt_sinc_expect(arg->sinc, 2); qthread_fork_syncvar_copyargs(visit, &args, sizeof(v_args_t), NULL); qthread_fork_syncvar_copyargs(visit, &args, sizeof(v_args_t), NULL); qt_sinc_submit(arg->sinc, NULL); } else if (arg->depth == 2) { /* I'm going to spawn leaf nodes. */ v_args_t args = { arg->depth - 1, arg->sinc }; qt_sinc_expect(arg->sinc, 2); qthread_fork_syncvar_copyargs(visit, &args, sizeof(v_args_t), NULL); qthread_fork_syncvar_copyargs(visit, &args, sizeof(v_args_t), NULL); qt_sinc_submit(arg->sinc, NULL); } else { /* I'm a leaf node. */ qt_sinc_submit(arg->sinc, NULL); } return 0; }
void chpl_task_startMovedTask(chpl_fn_p fp, void *arg, c_sublocid_t subLoc, chpl_taskID_t id, chpl_bool serial_state) { assert(subLoc != c_sublocid_curr); assert(id == chpl_nullTaskID); chapel_wrapper_args_t wrapper_args = {fp, arg, NULL, 0, *chpl_task_getPrivateData()}; wrapper_args.chpl_data.serial_state = serial_state; PROFILE_INCR(profile_task_startMovedTask,1); #if 1 // We are timing out when the subLoc is passed as 0 (zero). Can // we not time share tasks on a single shepherd? Perhaps we can // only time share as many tasks on a shepherd as that shepherd // has workers? For now, force the subLoc to be "any". subLoc = c_sublocid_any; #endif if (subLoc == c_sublocid_any) { qthread_fork_syncvar_copyargs(chapel_wrapper, &wrapper_args, sizeof(chapel_wrapper_args_t), NULL); } else { qthread_fork_syncvar_copyargs_to(chapel_wrapper, &wrapper_args, sizeof(chapel_wrapper_args_t), NULL, (qthread_shepherd_id_t) subLoc); } }
void chpl_task_addToTaskList(chpl_fn_int_t fid, void *arg, c_sublocid_t subLoc, chpl_task_list_p *task_list, int32_t task_list_locale, chpl_bool is_begin_stmt, int lineno, chpl_string filename) { qthread_shepherd_id_t const here_shep_id = qthread_shep(); chpl_task_private_data_t *parent_chpl_data = chpl_task_getPrivateData(); chpl_bool serial_state = parent_chpl_data->serial_state; chapel_wrapper_args_t wrapper_args = {chpl_ftable[fid], arg, filename, lineno, *parent_chpl_data}; PROFILE_INCR(profile_task_addToTaskList,1); if (serial_state) { syncvar_t ret = SYNCVAR_STATIC_EMPTY_INITIALIZER; qthread_fork_syncvar_copyargs_to(chapel_wrapper, &wrapper_args, sizeof(chapel_wrapper_args_t), &ret, here_shep_id); qthread_syncvar_readFF(NULL, &ret); } else if (subLoc == c_sublocid_any) { qthread_fork_syncvar_copyargs(chapel_wrapper, &wrapper_args, sizeof(chapel_wrapper_args_t), NULL); } else { if (subLoc == c_sublocid_curr) subLoc = (c_sublocid_t) here_shep_id; qthread_fork_syncvar_copyargs_to(chapel_wrapper, &wrapper_args, sizeof(chapel_wrapper_args_t), NULL, (qthread_shepherd_id_t) subLoc); } }
// Notes: // - Each task receives distinct copy of parent // - Copy of child is shallow, be careful with `state` member static aligned_t visit(void *args_) { node_t *parent = (node_t *)args_; int parent_height = parent->height; int num_children = parent->num_children; aligned_t expect = parent->expect; aligned_t num_descendants[num_children]; aligned_t sum_descendants = 1; if (num_children != 0) { node_t child __attribute__((aligned(8))); aligned_t donec = 0; // Spawn children, if any child.height = parent_height + 1; child.dc = &donec; child.expect = num_children; qthread_empty(&donec); for (int i = 0; i < num_children; i++) { child.acc = &num_descendants[i]; for (int j = 0; j < num_samples; j++) { rng_spawn(parent->state.state, child.state.state, i); } child.num_children = calc_num_children(&child); qthread_fork_syncvar_copyargs(visit, &child, sizeof(node_t), NULL); } // Wait for children to finish up, accumulate descendants counts if (donec != expect) qthread_readFF(NULL, &donec); for (int i = 0; i < num_children; i++) { sum_descendants += num_descendants[i]; } } *parent->acc = sum_descendants; if (qthread_incr(parent->dc, 1) + 1 == expect) { qthread_fill(parent->dc); } return 0; }
int main(int argc, char *argv[]) { int n = 10; int m = 10; num_timesteps = 10; workload = 0; workload_per = 0; workload_var = 0; int print_final = 0; int alltime = 0; CHECK_VERBOSE(); NUMARG(n, "N"); NUMARG(m, "M"); NUMARG(num_timesteps, "TIMESTEPS"); NUMARG(workload, "WORKLOAD"); NUMARG(workload_per, "WORKLOAD_PER"); NUMARG(workload_var, "WORKLOAD_VAR"); NUMARG(print_final, "PRINT_FINAL"); NUMARG(alltime, "ALL_TIME"); assert (n > 0 && m > 0); // Initialize Qthreads assert(qthread_initialize() == 0); qtimer_t alloc_timer = qtimer_create(); qtimer_t init_timer = qtimer_create(); qtimer_t exec_timer = qtimer_create(); // Allocate memory for 3-stage stencil (with boundary padding) qtimer_start(alloc_timer); stencil_t points; points.N = n + 2; points.M = m + 2; for (int s = 0; s < NUM_STAGES; s++) { points.stage[s] = malloc(points.N*sizeof(aligned_t *)); assert(NULL != points.stage[s]); for (int i = 0; i < points.N; i++) { points.stage[s][i] = calloc(points.M, sizeof(aligned_t)); assert(NULL != points.stage[s][i]); } } qtimer_stop(alloc_timer); // Initialize first stage and set boundary conditions qtimer_start(init_timer); for (int i = 1; i < points.N-1; i++) { for (int j = 1; j < points.M-1; j++) { qthread_writeF_const(&points.stage[0][i][j], 0); for (int s = 1; s < NUM_STAGES; s++) qthread_empty(&points.stage[s][i][j]); } } for (int i = 0; i < points.N; i++) { for (int s = 0; s < NUM_STAGES; s++) { #ifdef BOUNDARY_SYNC qthread_writeF_const(&points.stage[s][i][0], BOUNDARY); qthread_writeF_const(&points.stage[s][i][points.M-1], BOUNDARY); #else points.stage[s][i][0] = BOUNDARY; points.stage[s][i][points.M-1] = BOUNDARY; #endif } } for (int j = 0; j < points.M; j++) { for (int s = 0; s < NUM_STAGES; s++) { #ifdef BOUNDARY_SYNC qthread_writeF_const(&points.stage[s][0][j], BOUNDARY); qthread_writeF_const(&points.stage[s][points.N-1][j], BOUNDARY); #else points.stage[s][0][j] = BOUNDARY; points.stage[s][points.N-1][j] = BOUNDARY; #endif } } qtimer_stop(init_timer); // Create barrier to synchronize on completion of calculations qtimer_start(exec_timer); points.barrier = qt_feb_barrier_create(n*m+1); // Spawn tasks to start calculating updates at each point update_args_t args = {&points, -1, -1, 1, 1}; for (int i = 1; i < points.N-1; i++) { for (int j = 1; j < points.M-1; j++) { args.i = i; args.j = j; qthread_fork_syncvar_copyargs(update, &args, sizeof(update_args_t), NULL); } } // Wait for calculations to finish qt_feb_barrier_enter(points.barrier); qtimer_stop(exec_timer); // Print timing info if (alltime) { fprintf(stderr, "Allocation time: %f\n", qtimer_secs(alloc_timer)); fprintf(stderr, "Initialization time: %f\n", qtimer_secs(init_timer)); fprintf(stderr, "Execution time: %f\n", qtimer_secs(exec_timer)); } else { fprintf(stdout, "%f\n", qtimer_secs(exec_timer)); } // Print stencils if (print_final) { size_t final = (num_timesteps % NUM_STAGES); iprintf("Stage %lu:\n", prev_stage(prev_stage(final))); print_stage(&points, prev_stage(prev_stage(final))); iprintf("\nStage %lu:\n", prev_stage(final)); print_stage(&points, prev_stage(final)); iprintf("\nStage %lu:\n", final); print_stage(&points, final); } qt_feb_barrier_destroy(points.barrier); qtimer_destroy(alloc_timer); qtimer_destroy(init_timer); qtimer_destroy(exec_timer); // Free allocated memory for (int i = 0; i < points.N; i++) { free(points.stage[0][i]); free(points.stage[1][i]); free(points.stage[2][i]); } free(points.stage[0]); free(points.stage[1]); free(points.stage[2]); return 0; }
// ////////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { size_t depth = 3; assert(qthread_initialize() == 0); CHECK_VERBOSE(); NUMARG(depth, "TEST_DEPTH"); // Test creating an empty sinc { qt_sinc_t zero_sinc; qt_sinc_init(&zero_sinc, 0, NULL, NULL, 0); qt_sinc_wait(&zero_sinc, NULL); qt_sinc_fini(&zero_sinc); qt_sinc_t *three_sinc = qt_sinc_create(0, NULL, NULL, 0); qt_sinc_expect(three_sinc, 3); qthread_fork(submit_to_sinc, three_sinc, NULL); qthread_fork(submit_to_sinc, three_sinc, NULL); qthread_fork(submit_to_sinc, three_sinc, NULL); qt_sinc_wait(three_sinc, NULL); qt_sinc_destroy(three_sinc); } qt_sinc_t *sinc = qt_sinc_create(0, NULL, NULL, 2); // Spawn additional waits aligned_t rets[3]; { qthread_fork(wait_on_sinc, sinc, &rets[0]); qthread_fork(wait_on_sinc, sinc, &rets[1]); qthread_fork(wait_on_sinc, sinc, &rets[2]); } { v_args_t args = { depth, sinc }; // These two spawns covered by qt_sinc_create(...,2) qthread_fork_syncvar_copyargs(visit, &args, sizeof(v_args_t), NULL); qthread_fork_syncvar_copyargs(visit, &args, sizeof(v_args_t), NULL); } qt_sinc_wait(sinc, NULL); for (int i = 0; i < 3; i++) qthread_readFF(NULL, &rets[i]); // Reset the sinc qt_sinc_reset(sinc, 2); // Second use { v_args_t args = { depth, sinc }; // These two spawns covered by qt_sinc_reset(...,2) qthread_fork_syncvar_copyargs(visit, &args, sizeof(v_args_t), NULL); qthread_fork_syncvar_copyargs(visit, &args, sizeof(v_args_t), NULL); } qt_sinc_wait(sinc, NULL); qt_sinc_destroy(sinc); return 0; }