// Notes: // - Each task receives distinct copy of parent // - Copy of child is shallow, be careful with `state` member static long visit(node_t parent) { node_t child; uint64_t *child_descendants = calloc(sizeof(long), parent.num_children); CILK_C_REDUCER_OPADD(num_descendants, ulong, 0); uint64_t tmp; // Spawn children, if any for (int i = 0; i < parent.num_children; i++) { child.height = parent.height + 1; for (int j = 0; j < num_samples; j++) { rng_spawn(parent.state.state, child.state.state, i); } child.num_children = calc_num_children(&child); child_descendants[i] = _Cilk_spawn visit(child); } _Cilk_sync; CILK_C_REGISTER_REDUCER(num_descendants); _Cilk_for(int i = 0; i < parent.num_children; i++) { REDUCER_VIEW(num_descendants) += child_descendants[i]; } tmp = 1 + REDUCER_VIEW(num_descendants); CILK_C_UNREGISTER_REDUCER(num_descendants); return tmp; }
// Notes: // - Each task receives distinct copy of parent // - Copy of child is shallow, be careful with `state` member static long visit(node_t *parent, int num_children) { uint64_t num_descendants = 1; #ifdef BIG_STACKS uint64_t child_descendants[num_children]; node_t child_nodes[num_children]; #else uint64_t *child_descendants; node_t *child_nodes; if (num_children > 0) { child_descendants = calloc(sizeof(uint64_t), num_children); child_nodes = malloc(sizeof(node_t) * num_children); } #endif // Spawn children, if any for (int i = 0; i < num_children; i++) { node_t *child = &child_nodes[i]; child->height = parent->height + 1; for (int j = 0; j < num_samples; j++) { rng_spawn(parent->state.state, child->state.state, i); } child->num_children = calc_num_children(child); #pragma omp task untied firstprivate(i, child) shared(child_descendants) child_descendants[i] = visit(child, child->num_children); } #pragma omp taskwait // #pragma omp parallel for reduction(+:num_descendants) for (int i = 0; i < num_children; i++) { num_descendants += child_descendants[i]; } #ifndef BIG_STACKS if (num_children > 0) { free(child_descendants); free(child_nodes); } #endif return num_descendants; }
// Notes: // - Each task receives distinct copy of parent // - Copy of child is shallow, be careful with `state` member static aligned_t visit(void *args_) { node_t *parent = (node_t *)args_; int parent_height = parent->height; int num_children = parent->num_children; aligned_t expect = parent->expect; aligned_t num_descendants[num_children]; aligned_t sum_descendants = 1; if (num_children != 0) { node_t child __attribute__((aligned(8))); aligned_t donec = 0; // Spawn children, if any child.height = parent_height + 1; child.dc = &donec; child.expect = num_children; qthread_empty(&donec); for (int i = 0; i < num_children; i++) { child.acc = &num_descendants[i]; for (int j = 0; j < num_samples; j++) { rng_spawn(parent->state.state, child.state.state, i); } child.num_children = calc_num_children(&child); qthread_fork_syncvar_copyargs(visit, &child, sizeof(node_t), NULL); } // Wait for children to finish up, accumulate descendants counts if (donec != expect) qthread_readFF(NULL, &donec); for (int i = 0; i < num_children; i++) { sum_descendants += num_descendants[i]; } } *parent->acc = sum_descendants; if (qthread_incr(parent->dc, 1) + 1 == expect) { qthread_fill(parent->dc); } return 0; }
int main(int argc, char *argv[]) { uint64_t total_num_nodes = 0; qtimer_t timer; double total_time = 0.0; CHECK_VERBOSE(); { unsigned int tmp = (unsigned int)tree_type; NUMARG(tmp, "UTS_TREE_TYPE"); if (tmp <= BALANCED) { tree_type = (tree_t)tmp; } else { fprintf(stderr, "invalid tree type\n"); return EXIT_FAILURE; } tmp = (unsigned int)shape_fn; NUMARG(tmp, "UTS_SHAPE_FN"); if (tmp <= FIXED) { shape_fn = (shape_t)tmp; } else { fprintf(stderr, "invalid shape function\n"); return EXIT_FAILURE; } } DBLARG(bf_0, "UTS_BF_0"); NUMARG(root_seed, "UTS_ROOT_SEED"); NUMARG(tree_depth, "UTS_TREE_DEPTH"); DBLARG(non_leaf_prob, "UTS_NON_LEAF_PROB"); NUMARG(non_leaf_bf, "UTS_NON_LEAF_NUM"); NUMARG(shift_depth, "UTS_SHIFT_DEPTH"); NUMARG(num_samples, "UTS_NUM_SAMPLES"); #pragma omp parallel #pragma omp single #ifdef PRINT_STATS print_stats(); #else print_banner(); #endif timer = qtimer_create(); qtimer_start(timer); node_t root; root.height = 0; rng_init(root.state.state, root_seed); root.num_children = calc_num_children(&root); nodecount = 1; long retval; #pragma omp parallel #pragma omp single nowait #pragma omp task untied retval = visit(&root, root.num_children); total_num_nodes = retval; qtimer_stop(timer); total_time = qtimer_secs(timer); qtimer_destroy(timer); #ifdef PRINT_STATS printf("tree-size %lu\ntree-depth %d\nnum-leaves %llu\nperc-leaves %.2f\n", (unsigned long)total_num_nodes, (int)tree_height, (unsigned long long)num_leaves, num_leaves / (float)total_num_nodes * 100.0); printf("exec-time %.3f\ntotal-perf %.0f\npu-perf %.0f\n\n", total_time, total_num_nodes / total_time, total_num_nodes / total_time / omp_get_num_threads()); #else printf("Tree size = %lu, tree depth = %d, num leaves = %llu (%.2f%%)\n", (unsigned long)total_num_nodes, (int)tree_height, (unsigned long long)num_leaves, num_leaves / (float)total_num_nodes * 100.0); printf("Wallclock time = %.3f sec, performance = %.0f " "nodes/sec (%.0f nodes/sec per PE)\n\n", total_time, total_num_nodes / total_time, total_num_nodes / total_time / omp_get_num_threads()); #endif /* ifdef PRINT_STATS */ return 0; }
int main(int argc, char *argv[]) { uint64_t total_num_nodes = 0; qtimer_t timer; double total_time = 0.0; CHECK_VERBOSE(); { unsigned long tmp = 0; NUMARG(tmp, "UTS_TREE_TYPE"); tree_type = (tree_t)tmp; } DBLARG(bf_0, "UTS_BF_0"); NUMARG(root_seed, "UTS_ROOT_SEED"); { unsigned long tmp = 0; NUMARG(tmp, "UTS_SHAPE_FN"); shape_fn = (shape_t)tmp; } NUMARG(tree_depth, "UTS_TREE_DEPTH"); DBLARG(non_leaf_prob, "UTS_NON_LEAF_PROB"); NUMARG(non_leaf_bf, "UTS_NON_LEAF_NUM"); NUMARG(shift_depth, "UTS_SHIFT_DEPTH"); NUMARG(num_samples, "UTS_NUM_SAMPLES"); #ifdef PRINT_STATS print_stats(); #else print_banner(); #endif timer = qtimer_create(); qtimer_start(timer); node_t root; root.height = 0; rng_init(root.state.state, root_seed); root.num_children = calc_num_children(&root); nodecount = 1; long retval; { retval = _Cilk_spawn visit(root); _Cilk_sync; } total_num_nodes = retval; qtimer_stop(timer); total_time = qtimer_secs(timer); qtimer_destroy(timer); #ifdef PRINT_STATS LOG_UTS_RESULTS_YAML(total_num_nodes, total_time) LOG_ENV_CILK_YAML() #else printf("Tree size = %lu, tree depth = %d, num leaves = %llu (%.2f%%)\n", (unsigned long)total_num_nodes, (int)tree_height, (unsigned long long)num_leaves, num_leaves / (float)total_num_nodes * 100.0); printf("Wallclock time = %.3f sec, performance = %.0f " "nodes/sec (%.0f nodes/sec per PE)\n\n", total_time, total_num_nodes / total_time, total_num_nodes / total_time / __cilkrts_get_nworkers()); #endif /* ifdef PRINT_STATS */ return 0; }
int main(int argc, char *argv[]) { uint64_t total_num_nodes = 0; qtimer_t timer; double total_time = 0.0; CHECK_VERBOSE(); { unsigned int tmp = (unsigned int)tree_type; NUMARG(tmp, "UTS_TREE_TYPE"); if (tmp <= BALANCED) { tree_type = (tree_t)tmp; } else { fprintf(stderr, "invalid tree type\n"); return EXIT_FAILURE; } tmp = (unsigned int)shape_fn; NUMARG(tmp, "UTS_SHAPE_FN"); if (tmp <= FIXED) { shape_fn = (shape_t)tmp; } else { fprintf(stderr, "invalid shape function\n"); return EXIT_FAILURE; } } DBLARG(bf_0, "UTS_BF_0"); NUMARG(root_seed, "UTS_ROOT_SEED"); NUMARG(tree_depth, "UTS_TREE_DEPTH"); DBLARG(non_leaf_prob, "UTS_NON_LEAF_PROB"); NUMARG(non_leaf_bf, "UTS_NON_LEAF_NUM"); NUMARG(shift_depth, "UTS_SHIFT_DEPTH"); NUMARG(num_samples, "UTS_NUM_SAMPLES"); // If the operator did not attempt to set a stack size, force // a reasonable lower bound if (!getenv("QT_STACK_SIZE") && !getenv("QTHREAD_STACK_SIZE")) setenv("QT_STACK_SIZE", "32768", 0); assert(qthread_initialize() == 0); #ifdef PRINT_STATS print_stats(); #else print_banner(); #endif timer = qtimer_create(); qtimer_start(timer); node_t root; root.height = 0; rng_init(root.state.state, root_seed); root.num_children = calc_num_children(&root); aligned_t donecount = 0; root.dc = &donecount; qthread_empty(&donecount); aligned_t tot = 0; root.acc = &tot; root.expect = 1; qthread_fork_syncvar(visit, &root, NULL); qthread_readFF(NULL, root.dc); total_num_nodes = tot; qtimer_stop(timer); total_time = qtimer_secs(timer); qtimer_destroy(timer); #ifdef PRINT_STATS printf("tree-size %lu\ntree-depth %d\nnum-leaves %llu\nperc-leaves %.2f\n", (unsigned long)total_num_nodes, (int)tree_height, (unsigned long long)num_leaves, num_leaves / (float)total_num_nodes * 100.0); printf("exec-time %.3f\ntotal-perf %.0f\npu-perf %.0f\n\n", total_time, total_num_nodes / total_time, total_num_nodes / total_time / qthread_num_workers()); #else printf("Tree size = %lu, tree depth = %d, num leaves = %llu (%.2f%%)\n", (unsigned long)total_num_nodes, (int)tree_height, (unsigned long long)num_leaves, num_leaves / (float)total_num_nodes * 100.0); printf("Wallclock time = %.3f sec, performance = %.0f " "nodes/sec (%.0f nodes/sec per PE)\n\n", total_time, total_num_nodes / total_time, total_num_nodes / total_time / qthread_num_workers()); #endif /* ifdef PRINT_STATS */ return 0; }