static inline void perform_local_work(void) { # ifdef TIME_WORKLOAD qtimer_t work_timer = qtimer_create(); qtimer_start(work_timer); # endif // TIME_WORKLOAD volatile unsigned long work = workload; long rand_per = (long)qtimer_fastrand(); long rand_var = (long)qtimer_fastrand(); rand_per = (rand_per<0) ? (-rand_per)%100 : rand_per%100; if (rand_per < workload_per) { rand_var = (rand_var<0) ? (-rand_var)%100 : rand_var%100; work += (workload * (workload_var * 0.01)) * (rand_var * 0.01); } for (int i = 0; i < work; i++) { work = work % 1000000000; } work++; # ifdef TIME_WORKLOAD qtimer_stop(work_timer); fprintf(stdout, "Worked for %f\n", qtimer_secs(work_timer)); qtimer_destroy(work_timer); # endif // TIME_WORKLOAD }
void test_print_qthread(size_t i) { qtimer_t t = qtimer_create(); qtimer_start(t); do { qthread_yield(); qtimer_stop(t); } while(qtimer_secs(t) < 1); qtimer_destroy(t); //std::cout << i << "\n"; }
int main(int argc, char *argv[]) { uint64_t count = 1048576; int par_fork = 0; unsigned long threads = 1; qtimer_t timer; double total_time = 0.0; CHECK_VERBOSE(); NUMARG(count, "MT_COUNT"); NUMARG(par_fork, "MT_PAR_FORK"); assert(0 != count); #pragma omp parallel #pragma omp single { timer = qtimer_create(); threads = omp_get_num_threads(); if (par_fork) { qtimer_start(timer); #pragma omp parallel for for (uint64_t i = 0; i < count; i++) { #pragma omp task untied null_task(NULL); } } else { qtimer_start(timer); #pragma omp task untied for (uint64_t i = 0; i < count; i++) { #pragma omp task untied null_task(NULL); } } #pragma omp taskwait qtimer_stop(timer); } total_time = qtimer_secs(timer); qtimer_destroy(timer); printf("%lu %lu %f\n", threads, (unsigned long)count, total_time); return 0; }
int main(int argc, char *argv[]) { qtimer_t t; assert(qthread_initialize() == QTHREAD_SUCCESS); CHECK_VERBOSE(); t = qtimer_create(); assert(t); qtimer_start(t); qtimer_stop(t); if (qtimer_secs(t) == 0) { fprintf(stderr, "qtimer_secs(t) reported zero length time.\n"); } else if (qtimer_secs(t) < 0) { fprintf(stderr, "qtimer_secs(t) thinks time went backwards (%g).\n", qtimer_secs(t)); } iprintf("time to find self and assert it: %g secs\n", qtimer_secs(t)); qtimer_start(t); qtimer_stop(t); assert(qtimer_secs(t) >= 0.0); if (qtimer_secs(t) == 0.0) { iprintf("inlining reduces calltime to zero (apparently)\n"); } else { iprintf("smallest measurable time: %g secs\n", qtimer_secs(t)); } qtimer_destroy(t); // Now to test fastrand ks_test(); runs(); autocorrelation(); qthread_finalize(); return 0; }
int main(int argc, char *argv[]) { int n = 10; int m = 10; num_timesteps = 10; workload = 0; workload_per = 0; workload_var = 0; int print_final = 0; int alltime = 0; CHECK_VERBOSE(); NUMARG(n, "N"); NUMARG(m, "M"); NUMARG(num_timesteps, "TIMESTEPS"); NUMARG(workload, "WORKLOAD"); NUMARG(workload_per, "WORKLOAD_PER"); NUMARG(workload_var, "WORKLOAD_VAR"); NUMARG(print_final, "PRINT_FINAL"); NUMARG(alltime, "ALL_TIME"); assert (n > 0 && m > 0); // Initialize Qthreads assert(qthread_initialize() == 0); qtimer_t alloc_timer = qtimer_create(); qtimer_t init_timer = qtimer_create(); qtimer_t exec_timer = qtimer_create(); // Allocate memory for 3-stage stencil (with boundary padding) qtimer_start(alloc_timer); stencil_t points; points.N = n + 2; points.M = m + 2; for (int s = 0; s < NUM_STAGES; s++) { points.stage[s] = malloc(points.N*sizeof(aligned_t *)); assert(NULL != points.stage[s]); for (int i = 0; i < points.N; i++) { points.stage[s][i] = calloc(points.M, sizeof(aligned_t)); assert(NULL != points.stage[s][i]); } } qtimer_stop(alloc_timer); // Initialize first stage and set boundary conditions qtimer_start(init_timer); for (int i = 1; i < points.N-1; i++) { for (int j = 1; j < points.M-1; j++) { qthread_writeF_const(&points.stage[0][i][j], 0); for (int s = 1; s < NUM_STAGES; s++) qthread_empty(&points.stage[s][i][j]); } } for (int i = 0; i < points.N; i++) { for (int s = 0; s < NUM_STAGES; s++) { #ifdef BOUNDARY_SYNC qthread_writeF_const(&points.stage[s][i][0], BOUNDARY); qthread_writeF_const(&points.stage[s][i][points.M-1], BOUNDARY); #else points.stage[s][i][0] = BOUNDARY; points.stage[s][i][points.M-1] = BOUNDARY; #endif } } for (int j = 0; j < points.M; j++) { for (int s = 0; s < NUM_STAGES; s++) { #ifdef BOUNDARY_SYNC qthread_writeF_const(&points.stage[s][0][j], BOUNDARY); qthread_writeF_const(&points.stage[s][points.N-1][j], BOUNDARY); #else points.stage[s][0][j] = BOUNDARY; points.stage[s][points.N-1][j] = BOUNDARY; #endif } } qtimer_stop(init_timer); // Create barrier to synchronize on completion of calculations qtimer_start(exec_timer); points.barrier = qt_feb_barrier_create(n*m+1); // Spawn tasks to start calculating updates at each point update_args_t args = {&points, -1, -1, 1, 1}; for (int i = 1; i < points.N-1; i++) { for (int j = 1; j < points.M-1; j++) { args.i = i; args.j = j; qthread_fork_syncvar_copyargs(update, &args, sizeof(update_args_t), NULL); } } // Wait for calculations to finish qt_feb_barrier_enter(points.barrier); qtimer_stop(exec_timer); // Print timing info if (alltime) { fprintf(stderr, "Allocation time: %f\n", qtimer_secs(alloc_timer)); fprintf(stderr, "Initialization time: %f\n", qtimer_secs(init_timer)); fprintf(stderr, "Execution time: %f\n", qtimer_secs(exec_timer)); } else { fprintf(stdout, "%f\n", qtimer_secs(exec_timer)); } // Print stencils if (print_final) { size_t final = (num_timesteps % NUM_STAGES); iprintf("Stage %lu:\n", prev_stage(prev_stage(final))); print_stage(&points, prev_stage(prev_stage(final))); iprintf("\nStage %lu:\n", prev_stage(final)); print_stage(&points, prev_stage(final)); iprintf("\nStage %lu:\n", final); print_stage(&points, final); } qt_feb_barrier_destroy(points.barrier); qtimer_destroy(alloc_timer); qtimer_destroy(init_timer); qtimer_destroy(exec_timer); // Free allocated memory for (int i = 0; i < points.N; i++) { free(points.stage[0][i]); free(points.stage[1][i]); free(points.stage[2][i]); } free(points.stage[0]); free(points.stage[1]); free(points.stage[2]); return 0; }
int main(int argc, char *argv[]) { aligned_t *ui_array, *ui_array2; double *d_array, *d_array2; size_t len = 1000000; qtimer_t timer = qtimer_create(); double cumulative_time_qutil = 0.0; double cumulative_time_libc = 0.0; int using_doubles = 0; unsigned long iterations = 10; qthread_initialize(); CHECK_VERBOSE(); printf("%i threads\n", (int)qthread_num_workers()); NUMARG(len, "TEST_LEN"); NUMARG(iterations, "TEST_ITERATIONS"); NUMARG(using_doubles, "TEST_USING_DOUBLES"); printf("using %s\n", using_doubles ? "doubles" : "aligned_ts"); if (using_doubles) { d_array = calloc(len, sizeof(double)); printf("array is %s\n", human_readable(len * sizeof(double))); assert(d_array); // madvise(d_array,len*sizeof(double), MADV_SEQUENTIAL); for (unsigned int i = 0; i < len; i++) { d_array[i] = ((double)random()) / ((double)RAND_MAX) + random(); } d_array2 = calloc(len, sizeof(double)); assert(d_array2); // madvise(d_array2,len*sizeof(double), MADV_RANDOM); iprintf("double array generated...\n"); for (unsigned int i = 0; i < iterations; i++) { memcpy(d_array2, d_array, len * sizeof(double)); qtimer_start(timer); qutil_qsort(d_array2, len); qtimer_stop(timer); cumulative_time_qutil += qtimer_secs(timer); iprintf("\t%u: sorting %lu doubles with qutil took: %f seconds\n", i, (unsigned long)len, qtimer_secs(timer)); } cumulative_time_qutil /= (double)iterations; printf("sorting %lu doubles with qutil took: %f seconds (avg)\n", (unsigned long)len, cumulative_time_qutil); for (unsigned int i = 0; i < iterations; i++) { memcpy(d_array2, d_array, len * sizeof(double)); qtimer_start(timer); qsort(d_array2, len, sizeof(double), dcmp); qtimer_stop(timer); cumulative_time_libc += qtimer_secs(timer); iprintf("\t%u: sorting %lu doubles with libc took: %f seconds\n", i, (unsigned long)len, qtimer_secs(timer)); } cumulative_time_libc /= (double)iterations; printf("sorting %lu doubles with libc took: %f seconds\n", (unsigned long)len, cumulative_time_libc); free(d_array); free(d_array2); } else { ui_array = calloc(len, sizeof(aligned_t)); printf("array is %s\n", human_readable(len * sizeof(aligned_t))); for (unsigned int i = 0; i < len; i++) { ui_array[i] = random(); } ui_array2 = calloc(len, sizeof(aligned_t)); iprintf("ui_array generated...\n"); for (int i = 0; i < iterations; i++) { memcpy(ui_array2, ui_array, len * sizeof(aligned_t)); qtimer_start(timer); qutil_aligned_qsort(ui_array2, len); qtimer_stop(timer); cumulative_time_qutil += qtimer_secs(timer); } cumulative_time_qutil /= (double)iterations; printf("sorting %lu aligned_ts with qutil took: %f seconds\n", (unsigned long)len, cumulative_time_qutil); for (int i = 0; i < iterations; i++) { memcpy(ui_array2, ui_array, len * sizeof(aligned_t)); qtimer_start(timer); qsort(ui_array2, len, sizeof(double), acmp); qtimer_stop(timer); cumulative_time_libc += qtimer_secs(timer); } cumulative_time_libc /= (double)iterations; printf("sorting %lu aligned_ts with libc took: %f seconds (avg)\n", (unsigned long)len, cumulative_time_libc); free(ui_array); free(ui_array2); } if (cumulative_time_qutil < cumulative_time_libc) { printf("qutil with %lu threads provides a %0.2fx speedup.\n", (unsigned long)qthread_num_shepherds(), cumulative_time_libc/cumulative_time_qutil); } else { printf("qutil with %lu threads provides a %0.2fx slowdown.\n", (unsigned long)qthread_num_shepherds(), cumulative_time_libc/cumulative_time_qutil); } qtimer_destroy(timer); return 0; }
int main(int argc, char *argv[]) { uint64_t total_num_nodes = 0; qtimer_t timer; double total_time = 0.0; CHECK_VERBOSE(); { unsigned int tmp = (unsigned int)tree_type; NUMARG(tmp, "UTS_TREE_TYPE"); if (tmp <= BALANCED) { tree_type = (tree_t)tmp; } else { fprintf(stderr, "invalid tree type\n"); return EXIT_FAILURE; } tmp = (unsigned int)shape_fn; NUMARG(tmp, "UTS_SHAPE_FN"); if (tmp <= FIXED) { shape_fn = (shape_t)tmp; } else { fprintf(stderr, "invalid shape function\n"); return EXIT_FAILURE; } } DBLARG(bf_0, "UTS_BF_0"); NUMARG(root_seed, "UTS_ROOT_SEED"); NUMARG(tree_depth, "UTS_TREE_DEPTH"); DBLARG(non_leaf_prob, "UTS_NON_LEAF_PROB"); NUMARG(non_leaf_bf, "UTS_NON_LEAF_NUM"); NUMARG(shift_depth, "UTS_SHIFT_DEPTH"); NUMARG(num_samples, "UTS_NUM_SAMPLES"); #pragma omp parallel #pragma omp single #ifdef PRINT_STATS print_stats(); #else print_banner(); #endif timer = qtimer_create(); qtimer_start(timer); node_t root; root.height = 0; rng_init(root.state.state, root_seed); root.num_children = calc_num_children(&root); nodecount = 1; long retval; #pragma omp parallel #pragma omp single nowait #pragma omp task untied retval = visit(&root, root.num_children); total_num_nodes = retval; qtimer_stop(timer); total_time = qtimer_secs(timer); qtimer_destroy(timer); #ifdef PRINT_STATS printf("tree-size %lu\ntree-depth %d\nnum-leaves %llu\nperc-leaves %.2f\n", (unsigned long)total_num_nodes, (int)tree_height, (unsigned long long)num_leaves, num_leaves / (float)total_num_nodes * 100.0); printf("exec-time %.3f\ntotal-perf %.0f\npu-perf %.0f\n\n", total_time, total_num_nodes / total_time, total_num_nodes / total_time / omp_get_num_threads()); #else printf("Tree size = %lu, tree depth = %d, num leaves = %llu (%.2f%%)\n", (unsigned long)total_num_nodes, (int)tree_height, (unsigned long long)num_leaves, num_leaves / (float)total_num_nodes * 100.0); printf("Wallclock time = %.3f sec, performance = %.0f " "nodes/sec (%.0f nodes/sec per PE)\n\n", total_time, total_num_nodes / total_time, total_num_nodes / total_time / omp_get_num_threads()); #endif /* ifdef PRINT_STATS */ return 0; }
int main(int argc, char *argv[]) { uint64_t total_num_nodes = 0; qtimer_t timer; double total_time = 0.0; CHECK_VERBOSE(); { unsigned long tmp = 0; NUMARG(tmp, "UTS_TREE_TYPE"); tree_type = (tree_t)tmp; } DBLARG(bf_0, "UTS_BF_0"); NUMARG(root_seed, "UTS_ROOT_SEED"); { unsigned long tmp = 0; NUMARG(tmp, "UTS_SHAPE_FN"); shape_fn = (shape_t)tmp; } NUMARG(tree_depth, "UTS_TREE_DEPTH"); DBLARG(non_leaf_prob, "UTS_NON_LEAF_PROB"); NUMARG(non_leaf_bf, "UTS_NON_LEAF_NUM"); NUMARG(shift_depth, "UTS_SHIFT_DEPTH"); NUMARG(num_samples, "UTS_NUM_SAMPLES"); #ifdef PRINT_STATS print_stats(); #else print_banner(); #endif timer = qtimer_create(); qtimer_start(timer); node_t root; root.height = 0; rng_init(root.state.state, root_seed); root.num_children = calc_num_children(&root); nodecount = 1; long retval; { retval = _Cilk_spawn visit(root); _Cilk_sync; } total_num_nodes = retval; qtimer_stop(timer); total_time = qtimer_secs(timer); qtimer_destroy(timer); #ifdef PRINT_STATS LOG_UTS_RESULTS_YAML(total_num_nodes, total_time) LOG_ENV_CILK_YAML() #else printf("Tree size = %lu, tree depth = %d, num leaves = %llu (%.2f%%)\n", (unsigned long)total_num_nodes, (int)tree_height, (unsigned long long)num_leaves, num_leaves / (float)total_num_nodes * 100.0); printf("Wallclock time = %.3f sec, performance = %.0f " "nodes/sec (%.0f nodes/sec per PE)\n\n", total_time, total_num_nodes / total_time, total_num_nodes / total_time / __cilkrts_get_nworkers()); #endif /* ifdef PRINT_STATS */ return 0; }
int main(int argc, char *argv[]) { uint64_t total_num_nodes = 0; qtimer_t timer; double total_time = 0.0; CHECK_VERBOSE(); { unsigned int tmp = (unsigned int)tree_type; NUMARG(tmp, "UTS_TREE_TYPE"); if (tmp <= BALANCED) { tree_type = (tree_t)tmp; } else { fprintf(stderr, "invalid tree type\n"); return EXIT_FAILURE; } tmp = (unsigned int)shape_fn; NUMARG(tmp, "UTS_SHAPE_FN"); if (tmp <= FIXED) { shape_fn = (shape_t)tmp; } else { fprintf(stderr, "invalid shape function\n"); return EXIT_FAILURE; } } DBLARG(bf_0, "UTS_BF_0"); NUMARG(root_seed, "UTS_ROOT_SEED"); NUMARG(tree_depth, "UTS_TREE_DEPTH"); DBLARG(non_leaf_prob, "UTS_NON_LEAF_PROB"); NUMARG(non_leaf_bf, "UTS_NON_LEAF_NUM"); NUMARG(shift_depth, "UTS_SHIFT_DEPTH"); NUMARG(num_samples, "UTS_NUM_SAMPLES"); // If the operator did not attempt to set a stack size, force // a reasonable lower bound if (!getenv("QT_STACK_SIZE") && !getenv("QTHREAD_STACK_SIZE")) setenv("QT_STACK_SIZE", "32768", 0); assert(qthread_initialize() == 0); #ifdef PRINT_STATS print_stats(); #else print_banner(); #endif timer = qtimer_create(); qtimer_start(timer); node_t root; root.height = 0; rng_init(root.state.state, root_seed); root.num_children = calc_num_children(&root); aligned_t donecount = 0; root.dc = &donecount; qthread_empty(&donecount); aligned_t tot = 0; root.acc = &tot; root.expect = 1; qthread_fork_syncvar(visit, &root, NULL); qthread_readFF(NULL, root.dc); total_num_nodes = tot; qtimer_stop(timer); total_time = qtimer_secs(timer); qtimer_destroy(timer); #ifdef PRINT_STATS printf("tree-size %lu\ntree-depth %d\nnum-leaves %llu\nperc-leaves %.2f\n", (unsigned long)total_num_nodes, (int)tree_height, (unsigned long long)num_leaves, num_leaves / (float)total_num_nodes * 100.0); printf("exec-time %.3f\ntotal-perf %.0f\npu-perf %.0f\n\n", total_time, total_num_nodes / total_time, total_num_nodes / total_time / qthread_num_workers()); #else printf("Tree size = %lu, tree depth = %d, num leaves = %llu (%.2f%%)\n", (unsigned long)total_num_nodes, (int)tree_height, (unsigned long long)num_leaves, num_leaves / (float)total_num_nodes * 100.0); printf("Wallclock time = %.3f sec, performance = %.0f " "nodes/sec (%.0f nodes/sec per PE)\n\n", total_time, total_num_nodes / total_time, total_num_nodes / total_time / qthread_num_workers()); #endif /* ifdef PRINT_STATS */ return 0; }