void rt1_launcher(void *arg) { int idx = (int)(intptr_t)arg; ABT_thread cur_thread; ABT_pool cur_pool; ABT_sched_config config; ABT_sched sched; size_t size; double t_start, t_end; ABT_sched_config_var cv_event_freq = { .idx = 0, .type = ABT_SCHED_CONFIG_INT }; ABT_sched_config_var cv_idx = { .idx = 1, .type = ABT_SCHED_CONFIG_INT }; ABT_sched_def sched_def = { .type = ABT_SCHED_TYPE_ULT, .init = sched_init, .run = sched_run, .free = sched_free, .get_migr_pool = NULL }; /* Create a scheduler */ ABT_sched_config_create(&config, cv_event_freq, 10, cv_idx, idx, ABT_sched_config_var_end); ABT_sched_create(&sched_def, 1, &rt1_data->pool, config, &sched); /* Push the scheduler to the current pool */ ABT_thread_self(&cur_thread); ABT_thread_get_last_pool(cur_thread, &cur_pool); ABT_pool_add_sched(cur_pool, sched); /* Free */ ABT_sched_config_free(&config); t_start = ABT_get_wtime(); while (1) { rt1_app(idx); ABT_pool_get_total_size(cur_pool, &size); if (size == 0) { ABT_sched_free(&sched); int rank; ABT_xstream_self_rank(&rank); printf("ES%d: finished\n", rank); ABT_mutex_lock(rt1_data->mutex); rt1_data->xstreams[rank] = ABT_XSTREAM_NULL; rt1_data->num_xstreams--; ABT_mutex_unlock(rt1_data->mutex); break; } t_end = ABT_get_wtime(); if ((t_end - t_start) > g_timeout) { ABT_sched_finish(sched); } } } static void rt1_app(int eid) { int i, num_comps; size_t size; ABT_thread cur_thread; ABT_pool cur_pool; ABT_thread_self(&cur_thread); ABT_thread_get_last_pool(cur_thread, &cur_pool); if (eid == 0) ABT_event_prof_start(); num_comps = rt1_data->num_comps; for (i = 0; i < num_comps * 2; i += 2) { ABT_thread_create(rt1_data->pool, rt1_app_compute, (void *)(intptr_t)(eid * num_comps * 2 + i), ABT_THREAD_ATTR_NULL, NULL); ABT_task_create(rt1_data->pool, rt1_app_compute, (void *)(intptr_t)(eid * num_comps * 2 + i + 1), NULL); } do { ABT_thread_yield(); /* If the size of cur_pool is zero, it means the stacked scheduler has * been terminated because of the shrinking event. */ ABT_pool_get_total_size(cur_pool, &size); if (size == 0) break; ABT_pool_get_total_size(rt1_data->pool, &size); } while (size > 0); if (eid == 0) { ABT_event_prof_stop(); int cnt = __atomic_exchange_n(&rt1_data->cnt, 0, __ATOMIC_SEQ_CST); double local_work = (double)(cnt * rt1_data->num_iters); ABT_event_prof_publish("ops", local_work, local_work); } } static void rt1_app_compute(void *arg) { int pos = (int)(intptr_t)arg; int i; rt1_data->app_data[pos] = 0; for (i = 0; i < rt1_data->num_iters; i++) { rt1_data->app_data[pos] += sin((double)pos); } __atomic_fetch_add(&rt1_data->cnt, 1, __ATOMIC_SEQ_CST); }
int main(int argc, char *argv[]) { int i, j, r; int num_xstreams; char *str, *endptr; ABT_xstream *xstreams; ABT_thread *threads; vector_scal_task_args_t *args; int inner_xstreams; double *time, avg_time = 0.0; num_xstreams = (argc > 1) ? atoi(argv[1]) : NUM_XSTREAMS; inner_xstreams = (argc > 2) ? atoi(argv[2]) : NUM_XSTREAMS; int rep = (argc > 3) ? atoi(argv[3]) : NUM_REPS; time = (double *)malloc(sizeof(double) * rep); init(); g_pools = (ABT_pool *)malloc(sizeof(ABT_pool) * num_xstreams); xstreams = (ABT_xstream *)malloc(sizeof(ABT_xstream) * num_xstreams); threads = (ABT_thread *)malloc(sizeof(ABT_thread) * num_xstreams); args = (vector_scal_task_args_t *)malloc(sizeof(vector_scal_task_args_t) * num_xstreams); /* initialization */ ABT_init(argc, argv); for (i = 0; i < num_xstreams; i++) { ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC, ABT_TRUE, &g_pools[i]); } /* ES creation */ ABT_xstream_self(&xstreams[0]); ABT_xstream_set_main_sched_basic(xstreams[0], ABT_SCHED_DEFAULT, 1, &g_pools[0]); for (i = 1; i < num_xstreams; i++) { ABT_xstream_create_basic(ABT_SCHED_DEFAULT, 1, &g_pools[i], ABT_SCHED_CONFIG_NULL, &xstreams[i]); ABT_xstream_start(xstreams[i]); } /* Each task is created on the xstream which is going to execute it */ for (r = 0; r < rep; r++) { time[r] = ABT_get_wtime(); int bloc = NUM / (num_xstreams); int rest = NUM % (num_xstreams); int start = 0; int end = 0; for (j = 0; j < num_xstreams; j++) { start = end; int inc = (j < rest) ? 1 : 0; end += bloc + inc; args[j].start = start; args[j].end = end; args[j].it = NUM; args[j].nxstreams = inner_xstreams; if (j > 0) { ABT_thread_create(g_pools[j], vector_scal_launch, (void *)&args[j], ABT_THREAD_ATTR_NULL, &threads[j]); } } vector_scal_launch((void *)&args[0]); for (j = 1; j < num_xstreams; j++) { ABT_thread_free(&threads[j]); } time[r] = ABT_get_wtime() - time[r]; avg_time += time[r]; } avg_time /= rep; printf("%d %d %f\n", num_xstreams, inner_xstreams, avg_time); check(); for (i = 1; i < num_xstreams; i++) { ABT_xstream_join(xstreams[i]); ABT_xstream_free(&xstreams[i]); } ABT_finalize(); free(g_pools); free(xstreams); free(threads); free(args); free(time); return EXIT_SUCCESS; }
/* Create a work-stealing scheduler and push it to the pool */ static void thread_add_sched(void *arg) { int idx = (int)(intptr_t)arg; int i; ABT_thread cur_thread; ABT_pool cur_pool; ABT_pool *my_pools; ABT_sched_config config; ABT_sched sched; size_t size; double t_start, t_end; ABT_sched_config_var cv_event_freq = { .idx = 0, .type = ABT_SCHED_CONFIG_INT }; ABT_sched_config_var cv_idx = { .idx = 1, .type = ABT_SCHED_CONFIG_INT }; ABT_sched_def sched_def = { .type = ABT_SCHED_TYPE_ULT, .init = sched_init, .run = sched_run, .free = sched_free, .get_migr_pool = NULL }; /* Create a scheduler */ ABT_sched_config_create(&config, cv_event_freq, 10, cv_idx, idx, ABT_sched_config_var_end); my_pools = (ABT_pool *)malloc(sizeof(ABT_pool) * max_xstreams); for (i = 0; i < max_xstreams; i++) { my_pools[i] = g_pools[(idx + i) % max_xstreams]; } ABT_sched_create(&sched_def, max_xstreams, my_pools, config, &sched); /* Create a ULT for the new scheduler */ ABT_thread_create(my_pools[0], thread_work, arg, ABT_THREAD_ATTR_NULL, NULL); /* Push the scheduler to the current pool */ ABT_thread_self(&cur_thread); ABT_thread_get_last_pool(cur_thread, &cur_pool); ABT_pool_add_sched(cur_pool, sched); /* Free */ ABT_thread_release(cur_thread); ABT_sched_config_free(&config); free(my_pools); t_start = ABT_get_wtime(); while (1) { ABT_thread_yield(); ABT_pool_get_total_size(cur_pool, &size); if (size == 0) { ABT_sched_free(&sched); break; } t_end = ABT_get_wtime(); if ((t_end - t_start) > g_timeout) { ABT_sched_finish(sched); } } } static void thread_work(void *arg) { int idx = (int)(intptr_t)arg; int i; ABT_thread cur_thread; ABT_pool cur_pool; ABT_thread *threads; int num_threads; double t_start, t_end; ABT_thread_self(&cur_thread); ABT_thread_get_last_pool(cur_thread, &cur_pool); ABT_thread_release(cur_thread); t_start = ABT_get_wtime(); while (1) { num_threads = 2; threads = (ABT_thread *)malloc(sizeof(ABT_thread) * num_threads); for (i = 0; i < num_threads; i++) { ABT_thread_create(cur_pool, thread_hello, NULL, ABT_THREAD_ATTR_NULL, &threads[i]); } for (i = 0; i < num_threads; i++) { ABT_thread_free(&threads[i]); } free(threads); if (g_signal[idx]) { ABT_xstream xstream; ABT_xstream_self(&xstream); ABT_xstream_cancel(xstream); g_signal[idx] = 0; break; } t_end = ABT_get_wtime(); if ((t_end - t_start) > g_timeout) { break; } } } static void test_printf(const char *format, ...) { #if 0 va_start(list, format); vprintf(format, list); va_end(list); fflush(stdout); #endif }
int main(int argc, char *argv[]) { ABT_pool (*all_pools)[2]; ABT_sched *scheds; ABT_thread *top_threads; size_t i, t; uint64_t t_start; /* initialize */ ABT_test_init(argc, argv); for (i = 0; i < T_LAST; i++) { t_times[i] = 0; } /* read command-line arguments */ num_xstreams = ABT_test_get_arg_val(ABT_TEST_ARG_N_ES); num_threads = ABT_test_get_arg_val(ABT_TEST_ARG_N_ULT); iter = ABT_test_get_arg_val(ABT_TEST_ARG_N_ITER); g_xstreams = (ABT_xstream *)malloc(num_xstreams * sizeof(ABT_xstream)); g_pools = (ABT_pool *)malloc(num_xstreams * sizeof(ABT_pool)); g_threads = (ABT_thread **)malloc(num_xstreams * sizeof(ABT_thread *)); for (i = 0; i < num_xstreams; i++) { g_threads[i] = (ABT_thread *)malloc(num_threads * sizeof(ABT_thread)); } all_pools = (ABT_pool (*)[2])malloc(num_xstreams * sizeof(ABT_pool) * 2); scheds = (ABT_sched *)malloc(num_xstreams * sizeof(ABT_sched)); top_threads = (ABT_thread *)malloc(num_xstreams * sizeof(ABT_thread)); /* create pools and schedulers */ for (i = 0; i < num_xstreams; i++) { ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPSC, ABT_TRUE, &all_pools[i][0]); ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_PRIV, ABT_TRUE, &all_pools[i][1]); g_pools[i] = all_pools[i][1]; ABT_sched_create_basic(ABT_SCHED_DEFAULT, 2, all_pools[i], ABT_SCHED_CONFIG_NULL, &scheds[i]); } /* create ESs */ ABT_xstream_self(&g_xstreams[0]); ABT_xstream_set_main_sched(g_xstreams[0], scheds[0]); for (i = 1; i < num_xstreams; i++) { ABT_xstream_create(scheds[i], &g_xstreams[i]); } /* benchmarking */ for (t = 0; t < T_LAST; t++) { void (*test_fn)(void *); if (t == T_YIELD) { if (t_times[T_YIELD_ALL] > t_times[T_YIELD_OVERHEAD]) { t_times[t] = t_times[T_YIELD_ALL] - t_times[T_YIELD_OVERHEAD]; } else { t_times[t] = 0; } continue; } else if (t == T_YIELD_TO) { if (t_times[T_YIELD_TO_ALL] > t_times[T_YIELD_TO_OVERHEAD]) { t_times[t] = t_times[T_YIELD_TO_ALL] - t_times[T_YIELD_TO_OVERHEAD]; } else { t_times[t] = 0; } continue; } switch (t) { case T_CREATE_JOIN: test_fn = test_create_join; break; case T_CREATE_UNNAMED: test_fn = test_create_unnamed; break; case T_YIELD_OVERHEAD: test_fn = test_yield_overhead; break; case T_YIELD_ALL: test_fn = test_yield; break; case T_YIELD_TO_OVERHEAD: test_fn = test_yield_to_overhead; break; case T_YIELD_TO_ALL: test_fn = test_yield_to; break; #ifdef TEST_MIGRATE_TO case T_MIGRATE_TO_XSTREAM: test_fn = test_migrate_to_xstream; break; #endif default: assert(0); } /* warm-up */ for (i = 0; i < num_xstreams; i++) { ABT_thread_create(all_pools[i][0], test_fn, (void *)i, ABT_THREAD_ATTR_NULL, &top_threads[i]); } for (i = 0; i < num_xstreams; i++) { ABT_thread_free(&top_threads[i]); } /* measurement */ #ifdef USE_TIME t_start = ABT_get_wtime(); #else t_start = ABT_test_get_cycles(); #endif for (i = 0; i < num_xstreams; i++) { ABT_thread_create(all_pools[i][0], test_fn, (void *)i, ABT_THREAD_ATTR_NULL, &top_threads[i]); } for (i = 0; i < num_xstreams; i++) { ABT_thread_free(&top_threads[i]); } #ifdef USE_TIME t_times[t] = ABT_get_wtime() - t_start; #else t_times[t] = ABT_test_get_cycles() - t_start; #endif } /* join and free */ for (i = 1; i < num_xstreams; i++) { ABT_xstream_join(g_xstreams[i]); ABT_xstream_free(&g_xstreams[i]); } /* finalize */ ABT_test_finalize(0); /* compute the execution time for one iteration */ for (i = 0; i < T_LAST; i++) { t_times[i] = t_times[i] / iter / num_threads; } /* output */ int line_size = 56; ABT_test_print_line(stdout, '-', line_size); printf("%s\n", "Argobots"); ABT_test_print_line(stdout, '-', line_size); printf("# of ESs : %d\n", num_xstreams); printf("# of ULTs per ES: %d\n", num_threads); ABT_test_print_line(stdout, '-', line_size); printf("Avg. execution time (in seconds, %d times)\n", iter); ABT_test_print_line(stdout, '-', line_size); printf("%-20s %-s\n", "operation", "time"); ABT_test_print_line(stdout, '-', line_size); for (i = 0; i < T_LAST; i++) { #ifdef USE_TIME printf("%-19s %.9lf\n", t_names[i], t_times[i]); #else printf("%-19s %11" PRIu64 "\n", t_names[i], t_times[i]); #endif } ABT_test_print_line(stdout, '-', line_size); free(g_xstreams); free(g_pools); for (i = 0; i < num_xstreams; i++) { free(g_threads[i]); } free(g_threads); free(all_pools); free(scheds); free(top_threads); return EXIT_SUCCESS; }