void thorium_worker_pool_wake_up_workers(struct thorium_worker_pool *pool) { float load; int i; time_t current_time; int period; struct thorium_worker *worker; int elapsed; period = 1; /* * Send a signal to any worker without activity. * This is required because a thread can go to sleep after the signal was sent * (the first signal). */ i = 0; current_time = time(NULL); if (current_time - pool->last_signal_check >= period) { while (i < pool->worker_count) { worker = thorium_worker_pool_get_worker(pool, i); load = thorium_worker_get_epoch_load(worker); elapsed = current_time - thorium_worker_get_last_report_time(worker); /* * Wake up the worker (for instance, worker/8) * so that it pulls something. */ if (load < 0.1 || elapsed >= 1) { thorium_worker_signal(worker); } ++i; } pool->last_signal_check = current_time; } }
float thorium_worker_pool_get_current_load(struct thorium_worker_pool *pool) { float load; int workers; int i; workers = thorium_worker_pool_worker_count(pool); load = 0; i = 0; while (i < workers) { load += thorium_worker_get_epoch_load(thorium_worker_pool_get_worker(pool, i)); ++i; } load /= workers; return load; }
void thorium_balancer_balance(struct thorium_balancer *self) { /* * The 95th percentile is useful: * \see http://en.wikipedia.org/wiki/Burstable_billing * \see http://www.init7.net/en/backbone/95-percent-rule */ int load_percentile_50; struct core_timer timer; int i; struct core_vector loads; struct core_vector loads_unsorted; struct core_vector burdened_workers; struct core_vector stalled_workers; struct thorium_worker *worker; struct thorium_node *node; /*struct core_set *set;*/ struct core_pair pair; struct core_vector_iterator vector_iterator; int old_worker; int actor_name; int messages; int maximum; int with_maximum; struct core_map *set; struct core_map_iterator set_iterator; int stalled_index; int stalled_count; int new_worker_index; struct core_vector migrations; struct thorium_migration migration; struct thorium_migration *migration_to_do; struct thorium_actor *actor; int candidates; int load_value; int remaining_load; int projected_load; struct core_vector actors_to_migrate; int total; int with_messages; int stalled_percentile; int burdened_percentile; int old_total; int old_load; int new_load; int predicted_new_load; struct core_pair *pair_pointer; struct thorium_worker *new_worker; /*int new_total;*/ int actor_load; int test_stalled_index; int tests; int found_match; int spawned_actors; int killed_actors; int perfect; #ifdef THORIUM_SCHEDULER_ENABLE_SYMMETRIC_SCHEDULING struct core_map symmetric_actor_scripts; int script; #endif node = thorium_worker_pool_get_node(self->pool); spawned_actors = thorium_node_get_counter(node, CORE_COUNTER_SPAWNED_ACTORS); /* There is nothing to balance... */ if (spawned_actors == 0) { return; } killed_actors = thorium_node_get_counter(node, CORE_COUNTER_KILLED_ACTORS); /* * The system can probably not be balanced to get in * a better shape anyway. */ if (spawned_actors == self->last_spawned_actors && killed_actors == self->last_killed_actors && self->last_migrations == 0) { printf("SCHEDULER: balance can not be improved because nothing changed.\n"); return; } /* Check if we have perfection */ perfect = 1; for (i = 0; i < thorium_worker_pool_worker_count(self->pool); i++) { worker = thorium_worker_pool_get_worker(self->pool, i); load_value = thorium_worker_get_epoch_load(worker) * 100; if (load_value != 100) { perfect = 0; break; } } if (perfect) { printf("SCHEDULER: perfect balance can not be improved.\n"); return; } /* update counters */ self->last_spawned_actors = spawned_actors; self->last_killed_actors = killed_actors; /* Otherwise, try to balance things */ core_timer_init(&timer); core_timer_start(&timer); #ifdef THORIUM_SCHEDULER_ENABLE_SYMMETRIC_SCHEDULING core_map_init(&symmetric_actor_scripts, sizeof(int), sizeof(int)); thorium_balancer_detect_symmetric_scripts(self, &symmetric_actor_scripts); #endif #ifdef THORIUM_WORKER_ENABLE_LOCK /* Lock all workers first */ for (i = 0; i < thorium_worker_pool_worker_count(self->pool); i++) { worker = thorium_worker_pool_get_worker(self->pool, i); thorium_worker_lock(worker); } #endif core_vector_init(&migrations, sizeof(struct thorium_migration)); #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY printf("BALANCING\n"); #endif core_vector_init(&loads, sizeof(int)); core_vector_init(&loads_unsorted, sizeof(int)); core_vector_init(&burdened_workers, sizeof(struct core_pair)); core_vector_init(&stalled_workers, sizeof(struct core_pair)); core_vector_init(&actors_to_migrate, sizeof(struct core_pair)); for (i = 0; i < thorium_worker_pool_worker_count(self->pool); i++) { worker = thorium_worker_pool_get_worker(self->pool, i); load_value = thorium_worker_get_scheduling_epoch_load(worker) * SCHEDULER_PRECISION; #if 0 printf("DEBUG LOAD %d %d\n", i, load_value); #endif core_vector_push_back(&loads, &load_value); core_vector_push_back(&loads_unsorted, &load_value); } core_vector_sort_int(&loads); stalled_percentile = core_statistics_get_percentile_int(&loads, SCHEDULER_WINDOW); /*load_percentile_25 = core_statistics_get_percentile_int(&loads, 25);*/ load_percentile_50 = core_statistics_get_percentile_int(&loads, 50); /*load_percentile_75 = core_statistics_get_percentile_int(&loads, 75);*/ burdened_percentile = core_statistics_get_percentile_int(&loads, 100 - SCHEDULER_WINDOW); #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY printf("Percentiles for epoch loads: "); core_statistics_print_percentiles_int(&loads); #endif for (i = 0; i < thorium_worker_pool_worker_count(self->pool); i++) { worker = thorium_worker_pool_get_worker(self->pool, i); load_value = core_vector_at_as_int(&loads_unsorted, i); set = thorium_worker_get_actors(worker); if (stalled_percentile == burdened_percentile) { #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY printf("scheduling_class:%s ", THORIUM_CLASS_NORMAL_STRING); #endif } else if (load_value <= stalled_percentile) { #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY printf("scheduling_class:%s ", THORIUM_CLASS_STALLED_STRING); #endif core_pair_init(&pair, load_value, i); core_vector_push_back(&stalled_workers, &pair); } else if (load_value >= burdened_percentile) { #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY printf("scheduling_class:%s ", THORIUM_CLASS_BURDENED_STRING); #endif core_pair_init(&pair, load_value, i); core_vector_push_back(&burdened_workers, &pair); } else { #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY printf("scheduling_class:%s ", THORIUM_CLASS_NORMAL_STRING); #endif } #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY thorium_worker_print_actors(worker, self); #endif } core_vector_sort_int_reverse(&burdened_workers); core_vector_sort_int(&stalled_workers); stalled_count = core_vector_size(&stalled_workers); #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY printf("MIGRATIONS (stalled: %d, burdened: %d)\n", (int)core_vector_size(&stalled_workers), (int)core_vector_size(&burdened_workers)); #endif stalled_index = 0; core_vector_iterator_init(&vector_iterator, &burdened_workers); while (stalled_count > 0 && core_vector_iterator_get_next_value(&vector_iterator, &pair)) { old_worker = core_pair_get_second(&pair); worker = thorium_worker_pool_get_worker(self->pool, old_worker); set = thorium_worker_get_actors(worker); /* thorium_worker_print_actors(worker); printf("\n"); */ /* * Lock the worker and try to select actors for migration */ core_map_iterator_init(&set_iterator, set); maximum = -1; with_maximum = 0; total = 0; with_messages = 0; while (core_map_iterator_get_next_key_and_value(&set_iterator, &actor_name, NULL)) { actor = thorium_node_get_actor_from_name(thorium_worker_pool_get_node(self->pool), actor_name); messages = thorium_balancer_get_actor_production(self, actor); if (maximum == -1 || messages > maximum) { maximum = messages; with_maximum = 1; } else if (messages == maximum) { with_maximum++; } if (messages > 0) { ++with_messages; } total += messages; } core_map_iterator_destroy(&set_iterator); core_map_iterator_init(&set_iterator, set); --with_maximum; candidates = 0; load_value = thorium_worker_get_scheduling_epoch_load(worker) * SCHEDULER_PRECISION; remaining_load = load_value; #if 0 printf("maximum %d with_maximum %d\n", maximum, with_maximum); #endif while (core_map_iterator_get_next_key_and_value(&set_iterator, &actor_name, NULL)) { actor = thorium_node_get_actor_from_name(thorium_worker_pool_get_node(self->pool), actor_name); if (actor == NULL) { continue; } messages = thorium_balancer_get_actor_production(self, actor); #ifdef THORIUM_SCHEDULER_ENABLE_SYMMETRIC_SCHEDULING script = thorium_actor_script(actor); /* symmetric actors are migrated elsewhere. */ if (core_map_get_value(&symmetric_actor_scripts, &script, NULL)) { continue; } #endif /* Simulate the remaining load */ projected_load = remaining_load; projected_load -= ((0.0 + messages) / total) * load_value; #ifdef THORIUM_SCHEDULER_DEBUG printf(" TESTING actor %d, production was %d, projected_load is %d (- %d * (1 - %d/%d)\n", actor_name, messages, projected_load, load_value, messages, total); #endif /* An actor without any queued messages should not be migrated */ if (messages > 0 && ((with_maximum > 0 && messages == maximum) || messages < maximum) /* * Avoid removing too many actors because * generating a stalled one is not desired */ && (projected_load >= load_percentile_50 /* * The previous rule does not apply when there * are 2 actors. */ || with_messages == 2) ) { remaining_load = projected_load; candidates++; if (messages == maximum) { --with_maximum; } core_pair_init(&pair, messages, actor_name); core_vector_push_back(&actors_to_migrate, &pair); #ifdef THORIUM_SCHEDULER_DEBUG printf("early CANDIDATE for migration: actor %d, worker %d\n", actor_name, old_worker); #endif } } core_map_iterator_destroy(&set_iterator); } core_vector_iterator_destroy(&vector_iterator); /* Sort the candidates */ /* core_vector_sort_int(&actors_to_migrate); printf("Percentiles for production: "); core_statistics_print_percentiles_int(&actors_to_migrate); */ /* Sort them in reverse order. */ core_vector_sort_int_reverse(&actors_to_migrate); core_vector_iterator_init(&vector_iterator, &actors_to_migrate); /* For each highly active actor, * try to match it with a stalled worker */ while (core_vector_iterator_get_next_value(&vector_iterator, &pair)) { actor_name = core_pair_get_second(&pair); actor = thorium_node_get_actor_from_name(thorium_worker_pool_get_node(self->pool), actor_name); if (actor == NULL) { continue; } messages = thorium_balancer_get_actor_production(self, actor); old_worker = thorium_actor_assigned_worker(actor); worker = thorium_worker_pool_get_worker(self->pool, old_worker); /* old_total can not be 0 because otherwise the would not * be burdened. */ old_total = thorium_worker_get_production(worker, self); with_messages = thorium_worker_get_producer_count(worker, self); old_load = thorium_worker_get_scheduling_epoch_load(worker) * SCHEDULER_PRECISION; actor_load = ((0.0 + messages) / old_total) * old_load; /* Try to find a stalled worker that can take it. */ test_stalled_index = stalled_index; tests = 0; predicted_new_load = 0; found_match = 0; while (tests < stalled_count) { core_vector_get_value(&stalled_workers, test_stalled_index, &pair); new_worker_index = core_pair_get_second(&pair); new_worker = thorium_worker_pool_get_worker(self->pool, new_worker_index); new_load = thorium_worker_get_scheduling_epoch_load(new_worker) * SCHEDULER_PRECISION; /*new_total = thorium_worker_get_production(new_worker);*/ predicted_new_load = new_load + actor_load; if (predicted_new_load > SCHEDULER_PRECISION /* && with_messages != 2 */) { #ifdef THORIUM_SCHEDULER_DEBUG printf("Scheduler: skipping actor %d, predicted load is %d >= 100\n", actor_name, predicted_new_load); #endif ++tests; ++test_stalled_index; if (test_stalled_index == stalled_count) { test_stalled_index = 0; } continue; } /* Otherwise, this stalled worker is fine... */ stalled_index = test_stalled_index; found_match = 1; break; } /* This actor can not be migrated to any stalled worker. */ if (!found_match) { continue; } /* Otherwise, update the load of the stalled one and go forward with the change. */ pair_pointer = (struct core_pair *)core_vector_at(&stalled_workers, stalled_index); core_pair_set_first(pair_pointer, predicted_new_load); ++stalled_index; if (stalled_index == stalled_count) { stalled_index = 0; } #if 0 new_worker = thorium_worker_pool_get_worker(pool, new_worker_index); printf(" CANDIDATE: actor %d old worker %d (%d - %d = %d) new worker %d (%d + %d = %d)\n", actor_name, old_worker, value, messages, 2new_score, new_worker_index, new_worker_old_score, messages, new_worker_new_score); #endif thorium_migration_init(&migration, actor_name, old_worker, new_worker_index); core_vector_push_back(&migrations, &migration); thorium_migration_destroy(&migration); } core_vector_iterator_destroy(&vector_iterator); core_vector_destroy(&stalled_workers); core_vector_destroy(&burdened_workers); core_vector_destroy(&loads); core_vector_destroy(&loads_unsorted); core_vector_destroy(&actors_to_migrate); /* Update the last values */ for (i = 0; i < thorium_worker_pool_worker_count(self->pool); i++) { worker = thorium_worker_pool_get_worker(self->pool, i); set = thorium_worker_get_actors(worker); core_map_iterator_init(&set_iterator, set); while (core_map_iterator_get_next_key_and_value(&set_iterator, &actor_name, NULL)) { actor = thorium_node_get_actor_from_name(thorium_worker_pool_get_node(self->pool), actor_name); thorium_balancer_update_actor_production(self, actor); } core_map_iterator_destroy(&set_iterator); thorium_worker_reset_scheduling_epoch(worker); } #ifdef THORIUM_SCHEDULER_ENABLE_SYMMETRIC_SCHEDULING /* Generate migrations for symmetric actors. */ thorium_balancer_generate_symmetric_migrations(self, &symmetric_actor_scripts, &migrations); #endif /* Actually do the migrations */ core_vector_iterator_init(&vector_iterator, &migrations); while (core_vector_iterator_next(&vector_iterator, (void **)&migration_to_do)) { thorium_balancer_migrate(self, migration_to_do); } core_vector_iterator_destroy(&vector_iterator); self->last_migrations = core_vector_size(&migrations); core_vector_destroy(&migrations); #ifdef THORIUM_WORKER_ENABLE_LOCK /* Unlock all workers */ for (i = 0; i < thorium_worker_pool_worker_count(self->pool); i++) { worker = thorium_worker_pool_get_worker(self->pool, i); thorium_worker_unlock(worker); } #endif #ifdef THORIUM_SCHEDULER_ENABLE_SYMMETRIC_SCHEDULING core_map_destroy(&symmetric_actor_scripts); #endif core_timer_stop(&timer); printf("SCHEDULER: elapsed time for balancing: %d us, %d migrations performed\n", (int)(core_timer_get_elapsed_nanoseconds(&timer) / 1000), self->last_migrations); }
void thorium_worker_pool_print_load(struct thorium_worker_pool *self, int type) { int count; int i; float epoch_load; struct thorium_worker *worker; float loop_load; uint64_t epoch_wake_up_count; uint64_t loop_wake_up_count; /* int scheduling_score; */ int node_name; char *buffer; char *buffer_for_wake_up_events; char *buffer_for_future_timeline; int allocated; int offset; int offset_for_wake_up; int offset_for_future; int extra; time_t current_time; int elapsed; float selected_load; uint64_t selected_wake_up_count; float sum; char loop[] = "COMPUTATION"; char epoch[] = "EPOCH"; char *description; float load; description = NULL; if (type == THORIUM_WORKER_POOL_LOAD_LOOP) { description = loop; } else if (type == THORIUM_WORKER_POOL_LOAD_EPOCH) { description = epoch; } else { return; } current_time = time(NULL); elapsed = current_time - self->starting_time; extra = 100; count = thorium_worker_pool_worker_count(self); allocated = count * 20 + 20 + extra; buffer = core_memory_allocate(allocated, MEMORY_WORKER_POOL_KEY); buffer_for_wake_up_events = core_memory_allocate(allocated, MEMORY_WORKER_POOL_KEY); buffer_for_future_timeline = core_memory_allocate(allocated, MEMORY_WORKER_POOL_KEY); node_name = thorium_node_name(self->node); offset = 0; offset_for_wake_up = 0; offset_for_future = 0; i = 0; sum = 0; while (i < count && offset + extra < allocated) { worker = thorium_worker_pool_get_worker(self, i); epoch_load = thorium_worker_get_epoch_load(worker); loop_load = thorium_worker_get_loop_load(worker); epoch_wake_up_count = thorium_worker_get_epoch_wake_up_count(worker); loop_wake_up_count = thorium_worker_get_loop_wake_up_count(worker); selected_load = epoch_load; selected_wake_up_count = epoch_wake_up_count; if (type == THORIUM_WORKER_POOL_LOAD_EPOCH) { selected_load = epoch_load; selected_wake_up_count = epoch_wake_up_count; } else if (type == THORIUM_WORKER_POOL_LOAD_LOOP) { selected_load = loop_load; selected_wake_up_count = loop_wake_up_count; } /* offset += sprintf(buffer + offset, " [%d %d %.2f]", i, scheduling_score, selected_load); */ offset += sprintf(buffer + offset, " %.2f", selected_load); offset_for_wake_up += sprintf(buffer_for_wake_up_events + offset_for_wake_up, " %" PRIu64 "", selected_wake_up_count); offset_for_future += sprintf(buffer_for_future_timeline + offset_for_future, " %d", thorium_worker_get_scheduled_actor_count(worker)); sum += selected_load; ++i; } load = sum / count; printf("thorium_worker_pool: node/%d %s LOAD %d s %.2f/%d (%.2f)%s\n", node_name, description, elapsed, sum, count, load, buffer); printf("thorium_worker_pool: node/%d %s FUTURE_TIMELINE %d s %s\n", node_name, description, elapsed, buffer_for_future_timeline); printf("thorium_worker_pool: node/%d %s WAKE_UP_COUNT %d s %s\n", node_name, description, elapsed, buffer_for_wake_up_events); core_memory_free(buffer, MEMORY_WORKER_POOL_KEY); core_memory_free(buffer_for_wake_up_events, MEMORY_WORKER_POOL_KEY); core_memory_free(buffer_for_future_timeline, MEMORY_WORKER_POOL_KEY); }
int thorium_balancer_select_worker_least_busy( struct thorium_balancer *self, int *worker_score) { int to_check; int score; int best_score; struct thorium_worker *worker; struct thorium_worker *best_worker; int selected_worker; #if 0 int last_worker_score; #endif #ifdef THORIUM_WORKER_DEBUG int tag; int destination; struct thorium_message *message; #endif best_worker = NULL; best_score = 99; to_check = THORIUM_SCHEDULER_WORK_SCHEDULING_WINDOW; while (to_check--) { /* * get the worker to test for this iteration. */ worker = thorium_worker_pool_get_worker(self->pool, self->worker_for_work); score = thorium_worker_get_epoch_load(worker); #ifdef THORIUM_WORKER_POOL_DEBUG_ISSUE_334 if (score >= THORIUM_WORKER_WARNING_THRESHOLD && (self->last_scheduling_warning == 0 || score >= self->last_scheduling_warning + THORIUM_WORKER_WARNING_THRESHOLD_STRIDE)) { printf("Warning: node %d worker %d has a scheduling score of %d\n", thorium_node_name(thorium_worker_pool_get_node(self->pool)), self->worker_for_work, score); self->last_scheduling_warning = score; } #endif /* if the worker is not busy and it has no work to do, * select it right away... */ if (score == 0) { best_worker = worker; best_score = 0; break; } /* Otherwise, test the worker */ if (best_worker == NULL || score < best_score) { best_worker = worker; best_score = score; } /* * assign the next worker */ self->worker_for_work = thorium_worker_pool_next_worker(self->pool, self->worker_for_work); } #ifdef THORIUM_WORKER_POOL_DEBUG message = biosal_work_message(work); tag = thorium_message_action(message); destination = thorium_message_destination(message); if (tag == ACTION_ASK_TO_STOP) { printf("DEBUG dispatching ACTION_ASK_TO_STOP for actor %d to worker %d\n", destination, *start); } #endif selected_worker = self->worker_for_work; /* * assign the next worker */ self->worker_for_work = thorium_worker_pool_next_worker(self->pool, self->worker_for_work); *worker_score = best_score; /* This is a best effort algorithm */ return selected_worker; }