void thorium_worker_run(struct thorium_worker *worker) { struct thorium_actor *actor; struct thorium_message other_message; #ifdef THORIUM_NODE_INJECT_CLEAN_WORKER_BUFFERS void *buffer; #endif #ifdef THORIUM_NODE_ENABLE_INSTRUMENTATION time_t current_time; int elapsed; int period; uint64_t current_nanoseconds; uint64_t elapsed_nanoseconds; #endif #ifdef THORIUM_WORKER_DEBUG int tag; int destination; struct thorium_message *message; #endif #ifdef THORIUM_WORKER_ENABLE_LOCK thorium_worker_lock(worker); #endif #ifdef THORIUM_NODE_ENABLE_INSTRUMENTATION period = THORIUM_NODE_LOAD_PERIOD; current_time = time(NULL); elapsed = current_time - worker->last_report; if (elapsed >= period) { current_nanoseconds = core_timer_get_nanoseconds(&worker->timer); #ifdef THORIUM_WORKER_DEBUG_LOAD printf("DEBUG Updating load report\n"); #endif elapsed_nanoseconds = current_nanoseconds - worker->epoch_start_in_nanoseconds; if (elapsed_nanoseconds > 0) { worker->epoch_load = (0.0 + worker->epoch_used_nanoseconds) / elapsed_nanoseconds; worker->epoch_used_nanoseconds = 0; worker->last_wake_up_count = core_thread_get_wake_up_count(&worker->thread); /* \see http://stackoverflow.com/questions/9657993/negative-zero-in-c */ if (worker->epoch_load == 0) { worker->epoch_load = 0; } worker->epoch_start_in_nanoseconds = current_nanoseconds; worker->last_report = current_time; } #ifdef THORIUM_WORKER_PRINT_SCHEDULING_QUEUE /* if (thorium_node_name(worker->node) == 0 && worker->name == 0) { */ thorium_scheduler_print(&worker->scheduler, thorium_node_name(worker->node), worker->name); /* } */ #endif if (core_bitmap_get_bit_uint32_t(&worker->flags, FLAG_DEBUG_ACTORS)) { thorium_worker_print_actors(worker, NULL); } } #endif #ifdef THORIUM_WORKER_DEBUG if (core_bitmap_get_bit_uint32_t(&worker->flags, FLAG_DEBUG)) { printf("DEBUG worker/%d thorium_worker_run\n", thorium_worker_name(worker)); } #endif /* check for messages in inbound FIFO */ if (thorium_worker_dequeue_actor(worker, &actor)) { #ifdef THORIUM_WORKER_DEBUG message = biosal_work_message(&work); tag = thorium_message_action(message); destination = thorium_message_destination(message); if (tag == ACTION_ASK_TO_STOP) { printf("DEBUG pulled ACTION_ASK_TO_STOP for %d\n", destination); } #endif /* * Update the priority of the actor * before starting the timer because this is part of the * runtime system (RTS). */ #ifdef THORIUM_UPDATE_SCHEDULING_PRIORITIES thorium_priority_assigner_update(&worker->scheduler, actor); #endif #ifdef THORIUM_NODE_ENABLE_INSTRUMENTATION core_timer_start(&worker->timer); #endif core_bitmap_set_bit_uint32_t(&worker->flags, FLAG_BUSY); /* * Dispatch message to a worker */ thorium_worker_work(worker, actor); core_bitmap_clear_bit_uint32_t(&worker->flags, FLAG_BUSY); #ifdef THORIUM_NODE_ENABLE_INSTRUMENTATION core_timer_stop(&worker->timer); elapsed_nanoseconds = core_timer_get_elapsed_nanoseconds(&worker->timer); if (elapsed_nanoseconds >= THORIUM_GRANULARITY_WARNING_THRESHOLD) { } worker->epoch_used_nanoseconds += elapsed_nanoseconds; worker->loop_used_nanoseconds += elapsed_nanoseconds; worker->scheduling_epoch_used_nanoseconds += elapsed_nanoseconds; worker->last_elapsed_nanoseconds = elapsed_nanoseconds; #endif } /* queue buffered message */ if (core_fast_queue_dequeue(&worker->outbound_message_queue_buffer, &other_message)) { if (!core_fast_ring_push_from_producer(&worker->outbound_message_queue, &other_message)) { #ifdef SHOW_FULL_RING_WARNINGS printf("thorium_worker: Warning: ring is full => outbound_message_queue\n"); #endif core_fast_queue_enqueue(&worker->outbound_message_queue_buffer, &other_message); } } #ifdef THORIUM_NODE_INJECT_CLEAN_WORKER_BUFFERS /* * Free outbound buffers, if any */ if (thorium_worker_fetch_clean_outbound_buffer(worker, &buffer)) { core_memory_pool_free(&worker->outbound_message_memory_pool, buffer); #ifdef THORIUM_WORKER_DEBUG_INJECTION ++worker->counter_freed_outbound_buffers_from_other_workers; #endif } #endif /* * Transfer messages for triage */ if (core_fast_queue_dequeue(&worker->clean_message_queue_for_triage, &other_message)) { CORE_DEBUGGER_ASSERT(thorium_message_buffer(&other_message) != NULL); thorium_worker_enqueue_message_for_triage(worker, &other_message); } #ifdef THORIUM_WORKER_ENABLE_LOCK thorium_worker_unlock(worker); #endif }
void thorium_balancer_balance(struct thorium_balancer *self) { /* * The 95th percentile is useful: * \see http://en.wikipedia.org/wiki/Burstable_billing * \see http://www.init7.net/en/backbone/95-percent-rule */ int load_percentile_50; struct core_timer timer; int i; struct core_vector loads; struct core_vector loads_unsorted; struct core_vector burdened_workers; struct core_vector stalled_workers; struct thorium_worker *worker; struct thorium_node *node; /*struct core_set *set;*/ struct core_pair pair; struct core_vector_iterator vector_iterator; int old_worker; int actor_name; int messages; int maximum; int with_maximum; struct core_map *set; struct core_map_iterator set_iterator; int stalled_index; int stalled_count; int new_worker_index; struct core_vector migrations; struct thorium_migration migration; struct thorium_migration *migration_to_do; struct thorium_actor *actor; int candidates; int load_value; int remaining_load; int projected_load; struct core_vector actors_to_migrate; int total; int with_messages; int stalled_percentile; int burdened_percentile; int old_total; int old_load; int new_load; int predicted_new_load; struct core_pair *pair_pointer; struct thorium_worker *new_worker; /*int new_total;*/ int actor_load; int test_stalled_index; int tests; int found_match; int spawned_actors; int killed_actors; int perfect; #ifdef THORIUM_SCHEDULER_ENABLE_SYMMETRIC_SCHEDULING struct core_map symmetric_actor_scripts; int script; #endif node = thorium_worker_pool_get_node(self->pool); spawned_actors = thorium_node_get_counter(node, CORE_COUNTER_SPAWNED_ACTORS); /* There is nothing to balance... */ if (spawned_actors == 0) { return; } killed_actors = thorium_node_get_counter(node, CORE_COUNTER_KILLED_ACTORS); /* * The system can probably not be balanced to get in * a better shape anyway. */ if (spawned_actors == self->last_spawned_actors && killed_actors == self->last_killed_actors && self->last_migrations == 0) { printf("SCHEDULER: balance can not be improved because nothing changed.\n"); return; } /* Check if we have perfection */ perfect = 1; for (i = 0; i < thorium_worker_pool_worker_count(self->pool); i++) { worker = thorium_worker_pool_get_worker(self->pool, i); load_value = thorium_worker_get_epoch_load(worker) * 100; if (load_value != 100) { perfect = 0; break; } } if (perfect) { printf("SCHEDULER: perfect balance can not be improved.\n"); return; } /* update counters */ self->last_spawned_actors = spawned_actors; self->last_killed_actors = killed_actors; /* Otherwise, try to balance things */ core_timer_init(&timer); core_timer_start(&timer); #ifdef THORIUM_SCHEDULER_ENABLE_SYMMETRIC_SCHEDULING core_map_init(&symmetric_actor_scripts, sizeof(int), sizeof(int)); thorium_balancer_detect_symmetric_scripts(self, &symmetric_actor_scripts); #endif #ifdef THORIUM_WORKER_ENABLE_LOCK /* Lock all workers first */ for (i = 0; i < thorium_worker_pool_worker_count(self->pool); i++) { worker = thorium_worker_pool_get_worker(self->pool, i); thorium_worker_lock(worker); } #endif core_vector_init(&migrations, sizeof(struct thorium_migration)); #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY printf("BALANCING\n"); #endif core_vector_init(&loads, sizeof(int)); core_vector_init(&loads_unsorted, sizeof(int)); core_vector_init(&burdened_workers, sizeof(struct core_pair)); core_vector_init(&stalled_workers, sizeof(struct core_pair)); core_vector_init(&actors_to_migrate, sizeof(struct core_pair)); for (i = 0; i < thorium_worker_pool_worker_count(self->pool); i++) { worker = thorium_worker_pool_get_worker(self->pool, i); load_value = thorium_worker_get_scheduling_epoch_load(worker) * SCHEDULER_PRECISION; #if 0 printf("DEBUG LOAD %d %d\n", i, load_value); #endif core_vector_push_back(&loads, &load_value); core_vector_push_back(&loads_unsorted, &load_value); } core_vector_sort_int(&loads); stalled_percentile = core_statistics_get_percentile_int(&loads, SCHEDULER_WINDOW); /*load_percentile_25 = core_statistics_get_percentile_int(&loads, 25);*/ load_percentile_50 = core_statistics_get_percentile_int(&loads, 50); /*load_percentile_75 = core_statistics_get_percentile_int(&loads, 75);*/ burdened_percentile = core_statistics_get_percentile_int(&loads, 100 - SCHEDULER_WINDOW); #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY printf("Percentiles for epoch loads: "); core_statistics_print_percentiles_int(&loads); #endif for (i = 0; i < thorium_worker_pool_worker_count(self->pool); i++) { worker = thorium_worker_pool_get_worker(self->pool, i); load_value = core_vector_at_as_int(&loads_unsorted, i); set = thorium_worker_get_actors(worker); if (stalled_percentile == burdened_percentile) { #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY printf("scheduling_class:%s ", THORIUM_CLASS_NORMAL_STRING); #endif } else if (load_value <= stalled_percentile) { #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY printf("scheduling_class:%s ", THORIUM_CLASS_STALLED_STRING); #endif core_pair_init(&pair, load_value, i); core_vector_push_back(&stalled_workers, &pair); } else if (load_value >= burdened_percentile) { #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY printf("scheduling_class:%s ", THORIUM_CLASS_BURDENED_STRING); #endif core_pair_init(&pair, load_value, i); core_vector_push_back(&burdened_workers, &pair); } else { #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY printf("scheduling_class:%s ", THORIUM_CLASS_NORMAL_STRING); #endif } #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY thorium_worker_print_actors(worker, self); #endif } core_vector_sort_int_reverse(&burdened_workers); core_vector_sort_int(&stalled_workers); stalled_count = core_vector_size(&stalled_workers); #ifdef THORIUM_SCHEDULER_ENABLE_VERBOSITY printf("MIGRATIONS (stalled: %d, burdened: %d)\n", (int)core_vector_size(&stalled_workers), (int)core_vector_size(&burdened_workers)); #endif stalled_index = 0; core_vector_iterator_init(&vector_iterator, &burdened_workers); while (stalled_count > 0 && core_vector_iterator_get_next_value(&vector_iterator, &pair)) { old_worker = core_pair_get_second(&pair); worker = thorium_worker_pool_get_worker(self->pool, old_worker); set = thorium_worker_get_actors(worker); /* thorium_worker_print_actors(worker); printf("\n"); */ /* * Lock the worker and try to select actors for migration */ core_map_iterator_init(&set_iterator, set); maximum = -1; with_maximum = 0; total = 0; with_messages = 0; while (core_map_iterator_get_next_key_and_value(&set_iterator, &actor_name, NULL)) { actor = thorium_node_get_actor_from_name(thorium_worker_pool_get_node(self->pool), actor_name); messages = thorium_balancer_get_actor_production(self, actor); if (maximum == -1 || messages > maximum) { maximum = messages; with_maximum = 1; } else if (messages == maximum) { with_maximum++; } if (messages > 0) { ++with_messages; } total += messages; } core_map_iterator_destroy(&set_iterator); core_map_iterator_init(&set_iterator, set); --with_maximum; candidates = 0; load_value = thorium_worker_get_scheduling_epoch_load(worker) * SCHEDULER_PRECISION; remaining_load = load_value; #if 0 printf("maximum %d with_maximum %d\n", maximum, with_maximum); #endif while (core_map_iterator_get_next_key_and_value(&set_iterator, &actor_name, NULL)) { actor = thorium_node_get_actor_from_name(thorium_worker_pool_get_node(self->pool), actor_name); if (actor == NULL) { continue; } messages = thorium_balancer_get_actor_production(self, actor); #ifdef THORIUM_SCHEDULER_ENABLE_SYMMETRIC_SCHEDULING script = thorium_actor_script(actor); /* symmetric actors are migrated elsewhere. */ if (core_map_get_value(&symmetric_actor_scripts, &script, NULL)) { continue; } #endif /* Simulate the remaining load */ projected_load = remaining_load; projected_load -= ((0.0 + messages) / total) * load_value; #ifdef THORIUM_SCHEDULER_DEBUG printf(" TESTING actor %d, production was %d, projected_load is %d (- %d * (1 - %d/%d)\n", actor_name, messages, projected_load, load_value, messages, total); #endif /* An actor without any queued messages should not be migrated */ if (messages > 0 && ((with_maximum > 0 && messages == maximum) || messages < maximum) /* * Avoid removing too many actors because * generating a stalled one is not desired */ && (projected_load >= load_percentile_50 /* * The previous rule does not apply when there * are 2 actors. */ || with_messages == 2) ) { remaining_load = projected_load; candidates++; if (messages == maximum) { --with_maximum; } core_pair_init(&pair, messages, actor_name); core_vector_push_back(&actors_to_migrate, &pair); #ifdef THORIUM_SCHEDULER_DEBUG printf("early CANDIDATE for migration: actor %d, worker %d\n", actor_name, old_worker); #endif } } core_map_iterator_destroy(&set_iterator); } core_vector_iterator_destroy(&vector_iterator); /* Sort the candidates */ /* core_vector_sort_int(&actors_to_migrate); printf("Percentiles for production: "); core_statistics_print_percentiles_int(&actors_to_migrate); */ /* Sort them in reverse order. */ core_vector_sort_int_reverse(&actors_to_migrate); core_vector_iterator_init(&vector_iterator, &actors_to_migrate); /* For each highly active actor, * try to match it with a stalled worker */ while (core_vector_iterator_get_next_value(&vector_iterator, &pair)) { actor_name = core_pair_get_second(&pair); actor = thorium_node_get_actor_from_name(thorium_worker_pool_get_node(self->pool), actor_name); if (actor == NULL) { continue; } messages = thorium_balancer_get_actor_production(self, actor); old_worker = thorium_actor_assigned_worker(actor); worker = thorium_worker_pool_get_worker(self->pool, old_worker); /* old_total can not be 0 because otherwise the would not * be burdened. */ old_total = thorium_worker_get_production(worker, self); with_messages = thorium_worker_get_producer_count(worker, self); old_load = thorium_worker_get_scheduling_epoch_load(worker) * SCHEDULER_PRECISION; actor_load = ((0.0 + messages) / old_total) * old_load; /* Try to find a stalled worker that can take it. */ test_stalled_index = stalled_index; tests = 0; predicted_new_load = 0; found_match = 0; while (tests < stalled_count) { core_vector_get_value(&stalled_workers, test_stalled_index, &pair); new_worker_index = core_pair_get_second(&pair); new_worker = thorium_worker_pool_get_worker(self->pool, new_worker_index); new_load = thorium_worker_get_scheduling_epoch_load(new_worker) * SCHEDULER_PRECISION; /*new_total = thorium_worker_get_production(new_worker);*/ predicted_new_load = new_load + actor_load; if (predicted_new_load > SCHEDULER_PRECISION /* && with_messages != 2 */) { #ifdef THORIUM_SCHEDULER_DEBUG printf("Scheduler: skipping actor %d, predicted load is %d >= 100\n", actor_name, predicted_new_load); #endif ++tests; ++test_stalled_index; if (test_stalled_index == stalled_count) { test_stalled_index = 0; } continue; } /* Otherwise, this stalled worker is fine... */ stalled_index = test_stalled_index; found_match = 1; break; } /* This actor can not be migrated to any stalled worker. */ if (!found_match) { continue; } /* Otherwise, update the load of the stalled one and go forward with the change. */ pair_pointer = (struct core_pair *)core_vector_at(&stalled_workers, stalled_index); core_pair_set_first(pair_pointer, predicted_new_load); ++stalled_index; if (stalled_index == stalled_count) { stalled_index = 0; } #if 0 new_worker = thorium_worker_pool_get_worker(pool, new_worker_index); printf(" CANDIDATE: actor %d old worker %d (%d - %d = %d) new worker %d (%d + %d = %d)\n", actor_name, old_worker, value, messages, 2new_score, new_worker_index, new_worker_old_score, messages, new_worker_new_score); #endif thorium_migration_init(&migration, actor_name, old_worker, new_worker_index); core_vector_push_back(&migrations, &migration); thorium_migration_destroy(&migration); } core_vector_iterator_destroy(&vector_iterator); core_vector_destroy(&stalled_workers); core_vector_destroy(&burdened_workers); core_vector_destroy(&loads); core_vector_destroy(&loads_unsorted); core_vector_destroy(&actors_to_migrate); /* Update the last values */ for (i = 0; i < thorium_worker_pool_worker_count(self->pool); i++) { worker = thorium_worker_pool_get_worker(self->pool, i); set = thorium_worker_get_actors(worker); core_map_iterator_init(&set_iterator, set); while (core_map_iterator_get_next_key_and_value(&set_iterator, &actor_name, NULL)) { actor = thorium_node_get_actor_from_name(thorium_worker_pool_get_node(self->pool), actor_name); thorium_balancer_update_actor_production(self, actor); } core_map_iterator_destroy(&set_iterator); thorium_worker_reset_scheduling_epoch(worker); } #ifdef THORIUM_SCHEDULER_ENABLE_SYMMETRIC_SCHEDULING /* Generate migrations for symmetric actors. */ thorium_balancer_generate_symmetric_migrations(self, &symmetric_actor_scripts, &migrations); #endif /* Actually do the migrations */ core_vector_iterator_init(&vector_iterator, &migrations); while (core_vector_iterator_next(&vector_iterator, (void **)&migration_to_do)) { thorium_balancer_migrate(self, migration_to_do); } core_vector_iterator_destroy(&vector_iterator); self->last_migrations = core_vector_size(&migrations); core_vector_destroy(&migrations); #ifdef THORIUM_WORKER_ENABLE_LOCK /* Unlock all workers */ for (i = 0; i < thorium_worker_pool_worker_count(self->pool); i++) { worker = thorium_worker_pool_get_worker(self->pool, i); thorium_worker_unlock(worker); } #endif #ifdef THORIUM_SCHEDULER_ENABLE_SYMMETRIC_SCHEDULING core_map_destroy(&symmetric_actor_scripts); #endif core_timer_stop(&timer); printf("SCHEDULER: elapsed time for balancing: %d us, %d migrations performed\n", (int)(core_timer_get_elapsed_nanoseconds(&timer) / 1000), self->last_migrations); }