Example #1
0
void Master::schedule_tasks() {
    log_debug("Scheduling %d tasks on %d slots...", 
        ready_queue.size(), free_slots.size());

    int scheduled = 0;
    TaskList deferred_tasks;

    while (ready_queue.size() > 0 && free_slots.size() > 0) {
        Task *task = ready_queue.top();
        ready_queue.pop();

        log_trace("Scheduling task %s", task->name.c_str());

        bool match = false;

        for (SlotList::iterator s = free_slots.begin(); s != free_slots.end(); s++) {
            Slot *slot = *s;
            Host *host = slot->host;

            // If the task fits, schedule it
            if (host->can_run(task)) {

                log_trace("Matched task %s to slot %d on host %s", 
                    task->name.c_str(), slot->rank, host->name());

                // Reserve the resources
                vector<cpu_t> bindings = host->allocate_resources(task);
                host->log_resources(resource_log);

                submit_task(task, slot->rank, bindings);

                s = free_slots.erase(s);

                // so that the s++ in the loop doesn't skip one
                s--;

                match = true;
                scheduled += 1;

                // This is to break out of the slot loop so that we can 
                // consider the next task
                break;
            }
        }

        if (!match) {
            // If the task could not be scheduled, then we save it 
            // and move on to the next one. It will be requeued later.
            log_trace("No slot found for task %s", task->name.c_str());
            deferred_tasks.push_back(task);
        }
    }

    log_debug("Scheduled %d tasks and deferred %d tasks", scheduled, deferred_tasks.size());

    // Requeue all the deferred tasks
    for (TaskList::iterator t = deferred_tasks.begin(); t != deferred_tasks.end(); t++) {
        ready_queue.push(*t);
    }
}
Example #2
0
int Master::run() {
    log_info("Master starting with %d workers", numworkers);
    
    start_time = current_time();

    publish_event(WORKFLOW_START, NULL);
    
    // Install signal handlers
    struct sigaction signal_action;
    signal_action.sa_handler = on_signal;
    signal_action.sa_flags = SA_NODEFER;
    sigemptyset(&signal_action.sa_mask);
    if (sigaction(SIGALRM, &signal_action, NULL) < 0) {
        myfailures("Unable to set signal handler for SIGALRM");
    }
    if (sigaction(SIGTERM, &signal_action, NULL) < 0) {
        myfailures("Unable to set signal handler for SIGTERM");
    }
    
    // Set alarm to interrupt the master when the walltime is up
    if (this->max_wall_time > 0.0) {    
        log_info("Setting max walltime to %lf minutes", this->max_wall_time);
        alarm((unsigned)ceil(max_wall_time * 60.0));
    }
    
    register_workers();
    
    // Check to make sure that there is at least one host capable
    // of executing every task
    for (DAG::iterator t = dag->begin(); t != dag->end(); t++){
        Task *task = (*t).second;
        
        // Check all the hosts for one that can run the task
        bool match = false;
        for (unsigned h=0; h<hosts.size(); h++) {
            Host *host = hosts[h];
            if (host->can_run(task)) {
                match = true;
                break;
            }
        }
        
        if (!match) {
            // There was no host found that was capable of executing the
            // task, so we must abort
            myfailure("FATAL ERROR: No host is capable of running task %s", 
                task->name.c_str());
        }
    }
    
    // If there is a host script, wait here for it to run
    if (has_host_script) {
        comm->barrier();
    }
    
    log_info("Starting workflow");
    double makespan_start = current_time();
    // Keep executing tasks until the workflow is finished or the master
    // needs to abort the workflow due to a signal being caught
    while (!this->engine->is_finished() && !ABORT) {
        queue_ready_tasks();
        schedule_tasks();
        wait_for_results();
    }
	double makespan_finish = current_time();
    
    if (ABORT) {
        log_error("Aborting workflow");
    } else {
        log_info("Workflow finished");
    }
    
    if (this->engine->max_failures_reached()) {
        log_error("Max failures reached: DAG prematurely aborted");
    }
    
    // This must be done before write_cluster_summary so that the
    // wall time can be recorded in the cluster-summary record
    finish_time = current_time();
    wall_time = finish_time - start_time;
    double makespan = makespan_finish - makespan_start;
    
    // Close FDCache here before merging output so that
    // we can be sure the data files are flushed
    fdcache->close();
    
    // Compute resource utilization
    double master_util = total_runtime / (wall_time * (numworkers+1));
    double worker_util = total_runtime / (wall_time * numworkers);
    if (total_runtime <= 0) {
        master_util = 0.0;
        worker_util = 0.0;
    }
    
    log_info("Resource utilization (with master): %lf", master_util);
    log_info("Resource utilization (without master): %lf", worker_util);
    log_info("Total runtime of tasks: %lf seconds (%lf minutes)", total_runtime, total_runtime/60.0);
    log_info("Wall time: %lf seconds (%lf minutes)", wall_time, wall_time/60.0);
    log_info("Makespan: %lf seconds (%lf minutes)", makespan, makespan/60.0);
    log_info("Throughput: %lf tasks/second", success_count/makespan);
    log_info("Bytes sent to workers: %lu", comm->sent());
    log_info("Bytes received from workers: %lu", comm->recvd());
    log_info("File descriptor cache hit rate: %lf", fdcache->hitrate());

    bool failed = ABORT || this->engine->is_failed();
    write_cluster_summary(failed);
    
    if (!per_task_stdio) merge_all_task_stdio();
    
    log_info("Sending workers shutdown messages...");
    for (int i=1; i<=numworkers; i++) {
        log_debug("Sending shutdown message to worker %d", i);
        ShutdownMessage shmsg;
        comm->send_message(&shmsg, i);
    }
    
    if (failed) {
        publish_event(WORKFLOW_FAILURE, NULL);
    } else {
        publish_event(WORKFLOW_SUCCESS, NULL);
    }
    
    if (ABORT) {
        myfailure("Workflow aborted");
        return 1;
    } else if (failed) {
        log_error("Workflow failed");
        return 1;
    } else {
        log_info("Workflow suceeded");
        return 0;
    }
}