示例#1
0
static int send_job_for_app(APP& app) {
    BEST_APP_VERSION* bavp;
    SCHED_DB_RESULT result;

    lock_sema();
    for (int i=0; i<ssp->max_wu_results; i++) {
        WU_RESULT& wu_result = ssp->wu_results[i];
        if (wu_result.state != WR_STATE_PRESENT && wu_result.state != g_pid) {
            continue;
        }
        WORKUNIT wu = wu_result.workunit;

        if (wu.appid != app.id) continue;

        if (!can_send_nci(wu_result, wu, bavp, &app)) {
            // All jobs for a given NCI app are identical.
            // If we can't send one, we can't send any.
            //
            unlock_sema();
            log_messages.printf(MSG_NORMAL,
                "can_send_nci() failed for NCI job\n"
            );
            return -1;
        }
        wu_result.state = g_pid;
        unlock_sema();
        result.id = wu_result.resultid;
        wu_result.state = WR_STATE_EMPTY;
        if (result_still_sendable(result, wu)) {
            if (config.debug_send) {
                log_messages.printf(MSG_NORMAL,
                    "Sending non-CPU-intensive job: %s\n", wu.name
                );
            }
            add_result_to_reply(result, wu, bavp, false);
            return 0;
        }
        log_messages.printf(MSG_NORMAL,
            "NCI job was not still sendable\n"
        );
        lock_sema();
    }
    log_messages.printf(MSG_NORMAL,
        "no sendable NCI jobs for %s\n", app.user_friendly_name
    );
    unlock_sema();
    return 1;
}
示例#2
0
// Make a pass through the wu/results array, sending work.
// The choice of jobs is limited by flags in g_wreq, as follows:
// infeasible_only:
//      send only results that were previously infeasible for some host
// reliable_only:
//      send only jobs with "need_reliable" set (e.g. retries)
//      and send them only w/ app versions that are "reliable" for this host
// user_apps_only:
//      Send only jobs for apps selected by user
// beta_only:
//      Send only jobs for beta-test apps
// locality_sched_lite:
//      For apps that use locality sched Lite,
//      send only jobs for which the host already has at least 1 file
//
// Return true if no more work is needed.
//
static bool scan_work_array() {
    int i, j, rnd_off, last_retval=0;;
    APP* app;
    BEST_APP_VERSION* bavp;
    bool no_more_needed = false;
    SCHED_DB_RESULT result;

    // To minimize the amount of time we lock the array,
    // we initially scan without holding the lock.
    // If we find a job that passes quick_check(),
    // we acquire the lock and then check the job again.
    //
    bool sema_locked = false;

    rnd_off = rand() % ssp->max_wu_results;
    for (j=0; j<ssp->max_wu_results; j++) {
        i = (j+rnd_off) % ssp->max_wu_results;

        WU_RESULT& wu_result = ssp->wu_results[i];

        if (config.debug_send_job) {
            log_messages.printf(MSG_NORMAL,
                "[send_job] scanning slot %d\n", i
            );
        }

recheck:
        if (wu_result.state != WR_STATE_PRESENT && wu_result.state != g_pid) {
            continue;
        }

        // make a copy of the WORKUNIT part,
        // which we can modify without affecting the cache
        //
        WORKUNIT wu = wu_result.workunit;

        app = ssp->lookup_app(wu_result.workunit.appid);
        if (app == NULL) {
            log_messages.printf(MSG_CRITICAL,
                "[WU#%lu] no app\n",
                wu_result.workunit.id
            );
            continue; // this should never happen
        }

        if (app->non_cpu_intensive) continue;

        // do fast (non-DB) checks.
        // This may modify wu.rsc_fpops_est
        //
        if (!quick_check(wu_result, wu, bavp, app, last_retval)) {
            if (config.debug_send_job) {
                log_messages.printf(MSG_NORMAL,
                    "[send_job] slot %d failed quick check\n", i
                );
            }
            continue;
        }

        if (!sema_locked) {
            lock_sema();
            sema_locked = true;
            goto recheck;
        }

        // mark wu_result as checked out and release semaphore.
        // from here on in this loop, don't continue on failure;
        // instead, goto dont_send (so that we reacquire semaphore)
        //
        // Note: without the semaphore we don't have mutual exclusion;
        // ideally we should use a transaction from now until when
        // we commit to sending the results.

        wu_result.state = g_pid;
        unlock_sema();
        sema_locked = false;

        switch (slow_check(wu_result, app, bavp)) {
        case 1:
            // if we couldn't send the result to this host,
            // set its state back to PRESENT
            //
            wu_result.state = WR_STATE_PRESENT;
            break;
        case 2:
            // can't send this job to any host
            //
            wu_result.state = WR_STATE_EMPTY;
            break;
        default:
            // slow_check() refreshes fields of wu_result.workunit;
            // update our copy too
            //
            wu.hr_class = wu_result.workunit.hr_class;
            wu.app_version_id = wu_result.workunit.app_version_id;

            // mark slot as empty AFTER we've copied out of it
            // (since otherwise feeder might overwrite it)
            //
            wu_result.state = WR_STATE_EMPTY;

            // reread result from DB, make sure it's still unsent
            // TODO: from here to end of add_result_to_reply()
            // (which updates the DB record) should be a transaction
            //
            result.id = wu_result.resultid;
            if (result_still_sendable(result, wu)) {
                add_result_to_reply(result, wu, bavp, false);

                // add_result_to_reply() fails only in pathological cases -
                // e.g. we couldn't update the DB record or modify XML fields.
                // If this happens, don't replace the record in the array
                // (we can't anyway, since we marked the entry as "empty").
                // The feeder will eventually pick it up again,
                // and hopefully the problem won't happen twice.
            }
            break;
        }
        if (!work_needed(false)) {
            no_more_needed = true;
            break;
        }
    }
    if (sema_locked) {
        unlock_sema();
    }
    return no_more_needed;
}
示例#3
0
void process_request(char* code_sign_key) {
    PLATFORM* platform;
    int retval;
    double last_rpc_time, x;
    struct tm *rpc_time_tm;
    bool ok_to_send_work = !config.dont_send_jobs;
    bool have_no_work = false;
    char buf[256];
    HOST initial_host;
    unsigned int i;
    time_t t;

    memset(&g_reply->wreq, 0, sizeof(g_reply->wreq));

    // if client has sticky files we don't need any more, tell it
    //
    do_file_delete_regex();

    // if different major version of BOINC, just send a message
    //
    if (wrong_core_client_version()
            || unacceptable_os()
            || unacceptable_cpu()
       ) {
        ok_to_send_work = false;
    }

    // if no jobs reported and none to send, return without accessing DB
    //
    if (!ok_to_send_work && !g_request->results.size()) {
        return;
    }

    warn_user_if_core_client_upgrade_scheduled();

    if (requesting_work()) {
        if (config.locality_scheduling || config.locality_scheduler_fraction || config.enable_assignment) {
            have_no_work = false;
        } else {
            lock_sema();
            have_no_work = ssp->no_work(g_pid);
            if (have_no_work) {
                g_wreq->no_jobs_available = true;
            }
            unlock_sema();
        }
    }

    // If:
    // - there's no work,
    // - a config flag is set,
    // - client isn't returning results,
    // - this isn't an initial RPC,
    // - client is requesting work
    // then return without accessing the DB.
    // This is an efficiency hack for when servers are overloaded
    //
    if (
        have_no_work
        && config.nowork_skip
        && requesting_work()
        && (g_request->results.size() == 0)
        && (g_request->hostid != 0)
    ) {
        g_reply->insert_message("No work available", "low");
        g_reply->set_delay(DELAY_NO_WORK_SKIP);
        if (!config.msg_to_host && !config.enable_vda) {
            log_messages.printf(MSG_NORMAL, "No work - skipping DB access\n");
            return;
        }
    }

    // FROM HERE ON DON'T RETURN; "goto leave" instead
    // (because ssp->no_work() may have tagged an entry in the work array
    // with our process ID)

    retval = open_database();
    if (retval) {
        send_error_message("Server can't open database", 3600);
        g_reply->project_is_down = true;
        goto leave;
    }

    retval = authenticate_user();
    if (retval) goto leave;
    if (g_reply->user.id == 0) {
        log_messages.printf(MSG_CRITICAL, "No user ID!\n");
    }
    initial_host = g_reply->host;
    g_reply->host.rpc_seqno = g_request->rpc_seqno;

    g_reply->nucleus_only = false;

    log_request();

    // is host blacklisted?
    //
    if (g_reply->host._max_results_day == -1) {
        send_error_message("Not accepting requests from this host", 86400);
        goto leave;
    }

    if (strlen(config.sched_lockfile_dir)) {
        int pid_with_lock = lock_sched();
        if (pid_with_lock > 0) {
            log_messages.printf(MSG_CRITICAL,
                                "Another scheduler instance [PID=%d] is running for this host\n",
                                pid_with_lock
                               );
        } else if (pid_with_lock) {
            log_messages.printf(MSG_CRITICAL,
                                "Error acquiring lock for [HOST#%d]\n", g_reply->host.id
                               );
        }
        if (pid_with_lock) {
            send_error_message(
                "Another scheduler instance is running for this host", 60
            );
            goto leave;
        }
    }

    // in deciding whether it's a new day,
    // add a random factor (based on host ID)
    // to smooth out network traffic over the day
    //
    retval = rand();
    srand(g_reply->host.id);
    x = drand()*86400;
    srand(retval);
    last_rpc_time = g_reply->host.rpc_time;
    t = (time_t)(g_reply->host.rpc_time + x);
    rpc_time_tm = localtime(&t);
    g_request->last_rpc_dayofyear = rpc_time_tm->tm_yday;

    t = time(0);
    g_reply->host.rpc_time = t;
    t += (time_t)x;
    rpc_time_tm = localtime(&t);
    g_request->current_rpc_dayofyear = rpc_time_tm->tm_yday;

    retval = modify_host_struct(g_reply->host);

    // write time stats to disk if present
    //
    if (g_request->have_time_stats_log) {
        write_time_stats_log();
    }

    // look up the client's platform(s) in the DB
    //
    platform = ssp->lookup_platform(g_request->platform.name);
    if (platform) g_request->platforms.list.push_back(platform);

    // if primary platform is anonymous, ignore alternate platforms
    //
    if (strcmp(g_request->platform.name, "anonymous")) {
        for (i=0; i<g_request->alt_platforms.size(); i++) {
            platform = ssp->lookup_platform(g_request->alt_platforms[i].name);
            if (platform) g_request->platforms.list.push_back(platform);
        }
    }
    if (g_request->platforms.list.size() == 0) {
        sprintf(buf, "%s %s",
                _("This project doesn't support computers of type"),
                g_request->platform.name
               );
        g_reply->insert_message(buf, "notice");
        log_messages.printf(MSG_CRITICAL,
                            "[HOST#%d] platform '%s' not found\n",
                            g_reply->host.id, g_request->platform.name
                           );
        g_reply->set_delay(DELAY_PLATFORM_UNSUPPORTED);
        goto leave;
    }

    handle_global_prefs();

    read_host_app_versions();
    update_n_jobs_today();

    handle_results();
    handle_file_xfer_results();
    if (config.enable_vda) {
        handle_vda();
    }

    // Do this before resending lost jobs
    //
    if (bad_install_type()) {
        ok_to_send_work = false;
    }
    if (!requesting_work()) {
        ok_to_send_work = false;
    }
    send_work_setup();

    if (g_request->have_other_results_list) {
        if (ok_to_send_work
                && (config.resend_lost_results || g_wreq->resend_lost_results)
                && !g_request->results_truncated
           ) {
            if (resend_lost_work()) {
                ok_to_send_work = false;
            }
        }
        if (config.send_result_abort) {
            send_result_abort();
        }
    }

    if (requesting_work()) {
        if (!send_code_sign_key(code_sign_key)) {
            ok_to_send_work = false;
        }

        if (have_no_work) {
            if (config.debug_send) {
                log_messages.printf(MSG_NORMAL,
                                    "[send] No jobs in shmem cache\n"
                                   );
            }
        }

        // if last RPC was within config.min_sendwork_interval, don't send work
        //
        if (!have_no_work && ok_to_send_work) {
            if (config.min_sendwork_interval) {
                double diff = dtime() - last_rpc_time;
                if (diff < config.min_sendwork_interval) {
                    ok_to_send_work = false;
                    log_messages.printf(MSG_NORMAL,
                                        "Not sending work - last request too recent: %f\n", diff
                                       );
                    sprintf(buf,
                            "Not sending work - last request too recent: %d sec", (int)diff
                           );
                    g_reply->insert_message(buf, "low");

                    // the 1.01 is in case client's clock
                    // is slightly faster than ours
                    //
                    g_reply->set_delay(1.01*config.min_sendwork_interval);
                }
            }
            if (ok_to_send_work) {
                send_work();
            }
        }
        if (g_wreq->no_jobs_available) {
            g_reply->insert_message("Project has no tasks available", "low");
        }
    }


    handle_msgs_from_host();
    if (config.msg_to_host) {
        handle_msgs_to_host();
    }

    update_host_record(initial_host, g_reply->host, g_reply->user);
    write_host_app_versions();

leave:
    if (!have_no_work) {
        ssp->restore_work(g_pid);
    }
}
// Make a pass through the wu/results array, sending work.
// The choice of jobs is limited by flags in g_wreq, as follows:
// infeasible_only:
//      send only results that were previously infeasible for some host
// reliable_only: 
//      send only retries
// user_apps_only:
//      Send only jobs for apps selected by user
// beta_only:
//      Send only jobs for beta-test apps
//
// Return true if no more work is needed.
//
static bool scan_work_array() {
    int i, j, retval, rnd_off, last_retval=0;;
    APP* app;
    BEST_APP_VERSION* bavp;
    bool no_more_needed = false;
    DB_RESULT result;

    lock_sema();
    
    rnd_off = rand() % ssp->max_wu_results;
    for (j=0; j<ssp->max_wu_results; j++) {
        i = (j+rnd_off) % ssp->max_wu_results;

        WU_RESULT& wu_result = ssp->wu_results[i];
        WORKUNIT wu = wu_result.workunit;

        // do fast (non-DB) checks
        //
        if (!quick_check(wu_result, wu, bavp, app, last_retval)) {
            continue;
        }

        // mark wu_result as checked out and release semaphore.
        // from here on in this loop, don't continue on failure;
        // instead, goto dont_send (so that we reacquire semaphore)
        //
        // Note: without the semaphore we don't have mutual exclusion;
        // ideally we should use a transaction from now until when
        // we commit to sending the results.

        wu_result.state = g_pid;
        unlock_sema();

        if (!slow_check(wu_result, wu, app)) {
            // if we couldn't send the result to this host,
            // set its state back to PRESENT
            //
            wu_result.state = WR_STATE_PRESENT;
        } else {
            result.id = wu_result.resultid;

            // mark slot as empty AFTER we've copied out of it
            // (since otherwise feeder might overwrite it)
            //
            wu_result.state = WR_STATE_EMPTY;

            // reread result from DB, make sure it's still unsent
            // TODO: from here to end of add_result_to_reply()
            // (which updates the DB record) should be a transaction
            //
            if (result_still_sendable(result, wu)) {
                retval = add_result_to_reply(result, wu, bavp, false);

                // add_result_to_reply() fails only in pathological cases -
                // e.g. we couldn't update the DB record or modify XML fields.
                // If this happens, don't replace the record in the array
                // (we can't anyway, since we marked the entry as "empty").
                // The feeder will eventually pick it up again,
                // and hopefully the problem won't happen twice.
            }
        }
        lock_sema();
        if (!work_needed(false)) {
            no_more_needed = true;
            break;
        }
    }
    unlock_sema();
    return no_more_needed;
}
示例#5
0
void process_request(char* code_sign_key) {
    PLATFORM* platform;
    int retval;
    double last_rpc_time, x;
    struct tm *rpc_time_tm;
    bool ok_to_send_work = !config.dont_send_jobs;
    bool have_no_work = false;
    char buf[256];
    HOST initial_host;
    unsigned int i;
    time_t t;

    memset(&g_reply->wreq, 0, sizeof(g_reply->wreq));

    // if client has sticky files we don't need any more, tell it
    //
    do_file_delete_regex();

    // if different major version of BOINC, just send a message
    //
    if (wrong_core_client_version()
        || unacceptable_os()
        || unacceptable_cpu()
    ) {
        ok_to_send_work = false;
    }

    // if no jobs reported and none to send, return without accessing DB
    //
    if (!ok_to_send_work && !g_request->results.size()) {
        return;
    }

    warn_user_if_core_client_upgrade_scheduled();

    if (requesting_work()) {
        if (config.locality_scheduling || config.locality_scheduler_fraction || config.enable_assignment) {
            have_no_work = false;
        } else {
            lock_sema();
            have_no_work = ssp->no_work(g_pid);
            if (have_no_work) {
                g_wreq->no_jobs_available = true;
            }
            unlock_sema();
        }
    }

    // If:
    // - there's no work,
    // - a config flag is set,
    // - client isn't returning results,
    // - this isn't an initial RPC,
    // - client is requesting work
    // then return without accessing the DB.
    // This is an efficiency hack for when servers are overloaded
    //
    if (
        have_no_work
        && config.nowork_skip
        && requesting_work()
        && (g_request->results.size() == 0)
        && (g_request->hostid != 0)
    ) {
        g_reply->insert_message("No work available", "low");
        g_reply->set_delay(DELAY_NO_WORK_SKIP);
        if (!config.msg_to_host && !config.enable_vda) {
            log_messages.printf(MSG_NORMAL, "No work - skipping DB access\n");
            return;
        }
    }

    // FROM HERE ON DON'T RETURN; "goto leave" instead
    // (because ssp->no_work() may have tagged an entry in the work array
    // with our process ID)

    retval = open_database();
    if (retval) {
        send_error_message("Server can't open database", 3600);
        g_reply->project_is_down = true;
        goto leave;
    }

    retval = authenticate_user();
    if (retval) goto leave;
    if (g_reply->user.id == 0) {
        log_messages.printf(MSG_CRITICAL, "No user ID!\n");
    }
    initial_host = g_reply->host;
    g_reply->host.rpc_seqno = g_request->rpc_seqno;

    g_reply->nucleus_only = false;

    log_request();

#if 0
    // if you need to debug a problem w/ a particular host or user,
    // edit the following
    //
    if (g_reply->user.id == XX || g_reply.host.id == YY) {
        config.sched_debug_level = 3;
        config.debug_send = true;
        ...
    }
void send_work_matchmaker() {
    int i, slots_locked=0, slots_nonempty=0;
    JOB_SET jobs;
    int min_slots = config.mm_min_slots;
    if (!min_slots) min_slots = ssp->max_wu_results/2;
    int max_slots = config.mm_max_slots;
    if (!max_slots) max_slots = ssp->max_wu_results;
    int max_locked = 10;

    lock_sema();
    i = rand() % ssp->max_wu_results;

    // scan through the job cache, maintaining a JOB_SET of jobs
    // that we can send to this client, ordered by score.
    //
    for (int slots_scanned=0; slots_scanned<max_slots; slots_scanned++) {
        i = (i+1) % ssp->max_wu_results;
        WU_RESULT& wu_result = ssp->wu_results[i];
        switch (wu_result.state) {
        case WR_STATE_EMPTY:
            continue;
        case WR_STATE_PRESENT:
            slots_nonempty++;
            break;
        default:
            slots_nonempty++;
            if (wu_result.state == g_pid) break;
            slots_locked++;
            continue;
        }

        JOB job;
        job.index = i;

        // get score for this job, and skip it if it fails quick check.
        // NOTE: the EDF check done in get_score()
        // includes only in-progress jobs.
        //
        if (!job.get_score()) {
            continue;
        }
        if (config.debug_send) {
            log_messages.printf(MSG_NORMAL,
                "[send] score for %s: %f\n", wu_result.workunit.name, job.score
            );
        }

        if (job.score > jobs.lowest_score() || !jobs.request_satisfied()) {
            ssp->wu_results[i].state = g_pid;
            unlock_sema();
            if (wu_is_infeasible_slow(wu_result, *g_request, *g_reply)) {
                // if we can't use this job, put it back in pool
                //
                lock_sema();
                ssp->wu_results[i].state = WR_STATE_PRESENT;
                continue;
            }
            lock_sema();
            jobs.add_job(job);
        }

        if (jobs.request_satisfied() && slots_scanned>=min_slots) break;
    }

    if (!slots_nonempty) {
        log_messages.printf(MSG_CRITICAL,
            "Job cache is empty - check feeder\n"
        );
        g_wreq->no_jobs_available = true;
    }

    // TODO: trim jobs from tail of list until we pass the EDF check
    //
    jobs.send();
    unlock_sema();
    if (slots_locked > max_locked) {
        log_messages.printf(MSG_CRITICAL,
            "Found too many locked slots (%d>%d) - increase array size\n",
            slots_locked, max_locked
        );
    }
}
示例#7
0
// send work for a particular processor type
//
void send_work_score_type(int rt) {
    vector<JOB> jobs;

    if (config.debug_send) {
        log_messages.printf(MSG_NORMAL,
            "[send] scanning for %s jobs\n", proc_type_name(rt)
        );
    }

    clear_others(rt);

    int nscan = config.mm_max_slots;
    if (!nscan) nscan = ssp->max_wu_results;
    int rnd_off = rand() % ssp->max_wu_results;
    for (int j=0; j<nscan; j++) {
        int i = (j+rnd_off) % ssp->max_wu_results;
        WU_RESULT& wu_result = ssp->wu_results[i];
        if (wu_result.state != WR_STATE_PRESENT) {
            continue;
        }
        WORKUNIT wu = wu_result.workunit;
        JOB job;
        job.app = ssp->lookup_app(wu.appid);
        if (job.app->non_cpu_intensive) continue;
        job.bavp = get_app_version(wu, true, false);
        if (!job.bavp) continue;

        job.index = i;
        job.result_id = wu_result.resultid;
        if (!job.get_score(wu_result)) {
            continue;
        }
        jobs.push_back(job);
    }

    std::sort(jobs.begin(), jobs.end(), job_compare);

    bool sema_locked = false;
    for (unsigned int i=0; i<jobs.size(); i++) {
        if (!work_needed(false)) {
            break;
        }
        if (!g_wreq->need_proc_type(rt)) {
            break;
        }
        JOB& job = jobs[i];
        if (!sema_locked) {
            lock_sema();
            sema_locked = true;
        }

        // make sure the job is still in the cache
        // array is locked at this point.
        //
        WU_RESULT& wu_result = ssp->wu_results[job.index];
        if (wu_result.state != WR_STATE_PRESENT) {
            continue;
        }
        if (wu_result.resultid != job.result_id) {
            continue;
        }
        WORKUNIT wu = wu_result.workunit;
        int retval = wu_is_infeasible_fast(
            wu,
            wu_result.res_server_state, wu_result.res_priority,
            wu_result.res_report_deadline,
            *job.app,
            *job.bavp
        );

        if (retval) {
            continue;
        }
        wu_result.state = g_pid;

        // It passed fast checks.
        // Release sema and do slow checks
        //
        unlock_sema();
        sema_locked = false;

        switch (slow_check(wu_result, job.app, job.bavp)) {
        case 1:
            wu_result.state = WR_STATE_PRESENT;
            break;
        case 2:
            wu_result.state = WR_STATE_EMPTY;
            break;
        default:
            // slow_check() refreshes fields of wu_result.workunit;
            // update our copy too
            //
            wu.hr_class = wu_result.workunit.hr_class;
            wu.app_version_id = wu_result.workunit.app_version_id;

            // mark slot as empty AFTER we've copied out of it
            // (since otherwise feeder might overwrite it)
            //
            wu_result.state = WR_STATE_EMPTY;

            // reread result from DB, make sure it's still unsent
            // TODO: from here to end of add_result_to_reply()
            // (which updates the DB record) should be a transaction
            //
            SCHED_DB_RESULT result;
            result.id = wu_result.resultid;
            if (result_still_sendable(result, wu)) {
                add_result_to_reply(result, wu, job.bavp, false);

                // add_result_to_reply() fails only in pathological cases -
                // e.g. we couldn't update the DB record or modify XML fields.
                // If this happens, don't replace the record in the array
                // (we can't anyway, since we marked the entry as "empty").
                // The feeder will eventually pick it up again,
                // and hopefully the problem won't happen twice.
            }
            break;
        }
    }
    if (sema_locked) {
        unlock_sema();
    }

    restore_others(rt);
    g_wreq->best_app_versions.clear();
}