Esempio n. 1
0
static bool can_send_nci(
    WU_RESULT& wu_result,
    WORKUNIT& wu,
    BEST_APP_VERSION* &bavp,
    APP* app
) {
    bavp = get_app_version(wu, true, false);
    if (!bavp) {
        if (config.debug_send_job) {
            log_messages.printf(MSG_NORMAL,
                "[send_job] [WU#%lu] No app version for NCI job; skipping\n",
                wu.id
            );
        }
        return false;
    }
    int retval = wu_is_infeasible_fast(
        wu,
        wu_result.res_server_state, wu_result.res_priority,
        wu_result.res_report_deadline,
        *app, *bavp
    );
    if (retval) {
        if (config.debug_send_job) {
            log_messages.printf(MSG_NORMAL,
                "[send_job] [WU#%lu] wu_is_infeasible_fast() failed for NCI job; skipping\n",
                wu.id
            );
        }
        return false;
    }
    return true;
}
Esempio n. 2
0
// Try to send the client this result
// This can fail because:
// - result needs more disk/mem/speed than host has
// - already sent a result for this WU
// - no app_version available
//
static int possibly_send_result(SCHED_DB_RESULT& result) {
    DB_WORKUNIT wu;
    SCHED_DB_RESULT result2;
    int retval;
    long count;
    char buf[256];
    BEST_APP_VERSION* bavp;

    g_wreq->no_jobs_available = false;

    retval = wu.lookup_id(result.workunitid);
    if (retval) return ERR_DB_NOT_FOUND;

    // This doesn't take into account g_wreq->allow_non_selected_apps,
    // however Einstein@Home, which is the only project that currently uses
    // this locality scheduler, doesn't support the respective project-specific
    // preference setting
    //
    if (app_not_selected(wu.appid)) return ERR_NO_APP_VERSION;

    bavp = get_app_version(wu, true, false);

    if (!config.locality_scheduler_fraction && !bavp && is_anonymous(g_request->platforms.list[0])) {
        char help_msg_buf[512];
        sprintf(help_msg_buf,
            "To get more %s work, finish current work, stop BOINC, remove app_info.xml file, and restart.",
            config.long_name
        );
        g_reply->insert_message(help_msg_buf, "notice");
        g_reply->set_delay(DELAY_ANONYMOUS);
    }

    if (!bavp) return ERR_NO_APP_VERSION;

    APP* app = ssp->lookup_app(wu.appid);
    retval = wu_is_infeasible_fast(
        wu, result.server_state, result.report_deadline, result.priority,
        *app, *bavp
    );
    if (retval) return retval;

    if (config.one_result_per_user_per_wu) {
        sprintf(buf, "where userid=%lu and workunitid=%lu", g_reply->user.id, wu.id);
        retval = result2.count(count, buf);
        if (retval) return ERR_DB_NOT_FOUND;
        if (count > 0) return ERR_WU_USER_RULE;
    }

    return add_result_to_reply(result, wu, bavp, true);
}
Esempio n. 3
0
// Try to send the client this result
// This can fail because:
// - result needs more disk/mem/speed than host has
// - already sent a result for this WU
// - no app_version available
//
static int possibly_send_result(SCHED_DB_RESULT& result) {
    DB_WORKUNIT wu;
    SCHED_DB_RESULT result2;
    int retval, count;
    char buf[256];
    BEST_APP_VERSION* bavp;

    g_wreq->no_jobs_available = false;

    retval = wu.lookup_id(result.workunitid);
    if (retval) return ERR_DB_NOT_FOUND;

    bavp = get_app_version(wu, true, false);

    if (!config.locality_scheduler_fraction && !bavp && is_anonymous(g_request->platforms.list[0])) {
        char help_msg_buf[512];
        sprintf(help_msg_buf,
            "To get more %s work, finish current work, stop BOINC, remove app_info.xml file, and restart.",
            config.long_name
        );
        g_reply->insert_message(help_msg_buf, "notice");
        g_reply->set_delay(DELAY_ANONYMOUS);
    }

    if (!bavp) return ERR_NO_APP_VERSION;

    APP* app = ssp->lookup_app(wu.appid);
    retval = wu_is_infeasible_fast(
        wu, result.server_state, result.report_deadline, result.priority,
        *app, *bavp
    );
    if (retval) return retval;

    if (config.one_result_per_user_per_wu) {
        sprintf(buf, "where userid=%d and workunitid=%d", g_reply->user.id, wu.id);
        retval = result2.count(count, buf);
        if (retval) return ERR_DB_NOT_FOUND;
        if (count > 0) return ERR_WU_USER_RULE;
    }

    return add_result_to_reply(result, wu, bavp, true);
}
// Try to send the client this result
// This can fail because:
// - result needs more disk/mem/speed than host has
// - already sent a result for this WU
// - no app_version available
//
static int possibly_send_result(DB_RESULT& result) {
    DB_WORKUNIT wu;
    DB_RESULT result2;
    int retval, count;
    char buf[256];
    BEST_APP_VERSION* bavp;

    retval = wu.lookup_id(result.workunitid);
    if (retval) return ERR_DB_NOT_FOUND;

    bavp = get_app_version(wu, true);

    if (!bavp && anonymous(g_request->platforms.list[0])) {
        char help_msg_buf[512];
        sprintf(help_msg_buf,
            "To get more %s work, finish current work, stop BOINC, remove app_info.xml file, and restart.",
            config.long_name
        );
        g_reply->insert_message(USER_MESSAGE(help_msg_buf, "high"));
        g_reply->set_delay(DELAY_ANONYMOUS);
    }

    if (!bavp) return ERR_NO_APP_VERSION;

    APP* app = ssp->lookup_app(wu.appid);
    if (wu_is_infeasible_fast(wu, *app, *bavp)) {
        return ERR_INSUFFICIENT_RESOURCE;
    }

    if (config.one_result_per_user_per_wu) {
        sprintf(buf, "where userid=%d and workunitid=%d", g_reply->user.id, wu.id);
        retval = result2.count(count, buf);
        if (retval) return ERR_DB_NOT_FOUND;
        if (count > 0) return ERR_WU_USER_RULE;
    }

    return add_result_to_reply(result, wu, bavp, true);
}
Esempio n. 5
0
// do fast checks on this job, i.e. ones that don't require DB access
// if any check fails, return false
//
static bool quick_check(
    WU_RESULT& wu_result,
    WORKUNIT& wu,       // a mutable copy of wu_result.workunit.
        // We may modify its delay_bound, rsc_fpops_est, and rsc_fpops_bound
    BEST_APP_VERSION* &bavp,
    APP* app,
    int& last_retval
) {
    int retval;

    // If we're looking for beta jobs and this isn't one, skip it
    //
    if (g_wreq->beta_only) {
        if (!app->beta) {
            if (config.debug_send_job) {
                log_messages.printf(MSG_NORMAL,
                    "[send_job] job is not from beta app; skipping\n"
                );
            }
            return false;
        }
        if (config.debug_send_job) {
            log_messages.printf(MSG_NORMAL,
                "[send_job] [HOST#%lu] beta work found: [RESULT#%lu]\n",
                g_reply->host.id, wu_result.resultid
            );
        }
    } else {
        if (app->beta) {
            if (config.debug_send_job) {
                log_messages.printf(MSG_NORMAL,
                    "[send_job] job is from beta app; skipping\n"
                );
            }
            return false;
        }
    }

    // Are we scanning for need_reliable results?
    // skip this check the app is beta
    // (beta apps don't use the reliable mechanism)
    //
    if (!app->beta) {
        if (g_wreq->reliable_only && (!wu_result.need_reliable)) {
            if (config.debug_send_job) {
                log_messages.printf(MSG_NORMAL,
                    "[send_job] job doesn't need reliable host; skipping\n"
                );
            }
            return false;
        } else if (!g_wreq->reliable_only && wu_result.need_reliable) {
            if (config.debug_send_job) {
                log_messages.printf(MSG_NORMAL,
                    "[send_job] job needs reliable host; skipping\n"
                );
            }
            return false;
        }
    }

    // don't send if we are looking for infeasible results
    // and the result is not infeasible
    //
    if (g_wreq->infeasible_only && (wu_result.infeasible_count==0)) {
        if (config.debug_send_job) {
            log_messages.printf(MSG_NORMAL,
                "[send_job] job is not infeasible; skipping\n"
            );
        }
        return false;
    }

    // locality sched lite check.
    // Allow non-LSL jobs; otherwise we could starve them
    // NOTE: THIS NEGATES THE OTHER SCHED POLICIES (reliable, etc.).
    // Need to think of some way of combining them.
    //
    if (g_wreq->locality_sched_lite) {
        // skip this job if host has sticky files
        // but none of them is used by this job.
        // TODO: it should really be "host has sticky files for this app".
        // However, we don't have a way of making that association.
        // Could add something based on filename
        //
        if (app->locality_scheduling == LOCALITY_SCHED_LITE
            && g_request->file_infos.size()
        ) {
            int n = nfiles_on_host(wu_result.workunit);
            if (config.debug_locality_lite) {
                log_messages.printf(MSG_NORMAL,
                    "[loc_lite] job %s has %d files on this host\n",
                    wu_result.workunit.name, n
                );
            }
            if (n == 0) {
                return false;
            }
        }
    }

    // Find the best app_version for this host.
    //
    bavp = get_app_version(wu, true, g_wreq->reliable_only);
    if (!bavp) {
        if (config.debug_send_job) {
            log_messages.printf(MSG_NORMAL,
                "[send_job] No app version for job; skipping\n"
            );
        }
        return false;
    }

    // Check app filter if needed.
    // Do this AFTER get_app_version(), otherwise we could send
    // a misleading message to user
    //
    if (g_wreq->user_apps_only &&
        (!g_wreq->beta_only || config.distinct_beta_apps)
    ) {
        if (app_not_selected(app->id)) {
            g_wreq->no_allowed_apps_available = true;
            if (config.debug_send_job) {
                log_messages.printf(MSG_NORMAL,
                    "[send_job] [USER#%lu] [WU#%lu] user doesn't want work for app %s\n",
                    g_reply->user.id, wu.id, app->name
                );
            }
            return false;
        }
    }

    // Check whether we can send this job.
    // This may modify wu.delay_bound and wu.rsc_fpops_est
    //
    retval = wu_is_infeasible_fast(
        wu,
        wu_result.res_server_state, wu_result.res_priority,
        wu_result.res_report_deadline,
        *app, *bavp
    );
    if (retval) {
        if (retval != last_retval && config.debug_send_job) {
            log_messages.printf(MSG_NORMAL,
                "[send_job] [HOST#%lu] [WU#%lu %s] WU is infeasible: %s\n",
                g_reply->host.id, wu.id, wu.name, infeasible_string(retval)
            );
        }
        last_retval = retval;
        if (config.debug_send_job) {
            log_messages.printf(MSG_NORMAL,
                "[send_job] is_infeasible_fast() failed; skipping\n"
            );
        }
        return false;
    }
    return true;
}
Esempio n. 6
0
// resend any jobs that:
// 1) we already sent to this host;
// 2) are still in progress (i.e. haven't timed out) and
// 3) aren't present on the host
// Return true if there were any such jobs
//
bool resend_lost_work() {
    SCHED_DB_RESULT result;
    std::vector<DB_RESULT>results;
    unsigned int i;
    char buf[256];
    char warning_msg[256];
    bool did_any = false;
    int num_eligible_to_resend=0;
    int num_resent=0;
    BEST_APP_VERSION* bavp = NULL;
    APP* app = NULL;
    int retval;

    sprintf(buf, " where hostid=%d and server_state=%d ",
        g_reply->host.id, RESULT_SERVER_STATE_IN_PROGRESS
    );
    while (!result.enumerate(buf)) {
        if (!work_needed(false)) {
            result.end_enumerate();
            break;
        }

        bool found = false;
        for (i=0; i<g_request->other_results.size(); i++) {
            OTHER_RESULT& orp = g_request->other_results[i];
            if (!strcmp(orp.name, result.name)) {
                found = true;
                break;
            }
        }
        if (found) continue;

        num_eligible_to_resend++;
        if (config.debug_resend) {
            log_messages.printf(MSG_NORMAL,
                "[resend] [HOST#%d] found lost [RESULT#%u]: %s\n",
                g_reply->host.id, result.id, result.name
            );
        }

        DB_WORKUNIT wu;
        bool can_resend = true;
        retval = wu.lookup_id(result.workunitid);
        if (retval) {
            log_messages.printf(MSG_CRITICAL,
                "[HOST#%d] can't resend - WU not found for [RESULT#%u]\n",
                g_reply->host.id, result.id
            );
            can_resend = false;
        }
        if (can_resend) {
            app = ssp->lookup_app(wu.appid);
            bavp = get_app_version(wu, true, false);
            if (!bavp) {
                if (config.debug_resend) {
                    log_messages.printf(MSG_NORMAL,
                        "[HOST#%d] can't resend [RESULT#%u]: no app version for %s\n",
                        g_reply->host.id, result.id, app->name
                    );
                }
                can_resend = false;
            }
        }
        if (can_resend && wu.error_mask) {
            if (config.debug_resend) {
                log_messages.printf(MSG_NORMAL,
                    "[resend] skipping [RESULT#%u]: WU error mask %d\n",
                    result.id, wu.error_mask
                );
            }
            can_resend = false;
        }
        if (can_resend && wu.canonical_resultid) {
            if (config.debug_resend) {
                log_messages.printf(MSG_NORMAL,
                    "[resend] skipping [RESULT#%u]: already have canonical result\n",
                    result.id
                );
            }
            can_resend = false;
        }
        if (can_resend && wu_is_infeasible_fast(
            wu, result.server_state, result.priority, result.report_deadline,
            *app, *bavp
        )) {
            if (config.debug_resend) {
                log_messages.printf(MSG_NORMAL,
                    "[resend] skipping [RESULT#%u]: feasibility check failed\n",
                    result.id
                );
            }
            can_resend = false;
        }
        if (can_resend && possibly_give_result_new_deadline(result, wu, *bavp)) {
            if (config.debug_resend) {
                log_messages.printf(MSG_NORMAL,
                    "[resend] skipping [RESULT#%u]: deadline assignment failed\n",
                    result.id
                );
            }
            can_resend = false;
        }

        // If we can't resend this job for any of the above reasons,
        // make it time out so that the transitioner does the right thing.
        //
        if (!can_resend) {
            result.report_deadline = time(0)-1;
            retval = result.mark_as_sent(result.server_state, config.report_grace_period);
            if (retval) {
                log_messages.printf(MSG_CRITICAL,
                    "resend_lost_work: can't update result deadline: %s\n",
                    boincerror(retval)
                );
                continue;
            }

            retval = update_wu_on_send(
                wu, result.report_deadline + config.report_grace_period,
                *app, *bavp
            );
            if (retval) {
                log_messages.printf(MSG_CRITICAL,
                    "resend_lost_result: can't update WU transition time: %s\n",
                    boincerror(retval)
                );
                continue;
            }
            sprintf(warning_msg,
                "Didn't resend lost task %s (expired)", result.name
            );
            g_reply->insert_message(warning_msg, "low");
        } else {
            retval = add_result_to_reply(result, wu, bavp, false);
            if (retval) {
                log_messages.printf(MSG_CRITICAL,
                    "[HOST#%d] failed to send [RESULT#%u]\n",
                    g_reply->host.id, result.id
                );
                continue;
            }
            sprintf(warning_msg, "Resent lost task %s", result.name);
            g_reply->insert_message(warning_msg, "low");
            num_resent++;
            did_any = true;

            if (g_wreq->njobs_sent >= config.max_wus_to_send) {
                result.end_enumerate();
                break;
            }
        }
    }

    if (num_eligible_to_resend && config.debug_resend) {
        log_messages.printf(MSG_NORMAL,
            "[resend] [HOST#%d] %d lost results, resent %d\n",
            g_reply->host.id, num_eligible_to_resend, num_resent 
        );
    }

    return did_any;
}
// do fast checks on this job, i.e. ones that don't require DB access
// if any check fails, return false
//
static bool quick_check(
    WU_RESULT& wu_result, WORKUNIT& wu, BEST_APP_VERSION* &bavp,
    APP* &app, int& last_retval
) {
    int retval;

    if (wu_result.state != WR_STATE_PRESENT && wu_result.state != g_pid) {
        return false;
    }
    
    app = ssp->lookup_app(wu_result.workunit.appid);
    if (app == NULL) {
        return false; // this should never happen
    }

    g_wreq->no_jobs_available = false;

    // If we're looking for beta jobs and this isn't one, skip it
    //
    if (g_wreq->beta_only) {
        if (!app->beta) {
            return false;
        }
        if (config.debug_send) {
            log_messages.printf(MSG_NORMAL,
                "[send] [HOST#%d] beta work found: [RESULT#%d]\n",
                g_reply->host.id, wu_result.resultid
            );
        }
    } else {
        if (app->beta) {
            return false;
        }
    }
    
    // If this is a reliable host and we are checking for results that
    // need a reliable host, then continue if the result is a normal result
    // skip if the app is beta (beta apps don't use the reliable mechanism)
    //
    if (!app->beta) {
        if (g_wreq->reliable_only && (!wu_result.need_reliable)) {
            return false;
        } else if (!g_wreq->reliable_only && wu_result.need_reliable) {
            return false;
        }
    }
    
    // don't send if we are looking for infeasible results
    // and the result is not infeasible
    //
    if (g_wreq->infeasible_only && (wu_result.infeasible_count==0)) {
        return false;
    }
    
    // check app filter if needed
    //
    if (g_wreq->user_apps_only &&
        (!g_wreq->beta_only || config.distinct_beta_apps)
    ) {
        if (app_not_selected(wu)) {
            g_wreq->no_allowed_apps_available = true;
#if 0
            if (config.debug_send) {
                log_messages.printf(MSG_NORMAL,
                    "[send] [USER#%d] [WU#%d] user doesn't want work for app %s\n",
                    g_reply->user.id, wu.id, app->name
                );
            }
#endif
            return false;
        }
    }

    // Find the app and best app_version for this host.
    //
    bavp = get_app_version(wu, true, g_wreq->reliable_only);
    if (!bavp) {
        if (config.debug_array) {
            log_messages.printf(MSG_NORMAL,
                "[array] No app version\n"
            );
        }
        return false;
    }

    // don't send job if host can't handle it
    //
    retval = wu_is_infeasible_fast(
        wu,
        wu_result.res_server_state, wu_result.res_priority,
        wu_result.res_report_deadline,
        *app, *bavp
    );
    if (retval) {
        if (retval != last_retval && config.debug_send) {
            log_messages.printf(MSG_NORMAL,
                "[send] [HOST#%d] [WU#%d %s] WU is infeasible: %s\n",
                g_reply->host.id, wu.id, wu.name, infeasible_string(retval)
            );
        }
        last_retval = retval;
        if (config.debug_array) {
            log_messages.printf(MSG_NORMAL, "[array] infeasible\n");
        }
        return false;
    }
    return true;
}
Esempio n. 8
0
// send work for a particular processor type
//
void send_work_score_type(int rt) {
    vector<JOB> jobs;

    if (config.debug_send) {
        log_messages.printf(MSG_NORMAL,
            "[send] scanning for %s jobs\n", proc_type_name(rt)
        );
    }

    clear_others(rt);

    int nscan = config.mm_max_slots;
    if (!nscan) nscan = ssp->max_wu_results;
    int rnd_off = rand() % ssp->max_wu_results;
    for (int j=0; j<nscan; j++) {
        int i = (j+rnd_off) % ssp->max_wu_results;
        WU_RESULT& wu_result = ssp->wu_results[i];
        if (wu_result.state != WR_STATE_PRESENT) {
            continue;
        }
        WORKUNIT wu = wu_result.workunit;
        JOB job;
        job.app = ssp->lookup_app(wu.appid);
        if (job.app->non_cpu_intensive) continue;
        job.bavp = get_app_version(wu, true, false);
        if (!job.bavp) continue;

        job.index = i;
        job.result_id = wu_result.resultid;
        if (!job.get_score(wu_result)) {
            continue;
        }
        jobs.push_back(job);
    }

    std::sort(jobs.begin(), jobs.end(), job_compare);

    bool sema_locked = false;
    for (unsigned int i=0; i<jobs.size(); i++) {
        if (!work_needed(false)) {
            break;
        }
        if (!g_wreq->need_proc_type(rt)) {
            break;
        }
        JOB& job = jobs[i];
        if (!sema_locked) {
            lock_sema();
            sema_locked = true;
        }

        // make sure the job is still in the cache
        // array is locked at this point.
        //
        WU_RESULT& wu_result = ssp->wu_results[job.index];
        if (wu_result.state != WR_STATE_PRESENT) {
            continue;
        }
        if (wu_result.resultid != job.result_id) {
            continue;
        }
        WORKUNIT wu = wu_result.workunit;
        int retval = wu_is_infeasible_fast(
            wu,
            wu_result.res_server_state, wu_result.res_priority,
            wu_result.res_report_deadline,
            *job.app,
            *job.bavp
        );

        if (retval) {
            continue;
        }
        wu_result.state = g_pid;

        // It passed fast checks.
        // Release sema and do slow checks
        //
        unlock_sema();
        sema_locked = false;

        switch (slow_check(wu_result, job.app, job.bavp)) {
        case 1:
            wu_result.state = WR_STATE_PRESENT;
            break;
        case 2:
            wu_result.state = WR_STATE_EMPTY;
            break;
        default:
            // slow_check() refreshes fields of wu_result.workunit;
            // update our copy too
            //
            wu.hr_class = wu_result.workunit.hr_class;
            wu.app_version_id = wu_result.workunit.app_version_id;

            // mark slot as empty AFTER we've copied out of it
            // (since otherwise feeder might overwrite it)
            //
            wu_result.state = WR_STATE_EMPTY;

            // reread result from DB, make sure it's still unsent
            // TODO: from here to end of add_result_to_reply()
            // (which updates the DB record) should be a transaction
            //
            SCHED_DB_RESULT result;
            result.id = wu_result.resultid;
            if (result_still_sendable(result, wu)) {
                add_result_to_reply(result, wu, job.bavp, false);

                // add_result_to_reply() fails only in pathological cases -
                // e.g. we couldn't update the DB record or modify XML fields.
                // If this happens, don't replace the record in the array
                // (we can't anyway, since we marked the entry as "empty").
                // The feeder will eventually pick it up again,
                // and hopefully the problem won't happen twice.
            }
            break;
        }
    }
    if (sema_locked) {
        unlock_sema();
    }

    restore_others(rt);
    g_wreq->best_app_versions.clear();
}