// send a job for the given assignment // static int send_assigned_job(ASSIGNMENT& asg) { int retval; DB_WORKUNIT wu; char suffix[256], path[MAXPATHLEN]; const char *rtfpath; static bool first=true; static int seqno=0; static R_RSA_PRIVATE_KEY key; BEST_APP_VERSION* bavp; if (first) { first = false; sprintf(path, "%s/upload_private", config.key_dir); retval = read_key_file(path, key); if (retval) { log_messages.printf(MSG_CRITICAL, "can't read key\n"); return -1; } } retval = wu.lookup_id(asg.workunitid); if (retval) { log_messages.printf(MSG_CRITICAL, "assigned WU %d not found\n", asg.workunitid ); return retval; } bavp = get_app_version(wu, false, false); if (!bavp) { log_messages.printf(MSG_CRITICAL, "App version for assigned WU not found\n" ); return ERR_NOT_FOUND; } rtfpath = config.project_path("%s", wu.result_template_file); sprintf(suffix, "%d_%d_%d", getpid(), (int)time(0), seqno++); retval = create_result( wu, const_cast<char*>(rtfpath), suffix, key, config, 0, 0 ); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] create_result(): %s\n", wu.id, wu.name, boincerror(retval) ); return retval; } int result_id = boinc_db.insert_id(); SCHED_DB_RESULT result; retval = result.lookup_id(result_id); add_result_to_reply(result, wu, bavp, false); if (config.debug_assignment) { log_messages.printf(MSG_NORMAL, "[assign] [WU#%d] [RESULT#%d] [HOST#%d] send assignment %d\n", wu.id, result_id, g_reply->host.id, asg.id ); } return 0; }
static bool can_send_nci( WU_RESULT& wu_result, WORKUNIT& wu, BEST_APP_VERSION* &bavp, APP* app ) { bavp = get_app_version(wu, true, false); if (!bavp) { if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] [WU#%lu] No app version for NCI job; skipping\n", wu.id ); } return false; } int retval = wu_is_infeasible_fast( wu, wu_result.res_server_state, wu_result.res_priority, wu_result.res_report_deadline, *app, *bavp ); if (retval) { if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] [WU#%lu] wu_is_infeasible_fast() failed for NCI job; skipping\n", wu.id ); } return false; } return true; }
// Try to send the client this result // This can fail because: // - result needs more disk/mem/speed than host has // - already sent a result for this WU // - no app_version available // static int possibly_send_result(SCHED_DB_RESULT& result) { DB_WORKUNIT wu; SCHED_DB_RESULT result2; int retval; long count; char buf[256]; BEST_APP_VERSION* bavp; g_wreq->no_jobs_available = false; retval = wu.lookup_id(result.workunitid); if (retval) return ERR_DB_NOT_FOUND; // This doesn't take into account g_wreq->allow_non_selected_apps, // however Einstein@Home, which is the only project that currently uses // this locality scheduler, doesn't support the respective project-specific // preference setting // if (app_not_selected(wu.appid)) return ERR_NO_APP_VERSION; bavp = get_app_version(wu, true, false); if (!config.locality_scheduler_fraction && !bavp && is_anonymous(g_request->platforms.list[0])) { char help_msg_buf[512]; sprintf(help_msg_buf, "To get more %s work, finish current work, stop BOINC, remove app_info.xml file, and restart.", config.long_name ); g_reply->insert_message(help_msg_buf, "notice"); g_reply->set_delay(DELAY_ANONYMOUS); } if (!bavp) return ERR_NO_APP_VERSION; APP* app = ssp->lookup_app(wu.appid); retval = wu_is_infeasible_fast( wu, result.server_state, result.report_deadline, result.priority, *app, *bavp ); if (retval) return retval; if (config.one_result_per_user_per_wu) { sprintf(buf, "where userid=%lu and workunitid=%lu", g_reply->user.id, wu.id); retval = result2.count(count, buf); if (retval) return ERR_DB_NOT_FOUND; if (count > 0) return ERR_WU_USER_RULE; } return add_result_to_reply(result, wu, bavp, true); }
// Try to send the client this result // This can fail because: // - result needs more disk/mem/speed than host has // - already sent a result for this WU // - no app_version available // static int possibly_send_result(SCHED_DB_RESULT& result) { DB_WORKUNIT wu; SCHED_DB_RESULT result2; int retval, count; char buf[256]; BEST_APP_VERSION* bavp; g_wreq->no_jobs_available = false; retval = wu.lookup_id(result.workunitid); if (retval) return ERR_DB_NOT_FOUND; bavp = get_app_version(wu, true, false); if (!config.locality_scheduler_fraction && !bavp && is_anonymous(g_request->platforms.list[0])) { char help_msg_buf[512]; sprintf(help_msg_buf, "To get more %s work, finish current work, stop BOINC, remove app_info.xml file, and restart.", config.long_name ); g_reply->insert_message(help_msg_buf, "notice"); g_reply->set_delay(DELAY_ANONYMOUS); } if (!bavp) return ERR_NO_APP_VERSION; APP* app = ssp->lookup_app(wu.appid); retval = wu_is_infeasible_fast( wu, result.server_state, result.report_deadline, result.priority, *app, *bavp ); if (retval) return retval; if (config.one_result_per_user_per_wu) { sprintf(buf, "where userid=%d and workunitid=%d", g_reply->user.id, wu.id); retval = result2.count(count, buf); if (retval) return ERR_DB_NOT_FOUND; if (count > 0) return ERR_WU_USER_RULE; } return add_result_to_reply(result, wu, bavp, true); }
// Try to send the client this result // This can fail because: // - result needs more disk/mem/speed than host has // - already sent a result for this WU // - no app_version available // static int possibly_send_result(DB_RESULT& result) { DB_WORKUNIT wu; DB_RESULT result2; int retval, count; char buf[256]; BEST_APP_VERSION* bavp; retval = wu.lookup_id(result.workunitid); if (retval) return ERR_DB_NOT_FOUND; bavp = get_app_version(wu, true); if (!bavp && anonymous(g_request->platforms.list[0])) { char help_msg_buf[512]; sprintf(help_msg_buf, "To get more %s work, finish current work, stop BOINC, remove app_info.xml file, and restart.", config.long_name ); g_reply->insert_message(USER_MESSAGE(help_msg_buf, "high")); g_reply->set_delay(DELAY_ANONYMOUS); } if (!bavp) return ERR_NO_APP_VERSION; APP* app = ssp->lookup_app(wu.appid); if (wu_is_infeasible_fast(wu, *app, *bavp)) { return ERR_INSUFFICIENT_RESOURCE; } if (config.one_result_per_user_per_wu) { sprintf(buf, "where userid=%d and workunitid=%d", g_reply->user.id, wu.id); retval = result2.count(count, buf); if (retval) return ERR_DB_NOT_FOUND; if (count > 0) return ERR_WU_USER_RULE; } return add_result_to_reply(result, wu, bavp, true); }
// do fast checks on this job, i.e. ones that don't require DB access // if any check fails, return false // static bool quick_check( WU_RESULT& wu_result, WORKUNIT& wu, // a mutable copy of wu_result.workunit. // We may modify its delay_bound, rsc_fpops_est, and rsc_fpops_bound BEST_APP_VERSION* &bavp, APP* app, int& last_retval ) { int retval; // If we're looking for beta jobs and this isn't one, skip it // if (g_wreq->beta_only) { if (!app->beta) { if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] job is not from beta app; skipping\n" ); } return false; } if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] [HOST#%lu] beta work found: [RESULT#%lu]\n", g_reply->host.id, wu_result.resultid ); } } else { if (app->beta) { if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] job is from beta app; skipping\n" ); } return false; } } // Are we scanning for need_reliable results? // skip this check the app is beta // (beta apps don't use the reliable mechanism) // if (!app->beta) { if (g_wreq->reliable_only && (!wu_result.need_reliable)) { if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] job doesn't need reliable host; skipping\n" ); } return false; } else if (!g_wreq->reliable_only && wu_result.need_reliable) { if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] job needs reliable host; skipping\n" ); } return false; } } // don't send if we are looking for infeasible results // and the result is not infeasible // if (g_wreq->infeasible_only && (wu_result.infeasible_count==0)) { if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] job is not infeasible; skipping\n" ); } return false; } // locality sched lite check. // Allow non-LSL jobs; otherwise we could starve them // NOTE: THIS NEGATES THE OTHER SCHED POLICIES (reliable, etc.). // Need to think of some way of combining them. // if (g_wreq->locality_sched_lite) { // skip this job if host has sticky files // but none of them is used by this job. // TODO: it should really be "host has sticky files for this app". // However, we don't have a way of making that association. // Could add something based on filename // if (app->locality_scheduling == LOCALITY_SCHED_LITE && g_request->file_infos.size() ) { int n = nfiles_on_host(wu_result.workunit); if (config.debug_locality_lite) { log_messages.printf(MSG_NORMAL, "[loc_lite] job %s has %d files on this host\n", wu_result.workunit.name, n ); } if (n == 0) { return false; } } } // Find the best app_version for this host. // bavp = get_app_version(wu, true, g_wreq->reliable_only); if (!bavp) { if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] No app version for job; skipping\n" ); } return false; } // Check app filter if needed. // Do this AFTER get_app_version(), otherwise we could send // a misleading message to user // if (g_wreq->user_apps_only && (!g_wreq->beta_only || config.distinct_beta_apps) ) { if (app_not_selected(app->id)) { g_wreq->no_allowed_apps_available = true; if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] [USER#%lu] [WU#%lu] user doesn't want work for app %s\n", g_reply->user.id, wu.id, app->name ); } return false; } } // Check whether we can send this job. // This may modify wu.delay_bound and wu.rsc_fpops_est // retval = wu_is_infeasible_fast( wu, wu_result.res_server_state, wu_result.res_priority, wu_result.res_report_deadline, *app, *bavp ); if (retval) { if (retval != last_retval && config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] [HOST#%lu] [WU#%lu %s] WU is infeasible: %s\n", g_reply->host.id, wu.id, wu.name, infeasible_string(retval) ); } last_retval = retval; if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] is_infeasible_fast() failed; skipping\n" ); } return false; } return true; }
// resend any jobs that: // 1) we already sent to this host; // 2) are still in progress (i.e. haven't timed out) and // 3) aren't present on the host // Return true if there were any such jobs // bool resend_lost_work() { SCHED_DB_RESULT result; std::vector<DB_RESULT>results; unsigned int i; char buf[256]; char warning_msg[256]; bool did_any = false; int num_eligible_to_resend=0; int num_resent=0; BEST_APP_VERSION* bavp = NULL; APP* app = NULL; int retval; sprintf(buf, " where hostid=%d and server_state=%d ", g_reply->host.id, RESULT_SERVER_STATE_IN_PROGRESS ); while (!result.enumerate(buf)) { if (!work_needed(false)) { result.end_enumerate(); break; } bool found = false; for (i=0; i<g_request->other_results.size(); i++) { OTHER_RESULT& orp = g_request->other_results[i]; if (!strcmp(orp.name, result.name)) { found = true; break; } } if (found) continue; num_eligible_to_resend++; if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] [HOST#%d] found lost [RESULT#%u]: %s\n", g_reply->host.id, result.id, result.name ); } DB_WORKUNIT wu; bool can_resend = true; retval = wu.lookup_id(result.workunitid); if (retval) { log_messages.printf(MSG_CRITICAL, "[HOST#%d] can't resend - WU not found for [RESULT#%u]\n", g_reply->host.id, result.id ); can_resend = false; } if (can_resend) { app = ssp->lookup_app(wu.appid); bavp = get_app_version(wu, true, false); if (!bavp) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[HOST#%d] can't resend [RESULT#%u]: no app version for %s\n", g_reply->host.id, result.id, app->name ); } can_resend = false; } } if (can_resend && wu.error_mask) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] skipping [RESULT#%u]: WU error mask %d\n", result.id, wu.error_mask ); } can_resend = false; } if (can_resend && wu.canonical_resultid) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] skipping [RESULT#%u]: already have canonical result\n", result.id ); } can_resend = false; } if (can_resend && wu_is_infeasible_fast( wu, result.server_state, result.priority, result.report_deadline, *app, *bavp )) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] skipping [RESULT#%u]: feasibility check failed\n", result.id ); } can_resend = false; } if (can_resend && possibly_give_result_new_deadline(result, wu, *bavp)) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] skipping [RESULT#%u]: deadline assignment failed\n", result.id ); } can_resend = false; } // If we can't resend this job for any of the above reasons, // make it time out so that the transitioner does the right thing. // if (!can_resend) { result.report_deadline = time(0)-1; retval = result.mark_as_sent(result.server_state, config.report_grace_period); if (retval) { log_messages.printf(MSG_CRITICAL, "resend_lost_work: can't update result deadline: %s\n", boincerror(retval) ); continue; } retval = update_wu_on_send( wu, result.report_deadline + config.report_grace_period, *app, *bavp ); if (retval) { log_messages.printf(MSG_CRITICAL, "resend_lost_result: can't update WU transition time: %s\n", boincerror(retval) ); continue; } sprintf(warning_msg, "Didn't resend lost task %s (expired)", result.name ); g_reply->insert_message(warning_msg, "low"); } else { retval = add_result_to_reply(result, wu, bavp, false); if (retval) { log_messages.printf(MSG_CRITICAL, "[HOST#%d] failed to send [RESULT#%u]\n", g_reply->host.id, result.id ); continue; } sprintf(warning_msg, "Resent lost task %s", result.name); g_reply->insert_message(warning_msg, "low"); num_resent++; did_any = true; if (g_wreq->njobs_sent >= config.max_wus_to_send) { result.end_enumerate(); break; } } } if (num_eligible_to_resend && config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] [HOST#%d] %d lost results, resent %d\n", g_reply->host.id, num_eligible_to_resend, num_resent ); } return did_any; }
// do fast checks on this job, i.e. ones that don't require DB access // if any check fails, return false // static bool quick_check( WU_RESULT& wu_result, WORKUNIT& wu, BEST_APP_VERSION* &bavp, APP* &app, int& last_retval ) { int retval; if (wu_result.state != WR_STATE_PRESENT && wu_result.state != g_pid) { return false; } app = ssp->lookup_app(wu_result.workunit.appid); if (app == NULL) { return false; // this should never happen } g_wreq->no_jobs_available = false; // If we're looking for beta jobs and this isn't one, skip it // if (g_wreq->beta_only) { if (!app->beta) { return false; } if (config.debug_send) { log_messages.printf(MSG_NORMAL, "[send] [HOST#%d] beta work found: [RESULT#%d]\n", g_reply->host.id, wu_result.resultid ); } } else { if (app->beta) { return false; } } // If this is a reliable host and we are checking for results that // need a reliable host, then continue if the result is a normal result // skip if the app is beta (beta apps don't use the reliable mechanism) // if (!app->beta) { if (g_wreq->reliable_only && (!wu_result.need_reliable)) { return false; } else if (!g_wreq->reliable_only && wu_result.need_reliable) { return false; } } // don't send if we are looking for infeasible results // and the result is not infeasible // if (g_wreq->infeasible_only && (wu_result.infeasible_count==0)) { return false; } // check app filter if needed // if (g_wreq->user_apps_only && (!g_wreq->beta_only || config.distinct_beta_apps) ) { if (app_not_selected(wu)) { g_wreq->no_allowed_apps_available = true; #if 0 if (config.debug_send) { log_messages.printf(MSG_NORMAL, "[send] [USER#%d] [WU#%d] user doesn't want work for app %s\n", g_reply->user.id, wu.id, app->name ); } #endif return false; } } // Find the app and best app_version for this host. // bavp = get_app_version(wu, true, g_wreq->reliable_only); if (!bavp) { if (config.debug_array) { log_messages.printf(MSG_NORMAL, "[array] No app version\n" ); } return false; } // don't send job if host can't handle it // retval = wu_is_infeasible_fast( wu, wu_result.res_server_state, wu_result.res_priority, wu_result.res_report_deadline, *app, *bavp ); if (retval) { if (retval != last_retval && config.debug_send) { log_messages.printf(MSG_NORMAL, "[send] [HOST#%d] [WU#%d %s] WU is infeasible: %s\n", g_reply->host.id, wu.id, wu.name, infeasible_string(retval) ); } last_retval = retval; if (config.debug_array) { log_messages.printf(MSG_NORMAL, "[array] infeasible\n"); } return false; } return true; }
static int send_assigned_job(ASSIGNMENT& asg) { int retval; DB_WORKUNIT wu; char suffix[256], path[256], buf[256]; const char *rtfpath; static bool first=true; static int seqno=0; static R_RSA_PRIVATE_KEY key; BEST_APP_VERSION* bavp; if (first) { first = false; sprintf(path, "%s/upload_private", config.key_dir); retval = read_key_file(path, key); if (retval) { log_messages.printf(MSG_CRITICAL, "can't read key\n"); return -1; } } retval = wu.lookup_id(asg.workunitid); if (retval) { log_messages.printf(MSG_CRITICAL, "assigned WU %d not found\n", asg.workunitid ); return retval; } bavp = get_app_version(wu); if (!bavp) { log_messages.printf(MSG_CRITICAL, "App version for assigned WU not found\n" ); return ERR_NOT_FOUND; } rtfpath = config.project_path("%s", wu.result_template_file); sprintf(suffix, "%d_%d_%d", getpid(), (int)time(0), seqno++); retval = create_result(wu, (char *)rtfpath, suffix, key, config, 0, 0); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] create_result() %d\n", wu.id, wu.name, retval ); return retval; } int result_id = boinc_db.insert_id(); DB_RESULT result; retval = result.lookup_id(result_id); add_result_to_reply(result, wu, bavp, false); // if this is a one-job assignment, fill in assignment.resultid // so that it doesn't get sent again // if (!asg.multi && asg.target_type!=ASSIGN_NONE) { DB_ASSIGNMENT db_asg; db_asg.id = asg.id; sprintf(buf, "resultid=%d", result_id); retval = db_asg.update_field(buf); if (retval) { log_messages.printf(MSG_CRITICAL, "assign update failed: %d\n", retval ); return retval; } asg.resultid = result_id; } if (config.debug_assignment) { log_messages.printf(MSG_NORMAL, "[assign] [WU#%d] [RESULT#%d] [HOST#%d] send assignment %d\n", wu.id, result_id, g_reply->host.id, asg.id ); } return 0; }
// send work for a particular processor type // void send_work_score_type(int rt) { vector<JOB> jobs; if (config.debug_send) { log_messages.printf(MSG_NORMAL, "[send] scanning for %s jobs\n", proc_type_name(rt) ); } clear_others(rt); int nscan = config.mm_max_slots; if (!nscan) nscan = ssp->max_wu_results; int rnd_off = rand() % ssp->max_wu_results; for (int j=0; j<nscan; j++) { int i = (j+rnd_off) % ssp->max_wu_results; WU_RESULT& wu_result = ssp->wu_results[i]; if (wu_result.state != WR_STATE_PRESENT) { continue; } WORKUNIT wu = wu_result.workunit; JOB job; job.app = ssp->lookup_app(wu.appid); if (job.app->non_cpu_intensive) continue; job.bavp = get_app_version(wu, true, false); if (!job.bavp) continue; job.index = i; job.result_id = wu_result.resultid; if (!job.get_score(wu_result)) { continue; } jobs.push_back(job); } std::sort(jobs.begin(), jobs.end(), job_compare); bool sema_locked = false; for (unsigned int i=0; i<jobs.size(); i++) { if (!work_needed(false)) { break; } if (!g_wreq->need_proc_type(rt)) { break; } JOB& job = jobs[i]; if (!sema_locked) { lock_sema(); sema_locked = true; } // make sure the job is still in the cache // array is locked at this point. // WU_RESULT& wu_result = ssp->wu_results[job.index]; if (wu_result.state != WR_STATE_PRESENT) { continue; } if (wu_result.resultid != job.result_id) { continue; } WORKUNIT wu = wu_result.workunit; int retval = wu_is_infeasible_fast( wu, wu_result.res_server_state, wu_result.res_priority, wu_result.res_report_deadline, *job.app, *job.bavp ); if (retval) { continue; } wu_result.state = g_pid; // It passed fast checks. // Release sema and do slow checks // unlock_sema(); sema_locked = false; switch (slow_check(wu_result, job.app, job.bavp)) { case 1: wu_result.state = WR_STATE_PRESENT; break; case 2: wu_result.state = WR_STATE_EMPTY; break; default: // slow_check() refreshes fields of wu_result.workunit; // update our copy too // wu.hr_class = wu_result.workunit.hr_class; wu.app_version_id = wu_result.workunit.app_version_id; // mark slot as empty AFTER we've copied out of it // (since otherwise feeder might overwrite it) // wu_result.state = WR_STATE_EMPTY; // reread result from DB, make sure it's still unsent // TODO: from here to end of add_result_to_reply() // (which updates the DB record) should be a transaction // SCHED_DB_RESULT result; result.id = wu_result.resultid; if (result_still_sendable(result, wu)) { add_result_to_reply(result, wu, job.bavp, false); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. // If this happens, don't replace the record in the array // (we can't anyway, since we marked the entry as "empty"). // The feeder will eventually pick it up again, // and hopefully the problem won't happen twice. } break; } } if (sema_locked) { unlock_sema(); } restore_others(rt); g_wreq->best_app_versions.clear(); }