// send a job for the given assignment // static int send_assigned_job(ASSIGNMENT& asg) { int retval; DB_WORKUNIT wu; char suffix[256], path[MAXPATHLEN]; const char *rtfpath; static bool first=true; static int seqno=0; static R_RSA_PRIVATE_KEY key; BEST_APP_VERSION* bavp; if (first) { first = false; sprintf(path, "%s/upload_private", config.key_dir); retval = read_key_file(path, key); if (retval) { log_messages.printf(MSG_CRITICAL, "can't read key\n"); return -1; } } retval = wu.lookup_id(asg.workunitid); if (retval) { log_messages.printf(MSG_CRITICAL, "assigned WU %d not found\n", asg.workunitid ); return retval; } bavp = get_app_version(wu, false, false); if (!bavp) { log_messages.printf(MSG_CRITICAL, "App version for assigned WU not found\n" ); return ERR_NOT_FOUND; } rtfpath = config.project_path("%s", wu.result_template_file); sprintf(suffix, "%d_%d_%d", getpid(), (int)time(0), seqno++); retval = create_result( wu, const_cast<char*>(rtfpath), suffix, key, config, 0, 0 ); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] create_result(): %s\n", wu.id, wu.name, boincerror(retval) ); return retval; } int result_id = boinc_db.insert_id(); SCHED_DB_RESULT result; retval = result.lookup_id(result_id); add_result_to_reply(result, wu, bavp, false); if (config.debug_assignment) { log_messages.printf(MSG_NORMAL, "[assign] [WU#%d] [RESULT#%d] [HOST#%d] send assignment %d\n", wu.id, result_id, g_reply->host.id, asg.id ); } return 0; }
// Try to send the client this result // This can fail because: // - result needs more disk/mem/speed than host has // - already sent a result for this WU // - no app_version available // static int possibly_send_result(SCHED_DB_RESULT& result) { DB_WORKUNIT wu; SCHED_DB_RESULT result2; int retval; long count; char buf[256]; BEST_APP_VERSION* bavp; g_wreq->no_jobs_available = false; retval = wu.lookup_id(result.workunitid); if (retval) return ERR_DB_NOT_FOUND; // This doesn't take into account g_wreq->allow_non_selected_apps, // however Einstein@Home, which is the only project that currently uses // this locality scheduler, doesn't support the respective project-specific // preference setting // if (app_not_selected(wu.appid)) return ERR_NO_APP_VERSION; bavp = get_app_version(wu, true, false); if (!config.locality_scheduler_fraction && !bavp && is_anonymous(g_request->platforms.list[0])) { char help_msg_buf[512]; sprintf(help_msg_buf, "To get more %s work, finish current work, stop BOINC, remove app_info.xml file, and restart.", config.long_name ); g_reply->insert_message(help_msg_buf, "notice"); g_reply->set_delay(DELAY_ANONYMOUS); } if (!bavp) return ERR_NO_APP_VERSION; APP* app = ssp->lookup_app(wu.appid); retval = wu_is_infeasible_fast( wu, result.server_state, result.report_deadline, result.priority, *app, *bavp ); if (retval) return retval; if (config.one_result_per_user_per_wu) { sprintf(buf, "where userid=%lu and workunitid=%lu", g_reply->user.id, wu.id); retval = result2.count(count, buf); if (retval) return ERR_DB_NOT_FOUND; if (count > 0) return ERR_WU_USER_RULE; } return add_result_to_reply(result, wu, bavp, true); }
static int send_job_for_app(APP& app) { BEST_APP_VERSION* bavp; SCHED_DB_RESULT result; lock_sema(); for (int i=0; i<ssp->max_wu_results; i++) { WU_RESULT& wu_result = ssp->wu_results[i]; if (wu_result.state != WR_STATE_PRESENT && wu_result.state != g_pid) { continue; } WORKUNIT wu = wu_result.workunit; if (wu.appid != app.id) continue; if (!can_send_nci(wu_result, wu, bavp, &app)) { // All jobs for a given NCI app are identical. // If we can't send one, we can't send any. // unlock_sema(); log_messages.printf(MSG_NORMAL, "can_send_nci() failed for NCI job\n" ); return -1; } wu_result.state = g_pid; unlock_sema(); result.id = wu_result.resultid; wu_result.state = WR_STATE_EMPTY; if (result_still_sendable(result, wu)) { if (config.debug_send) { log_messages.printf(MSG_NORMAL, "Sending non-CPU-intensive job: %s\n", wu.name ); } add_result_to_reply(result, wu, bavp, false); return 0; } log_messages.printf(MSG_NORMAL, "NCI job was not still sendable\n" ); lock_sema(); } log_messages.printf(MSG_NORMAL, "no sendable NCI jobs for %s\n", app.user_friendly_name ); unlock_sema(); return 1; }
void JOB_SET::send() { WORKUNIT wu; DB_RESULT result; int retval; std::list<JOB>::iterator i = jobs.begin(); while (i != jobs.end()) { JOB& job = *(i++); WU_RESULT wu_result = ssp->wu_results[job.index]; ssp->wu_results[job.index].state = WR_STATE_EMPTY; wu = wu_result.workunit; result.id = wu_result.resultid; retval = read_sendable_result(result); if (!retval) { add_result_to_reply(result, wu, job.bavp, false); } } }
// Try to send the client this result // This can fail because: // - result needs more disk/mem/speed than host has // - already sent a result for this WU // - no app_version available // static int possibly_send_result(SCHED_DB_RESULT& result) { DB_WORKUNIT wu; SCHED_DB_RESULT result2; int retval, count; char buf[256]; BEST_APP_VERSION* bavp; g_wreq->no_jobs_available = false; retval = wu.lookup_id(result.workunitid); if (retval) return ERR_DB_NOT_FOUND; bavp = get_app_version(wu, true, false); if (!config.locality_scheduler_fraction && !bavp && is_anonymous(g_request->platforms.list[0])) { char help_msg_buf[512]; sprintf(help_msg_buf, "To get more %s work, finish current work, stop BOINC, remove app_info.xml file, and restart.", config.long_name ); g_reply->insert_message(help_msg_buf, "notice"); g_reply->set_delay(DELAY_ANONYMOUS); } if (!bavp) return ERR_NO_APP_VERSION; APP* app = ssp->lookup_app(wu.appid); retval = wu_is_infeasible_fast( wu, result.server_state, result.report_deadline, result.priority, *app, *bavp ); if (retval) return retval; if (config.one_result_per_user_per_wu) { sprintf(buf, "where userid=%d and workunitid=%d", g_reply->user.id, wu.id); retval = result2.count(count, buf); if (retval) return ERR_DB_NOT_FOUND; if (count > 0) return ERR_WU_USER_RULE; } return add_result_to_reply(result, wu, bavp, true); }
// Try to send the client this result // This can fail because: // - result needs more disk/mem/speed than host has // - already sent a result for this WU // - no app_version available // static int possibly_send_result(DB_RESULT& result) { DB_WORKUNIT wu; DB_RESULT result2; int retval, count; char buf[256]; BEST_APP_VERSION* bavp; retval = wu.lookup_id(result.workunitid); if (retval) return ERR_DB_NOT_FOUND; bavp = get_app_version(wu, true); if (!bavp && anonymous(g_request->platforms.list[0])) { char help_msg_buf[512]; sprintf(help_msg_buf, "To get more %s work, finish current work, stop BOINC, remove app_info.xml file, and restart.", config.long_name ); g_reply->insert_message(USER_MESSAGE(help_msg_buf, "high")); g_reply->set_delay(DELAY_ANONYMOUS); } if (!bavp) return ERR_NO_APP_VERSION; APP* app = ssp->lookup_app(wu.appid); if (wu_is_infeasible_fast(wu, *app, *bavp)) { return ERR_INSUFFICIENT_RESOURCE; } if (config.one_result_per_user_per_wu) { sprintf(buf, "where userid=%d and workunitid=%d", g_reply->user.id, wu.id); retval = result2.count(count, buf); if (retval) return ERR_DB_NOT_FOUND; if (count > 0) return ERR_WU_USER_RULE; } return add_result_to_reply(result, wu, bavp, true); }
// Make a pass through the wu/results array, sending work. // The choice of jobs is limited by flags in g_wreq, as follows: // infeasible_only: // send only results that were previously infeasible for some host // reliable_only: // send only jobs with "need_reliable" set (e.g. retries) // and send them only w/ app versions that are "reliable" for this host // user_apps_only: // Send only jobs for apps selected by user // beta_only: // Send only jobs for beta-test apps // locality_sched_lite: // For apps that use locality sched Lite, // send only jobs for which the host already has at least 1 file // // Return true if no more work is needed. // static bool scan_work_array() { int i, j, rnd_off, last_retval=0;; APP* app; BEST_APP_VERSION* bavp; bool no_more_needed = false; SCHED_DB_RESULT result; // To minimize the amount of time we lock the array, // we initially scan without holding the lock. // If we find a job that passes quick_check(), // we acquire the lock and then check the job again. // bool sema_locked = false; rnd_off = rand() % ssp->max_wu_results; for (j=0; j<ssp->max_wu_results; j++) { i = (j+rnd_off) % ssp->max_wu_results; WU_RESULT& wu_result = ssp->wu_results[i]; if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] scanning slot %d\n", i ); } recheck: if (wu_result.state != WR_STATE_PRESENT && wu_result.state != g_pid) { continue; } // make a copy of the WORKUNIT part, // which we can modify without affecting the cache // WORKUNIT wu = wu_result.workunit; app = ssp->lookup_app(wu_result.workunit.appid); if (app == NULL) { log_messages.printf(MSG_CRITICAL, "[WU#%lu] no app\n", wu_result.workunit.id ); continue; // this should never happen } if (app->non_cpu_intensive) continue; // do fast (non-DB) checks. // This may modify wu.rsc_fpops_est // if (!quick_check(wu_result, wu, bavp, app, last_retval)) { if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] slot %d failed quick check\n", i ); } continue; } if (!sema_locked) { lock_sema(); sema_locked = true; goto recheck; } // mark wu_result as checked out and release semaphore. // from here on in this loop, don't continue on failure; // instead, goto dont_send (so that we reacquire semaphore) // // Note: without the semaphore we don't have mutual exclusion; // ideally we should use a transaction from now until when // we commit to sending the results. wu_result.state = g_pid; unlock_sema(); sema_locked = false; switch (slow_check(wu_result, app, bavp)) { case 1: // if we couldn't send the result to this host, // set its state back to PRESENT // wu_result.state = WR_STATE_PRESENT; break; case 2: // can't send this job to any host // wu_result.state = WR_STATE_EMPTY; break; default: // slow_check() refreshes fields of wu_result.workunit; // update our copy too // wu.hr_class = wu_result.workunit.hr_class; wu.app_version_id = wu_result.workunit.app_version_id; // mark slot as empty AFTER we've copied out of it // (since otherwise feeder might overwrite it) // wu_result.state = WR_STATE_EMPTY; // reread result from DB, make sure it's still unsent // TODO: from here to end of add_result_to_reply() // (which updates the DB record) should be a transaction // result.id = wu_result.resultid; if (result_still_sendable(result, wu)) { add_result_to_reply(result, wu, bavp, false); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. // If this happens, don't replace the record in the array // (we can't anyway, since we marked the entry as "empty"). // The feeder will eventually pick it up again, // and hopefully the problem won't happen twice. } break; } if (!work_needed(false)) { no_more_needed = true; break; } } if (sema_locked) { unlock_sema(); } return no_more_needed; }
// resend any jobs that: // 1) we already sent to this host; // 2) are still in progress (i.e. haven't timed out) and // 3) aren't present on the host // Return true if there were any such jobs // bool resend_lost_work() { SCHED_DB_RESULT result; std::vector<DB_RESULT>results; unsigned int i; char buf[256]; char warning_msg[256]; bool did_any = false; int num_eligible_to_resend=0; int num_resent=0; BEST_APP_VERSION* bavp = NULL; APP* app = NULL; int retval; sprintf(buf, " where hostid=%d and server_state=%d ", g_reply->host.id, RESULT_SERVER_STATE_IN_PROGRESS ); while (!result.enumerate(buf)) { if (!work_needed(false)) { result.end_enumerate(); break; } bool found = false; for (i=0; i<g_request->other_results.size(); i++) { OTHER_RESULT& orp = g_request->other_results[i]; if (!strcmp(orp.name, result.name)) { found = true; break; } } if (found) continue; num_eligible_to_resend++; if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] [HOST#%d] found lost [RESULT#%u]: %s\n", g_reply->host.id, result.id, result.name ); } DB_WORKUNIT wu; bool can_resend = true; retval = wu.lookup_id(result.workunitid); if (retval) { log_messages.printf(MSG_CRITICAL, "[HOST#%d] can't resend - WU not found for [RESULT#%u]\n", g_reply->host.id, result.id ); can_resend = false; } if (can_resend) { app = ssp->lookup_app(wu.appid); bavp = get_app_version(wu, true, false); if (!bavp) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[HOST#%d] can't resend [RESULT#%u]: no app version for %s\n", g_reply->host.id, result.id, app->name ); } can_resend = false; } } if (can_resend && wu.error_mask) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] skipping [RESULT#%u]: WU error mask %d\n", result.id, wu.error_mask ); } can_resend = false; } if (can_resend && wu.canonical_resultid) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] skipping [RESULT#%u]: already have canonical result\n", result.id ); } can_resend = false; } if (can_resend && wu_is_infeasible_fast( wu, result.server_state, result.priority, result.report_deadline, *app, *bavp )) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] skipping [RESULT#%u]: feasibility check failed\n", result.id ); } can_resend = false; } if (can_resend && possibly_give_result_new_deadline(result, wu, *bavp)) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] skipping [RESULT#%u]: deadline assignment failed\n", result.id ); } can_resend = false; } // If we can't resend this job for any of the above reasons, // make it time out so that the transitioner does the right thing. // if (!can_resend) { result.report_deadline = time(0)-1; retval = result.mark_as_sent(result.server_state, config.report_grace_period); if (retval) { log_messages.printf(MSG_CRITICAL, "resend_lost_work: can't update result deadline: %s\n", boincerror(retval) ); continue; } retval = update_wu_on_send( wu, result.report_deadline + config.report_grace_period, *app, *bavp ); if (retval) { log_messages.printf(MSG_CRITICAL, "resend_lost_result: can't update WU transition time: %s\n", boincerror(retval) ); continue; } sprintf(warning_msg, "Didn't resend lost task %s (expired)", result.name ); g_reply->insert_message(warning_msg, "low"); } else { retval = add_result_to_reply(result, wu, bavp, false); if (retval) { log_messages.printf(MSG_CRITICAL, "[HOST#%d] failed to send [RESULT#%u]\n", g_reply->host.id, result.id ); continue; } sprintf(warning_msg, "Resent lost task %s", result.name); g_reply->insert_message(warning_msg, "low"); num_resent++; did_any = true; if (g_wreq->njobs_sent >= config.max_wus_to_send) { result.end_enumerate(); break; } } } if (num_eligible_to_resend && config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] [HOST#%d] %d lost results, resent %d\n", g_reply->host.id, num_eligible_to_resend, num_resent ); } return did_any; }
// Make a pass through the wu/results array, sending work. // The choice of jobs is limited by flags in g_wreq, as follows: // infeasible_only: // send only results that were previously infeasible for some host // reliable_only: // send only retries // user_apps_only: // Send only jobs for apps selected by user // beta_only: // Send only jobs for beta-test apps // // Return true if no more work is needed. // static bool scan_work_array() { int i, j, retval, rnd_off, last_retval=0;; APP* app; BEST_APP_VERSION* bavp; bool no_more_needed = false; DB_RESULT result; lock_sema(); rnd_off = rand() % ssp->max_wu_results; for (j=0; j<ssp->max_wu_results; j++) { i = (j+rnd_off) % ssp->max_wu_results; WU_RESULT& wu_result = ssp->wu_results[i]; WORKUNIT wu = wu_result.workunit; // do fast (non-DB) checks // if (!quick_check(wu_result, wu, bavp, app, last_retval)) { continue; } // mark wu_result as checked out and release semaphore. // from here on in this loop, don't continue on failure; // instead, goto dont_send (so that we reacquire semaphore) // // Note: without the semaphore we don't have mutual exclusion; // ideally we should use a transaction from now until when // we commit to sending the results. wu_result.state = g_pid; unlock_sema(); if (!slow_check(wu_result, wu, app)) { // if we couldn't send the result to this host, // set its state back to PRESENT // wu_result.state = WR_STATE_PRESENT; } else { result.id = wu_result.resultid; // mark slot as empty AFTER we've copied out of it // (since otherwise feeder might overwrite it) // wu_result.state = WR_STATE_EMPTY; // reread result from DB, make sure it's still unsent // TODO: from here to end of add_result_to_reply() // (which updates the DB record) should be a transaction // if (result_still_sendable(result, wu)) { retval = add_result_to_reply(result, wu, bavp, false); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. // If this happens, don't replace the record in the array // (we can't anyway, since we marked the entry as "empty"). // The feeder will eventually pick it up again, // and hopefully the problem won't happen twice. } } lock_sema(); if (!work_needed(false)) { no_more_needed = true; break; } } unlock_sema(); return no_more_needed; }
static int send_assigned_job(ASSIGNMENT& asg) { int retval; DB_WORKUNIT wu; char suffix[256], path[256], buf[256]; const char *rtfpath; static bool first=true; static int seqno=0; static R_RSA_PRIVATE_KEY key; BEST_APP_VERSION* bavp; if (first) { first = false; sprintf(path, "%s/upload_private", config.key_dir); retval = read_key_file(path, key); if (retval) { log_messages.printf(MSG_CRITICAL, "can't read key\n"); return -1; } } retval = wu.lookup_id(asg.workunitid); if (retval) { log_messages.printf(MSG_CRITICAL, "assigned WU %d not found\n", asg.workunitid ); return retval; } bavp = get_app_version(wu); if (!bavp) { log_messages.printf(MSG_CRITICAL, "App version for assigned WU not found\n" ); return ERR_NOT_FOUND; } rtfpath = config.project_path("%s", wu.result_template_file); sprintf(suffix, "%d_%d_%d", getpid(), (int)time(0), seqno++); retval = create_result(wu, (char *)rtfpath, suffix, key, config, 0, 0); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] create_result() %d\n", wu.id, wu.name, retval ); return retval; } int result_id = boinc_db.insert_id(); DB_RESULT result; retval = result.lookup_id(result_id); add_result_to_reply(result, wu, bavp, false); // if this is a one-job assignment, fill in assignment.resultid // so that it doesn't get sent again // if (!asg.multi && asg.target_type!=ASSIGN_NONE) { DB_ASSIGNMENT db_asg; db_asg.id = asg.id; sprintf(buf, "resultid=%d", result_id); retval = db_asg.update_field(buf); if (retval) { log_messages.printf(MSG_CRITICAL, "assign update failed: %d\n", retval ); return retval; } asg.resultid = result_id; } if (config.debug_assignment) { log_messages.printf(MSG_NORMAL, "[assign] [WU#%d] [RESULT#%d] [HOST#%d] send assignment %d\n", wu.id, result_id, g_reply->host.id, asg.id ); } return 0; }
// send work for a particular processor type // void send_work_score_type(int rt) { vector<JOB> jobs; if (config.debug_send) { log_messages.printf(MSG_NORMAL, "[send] scanning for %s jobs\n", proc_type_name(rt) ); } clear_others(rt); int nscan = config.mm_max_slots; if (!nscan) nscan = ssp->max_wu_results; int rnd_off = rand() % ssp->max_wu_results; for (int j=0; j<nscan; j++) { int i = (j+rnd_off) % ssp->max_wu_results; WU_RESULT& wu_result = ssp->wu_results[i]; if (wu_result.state != WR_STATE_PRESENT) { continue; } WORKUNIT wu = wu_result.workunit; JOB job; job.app = ssp->lookup_app(wu.appid); if (job.app->non_cpu_intensive) continue; job.bavp = get_app_version(wu, true, false); if (!job.bavp) continue; job.index = i; job.result_id = wu_result.resultid; if (!job.get_score(wu_result)) { continue; } jobs.push_back(job); } std::sort(jobs.begin(), jobs.end(), job_compare); bool sema_locked = false; for (unsigned int i=0; i<jobs.size(); i++) { if (!work_needed(false)) { break; } if (!g_wreq->need_proc_type(rt)) { break; } JOB& job = jobs[i]; if (!sema_locked) { lock_sema(); sema_locked = true; } // make sure the job is still in the cache // array is locked at this point. // WU_RESULT& wu_result = ssp->wu_results[job.index]; if (wu_result.state != WR_STATE_PRESENT) { continue; } if (wu_result.resultid != job.result_id) { continue; } WORKUNIT wu = wu_result.workunit; int retval = wu_is_infeasible_fast( wu, wu_result.res_server_state, wu_result.res_priority, wu_result.res_report_deadline, *job.app, *job.bavp ); if (retval) { continue; } wu_result.state = g_pid; // It passed fast checks. // Release sema and do slow checks // unlock_sema(); sema_locked = false; switch (slow_check(wu_result, job.app, job.bavp)) { case 1: wu_result.state = WR_STATE_PRESENT; break; case 2: wu_result.state = WR_STATE_EMPTY; break; default: // slow_check() refreshes fields of wu_result.workunit; // update our copy too // wu.hr_class = wu_result.workunit.hr_class; wu.app_version_id = wu_result.workunit.app_version_id; // mark slot as empty AFTER we've copied out of it // (since otherwise feeder might overwrite it) // wu_result.state = WR_STATE_EMPTY; // reread result from DB, make sure it's still unsent // TODO: from here to end of add_result_to_reply() // (which updates the DB record) should be a transaction // SCHED_DB_RESULT result; result.id = wu_result.resultid; if (result_still_sendable(result, wu)) { add_result_to_reply(result, wu, job.bavp, false); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. // If this happens, don't replace the record in the array // (we can't anyway, since we marked the entry as "empty"). // The feeder will eventually pick it up again, // and hopefully the problem won't happen twice. } break; } } if (sema_locked) { unlock_sema(); } restore_others(rt); g_wreq->best_app_versions.clear(); }