static int send_job_for_app(APP& app) { BEST_APP_VERSION* bavp; SCHED_DB_RESULT result; lock_sema(); for (int i=0; i<ssp->max_wu_results; i++) { WU_RESULT& wu_result = ssp->wu_results[i]; if (wu_result.state != WR_STATE_PRESENT && wu_result.state != g_pid) { continue; } WORKUNIT wu = wu_result.workunit; if (wu.appid != app.id) continue; if (!can_send_nci(wu_result, wu, bavp, &app)) { // All jobs for a given NCI app are identical. // If we can't send one, we can't send any. // unlock_sema(); log_messages.printf(MSG_NORMAL, "can_send_nci() failed for NCI job\n" ); return -1; } wu_result.state = g_pid; unlock_sema(); result.id = wu_result.resultid; wu_result.state = WR_STATE_EMPTY; if (result_still_sendable(result, wu)) { if (config.debug_send) { log_messages.printf(MSG_NORMAL, "Sending non-CPU-intensive job: %s\n", wu.name ); } add_result_to_reply(result, wu, bavp, false); return 0; } log_messages.printf(MSG_NORMAL, "NCI job was not still sendable\n" ); lock_sema(); } log_messages.printf(MSG_NORMAL, "no sendable NCI jobs for %s\n", app.user_friendly_name ); unlock_sema(); return 1; }
// Make a pass through the wu/results array, sending work. // The choice of jobs is limited by flags in g_wreq, as follows: // infeasible_only: // send only results that were previously infeasible for some host // reliable_only: // send only jobs with "need_reliable" set (e.g. retries) // and send them only w/ app versions that are "reliable" for this host // user_apps_only: // Send only jobs for apps selected by user // beta_only: // Send only jobs for beta-test apps // locality_sched_lite: // For apps that use locality sched Lite, // send only jobs for which the host already has at least 1 file // // Return true if no more work is needed. // static bool scan_work_array() { int i, j, rnd_off, last_retval=0;; APP* app; BEST_APP_VERSION* bavp; bool no_more_needed = false; SCHED_DB_RESULT result; // To minimize the amount of time we lock the array, // we initially scan without holding the lock. // If we find a job that passes quick_check(), // we acquire the lock and then check the job again. // bool sema_locked = false; rnd_off = rand() % ssp->max_wu_results; for (j=0; j<ssp->max_wu_results; j++) { i = (j+rnd_off) % ssp->max_wu_results; WU_RESULT& wu_result = ssp->wu_results[i]; if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] scanning slot %d\n", i ); } recheck: if (wu_result.state != WR_STATE_PRESENT && wu_result.state != g_pid) { continue; } // make a copy of the WORKUNIT part, // which we can modify without affecting the cache // WORKUNIT wu = wu_result.workunit; app = ssp->lookup_app(wu_result.workunit.appid); if (app == NULL) { log_messages.printf(MSG_CRITICAL, "[WU#%lu] no app\n", wu_result.workunit.id ); continue; // this should never happen } if (app->non_cpu_intensive) continue; // do fast (non-DB) checks. // This may modify wu.rsc_fpops_est // if (!quick_check(wu_result, wu, bavp, app, last_retval)) { if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] slot %d failed quick check\n", i ); } continue; } if (!sema_locked) { lock_sema(); sema_locked = true; goto recheck; } // mark wu_result as checked out and release semaphore. // from here on in this loop, don't continue on failure; // instead, goto dont_send (so that we reacquire semaphore) // // Note: without the semaphore we don't have mutual exclusion; // ideally we should use a transaction from now until when // we commit to sending the results. wu_result.state = g_pid; unlock_sema(); sema_locked = false; switch (slow_check(wu_result, app, bavp)) { case 1: // if we couldn't send the result to this host, // set its state back to PRESENT // wu_result.state = WR_STATE_PRESENT; break; case 2: // can't send this job to any host // wu_result.state = WR_STATE_EMPTY; break; default: // slow_check() refreshes fields of wu_result.workunit; // update our copy too // wu.hr_class = wu_result.workunit.hr_class; wu.app_version_id = wu_result.workunit.app_version_id; // mark slot as empty AFTER we've copied out of it // (since otherwise feeder might overwrite it) // wu_result.state = WR_STATE_EMPTY; // reread result from DB, make sure it's still unsent // TODO: from here to end of add_result_to_reply() // (which updates the DB record) should be a transaction // result.id = wu_result.resultid; if (result_still_sendable(result, wu)) { add_result_to_reply(result, wu, bavp, false); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. // If this happens, don't replace the record in the array // (we can't anyway, since we marked the entry as "empty"). // The feeder will eventually pick it up again, // and hopefully the problem won't happen twice. } break; } if (!work_needed(false)) { no_more_needed = true; break; } } if (sema_locked) { unlock_sema(); } return no_more_needed; }
void process_request(char* code_sign_key) { PLATFORM* platform; int retval; double last_rpc_time, x; struct tm *rpc_time_tm; bool ok_to_send_work = !config.dont_send_jobs; bool have_no_work = false; char buf[256]; HOST initial_host; unsigned int i; time_t t; memset(&g_reply->wreq, 0, sizeof(g_reply->wreq)); // if client has sticky files we don't need any more, tell it // do_file_delete_regex(); // if different major version of BOINC, just send a message // if (wrong_core_client_version() || unacceptable_os() || unacceptable_cpu() ) { ok_to_send_work = false; } // if no jobs reported and none to send, return without accessing DB // if (!ok_to_send_work && !g_request->results.size()) { return; } warn_user_if_core_client_upgrade_scheduled(); if (requesting_work()) { if (config.locality_scheduling || config.locality_scheduler_fraction || config.enable_assignment) { have_no_work = false; } else { lock_sema(); have_no_work = ssp->no_work(g_pid); if (have_no_work) { g_wreq->no_jobs_available = true; } unlock_sema(); } } // If: // - there's no work, // - a config flag is set, // - client isn't returning results, // - this isn't an initial RPC, // - client is requesting work // then return without accessing the DB. // This is an efficiency hack for when servers are overloaded // if ( have_no_work && config.nowork_skip && requesting_work() && (g_request->results.size() == 0) && (g_request->hostid != 0) ) { g_reply->insert_message("No work available", "low"); g_reply->set_delay(DELAY_NO_WORK_SKIP); if (!config.msg_to_host && !config.enable_vda) { log_messages.printf(MSG_NORMAL, "No work - skipping DB access\n"); return; } } // FROM HERE ON DON'T RETURN; "goto leave" instead // (because ssp->no_work() may have tagged an entry in the work array // with our process ID) retval = open_database(); if (retval) { send_error_message("Server can't open database", 3600); g_reply->project_is_down = true; goto leave; } retval = authenticate_user(); if (retval) goto leave; if (g_reply->user.id == 0) { log_messages.printf(MSG_CRITICAL, "No user ID!\n"); } initial_host = g_reply->host; g_reply->host.rpc_seqno = g_request->rpc_seqno; g_reply->nucleus_only = false; log_request(); // is host blacklisted? // if (g_reply->host._max_results_day == -1) { send_error_message("Not accepting requests from this host", 86400); goto leave; } if (strlen(config.sched_lockfile_dir)) { int pid_with_lock = lock_sched(); if (pid_with_lock > 0) { log_messages.printf(MSG_CRITICAL, "Another scheduler instance [PID=%d] is running for this host\n", pid_with_lock ); } else if (pid_with_lock) { log_messages.printf(MSG_CRITICAL, "Error acquiring lock for [HOST#%d]\n", g_reply->host.id ); } if (pid_with_lock) { send_error_message( "Another scheduler instance is running for this host", 60 ); goto leave; } } // in deciding whether it's a new day, // add a random factor (based on host ID) // to smooth out network traffic over the day // retval = rand(); srand(g_reply->host.id); x = drand()*86400; srand(retval); last_rpc_time = g_reply->host.rpc_time; t = (time_t)(g_reply->host.rpc_time + x); rpc_time_tm = localtime(&t); g_request->last_rpc_dayofyear = rpc_time_tm->tm_yday; t = time(0); g_reply->host.rpc_time = t; t += (time_t)x; rpc_time_tm = localtime(&t); g_request->current_rpc_dayofyear = rpc_time_tm->tm_yday; retval = modify_host_struct(g_reply->host); // write time stats to disk if present // if (g_request->have_time_stats_log) { write_time_stats_log(); } // look up the client's platform(s) in the DB // platform = ssp->lookup_platform(g_request->platform.name); if (platform) g_request->platforms.list.push_back(platform); // if primary platform is anonymous, ignore alternate platforms // if (strcmp(g_request->platform.name, "anonymous")) { for (i=0; i<g_request->alt_platforms.size(); i++) { platform = ssp->lookup_platform(g_request->alt_platforms[i].name); if (platform) g_request->platforms.list.push_back(platform); } } if (g_request->platforms.list.size() == 0) { sprintf(buf, "%s %s", _("This project doesn't support computers of type"), g_request->platform.name ); g_reply->insert_message(buf, "notice"); log_messages.printf(MSG_CRITICAL, "[HOST#%d] platform '%s' not found\n", g_reply->host.id, g_request->platform.name ); g_reply->set_delay(DELAY_PLATFORM_UNSUPPORTED); goto leave; } handle_global_prefs(); read_host_app_versions(); update_n_jobs_today(); handle_results(); handle_file_xfer_results(); if (config.enable_vda) { handle_vda(); } // Do this before resending lost jobs // if (bad_install_type()) { ok_to_send_work = false; } if (!requesting_work()) { ok_to_send_work = false; } send_work_setup(); if (g_request->have_other_results_list) { if (ok_to_send_work && (config.resend_lost_results || g_wreq->resend_lost_results) && !g_request->results_truncated ) { if (resend_lost_work()) { ok_to_send_work = false; } } if (config.send_result_abort) { send_result_abort(); } } if (requesting_work()) { if (!send_code_sign_key(code_sign_key)) { ok_to_send_work = false; } if (have_no_work) { if (config.debug_send) { log_messages.printf(MSG_NORMAL, "[send] No jobs in shmem cache\n" ); } } // if last RPC was within config.min_sendwork_interval, don't send work // if (!have_no_work && ok_to_send_work) { if (config.min_sendwork_interval) { double diff = dtime() - last_rpc_time; if (diff < config.min_sendwork_interval) { ok_to_send_work = false; log_messages.printf(MSG_NORMAL, "Not sending work - last request too recent: %f\n", diff ); sprintf(buf, "Not sending work - last request too recent: %d sec", (int)diff ); g_reply->insert_message(buf, "low"); // the 1.01 is in case client's clock // is slightly faster than ours // g_reply->set_delay(1.01*config.min_sendwork_interval); } } if (ok_to_send_work) { send_work(); } } if (g_wreq->no_jobs_available) { g_reply->insert_message("Project has no tasks available", "low"); } } handle_msgs_from_host(); if (config.msg_to_host) { handle_msgs_to_host(); } update_host_record(initial_host, g_reply->host, g_reply->user); write_host_app_versions(); leave: if (!have_no_work) { ssp->restore_work(g_pid); } }
// Make a pass through the wu/results array, sending work. // The choice of jobs is limited by flags in g_wreq, as follows: // infeasible_only: // send only results that were previously infeasible for some host // reliable_only: // send only retries // user_apps_only: // Send only jobs for apps selected by user // beta_only: // Send only jobs for beta-test apps // // Return true if no more work is needed. // static bool scan_work_array() { int i, j, retval, rnd_off, last_retval=0;; APP* app; BEST_APP_VERSION* bavp; bool no_more_needed = false; DB_RESULT result; lock_sema(); rnd_off = rand() % ssp->max_wu_results; for (j=0; j<ssp->max_wu_results; j++) { i = (j+rnd_off) % ssp->max_wu_results; WU_RESULT& wu_result = ssp->wu_results[i]; WORKUNIT wu = wu_result.workunit; // do fast (non-DB) checks // if (!quick_check(wu_result, wu, bavp, app, last_retval)) { continue; } // mark wu_result as checked out and release semaphore. // from here on in this loop, don't continue on failure; // instead, goto dont_send (so that we reacquire semaphore) // // Note: without the semaphore we don't have mutual exclusion; // ideally we should use a transaction from now until when // we commit to sending the results. wu_result.state = g_pid; unlock_sema(); if (!slow_check(wu_result, wu, app)) { // if we couldn't send the result to this host, // set its state back to PRESENT // wu_result.state = WR_STATE_PRESENT; } else { result.id = wu_result.resultid; // mark slot as empty AFTER we've copied out of it // (since otherwise feeder might overwrite it) // wu_result.state = WR_STATE_EMPTY; // reread result from DB, make sure it's still unsent // TODO: from here to end of add_result_to_reply() // (which updates the DB record) should be a transaction // if (result_still_sendable(result, wu)) { retval = add_result_to_reply(result, wu, bavp, false); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. // If this happens, don't replace the record in the array // (we can't anyway, since we marked the entry as "empty"). // The feeder will eventually pick it up again, // and hopefully the problem won't happen twice. } } lock_sema(); if (!work_needed(false)) { no_more_needed = true; break; } } unlock_sema(); return no_more_needed; }
void process_request(char* code_sign_key) { PLATFORM* platform; int retval; double last_rpc_time, x; struct tm *rpc_time_tm; bool ok_to_send_work = !config.dont_send_jobs; bool have_no_work = false; char buf[256]; HOST initial_host; unsigned int i; time_t t; memset(&g_reply->wreq, 0, sizeof(g_reply->wreq)); // if client has sticky files we don't need any more, tell it // do_file_delete_regex(); // if different major version of BOINC, just send a message // if (wrong_core_client_version() || unacceptable_os() || unacceptable_cpu() ) { ok_to_send_work = false; } // if no jobs reported and none to send, return without accessing DB // if (!ok_to_send_work && !g_request->results.size()) { return; } warn_user_if_core_client_upgrade_scheduled(); if (requesting_work()) { if (config.locality_scheduling || config.locality_scheduler_fraction || config.enable_assignment) { have_no_work = false; } else { lock_sema(); have_no_work = ssp->no_work(g_pid); if (have_no_work) { g_wreq->no_jobs_available = true; } unlock_sema(); } } // If: // - there's no work, // - a config flag is set, // - client isn't returning results, // - this isn't an initial RPC, // - client is requesting work // then return without accessing the DB. // This is an efficiency hack for when servers are overloaded // if ( have_no_work && config.nowork_skip && requesting_work() && (g_request->results.size() == 0) && (g_request->hostid != 0) ) { g_reply->insert_message("No work available", "low"); g_reply->set_delay(DELAY_NO_WORK_SKIP); if (!config.msg_to_host && !config.enable_vda) { log_messages.printf(MSG_NORMAL, "No work - skipping DB access\n"); return; } } // FROM HERE ON DON'T RETURN; "goto leave" instead // (because ssp->no_work() may have tagged an entry in the work array // with our process ID) retval = open_database(); if (retval) { send_error_message("Server can't open database", 3600); g_reply->project_is_down = true; goto leave; } retval = authenticate_user(); if (retval) goto leave; if (g_reply->user.id == 0) { log_messages.printf(MSG_CRITICAL, "No user ID!\n"); } initial_host = g_reply->host; g_reply->host.rpc_seqno = g_request->rpc_seqno; g_reply->nucleus_only = false; log_request(); #if 0 // if you need to debug a problem w/ a particular host or user, // edit the following // if (g_reply->user.id == XX || g_reply.host.id == YY) { config.sched_debug_level = 3; config.debug_send = true; ... }
void send_work_matchmaker() { int i, slots_locked=0, slots_nonempty=0; JOB_SET jobs; int min_slots = config.mm_min_slots; if (!min_slots) min_slots = ssp->max_wu_results/2; int max_slots = config.mm_max_slots; if (!max_slots) max_slots = ssp->max_wu_results; int max_locked = 10; lock_sema(); i = rand() % ssp->max_wu_results; // scan through the job cache, maintaining a JOB_SET of jobs // that we can send to this client, ordered by score. // for (int slots_scanned=0; slots_scanned<max_slots; slots_scanned++) { i = (i+1) % ssp->max_wu_results; WU_RESULT& wu_result = ssp->wu_results[i]; switch (wu_result.state) { case WR_STATE_EMPTY: continue; case WR_STATE_PRESENT: slots_nonempty++; break; default: slots_nonempty++; if (wu_result.state == g_pid) break; slots_locked++; continue; } JOB job; job.index = i; // get score for this job, and skip it if it fails quick check. // NOTE: the EDF check done in get_score() // includes only in-progress jobs. // if (!job.get_score()) { continue; } if (config.debug_send) { log_messages.printf(MSG_NORMAL, "[send] score for %s: %f\n", wu_result.workunit.name, job.score ); } if (job.score > jobs.lowest_score() || !jobs.request_satisfied()) { ssp->wu_results[i].state = g_pid; unlock_sema(); if (wu_is_infeasible_slow(wu_result, *g_request, *g_reply)) { // if we can't use this job, put it back in pool // lock_sema(); ssp->wu_results[i].state = WR_STATE_PRESENT; continue; } lock_sema(); jobs.add_job(job); } if (jobs.request_satisfied() && slots_scanned>=min_slots) break; } if (!slots_nonempty) { log_messages.printf(MSG_CRITICAL, "Job cache is empty - check feeder\n" ); g_wreq->no_jobs_available = true; } // TODO: trim jobs from tail of list until we pass the EDF check // jobs.send(); unlock_sema(); if (slots_locked > max_locked) { log_messages.printf(MSG_CRITICAL, "Found too many locked slots (%d>%d) - increase array size\n", slots_locked, max_locked ); } }
// send work for a particular processor type // void send_work_score_type(int rt) { vector<JOB> jobs; if (config.debug_send) { log_messages.printf(MSG_NORMAL, "[send] scanning for %s jobs\n", proc_type_name(rt) ); } clear_others(rt); int nscan = config.mm_max_slots; if (!nscan) nscan = ssp->max_wu_results; int rnd_off = rand() % ssp->max_wu_results; for (int j=0; j<nscan; j++) { int i = (j+rnd_off) % ssp->max_wu_results; WU_RESULT& wu_result = ssp->wu_results[i]; if (wu_result.state != WR_STATE_PRESENT) { continue; } WORKUNIT wu = wu_result.workunit; JOB job; job.app = ssp->lookup_app(wu.appid); if (job.app->non_cpu_intensive) continue; job.bavp = get_app_version(wu, true, false); if (!job.bavp) continue; job.index = i; job.result_id = wu_result.resultid; if (!job.get_score(wu_result)) { continue; } jobs.push_back(job); } std::sort(jobs.begin(), jobs.end(), job_compare); bool sema_locked = false; for (unsigned int i=0; i<jobs.size(); i++) { if (!work_needed(false)) { break; } if (!g_wreq->need_proc_type(rt)) { break; } JOB& job = jobs[i]; if (!sema_locked) { lock_sema(); sema_locked = true; } // make sure the job is still in the cache // array is locked at this point. // WU_RESULT& wu_result = ssp->wu_results[job.index]; if (wu_result.state != WR_STATE_PRESENT) { continue; } if (wu_result.resultid != job.result_id) { continue; } WORKUNIT wu = wu_result.workunit; int retval = wu_is_infeasible_fast( wu, wu_result.res_server_state, wu_result.res_priority, wu_result.res_report_deadline, *job.app, *job.bavp ); if (retval) { continue; } wu_result.state = g_pid; // It passed fast checks. // Release sema and do slow checks // unlock_sema(); sema_locked = false; switch (slow_check(wu_result, job.app, job.bavp)) { case 1: wu_result.state = WR_STATE_PRESENT; break; case 2: wu_result.state = WR_STATE_EMPTY; break; default: // slow_check() refreshes fields of wu_result.workunit; // update our copy too // wu.hr_class = wu_result.workunit.hr_class; wu.app_version_id = wu_result.workunit.app_version_id; // mark slot as empty AFTER we've copied out of it // (since otherwise feeder might overwrite it) // wu_result.state = WR_STATE_EMPTY; // reread result from DB, make sure it's still unsent // TODO: from here to end of add_result_to_reply() // (which updates the DB record) should be a transaction // SCHED_DB_RESULT result; result.id = wu_result.resultid; if (result_still_sendable(result, wu)) { add_result_to_reply(result, wu, job.bavp, false); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. // If this happens, don't replace the record in the array // (we can't anyway, since we marked the entry as "empty"). // The feeder will eventually pick it up again, // and hopefully the problem won't happen twice. } break; } } if (sema_locked) { unlock_sema(); } restore_others(rt); g_wreq->best_app_versions.clear(); }