static int send_job_for_app(APP& app) { BEST_APP_VERSION* bavp; SCHED_DB_RESULT result; lock_sema(); for (int i=0; i<ssp->max_wu_results; i++) { WU_RESULT& wu_result = ssp->wu_results[i]; if (wu_result.state != WR_STATE_PRESENT && wu_result.state != g_pid) { continue; } WORKUNIT wu = wu_result.workunit; if (wu.appid != app.id) continue; if (!can_send_nci(wu_result, wu, bavp, &app)) { // All jobs for a given NCI app are identical. // If we can't send one, we can't send any. // unlock_sema(); log_messages.printf(MSG_NORMAL, "can_send_nci() failed for NCI job\n" ); return -1; } wu_result.state = g_pid; unlock_sema(); result.id = wu_result.resultid; wu_result.state = WR_STATE_EMPTY; if (result_still_sendable(result, wu)) { if (config.debug_send) { log_messages.printf(MSG_NORMAL, "Sending non-CPU-intensive job: %s\n", wu.name ); } add_result_to_reply(result, wu, bavp, false); return 0; } log_messages.printf(MSG_NORMAL, "NCI job was not still sendable\n" ); lock_sema(); } log_messages.printf(MSG_NORMAL, "no sendable NCI jobs for %s\n", app.user_friendly_name ); unlock_sema(); return 1; }
// Make a pass through the wu/results array, sending work. // The choice of jobs is limited by flags in g_wreq, as follows: // infeasible_only: // send only results that were previously infeasible for some host // reliable_only: // send only jobs with "need_reliable" set (e.g. retries) // and send them only w/ app versions that are "reliable" for this host // user_apps_only: // Send only jobs for apps selected by user // beta_only: // Send only jobs for beta-test apps // locality_sched_lite: // For apps that use locality sched Lite, // send only jobs for which the host already has at least 1 file // // Return true if no more work is needed. // static bool scan_work_array() { int i, j, rnd_off, last_retval=0;; APP* app; BEST_APP_VERSION* bavp; bool no_more_needed = false; SCHED_DB_RESULT result; // To minimize the amount of time we lock the array, // we initially scan without holding the lock. // If we find a job that passes quick_check(), // we acquire the lock and then check the job again. // bool sema_locked = false; rnd_off = rand() % ssp->max_wu_results; for (j=0; j<ssp->max_wu_results; j++) { i = (j+rnd_off) % ssp->max_wu_results; WU_RESULT& wu_result = ssp->wu_results[i]; if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] scanning slot %d\n", i ); } recheck: if (wu_result.state != WR_STATE_PRESENT && wu_result.state != g_pid) { continue; } // make a copy of the WORKUNIT part, // which we can modify without affecting the cache // WORKUNIT wu = wu_result.workunit; app = ssp->lookup_app(wu_result.workunit.appid); if (app == NULL) { log_messages.printf(MSG_CRITICAL, "[WU#%lu] no app\n", wu_result.workunit.id ); continue; // this should never happen } if (app->non_cpu_intensive) continue; // do fast (non-DB) checks. // This may modify wu.rsc_fpops_est // if (!quick_check(wu_result, wu, bavp, app, last_retval)) { if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] slot %d failed quick check\n", i ); } continue; } if (!sema_locked) { lock_sema(); sema_locked = true; goto recheck; } // mark wu_result as checked out and release semaphore. // from here on in this loop, don't continue on failure; // instead, goto dont_send (so that we reacquire semaphore) // // Note: without the semaphore we don't have mutual exclusion; // ideally we should use a transaction from now until when // we commit to sending the results. wu_result.state = g_pid; unlock_sema(); sema_locked = false; switch (slow_check(wu_result, app, bavp)) { case 1: // if we couldn't send the result to this host, // set its state back to PRESENT // wu_result.state = WR_STATE_PRESENT; break; case 2: // can't send this job to any host // wu_result.state = WR_STATE_EMPTY; break; default: // slow_check() refreshes fields of wu_result.workunit; // update our copy too // wu.hr_class = wu_result.workunit.hr_class; wu.app_version_id = wu_result.workunit.app_version_id; // mark slot as empty AFTER we've copied out of it // (since otherwise feeder might overwrite it) // wu_result.state = WR_STATE_EMPTY; // reread result from DB, make sure it's still unsent // TODO: from here to end of add_result_to_reply() // (which updates the DB record) should be a transaction // result.id = wu_result.resultid; if (result_still_sendable(result, wu)) { add_result_to_reply(result, wu, bavp, false); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. // If this happens, don't replace the record in the array // (we can't anyway, since we marked the entry as "empty"). // The feeder will eventually pick it up again, // and hopefully the problem won't happen twice. } break; } if (!work_needed(false)) { no_more_needed = true; break; } } if (sema_locked) { unlock_sema(); } return no_more_needed; }
// Make a pass through the wu/results array, sending work. // The choice of jobs is limited by flags in g_wreq, as follows: // infeasible_only: // send only results that were previously infeasible for some host // reliable_only: // send only retries // user_apps_only: // Send only jobs for apps selected by user // beta_only: // Send only jobs for beta-test apps // // Return true if no more work is needed. // static bool scan_work_array() { int i, j, retval, rnd_off, last_retval=0;; APP* app; BEST_APP_VERSION* bavp; bool no_more_needed = false; DB_RESULT result; lock_sema(); rnd_off = rand() % ssp->max_wu_results; for (j=0; j<ssp->max_wu_results; j++) { i = (j+rnd_off) % ssp->max_wu_results; WU_RESULT& wu_result = ssp->wu_results[i]; WORKUNIT wu = wu_result.workunit; // do fast (non-DB) checks // if (!quick_check(wu_result, wu, bavp, app, last_retval)) { continue; } // mark wu_result as checked out and release semaphore. // from here on in this loop, don't continue on failure; // instead, goto dont_send (so that we reacquire semaphore) // // Note: without the semaphore we don't have mutual exclusion; // ideally we should use a transaction from now until when // we commit to sending the results. wu_result.state = g_pid; unlock_sema(); if (!slow_check(wu_result, wu, app)) { // if we couldn't send the result to this host, // set its state back to PRESENT // wu_result.state = WR_STATE_PRESENT; } else { result.id = wu_result.resultid; // mark slot as empty AFTER we've copied out of it // (since otherwise feeder might overwrite it) // wu_result.state = WR_STATE_EMPTY; // reread result from DB, make sure it's still unsent // TODO: from here to end of add_result_to_reply() // (which updates the DB record) should be a transaction // if (result_still_sendable(result, wu)) { retval = add_result_to_reply(result, wu, bavp, false); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. // If this happens, don't replace the record in the array // (we can't anyway, since we marked the entry as "empty"). // The feeder will eventually pick it up again, // and hopefully the problem won't happen twice. } } lock_sema(); if (!work_needed(false)) { no_more_needed = true; break; } } unlock_sema(); return no_more_needed; }
// send work for a particular processor type // void send_work_score_type(int rt) { vector<JOB> jobs; if (config.debug_send) { log_messages.printf(MSG_NORMAL, "[send] scanning for %s jobs\n", proc_type_name(rt) ); } clear_others(rt); int nscan = config.mm_max_slots; if (!nscan) nscan = ssp->max_wu_results; int rnd_off = rand() % ssp->max_wu_results; for (int j=0; j<nscan; j++) { int i = (j+rnd_off) % ssp->max_wu_results; WU_RESULT& wu_result = ssp->wu_results[i]; if (wu_result.state != WR_STATE_PRESENT) { continue; } WORKUNIT wu = wu_result.workunit; JOB job; job.app = ssp->lookup_app(wu.appid); if (job.app->non_cpu_intensive) continue; job.bavp = get_app_version(wu, true, false); if (!job.bavp) continue; job.index = i; job.result_id = wu_result.resultid; if (!job.get_score(wu_result)) { continue; } jobs.push_back(job); } std::sort(jobs.begin(), jobs.end(), job_compare); bool sema_locked = false; for (unsigned int i=0; i<jobs.size(); i++) { if (!work_needed(false)) { break; } if (!g_wreq->need_proc_type(rt)) { break; } JOB& job = jobs[i]; if (!sema_locked) { lock_sema(); sema_locked = true; } // make sure the job is still in the cache // array is locked at this point. // WU_RESULT& wu_result = ssp->wu_results[job.index]; if (wu_result.state != WR_STATE_PRESENT) { continue; } if (wu_result.resultid != job.result_id) { continue; } WORKUNIT wu = wu_result.workunit; int retval = wu_is_infeasible_fast( wu, wu_result.res_server_state, wu_result.res_priority, wu_result.res_report_deadline, *job.app, *job.bavp ); if (retval) { continue; } wu_result.state = g_pid; // It passed fast checks. // Release sema and do slow checks // unlock_sema(); sema_locked = false; switch (slow_check(wu_result, job.app, job.bavp)) { case 1: wu_result.state = WR_STATE_PRESENT; break; case 2: wu_result.state = WR_STATE_EMPTY; break; default: // slow_check() refreshes fields of wu_result.workunit; // update our copy too // wu.hr_class = wu_result.workunit.hr_class; wu.app_version_id = wu_result.workunit.app_version_id; // mark slot as empty AFTER we've copied out of it // (since otherwise feeder might overwrite it) // wu_result.state = WR_STATE_EMPTY; // reread result from DB, make sure it's still unsent // TODO: from here to end of add_result_to_reply() // (which updates the DB record) should be a transaction // SCHED_DB_RESULT result; result.id = wu_result.resultid; if (result_still_sendable(result, wu)) { add_result_to_reply(result, wu, job.bavp, false); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. // If this happens, don't replace the record in the array // (we can't anyway, since we marked the entry as "empty"). // The feeder will eventually pick it up again, // and hopefully the problem won't happen twice. } break; } } if (sema_locked) { unlock_sema(); } restore_others(rt); g_wreq->best_app_versions.clear(); }