static inline void got_bad_result(SCHED_RESULT_ITEM& sri) { int gavid = generalized_app_version_id(sri.app_version_id, sri.appid); DB_HOST_APP_VERSION* havp = gavid_to_havp(gavid); if (!havp) { if (config.debug_handle_results) { log_messages.printf(MSG_NORMAL, "[handle] No app version for %d\n", gavid ); } return; } int n = havp->max_jobs_per_day; if (n > config.daily_result_quota) { n = config.daily_result_quota; } n -= 1; if (n < 1) { n = 1; } if (config.debug_quota) { log_messages.printf(MSG_NORMAL, "[quota] decreasing max_jobs_per_day for %d: %d->%d\n", gavid, havp->max_jobs_per_day, n ); } havp->max_jobs_per_day = n; havp->consecutive_valid = 0; }
// got a SUCCESS result. Doesn't mean it's valid! // static inline void got_good_result(SCHED_RESULT_ITEM& sri) { DB_ID_TYPE gavid = generalized_app_version_id(sri.app_version_id, sri.appid); DB_HOST_APP_VERSION* havp = gavid_to_havp(gavid); if (!havp) { if (config.debug_handle_results) { log_messages.printf(MSG_NORMAL, "[handle] No app version for %ld\n", gavid ); } return; } if (havp->max_jobs_per_day < config.daily_result_quota) { int n = havp->max_jobs_per_day*2; if (n > config.daily_result_quota) { n = config.daily_result_quota; } if (config.debug_quota) { log_messages.printf(MSG_NORMAL, "[quota] increasing max_jobs_per_day for %ld: %d->%d\n", gavid, havp->max_jobs_per_day, n ); } havp->max_jobs_per_day = n; } }
DB_HOST_APP_VERSION* BEST_APP_VERSION::host_app_version() { if (cavp) { return gavid_to_havp( generalized_app_version_id(host_usage.resource_type(), appid) ); } else { return gavid_to_havp(avp->id); } }
// called at start of send_work(). // Estimate FLOPS of anon platform versions, // and compute scaling factor for wu.rsc_fpops // void estimate_flops_anon_platform() { unsigned int i; for (i=0; i<g_request->client_app_versions.size(); i++) { CLIENT_APP_VERSION& cav = g_request->client_app_versions[i]; cav.rsc_fpops_scale = 1; if (cav.host_usage.avg_ncpus == 0 && cav.host_usage.ncudas == 0 && cav.host_usage.natis == 0) { cav.host_usage.avg_ncpus = 1; } // current clients fill in host_usage.flops with peak FLOPS // if it's missing from app_info.xml; // however, for older clients, we need to fill it in ourselves; // assume it uses 1 CPU // if (cav.host_usage.projected_flops == 0) { cav.host_usage.projected_flops = g_reply->host.p_fpops; } // At this point host_usage.projected_flops is filled in with something. // See if we have a better estimated based on history // DB_HOST_APP_VERSION* havp = gavid_to_havp( generalized_app_version_id( cav.host_usage.resource_type(), cav.app->id ) ); if (havp && havp->et.n > MIN_HOST_SAMPLES) { double new_flops = 1./havp->et.get_avg(); cav.rsc_fpops_scale = cav.host_usage.projected_flops/new_flops; cav.host_usage.projected_flops = new_flops; if (config.debug_send) { log_messages.printf(MSG_NORMAL, "[send] (%s) setting projected flops to %fG based on ET\n", cav.plan_class, new_flops/1e9 ); } } else { if (config.debug_send) { log_messages.printf(MSG_NORMAL, "[send] (%s) using client-supplied flops %fG\n", cav.plan_class, cav.host_usage.projected_flops ); } } } }
// Compute or estimate "claimed peak FLOP count". // Possibly update host_app_version records and write to DB. // Possibly update app_version records in memory and let caller write to DB, // to merge DB writes // int get_pfc( RESULT& r, WORKUNIT& wu, DB_APP& app, // in vector<DB_APP_VERSION>&app_versions, // in/out DB_HOST_APP_VERSION& hav, // in/out double& pfc, int& mode // out ) { DB_APP_VERSION* avp=0; int retval; mode = PFC_MODE_APPROX; if (r.runtime_outlier && config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] runtime outlier, not updating stats\n", r.id ); } // is result from old scheduler that didn't set r.app_version_id correctly? // if so, use WU estimate (this is a transient condition) // if (r.app_version_id == 0 || r.app_version_id == 1) { if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] missing app_version_id (%d): returning WU default %.2f\n", r.id, r.app_version_id, wu_estimated_credit(wu, app) ); } mode = PFC_MODE_WU_EST; pfc = wu_estimated_pfc(wu, app); return 0; } // temporary kludge for SETI@home: // if GPU initialization fails the app falls back to CPU. // if (strstr(r.stderr_out, "Device Emulation (CPU)")) { if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d][AV#%d] CUDA app fell back to CPU; returning WU default %.2f\n", r.id, r.app_version_id, wu.rsc_fpops_est*COBBLESTONE_SCALE ); } mode = PFC_MODE_WU_EST; pfc = wu_estimated_pfc(wu, app); return 0; } int gavid = generalized_app_version_id(r.app_version_id, r.appid); // transition case // if (!hav.host_id) { mode = PFC_MODE_WU_EST; pfc = wu_estimated_pfc(wu, app); return 0; } // old clients report CPU time but not elapsed time. // Use HOST_APP_VERSION.et to track statistics of CPU time. // if (r.elapsed_time < 1e-6) { // in case buggy client reports elapsed time like 1e-304 if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] old client (elapsed time not reported)\n", r.id ); } if (!r.runtime_outlier) { hav.et.update_var( r.cpu_time/wu.rsc_fpops_est, HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT ); } pfc = wu_estimated_pfc(wu, app); if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] old client: raw credit %.2f\n", r.id, pfc*COBBLESTONE_SCALE ); } bool do_scale = true; if (hav.et.n < MIN_HOST_SAMPLES || (hav.et.get_avg() <= 0)) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] old client: no host scaling - zero or too few samples %f\n", r.id, hav.et.n ); } } if (do_scale && app.host_scale_check && hav.consecutive_valid < CONS_VALID_HOST_SCALE ) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] old client: no host scaling - cons valid %d\n", r.id, hav.consecutive_valid ); } } if (do_scale) { double s = r.cpu_time / (hav.et.get_avg()*wu.rsc_fpops_est); pfc *= s; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] old client: scaling (based on CPU time) by %g, return %.2f\n", r.id, s, pfc*COBBLESTONE_SCALE ); } } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] old client: returning PFC %.2f\n", r.id, pfc*COBBLESTONE_SCALE ); } return 0; } // r.flops_estimate should be positive // but (because of scheduler bug) it may not be. // At this point we don't have much to go on, so use 1e10. // if (r.flops_estimate <= 0) { r.flops_estimate = 1e10; } double raw_pfc = (r.elapsed_time * r.flops_estimate); if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] raw credit: %.2f (%.2f sec, %.2f est GFLOPS)\n", r.id, raw_pfc*COBBLESTONE_SCALE, r.elapsed_time, r.flops_estimate/1e9 ); } // Sanity check // if (raw_pfc > wu.rsc_fpops_bound) { char query[256], clause[256]; pfc = wu_estimated_pfc(wu, app); if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] sanity check failed: %.2f>%.2f, return %.2f\n", r.id, raw_pfc*COBBLESTONE_SCALE, wu.rsc_fpops_bound*COBBLESTONE_SCALE, pfc*COBBLESTONE_SCALE ); } sprintf(query, "consecutive_valid=0"); sprintf(clause, "host_id=%d and app_version_id=%d", r.hostid, gavid); retval = hav.update_fields_noid(query, clause); return retval; } if (r.app_version_id < 0) { // anon platform // bool do_scale = true; if (hav.pfc.n < MIN_HOST_SAMPLES || hav.pfc.get_avg()<=0) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] anon platform, not scaling, PFC avg zero or too few samples %.0f\n", r.id, hav.pfc.n ); } } if (do_scale && app.host_scale_check && hav.consecutive_valid < CONS_VALID_HOST_SCALE ) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] anon platform, not scaling, cons valid %d\n", r.id, hav.consecutive_valid ); } } if (do_scale) { double scale = app.min_avg_pfc / hav.pfc.get_avg(); pfc = raw_pfc * scale; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] anon platform, scaling by %g (%.2f/%.2f)\n", r.id, scale, app.min_avg_pfc, hav.pfc.get_avg() ); } } else { pfc = wu_estimated_pfc(wu, app); if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] not scaling, using app avg %.2f\n", r.id, pfc*COBBLESTONE_SCALE ); } } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] anon platform, returning %.2f\n", r.id, pfc*COBBLESTONE_SCALE ); } } else { avp = av_lookup(r.app_version_id, app_versions); if (!avp) { log_messages.printf(MSG_CRITICAL, "get_pfc() [RESULT#%d]: No AVP %d!!\n", r.id, r.app_version_id ); return ERR_NOT_FOUND; } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] [AV#%d] normal case. %.0f sec, %.1f GFLOPS. raw credit: %.2f\n", r.id, avp->id, r.elapsed_time, r.flops_estimate/1e9, raw_pfc*COBBLESTONE_SCALE ); } bool do_scale = true; double host_scale = 0; if (app.host_scale_check && hav.consecutive_valid < CONS_VALID_HOST_SCALE ) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] not host scaling - cons valid %d\n", r.id, hav.consecutive_valid ); } } if (do_scale && (hav.pfc.n < MIN_HOST_SAMPLES || hav.pfc.get_avg()==0)) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] not host scaling - HAV PFC zero or too few samples %.0f\n", r.id, hav.pfc.n ); } } if (do_scale && avp->pfc.n < MIN_VERSION_SAMPLES) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] not host scaling - app_version PFC too few samples%.0f\n", r.id, avp->pfc.n ); } } if (do_scale && hav.pfc.get_avg() <= 0) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] not host scaling - HAV PFC is zero\n", r.id ); } } if (do_scale) { host_scale = avp->pfc.get_avg() / hav.pfc.get_avg(); if (host_scale > 10) host_scale = 10; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] host scale: %.2f (%f/%f)\n", r.id, host_scale, avp->pfc.get_avg(), hav.pfc.get_avg() ); } } pfc = raw_pfc; if (avp->pfc_scale) { pfc *= avp->pfc_scale; if (host_scale) { pfc *= host_scale; mode = PFC_MODE_NORMAL; } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] applying app version scale %.3f\n", r.id, avp->pfc_scale ); } } else { if (host_scale) { pfc *= host_scale; } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] no app version scale\n", r.id ); } } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] [AV#%d] PFC avgs with %g (%g/%g)\n", r.id, avp->id, raw_pfc/wu.rsc_fpops_est, raw_pfc, wu.rsc_fpops_est ); } double x = raw_pfc / wu.rsc_fpops_est; if (!r.runtime_outlier && is_pfc_sane(x, wu, app)) { avp->pfc_samples.push_back(x); } } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] updating HAV PFC %.2f et %g turnaround %d\n", r.id, raw_pfc / wu.rsc_fpops_est, r.elapsed_time / wu.rsc_fpops_est, (r.received_time - r.sent_time) ); } double x = raw_pfc / wu.rsc_fpops_est; if (!r.runtime_outlier && is_pfc_sane(x, wu, app)) { hav.pfc.update(x, HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT); } if (!r.runtime_outlier) { hav.et.update_var( r.elapsed_time / wu.rsc_fpops_est, HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT ); hav.turnaround.update_var( (r.received_time - r.sent_time), HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT ); } // keep track of credit per app version // if (avp) { avp->credit_samples.push_back(pfc*COBBLESTONE_SCALE); avp->credit_times.push_back(r.sent_time); } return 0; }
// input: // cav.host_usage.projected_flops // This is the <flops> specified in app_info.xml // If not specified there, it's a conservative estimate // (CPU speed * (ncpus + 10*ngpus)) // In either case, this value will be used by the client // to estimate job runtime and runtime limit // est runtime = wu.rsc_fpops_est/x // runtime limit = wu.rsc_fpops_bound/x // x may be way off from the actual speed. // So to get accurate runtime est, we need to adjust wu.rsc_fpops_est // // output: // cav.host_usage.projected_flops // An estimate of the actual FLOPS the app will get, // based on elapsed time history (if possible). // This is used by the scheduler to estimate runtime. // cav.rsc_fpops_scale // wu.rsc_fpops_est and wu.rsc_fpops_bound will be scaled by this // // called at start of send_work(). // void estimate_flops_anon_platform() { unsigned int i; for (i=0; i<g_request->client_app_versions.size(); i++) { CLIENT_APP_VERSION& cav = g_request->client_app_versions[i]; if (!cav.app) continue; cav.rsc_fpops_scale = 1; if (cav.host_usage.avg_ncpus == 0 && cav.host_usage.proc_type == PROC_TYPE_CPU ) { cav.host_usage.avg_ncpus = 1; } // if projected_flops is missing, make a wild guess // Note: 6.12+ clients supply a project FLOPS, // even if the user didn't // if (cav.host_usage.projected_flops == 0) { cav.host_usage.projected_flops = g_reply->host.p_fpops; } // If data is available, estimate FLOPS based on average elapsed time // DB_HOST_APP_VERSION* havp = gavid_to_havp( generalized_app_version_id( cav.host_usage.resource_type(), cav.app->id ) ); if (havp && (havp->et.n > MIN_HOST_SAMPLES) && (havp->et.get_avg() > 0) ) { double new_flops = 1./havp->et.get_avg(); // cap this at ET_RATIO_LIMIT*projected, // in case we've had a bunch of short jobs recently // if (new_flops > ET_RATIO_LIMIT*cav.host_usage.projected_flops) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] (%s) capping new_flops; %.1fG > %.0f*%.1fG\n", cav.plan_class, new_flops/1e9, ET_RATIO_LIMIT, cav.host_usage.projected_flops/1e9 ); } new_flops = ET_RATIO_LIMIT*cav.host_usage.projected_flops; } cav.rsc_fpops_scale = cav.host_usage.projected_flops/new_flops; cav.host_usage.projected_flops = new_flops; if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] (%s) setting projected flops to %fG based on ET\n", cav.plan_class, new_flops/1e9 ); log_messages.printf(MSG_NORMAL, "[version] setting rsc_fpops_scale to %g\n", cav.rsc_fpops_scale ); } } else { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] (%s) using client-supplied flops %fG\n", cav.plan_class, cav.host_usage.projected_flops/1e9 ); } } } }
// A result timed out; penalize the corresponding host_app_version // static int result_timed_out( TRANSITIONER_ITEM res_item, TRANSITIONER_ITEM& wu_item ) { DB_HOST_APP_VERSION hav; char query[512], clause[512]; int gavid = generalized_app_version_id( res_item.res_app_version_id, wu_item.appid ); int retval = hav_lookup(hav, res_item.res_hostid, gavid); if (retval) { log_messages.printf(MSG_NORMAL, "result_timed_out(): hav_lookup failed: %s\n", boincerror(retval) ); return 0; } hav.turnaround.update_var( (double)wu_item.delay_bound, HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT ); int n = hav.max_jobs_per_day; if (n == 0) { n = config.daily_result_quota; } if (n > config.daily_result_quota) { n = config.daily_result_quota; } n -= 1; if (n < 1) { n = 1; } if (config.debug_quota) { log_messages.printf(MSG_NORMAL, "[quota] max_jobs_per_day for %d; %d->%d\n", gavid, hav.max_jobs_per_day, n ); } hav.max_jobs_per_day = n; hav.consecutive_valid = 0; sprintf(query, "turnaround_n=%.15e, turnaround_avg=%.15e, turnaround_var=%.15e, turnaround_q=%.15e, max_jobs_per_day=%d, consecutive_valid=%d", hav.turnaround.n, hav.turnaround.avg, hav.turnaround.var, hav.turnaround.q, hav.max_jobs_per_day, hav.consecutive_valid ); sprintf(clause, "host_id=%d and app_version_id=%d", hav.host_id, hav.app_version_id ); retval = hav.update_fields_noid(query, clause); if (retval) { log_messages.printf(MSG_CRITICAL, "CRITICAL result_timed_out(): hav updated failed: %s\n", boincerror(retval) ); } return 0; }
// handle a workunit which has new results // int handle_wu( DB_VALIDATOR_ITEM_SET& validator, std::vector<VALIDATOR_ITEM>& items ) { int canonical_result_index = -1; bool update_result, retry; TRANSITION_TIME transition_time = NO_CHANGE; int retval = 0, canonicalid = 0, x; double credit = 0; unsigned int i; WORKUNIT& wu = items[0].wu; g_wup = &wu; vector<RESULT> results; vector<DB_HOST_APP_VERSION> host_app_versions, host_app_versions_orig; int nsuccess_results; // Here if WU doesn't have a canonical result yet. // Try to get one log_messages.printf(MSG_NORMAL, "[WU#%d %s] handle_wu(): No canonical result yet\n", wu.id, wu.name ); ++log_messages; // make a vector of the successful results, // and a parallel vector of host_app_versions // for (i=0; i<items.size(); i++) { RESULT& result = items[i].res; if ((result.server_state == RESULT_SERVER_STATE_OVER) && (result.outcome == RESULT_OUTCOME_SUCCESS) ) { results.push_back(result); DB_HOST_APP_VERSION hav; retval = hav_lookup(hav, result.hostid, generalized_app_version_id(result.app_version_id, result.appid) ); if (retval) { hav.host_id=0; // flag that it's missing } host_app_versions.push_back(hav); host_app_versions_orig.push_back(hav); } } log_messages.printf(MSG_DEBUG, "[WU#%d %s] Found %d successful results\n", wu.id, wu.name, (int)results.size() ); if (results.size() >= (unsigned int)wu.min_quorum) { log_messages.printf(MSG_DEBUG, "[WU#%d %s] Enough for quorum, checking set.\n", wu.id, wu.name ); double dummy; retval = check_set( results, wu, canonicalid, dummy, retry ); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] check_set returned %d, exiting\n", wu.id, wu.name, retval ); return retval; } if (retry) transition_time = DELAYED; if (credit_from_wu) { retval = get_credit_from_wu(wu, results, credit); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] get_credit_from_wu returned %d\n", wu.id, wu.name, retval ); return retval; } } if (canonicalid) { retval = assign_credit_set( wu, results, app, app_versions, host_app_versions, max_granted_credit, credit ); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] assign_credit_set() returned %d\n", wu.id, wu.name, retval ); transition_time = DELAYED; goto leave; } } if (max_granted_credit && credit>max_granted_credit) { credit = max_granted_credit; } // scan results. // update as needed, and count the # of results // that are still outcome=SUCCESS // (some may have changed to VALIDATE_ERROR) // nsuccess_results = 0; for (i=0; i<results.size(); i++) { RESULT& result = results[i]; DB_HOST_APP_VERSION& hav = host_app_versions[i]; DB_HOST_APP_VERSION& hav_orig = host_app_versions_orig[i]; update_result = false; bool update_host = false; if (result.outcome == RESULT_OUTCOME_VALIDATE_ERROR) { transition_time = IMMEDIATE; update_result = true; } else { nsuccess_results++; } DB_HOST host; HOST host_initial; switch (result.validate_state) { case VALIDATE_STATE_VALID: case VALIDATE_STATE_INVALID: retval = host.lookup_id(result.hostid); if (retval) { log_messages.printf(MSG_CRITICAL, "[RESULT#%d] lookup of host %d failed %d\n", result.id, result.hostid, retval ); continue; } host_initial = host; } switch (result.validate_state) { case VALIDATE_STATE_VALID: update_result = true; update_host = true; retval = is_valid(host, result, wu, host_app_versions[i]); if (retval) { log_messages.printf(MSG_DEBUG, "[RESULT#%d %s] is_valid() failed: %d\n", result.id, result.name, retval ); } grant_credit( host, result.sent_time, result.cpu_time, result.granted_credit ); log_messages.printf(MSG_NORMAL, "[RESULT#%d %s] Valid; granted %f credit [HOST#%d]\n", result.id, result.name, result.granted_credit, result.hostid ); break; case VALIDATE_STATE_INVALID: update_result = true; update_host = true; log_messages.printf(MSG_NORMAL, "[RESULT#%d %s] Invalid [HOST#%d]\n", result.id, result.name, result.hostid ); is_invalid(host_app_versions[i]); break; case VALIDATE_STATE_INIT: log_messages.printf(MSG_NORMAL, "[RESULT#%d %s] Inconclusive [HOST#%d]\n", result.id, result.name, result.hostid ); result.validate_state = VALIDATE_STATE_INCONCLUSIVE; update_result = true; break; } if (hav.host_id) { retval = hav.update_validator(hav_orig); } if (update_host) { retval = host.update_diff_validator(host_initial); } if (update_result) { retval = validator.update_result(result); if (retval) { log_messages.printf(MSG_CRITICAL, "[RESULT#%d %s] result.update() failed: %d\n", result.id, result.name, retval ); } } } if (canonicalid) { // if we found a canonical result, // trigger the assimilator, but do NOT trigger // the transitioner - doing so creates a race condition // transition_time = NEVER; log_messages.printf(MSG_DEBUG, "[WU#%d %s] Found a canonical result: id=%d\n", wu.id, wu.name, canonicalid ); wu.canonical_resultid = canonicalid; wu.canonical_credit = credit; wu.assimilate_state = ASSIMILATE_READY; // don't need to send any more results // for (i=0; i<items.size(); i++) { RESULT& result = items[i].res; if (result.server_state != RESULT_SERVER_STATE_UNSENT) { continue; } result.server_state = RESULT_SERVER_STATE_OVER; result.outcome = RESULT_OUTCOME_DIDNT_NEED; retval = validator.update_result(result); if (retval) { log_messages.printf(MSG_CRITICAL, "[RESULT#%d %s] result.update() failed: %d\n", result.id, result.name, retval ); } } } else { // here if no consensus. // check if #success results is too large // if (nsuccess_results > wu.max_success_results) { wu.error_mask |= WU_ERROR_TOO_MANY_SUCCESS_RESULTS; transition_time = IMMEDIATE; } // if #success results >= target_nresults, // we need more results, so bump target_nresults // NOTE: nsuccess_results should never be > target_nresults, // but accommodate that if it should happen // if (nsuccess_results >= wu.target_nresults) { wu.target_nresults = nsuccess_results+1; transition_time = IMMEDIATE; } } } leave: --log_messages; switch (transition_time) { case IMMEDIATE: wu.transition_time = time(0); break; case DELAYED: x = time(0) + 6*3600; if (x < wu.transition_time) wu.transition_time = x; break; case NEVER: wu.transition_time = INT_MAX; break; case NO_CHANGE: break; } wu.need_validate = 0; retval = validator.update_workunit(wu); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] update_workunit() failed: %d; exiting\n", wu.id, wu.name, retval ); return retval; } return 0; }
// handle a workunit which has new results // int handle_wu( DB_VALIDATOR_ITEM_SET& validator, std::vector<VALIDATOR_ITEM>& items ) { int canonical_result_index = -1; bool update_result, retry; TRANSITION_TIME transition_time = NO_CHANGE; int retval = 0, x; DB_ID_TYPE canonicalid = 0; double credit = 0; unsigned int i; WORKUNIT& wu = items[0].wu; g_wup = &wu; if (wu.canonical_resultid) { log_messages.printf(MSG_NORMAL, "[WU#%lu %s] Already has canonical result %lu\n", wu.id, wu.name, wu.canonical_resultid ); ++log_messages; // Here if WU already has a canonical result. // Get unchecked results and see if they match the canonical result // for (i=0; i<items.size(); i++) { RESULT& result = items[i].res; if (result.id == wu.canonical_resultid) { canonical_result_index = i; } } if (canonical_result_index == -1) { log_messages.printf(MSG_CRITICAL, "[WU#%lu %s] Can't find canonical result %lu\n", wu.id, wu.name, wu.canonical_resultid ); return 0; } RESULT& canonical_result = items[canonical_result_index].res; // scan this WU's results, and check the unchecked ones // for (i=0; i<items.size(); i++) { RESULT& result = items[i].res; if (result.server_state != RESULT_SERVER_STATE_OVER) continue; if (result.outcome != RESULT_OUTCOME_SUCCESS) continue; switch (result.validate_state) { case VALIDATE_STATE_INIT: case VALIDATE_STATE_INCONCLUSIVE: break; default: continue; } log_messages.printf(MSG_NORMAL, "[WU#%lu] handle_wu(): testing result %lu\n", wu.id, result.id ); check_pair(result, canonical_result, retry); if (retry) { // this usually means an NFS mount has failed; // arrange to try again later. // transition_time = DELAYED; goto leave; } update_result = false; if (result.outcome == RESULT_OUTCOME_VALIDATE_ERROR) { update_result = true; } // this might be last result, so let transitioner // trigger file delete etc. if needed // transition_time = IMMEDIATE; DB_HOST host; retval = host.lookup_id(result.hostid); if (retval) { log_messages.printf(MSG_CRITICAL, "[RESULT#%lu] lookup of host %lu failed: %s\n", result.id, result.hostid, boincerror(retval) ); continue; } HOST host_initial = host; bool update_hav = false; DB_HOST_APP_VERSION hav; retval = hav_lookup(hav, result.hostid, generalized_app_version_id(result.app_version_id, result.appid) ); if (retval) { log_messages.printf(MSG_CRITICAL, "[RESULT#%lu %s] hav_lookup returned %d\n", result.id, result.name, retval ); hav.host_id = 0; } DB_HOST_APP_VERSION hav_orig = hav; vector<DB_HOST_APP_VERSION> havv; havv.push_back(hav); vector<RESULT> rv; switch (result.validate_state) { case VALIDATE_STATE_VALID: update_result = true; update_hav = true; log_messages.printf(MSG_NORMAL, "[RESULT#%lu %s] pair_check() matched: setting result to valid\n", result.id, result.name ); retval = is_valid(host, result, wu, havv[0]); if (retval) { log_messages.printf(MSG_NORMAL, "[RESULT#%lu %s] is_valid() error: %s\n", result.id, result.name, boincerror(retval) ); } // do credit computation, but grant credit of canonical result // rv.push_back(result); assign_credit_set( wu, rv, app, app_versions, havv, max_granted_credit, credit ); if (!no_credit) { result.granted_credit = canonical_result.granted_credit; grant_credit(host, result.sent_time, result.granted_credit); if (config.credit_by_app) { grant_credit_by_app(result, result.granted_credit); } } break; case VALIDATE_STATE_INVALID: update_result = true; update_hav = true; log_messages.printf(MSG_NORMAL, "[RESULT#%lu %s] pair_check() didn't match: setting result to invalid\n", result.id, result.name ); is_invalid(havv[0]); } if (hav.host_id && update_hav) { if (dry_run) { log_messages.printf(MSG_NORMAL, "DB not updated (dry run)\n"); } else { log_messages.printf(MSG_NORMAL, "[HOST#%lu AV#%lu] [outlier=%d] Updating HAV in DB. pfc.n=%f->%f\n", havv[0].host_id, havv[0].app_version_id, result.runtime_outlier, hav_orig.pfc.n, havv[0].pfc.n ); retval=havv[0].update_validator(hav_orig); if (retval) { log_messages.printf(MSG_CRITICAL, "[HOST#%lu AV%lu] hav.update_validator() failed: %s\n", hav.host_id, hav.app_version_id, boincerror(retval) ); } } } host.update_diff_validator(host_initial); if (update_result) { log_messages.printf(MSG_NORMAL, "[RESULT#%lu %s] granted_credit %f\n", result.id, result.name, result.granted_credit ); if (dry_run) { log_messages.printf(MSG_NORMAL, "DB not updated (dry run)\n"); } else { retval = validator.update_result(result); if (retval) { log_messages.printf(MSG_CRITICAL, "[RESULT#%lu %s] Can't update result: %s\n", result.id, result.name, boincerror(retval) ); } } } } } else { // Here if WU doesn't have a canonical result yet. // Try to get one vector<RESULT> viable_results; vector<DB_HOST_APP_VERSION> host_app_versions, host_app_versions_orig; log_messages.printf(MSG_NORMAL, "[WU#%lu %s] handle_wu(): No canonical result yet\n", wu.id, wu.name ); ++log_messages; // make a vector of the "viable" (i.e. possibly canonical) results, // and a parallel vector of host_app_versions // for (i=0; i<items.size(); i++) { RESULT& result = items[i].res; if (result.server_state != RESULT_SERVER_STATE_OVER) continue; if (result.outcome != RESULT_OUTCOME_SUCCESS) continue; if (result.validate_state == VALIDATE_STATE_INVALID) continue; viable_results.push_back(result); DB_HOST_APP_VERSION hav; retval = hav_lookup(hav, result.hostid, generalized_app_version_id(result.app_version_id, result.appid) ); if (retval) { hav.host_id=0; // flag that it's missing } host_app_versions.push_back(hav); host_app_versions_orig.push_back(hav); } log_messages.printf(MSG_DEBUG, "[WU#%lu %s] Found %d viable results\n", wu.id, wu.name, (int)viable_results.size() ); if (viable_results.size() >= (unsigned int)wu.min_quorum) { log_messages.printf(MSG_DEBUG, "[WU#%lu %s] Enough for quorum, checking set.\n", wu.id, wu.name ); double dummy; retval = check_set(viable_results, wu, canonicalid, dummy, retry); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%lu %s] check_set() error: %s\n", wu.id, wu.name, boincerror(retval) ); return retval; } if (retry) transition_time = DELAYED; // if we found a canonical instance, decide on credit // if (canonicalid) { // always do the credit calculation, to update statistics, // even if we're granting credit a different way // retval = assign_credit_set( wu, viable_results, app, app_versions, host_app_versions, max_granted_credit, credit ); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%lu %s] assign_credit_set(): %s\n", wu.id, wu.name, boincerror(retval) ); transition_time = DELAYED; goto leave; } if (credit_from_wu) { retval = get_credit_from_wu(wu, viable_results, credit); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%lu %s] get_credit_from_wu(): credit not specified in WU\n", wu.id, wu.name ); credit = 0; } } else if (credit_from_runtime) { credit = 0; for (i=0; i<viable_results.size(); i++) { RESULT& result = viable_results[i]; if (result.id == canonicalid) { DB_HOST host; retval = host.lookup_id(result.hostid); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%lu %s] host %lu lookup failed\n", wu.id, wu.name, result.hostid ); break; } double runtime = result.elapsed_time; if (runtime <=0 || runtime > max_runtime) { runtime = max_runtime; } credit = result.flops_estimate * runtime * COBBLESTONE_SCALE; log_messages.printf(MSG_NORMAL, "[WU#%lu][RESULT#%lu] credit_from_runtime %.2f = %.0fs * %.2fGFLOPS\n", wu.id, result.id, credit, runtime, result.flops_estimate/1e9 ); break; } } } else if (no_credit) { credit = 0; } if (max_granted_credit && credit>max_granted_credit) { credit = max_granted_credit; } } // scan the viable results. // update as needed, // and count the # of results that are still viable // (some may now have outcome VALIDATE_ERROR, // or validate_state INVALID) // int n_viable_results = 0; for (i=0; i<viable_results.size(); i++) { RESULT& result = viable_results[i]; DB_HOST_APP_VERSION& hav = host_app_versions[i]; DB_HOST_APP_VERSION& hav_orig = host_app_versions_orig[i]; update_result = false; bool update_host = false; if (result.outcome != RESULT_OUTCOME_SUCCESS || result.validate_state == VALIDATE_STATE_INVALID ) { transition_time = IMMEDIATE; update_result = true; } else { n_viable_results++; } DB_HOST host; HOST host_initial; switch (result.validate_state) { case VALIDATE_STATE_VALID: case VALIDATE_STATE_INVALID: retval = host.lookup_id(result.hostid); if (retval) { log_messages.printf(MSG_CRITICAL, "[RESULT#%lu] lookup of host %lu: %s\n", result.id, result.hostid, boincerror(retval) ); continue; } host_initial = host; } switch (result.validate_state) { case VALIDATE_STATE_VALID: update_result = true; update_host = true; retval = is_valid(host, result, wu, host_app_versions[i]); if (retval) { log_messages.printf(MSG_DEBUG, "[RESULT#%lu %s] is_valid() failed: %s\n", result.id, result.name, boincerror(retval) ); } if (!no_credit) { result.granted_credit = credit; grant_credit(host, result.sent_time, credit); log_messages.printf(MSG_NORMAL, "[RESULT#%lu %s] Valid; granted %f credit [HOST#%lu]\n", result.id, result.name, result.granted_credit, result.hostid ); if (config.credit_by_app) { grant_credit_by_app(result, credit); } } break; case VALIDATE_STATE_INVALID: update_result = true; update_host = true; log_messages.printf(MSG_NORMAL, "[RESULT#%lu %s] Invalid [HOST#%lu]\n", result.id, result.name, result.hostid ); is_invalid(host_app_versions[i]); break; case VALIDATE_STATE_INIT: log_messages.printf(MSG_NORMAL, "[RESULT#%lu %s] Inconclusive [HOST#%lu]\n", result.id, result.name, result.hostid ); result.validate_state = VALIDATE_STATE_INCONCLUSIVE; update_result = true; break; } if (dry_run) { log_messages.printf(MSG_NORMAL, "DB not updated (dry run)\n"); } else { if (hav.host_id) { log_messages.printf(MSG_NORMAL, "[HOST#%lu AV#%lu] [outlier=%d] Updating HAV in DB. pfc.n=%f->%f\n", hav.host_id, hav.app_version_id, result.runtime_outlier, hav_orig.pfc.n, hav.pfc.n ); retval = hav.update_validator(hav_orig); if (retval) { log_messages.printf(MSG_CRITICAL, "[HOST#%lu AV%lu] hav.update_validator() failed: %s\n", hav.host_id, hav.app_version_id, boincerror(retval) ); } } if (update_host) { retval = host.update_diff_validator(host_initial); if (retval) { log_messages.printf(MSG_CRITICAL, "[HOST#%lu] host.update_diff_validator() failed: %s\n", host.id, boincerror(retval) ); } } if (update_result) { retval = validator.update_result(result); if (retval) { log_messages.printf(MSG_CRITICAL, "[RESULT#%lu %s] result.update() failed: %s\n", result.id, result.name, boincerror(retval) ); } } } } if (canonicalid) { // if we found a canonical result, // trigger the assimilator, but do NOT trigger // the transitioner - doing so creates a race condition // transition_time = NEVER; log_messages.printf(MSG_DEBUG, "[WU#%lu %s] Found a canonical result: id=%lu\n", wu.id, wu.name, canonicalid ); wu.canonical_resultid = canonicalid; wu.canonical_credit = credit; wu.assimilate_state = ASSIMILATE_READY; // don't need to send any more results // for (i=0; i<items.size(); i++) { RESULT& result = items[i].res; if (result.server_state != RESULT_SERVER_STATE_UNSENT) { continue; } result.server_state = RESULT_SERVER_STATE_OVER; result.outcome = RESULT_OUTCOME_DIDNT_NEED; if (dry_run) { log_messages.printf(MSG_NORMAL, "DB not updated (dry run)\n"); } else { retval = validator.update_result(result); if (retval) { log_messages.printf(MSG_CRITICAL, "[RESULT#%lu %s] result.update() failed: %s\n", result.id, result.name, boincerror(retval) ); } } } } else { // here if no consensus. // check if #viable results is too large // if (n_viable_results > wu.max_success_results) { wu.error_mask |= WU_ERROR_TOO_MANY_SUCCESS_RESULTS; transition_time = IMMEDIATE; } // if #viable results >= target_nresults, // we need more results, so bump target_nresults // NOTE: n_viable_results should never be > target_nresults, // but accommodate that if it should happen // if (n_viable_results >= wu.target_nresults) { wu.target_nresults = n_viable_results+1; transition_time = IMMEDIATE; } } } } leave: --log_messages; switch (transition_time) { case IMMEDIATE: wu.transition_time = time(0); break; case DELAYED: x = time(0) + 6*3600; if (x < wu.transition_time) wu.transition_time = x; break; case NEVER: wu.transition_time = INT_MAX; break; case NO_CHANGE: break; } wu.need_validate = 0; if (dry_run) { log_messages.printf(MSG_NORMAL, "DB not updated (dry run)\n"); } else { retval = validator.update_workunit(wu); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%lu %s] update_workunit() failed: %s\n", wu.id, wu.name, boincerror(retval) ); return retval; } } return 0; }