// Compute or estimate "claimed peak FLOP count". // Possibly update host_app_version records and write to DB. // Possibly update app_version records in memory and let caller write to DB, // to merge DB writes // int get_pfc( RESULT& r, WORKUNIT& wu, DB_APP& app, // in vector<DB_APP_VERSION>&app_versions, // in/out DB_HOST_APP_VERSION& hav, // in/out double& pfc, int& mode // out ) { DB_APP_VERSION* avp=0; int retval; mode = PFC_MODE_APPROX; if (r.runtime_outlier && config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] runtime outlier, not updating stats\n", r.id ); } // is result from old scheduler that didn't set r.app_version_id correctly? // if so, use WU estimate (this is a transient condition) // if (r.app_version_id == 0 || r.app_version_id == 1) { if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] missing app_version_id (%d): returning WU default %.2f\n", r.id, r.app_version_id, wu_estimated_credit(wu, app) ); } mode = PFC_MODE_WU_EST; pfc = wu_estimated_pfc(wu, app); return 0; } // temporary kludge for SETI@home: // if GPU initialization fails the app falls back to CPU. // if (strstr(r.stderr_out, "Device Emulation (CPU)")) { if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d][AV#%d] CUDA app fell back to CPU; returning WU default %.2f\n", r.id, r.app_version_id, wu.rsc_fpops_est*COBBLESTONE_SCALE ); } mode = PFC_MODE_WU_EST; pfc = wu_estimated_pfc(wu, app); return 0; } int gavid = generalized_app_version_id(r.app_version_id, r.appid); // transition case // if (!hav.host_id) { mode = PFC_MODE_WU_EST; pfc = wu_estimated_pfc(wu, app); return 0; } // old clients report CPU time but not elapsed time. // Use HOST_APP_VERSION.et to track statistics of CPU time. // if (r.elapsed_time < 1e-6) { // in case buggy client reports elapsed time like 1e-304 if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] old client (elapsed time not reported)\n", r.id ); } if (!r.runtime_outlier) { hav.et.update_var( r.cpu_time/wu.rsc_fpops_est, HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT ); } pfc = wu_estimated_pfc(wu, app); if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] old client: raw credit %.2f\n", r.id, pfc*COBBLESTONE_SCALE ); } bool do_scale = true; if (hav.et.n < MIN_HOST_SAMPLES || (hav.et.get_avg() <= 0)) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] old client: no host scaling - zero or too few samples %f\n", r.id, hav.et.n ); } } if (do_scale && app.host_scale_check && hav.consecutive_valid < CONS_VALID_HOST_SCALE ) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] old client: no host scaling - cons valid %d\n", r.id, hav.consecutive_valid ); } } if (do_scale) { double s = r.cpu_time / (hav.et.get_avg()*wu.rsc_fpops_est); pfc *= s; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] old client: scaling (based on CPU time) by %g, return %.2f\n", r.id, s, pfc*COBBLESTONE_SCALE ); } } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] old client: returning PFC %.2f\n", r.id, pfc*COBBLESTONE_SCALE ); } return 0; } // r.flops_estimate should be positive // but (because of scheduler bug) it may not be. // At this point we don't have much to go on, so use 1e10. // if (r.flops_estimate <= 0) { r.flops_estimate = 1e10; } double raw_pfc = (r.elapsed_time * r.flops_estimate); if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] raw credit: %.2f (%.2f sec, %.2f est GFLOPS)\n", r.id, raw_pfc*COBBLESTONE_SCALE, r.elapsed_time, r.flops_estimate/1e9 ); } // Sanity check // if (raw_pfc > wu.rsc_fpops_bound) { char query[256], clause[256]; pfc = wu_estimated_pfc(wu, app); if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] sanity check failed: %.2f>%.2f, return %.2f\n", r.id, raw_pfc*COBBLESTONE_SCALE, wu.rsc_fpops_bound*COBBLESTONE_SCALE, pfc*COBBLESTONE_SCALE ); } sprintf(query, "consecutive_valid=0"); sprintf(clause, "host_id=%d and app_version_id=%d", r.hostid, gavid); retval = hav.update_fields_noid(query, clause); return retval; } if (r.app_version_id < 0) { // anon platform // bool do_scale = true; if (hav.pfc.n < MIN_HOST_SAMPLES || hav.pfc.get_avg()<=0) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] anon platform, not scaling, PFC avg zero or too few samples %.0f\n", r.id, hav.pfc.n ); } } if (do_scale && app.host_scale_check && hav.consecutive_valid < CONS_VALID_HOST_SCALE ) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] anon platform, not scaling, cons valid %d\n", r.id, hav.consecutive_valid ); } } if (do_scale) { double scale = app.min_avg_pfc / hav.pfc.get_avg(); pfc = raw_pfc * scale; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] anon platform, scaling by %g (%.2f/%.2f)\n", r.id, scale, app.min_avg_pfc, hav.pfc.get_avg() ); } } else { pfc = wu_estimated_pfc(wu, app); if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] not scaling, using app avg %.2f\n", r.id, pfc*COBBLESTONE_SCALE ); } } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] anon platform, returning %.2f\n", r.id, pfc*COBBLESTONE_SCALE ); } } else { avp = av_lookup(r.app_version_id, app_versions); if (!avp) { log_messages.printf(MSG_CRITICAL, "get_pfc() [RESULT#%d]: No AVP %d!!\n", r.id, r.app_version_id ); return ERR_NOT_FOUND; } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] [AV#%d] normal case. %.0f sec, %.1f GFLOPS. raw credit: %.2f\n", r.id, avp->id, r.elapsed_time, r.flops_estimate/1e9, raw_pfc*COBBLESTONE_SCALE ); } bool do_scale = true; double host_scale = 0; if (app.host_scale_check && hav.consecutive_valid < CONS_VALID_HOST_SCALE ) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] not host scaling - cons valid %d\n", r.id, hav.consecutive_valid ); } } if (do_scale && (hav.pfc.n < MIN_HOST_SAMPLES || hav.pfc.get_avg()==0)) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] not host scaling - HAV PFC zero or too few samples %.0f\n", r.id, hav.pfc.n ); } } if (do_scale && avp->pfc.n < MIN_VERSION_SAMPLES) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] not host scaling - app_version PFC too few samples%.0f\n", r.id, avp->pfc.n ); } } if (do_scale && hav.pfc.get_avg() <= 0) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] not host scaling - HAV PFC is zero\n", r.id ); } } if (do_scale) { host_scale = avp->pfc.get_avg() / hav.pfc.get_avg(); if (host_scale > 10) host_scale = 10; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] host scale: %.2f (%f/%f)\n", r.id, host_scale, avp->pfc.get_avg(), hav.pfc.get_avg() ); } } pfc = raw_pfc; if (avp->pfc_scale) { pfc *= avp->pfc_scale; if (host_scale) { pfc *= host_scale; mode = PFC_MODE_NORMAL; } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] applying app version scale %.3f\n", r.id, avp->pfc_scale ); } } else { if (host_scale) { pfc *= host_scale; } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] no app version scale\n", r.id ); } } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] [AV#%d] PFC avgs with %g (%g/%g)\n", r.id, avp->id, raw_pfc/wu.rsc_fpops_est, raw_pfc, wu.rsc_fpops_est ); } double x = raw_pfc / wu.rsc_fpops_est; if (!r.runtime_outlier && is_pfc_sane(x, wu, app)) { avp->pfc_samples.push_back(x); } } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] updating HAV PFC %.2f et %g turnaround %d\n", r.id, raw_pfc / wu.rsc_fpops_est, r.elapsed_time / wu.rsc_fpops_est, (r.received_time - r.sent_time) ); } double x = raw_pfc / wu.rsc_fpops_est; if (!r.runtime_outlier && is_pfc_sane(x, wu, app)) { hav.pfc.update(x, HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT); } if (!r.runtime_outlier) { hav.et.update_var( r.elapsed_time / wu.rsc_fpops_est, HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT ); hav.turnaround.update_var( (r.received_time - r.sent_time), HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT ); } // keep track of credit per app version // if (avp) { avp->credit_samples.push_back(pfc*COBBLESTONE_SCALE); avp->credit_times.push_back(r.sent_time); } return 0; }
// Compute or estimate "claimed peak FLOP count" for a completed job. // Possibly update host_app_version records and write to DB. // Possibly update app_version records in memory and let caller write to DB, // to merge DB writes // int get_pfc( RESULT &r, WORKUNIT &wu, DB_APP &app, // in vector<DB_APP_VERSION_VAL>&app_versions, // in/out DB_HOST_APP_VERSION &hav, // in/out double &pfc, // out int &mode // out ){ DB_APP_VERSION_VAL *avp=0; mode = PFC_MODE_APPROX; if (r.runtime_outlier) { if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] runtime outlier, not updating stats\n", r.id ); } } // is result from old scheduler that didn't set r.app_version_id? // if so, use WU estimate (this is a transient condition // while projects upgrade server software) // if (r.app_version_id == 0) { if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] missing app_version_id (%ld): returning WU default %.2f\n", r.id, r.app_version_id, wu_estimated_credit(wu, app) ); } mode = PFC_MODE_WU_EST; pfc = wu_estimated_pfc(wu, app); return 0; } // temporary kludge for SETI@home: // if GPU initialization fails the app falls back to CPU. // if (strstr(r.stderr_out, "Device Emulation (CPU)")) { if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu][AV#%ld] CUDA app fell back to CPU; returning WU default %.2f\n", r.id, r.app_version_id, wu.rsc_fpops_est*COBBLESTONE_SCALE ); } mode = PFC_MODE_WU_EST; pfc = wu_estimated_pfc(wu, app); return 0; } // transition case: there's no host_app_version record // if (!hav.host_id) { mode = PFC_MODE_WU_EST; pfc = wu_estimated_pfc(wu, app); return 0; } // old clients report CPU time but not elapsed time. // Use HOST_APP_VERSION.et to track statistics of CPU time. // if (r.elapsed_time < 1e-6) { // in case buggy client reports elapsed time like 1e-304 if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] old client (elapsed time not reported)\n", r.id ); } if (!r.runtime_outlier) { hav.et.update_var( r.cpu_time/wu.rsc_fpops_est, HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT ); // if ((r.elapsed_time > 0) && (r.cpu_time > 0)) { // hav.rt.update(r.elapsed_time,HAV_AVG_THRESH,HAV_AVG_WEIGHT,HAV_AVG_LIMIT); // hav.cpu.update(r.cpu_time,HAV_AVG_THRESH,HAV_AVG_WEIGHT,HAV_AVG_LIMIT); // } } pfc = wu_estimated_pfc(wu, app); if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] old client: raw credit %.2f\n", r.id, pfc*COBBLESTONE_SCALE ); } bool do_scale = true; if (hav.et.n < MIN_HOST_SAMPLES || (hav.et.get_avg() <= 0)) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] old client: no host scaling - zero or too few samples %f\n", r.id, hav.et.n ); } } if (do_scale && app.host_scale_check && hav.consecutive_valid < CONS_VALID_HOST_SCALE ) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] old client: no host scaling - cons valid %d\n", r.id, hav.consecutive_valid ); } } if (do_scale) { double s = r.cpu_time / (hav.et.get_avg()*wu.rsc_fpops_est); pfc *= s; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] old client: scaling (based on CPU time) by %g, return %.2f\n", r.id, s, pfc*COBBLESTONE_SCALE ); } } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] old client: returning PFC %.2f\n", r.id, pfc*COBBLESTONE_SCALE ); } return 0; } // r.flops_estimate should be positive // but (because of scheduler bug) it may not be. // At this point we don't have much to go on, so use 1e10. // if (r.flops_estimate <= 0) { r.flops_estimate = 1e10; } double raw_pfc = (r.elapsed_time * r.flops_estimate); if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] raw credit: %.2f (%.2f sec, %.2f est GFLOPS)\n", r.id, raw_pfc*COBBLESTONE_SCALE, r.elapsed_time, r.flops_estimate/1e9 ); } // get app version // avp = av_lookup(r.app_version_id, app_versions); // Sanity check // If an app version scale exists, use it. Otherwise assume 1. // double tmp_scale = (avp && avp->pfc_scale) ? (avp->pfc_scale) : 1.0; if (raw_pfc*tmp_scale > wu.rsc_fpops_bound) { // This sanity check should be unnecessary because we have a maximum // credit grant limit. // With anonymous GPU apps the sanity check often fails // because anonymous GPU scales are often of order 0.01. // That prevents PFC averages from being updated. // So I've removed the return statement. // pfc = wu_estimated_pfc(wu, app); if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] WARNING: sanity check failed: %.2f>%.2f, return %.2f\n", r.id, raw_pfc*tmp_scale*COBBLESTONE_SCALE, wu.rsc_fpops_bound*COBBLESTONE_SCALE, pfc*COBBLESTONE_SCALE ); } // This was a bad idea because it prevents HAV.pfc from being updated. // sprintf(query, "consecutive_valid=0"); // sprintf(clause, "host_id=%d and app_version_id=%d", r.hostid, gavid); // retval = hav.update_fields_noid(query, clause); // return retval; } if (r.app_version_id < 0) { // anon platform // bool do_scale = true; if (hav.pfc.n < MIN_HOST_SAMPLES || hav.pfc.get_avg()<=0) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] anon platform, not scaling, PFC avg zero or too few samples %.0f\n", r.id, hav.pfc.n ); } } if (do_scale && app.host_scale_check && hav.consecutive_valid < CONS_VALID_HOST_SCALE ) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] anon platform, not scaling, cons valid %d\n", r.id, hav.consecutive_valid ); } } if (do_scale) { double scale = app.min_avg_pfc / hav.pfc.get_avg(); pfc = raw_pfc * scale; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] anon platform, scaling by %g (%.2f/%.2f)\n", r.id, scale, app.min_avg_pfc, hav.pfc.get_avg() ); } } else { pfc = wu_estimated_pfc(wu, app); if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] not scaling, using app avg %.2f\n", r.id, pfc*COBBLESTONE_SCALE ); } } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] anon platform, returning %.2f\n", r.id, pfc*COBBLESTONE_SCALE ); } } else { avp = av_lookup(r.app_version_id, app_versions); if (!avp) { log_messages.printf(MSG_CRITICAL, "get_pfc() [RESULT#%lu]: No AVP %ld!!\n", r.id, r.app_version_id ); return ERR_NOT_FOUND; } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] [AV#%lu] normal case. %.0f sec, %.1f GFLOPS. raw credit: %.2f\n", r.id, avp->id, r.elapsed_time, r.flops_estimate/1e9, raw_pfc*COBBLESTONE_SCALE ); } bool do_scale = true; double host_scale = 0; if (app.host_scale_check && hav.consecutive_valid < CONS_VALID_HOST_SCALE ) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] not host scaling - cons valid %d\n", r.id, hav.consecutive_valid ); } } if (do_scale && (hav.pfc.n < MIN_HOST_SAMPLES || hav.pfc.get_avg()==0)) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] not host scaling - HAV PFC zero or too few samples %.0f\n", r.id, hav.pfc.n ); } } if (do_scale && avp->pfc.n < MIN_VERSION_SAMPLES) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] not host scaling - app_version PFC too few samples%.0f\n", r.id, avp->pfc.n ); } } if (do_scale && hav.pfc.get_avg() <= 0) { do_scale = false; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] not host scaling - HAV PFC is zero\n", r.id ); } } if (do_scale) { host_scale = avp->pfc.get_avg() / hav.pfc.get_avg(); if (host_scale > 10) { host_scale = 10; } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] host scale: %.2f (%f/%f)\n", r.id, host_scale, avp->pfc.get_avg(), hav.pfc.get_avg() ); } } pfc = raw_pfc; if (avp->pfc_scale) { pfc *= avp->pfc_scale; if (host_scale) { pfc *= host_scale; mode = PFC_MODE_NORMAL; } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] applying app version scale %.3f\n", r.id, avp->pfc_scale ); } } else { if (host_scale) { pfc *= host_scale; } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] no app version scale\n", r.id ); } } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] [AV#%lu] PFC avgs with %g (%g/%g)\n", r.id, avp->id, raw_pfc/wu.rsc_fpops_est, raw_pfc, wu.rsc_fpops_est ); } double x = raw_pfc / wu.rsc_fpops_est; if (!r.runtime_outlier && is_pfc_sane(x, wu, app)) { avp->pfc_samples.push_back(x); } } if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] updating HAV PFC %.2f et %g turnaround %d\n", r.id, raw_pfc / wu.rsc_fpops_est, r.elapsed_time / wu.rsc_fpops_est, (r.received_time - r.sent_time) ); } if (!r.runtime_outlier) { double x = raw_pfc / wu.rsc_fpops_est; if (is_pfc_sane(x, wu, app)) { if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] [HOST#%lu] before updating HAV PFC pfc.n=%f pfc.avg=%f\n", r.id, hav.host_id, hav.pfc.n, hav.pfc.avg ); } hav.pfc.update(x, HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT); if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%lu] [HOST#%lu] after updating HAV PFC pfc.n=%f pfc.avg=%f\n", r.id, hav.host_id, hav.pfc.n, hav.pfc.avg ); } } hav.et.update_var( r.elapsed_time / wu.rsc_fpops_est, HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT ); hav.turnaround.update_var( (r.received_time - r.sent_time), HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT ); } // keep track of credit per app version // if (avp) { avp->credit_samples.push_back(pfc*COBBLESTONE_SCALE); avp->credit_times.push_back(r.sent_time); } return 0; }