Example #1
0
static inline void got_bad_result(SCHED_RESULT_ITEM& sri) {
    int gavid = generalized_app_version_id(sri.app_version_id, sri.appid);
    DB_HOST_APP_VERSION* havp = gavid_to_havp(gavid);
    if (!havp) {
        if (config.debug_handle_results) {
            log_messages.printf(MSG_NORMAL,
                "[handle] No app version for %d\n", gavid
            );
        }
        return;
    }

    int n = havp->max_jobs_per_day;
    if (n > config.daily_result_quota) {
        n = config.daily_result_quota;
    }
    n -= 1;
    if (n < 1) {
        n = 1;
    }
    if (config.debug_quota) {
        log_messages.printf(MSG_NORMAL,
            "[quota] decreasing max_jobs_per_day for %d: %d->%d\n",
            gavid, havp->max_jobs_per_day, n
        );
    }
    havp->max_jobs_per_day = n;

    havp->consecutive_valid = 0;
}
Example #2
0
// got a SUCCESS result.  Doesn't mean it's valid!
//
static inline void got_good_result(SCHED_RESULT_ITEM& sri) {
    DB_ID_TYPE gavid = generalized_app_version_id(sri.app_version_id, sri.appid);
    DB_HOST_APP_VERSION* havp = gavid_to_havp(gavid);
    if (!havp) {
        if (config.debug_handle_results) {
            log_messages.printf(MSG_NORMAL,
                "[handle] No app version for %ld\n", gavid
            );
        }
        return;
    }
    if (havp->max_jobs_per_day < config.daily_result_quota) {
        int n = havp->max_jobs_per_day*2;
        if (n > config.daily_result_quota) {
            n = config.daily_result_quota;
        }
        if (config.debug_quota) {
            log_messages.printf(MSG_NORMAL,
                "[quota] increasing max_jobs_per_day for %ld: %d->%d\n",
                gavid, havp->max_jobs_per_day, n
            );
        }
        havp->max_jobs_per_day = n;
    }
}
Example #3
0
DB_HOST_APP_VERSION* BEST_APP_VERSION::host_app_version() {
    if (cavp) {
        return gavid_to_havp(
            generalized_app_version_id(host_usage.resource_type(), appid)
        );
    } else {
        return gavid_to_havp(avp->id);
    }
}
Example #4
0
// called at start of send_work().
// Estimate FLOPS of anon platform versions,
// and compute scaling factor for wu.rsc_fpops
//
void estimate_flops_anon_platform() {
    unsigned int i;
    for (i=0; i<g_request->client_app_versions.size(); i++) {
        CLIENT_APP_VERSION& cav = g_request->client_app_versions[i];

        cav.rsc_fpops_scale = 1;

        if (cav.host_usage.avg_ncpus == 0 && cav.host_usage.ncudas == 0 && cav.host_usage.natis == 0) {
            cav.host_usage.avg_ncpus = 1;
        }

        // current clients fill in host_usage.flops with peak FLOPS
        // if it's missing from app_info.xml;
        // however, for older clients, we need to fill it in ourselves;
        // assume it uses 1 CPU
        //
        if (cav.host_usage.projected_flops == 0) {
            cav.host_usage.projected_flops = g_reply->host.p_fpops;
        }

        // At this point host_usage.projected_flops is filled in with something.
        // See if we have a better estimated based on history
        //
        DB_HOST_APP_VERSION* havp = gavid_to_havp(
            generalized_app_version_id(
                cav.host_usage.resource_type(), cav.app->id
            )
        );
        if (havp && havp->et.n > MIN_HOST_SAMPLES) {
            double new_flops = 1./havp->et.get_avg();
            cav.rsc_fpops_scale = cav.host_usage.projected_flops/new_flops;
            cav.host_usage.projected_flops = new_flops;
            if (config.debug_send) {
                log_messages.printf(MSG_NORMAL,
                    "[send] (%s) setting projected flops to %fG based on ET\n",
                    cav.plan_class, new_flops/1e9
                );
            }
        } else {
            if (config.debug_send) {
                log_messages.printf(MSG_NORMAL,
                    "[send] (%s) using client-supplied flops %fG\n",
                    cav.plan_class, cav.host_usage.projected_flops
                );
            }
        }
    }
}
// Compute or estimate "claimed peak FLOP count".
// Possibly update host_app_version records and write to DB.
// Possibly update app_version records in memory and let caller write to DB,
// to merge DB writes
//
int get_pfc(
    RESULT& r, WORKUNIT& wu, DB_APP& app,       // in
    vector<DB_APP_VERSION>&app_versions,        // in/out
    DB_HOST_APP_VERSION& hav,                   // in/out
    double& pfc, int& mode                      // out
) {
    DB_APP_VERSION* avp=0;
    int retval;

    mode = PFC_MODE_APPROX;

    if (r.runtime_outlier && config.debug_credit) {
        log_messages.printf(MSG_NORMAL,
            "[credit] [RESULT#%d] runtime outlier, not updating stats\n",
            r.id
        );
    }

    // is result from old scheduler that didn't set r.app_version_id correctly?
    // if so, use WU estimate (this is a transient condition)
    //
    if (r.app_version_id == 0 || r.app_version_id == 1) {
        if (config.debug_credit) {
            log_messages.printf(MSG_NORMAL,
                "[credit] [RESULT#%d] missing app_version_id (%d): returning WU default %.2f\n",
                r.id, r.app_version_id, wu_estimated_credit(wu, app)
            );
        }
        mode = PFC_MODE_WU_EST;
        pfc = wu_estimated_pfc(wu, app);
        return 0;
    }

    // temporary kludge for SETI@home:
    // if GPU initialization fails the app falls back to CPU.
    //
    if (strstr(r.stderr_out, "Device Emulation (CPU)")) {
        if (config.debug_credit) {
            log_messages.printf(MSG_NORMAL,
                "[credit] [RESULT#%d][AV#%d] CUDA app fell back to CPU; returning WU default %.2f\n",
                r.id, r.app_version_id, wu.rsc_fpops_est*COBBLESTONE_SCALE
            );
        }
        mode = PFC_MODE_WU_EST;
        pfc = wu_estimated_pfc(wu, app);
        return 0;
    }

    int gavid = generalized_app_version_id(r.app_version_id, r.appid);

    // transition case
    //
    if (!hav.host_id) {
        mode = PFC_MODE_WU_EST;
        pfc = wu_estimated_pfc(wu, app);
        return 0;
    }

    // old clients report CPU time but not elapsed time.
    // Use HOST_APP_VERSION.et to track statistics of CPU time.
    //
    if (r.elapsed_time < 1e-6) {
        // in case buggy client reports elapsed time like 1e-304

        if (config.debug_credit) {
            log_messages.printf(MSG_NORMAL,
                "[credit] [RESULT#%d] old client (elapsed time not reported)\n",
                r.id
            );
        }
        if (!r.runtime_outlier) {
            hav.et.update_var(
                r.cpu_time/wu.rsc_fpops_est,
                HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT
            );
        }
        pfc = wu_estimated_pfc(wu, app);
        if (config.debug_credit) {
            log_messages.printf(MSG_NORMAL,
                "[credit] [RESULT#%d] old client: raw credit %.2f\n",
                r.id, pfc*COBBLESTONE_SCALE
            );
        }
        bool do_scale = true;
        if (hav.et.n < MIN_HOST_SAMPLES || (hav.et.get_avg() <= 0)) {
            do_scale = false;
            if (config.debug_credit) {
                log_messages.printf(MSG_NORMAL,
                    "[credit] [RESULT#%d] old client: no host scaling - zero or too few samples %f\n",
                    r.id, hav.et.n
                );
            }
        }
        if (do_scale
            && app.host_scale_check
            && hav.consecutive_valid < CONS_VALID_HOST_SCALE
        ) {
            do_scale = false;
            if (config.debug_credit) {
                log_messages.printf(MSG_NORMAL,
                    "[credit] [RESULT#%d] old client: no host scaling - cons valid %d\n",
                    r.id, hav.consecutive_valid
                );
            }
        }
        if (do_scale) {
            double s = r.cpu_time / (hav.et.get_avg()*wu.rsc_fpops_est);
            pfc *= s;
            if (config.debug_credit) {
                log_messages.printf(MSG_NORMAL,
                    "[credit] [RESULT#%d] old client: scaling (based on CPU time) by %g, return %.2f\n",
                    r.id, s, pfc*COBBLESTONE_SCALE
                );
            }
        }
        if (config.debug_credit) {
            log_messages.printf(MSG_NORMAL,
                "[credit] [RESULT#%d] old client: returning PFC %.2f\n",
                r.id, pfc*COBBLESTONE_SCALE
            );
        }
        return 0;
    }

    // r.flops_estimate should be positive
    // but (because of scheduler bug) it may not be.
    // At this point we don't have much to go on, so use 1e10.
    //
    if (r.flops_estimate <= 0) {
        r.flops_estimate = 1e10;
    }

    double raw_pfc = (r.elapsed_time * r.flops_estimate);
    if (config.debug_credit) {
        log_messages.printf(MSG_NORMAL,
            "[credit] [RESULT#%d] raw credit: %.2f (%.2f sec, %.2f est GFLOPS)\n",
            r.id, raw_pfc*COBBLESTONE_SCALE, r.elapsed_time,
            r.flops_estimate/1e9
        );
    }

    // Sanity check
    //
    if (raw_pfc > wu.rsc_fpops_bound) {
        char query[256], clause[256];
        pfc = wu_estimated_pfc(wu, app);
        if (config.debug_credit) {
            log_messages.printf(MSG_NORMAL,
                "[credit] [RESULT#%d] sanity check failed: %.2f>%.2f, return %.2f\n",
                r.id, raw_pfc*COBBLESTONE_SCALE,
                wu.rsc_fpops_bound*COBBLESTONE_SCALE, pfc*COBBLESTONE_SCALE
            );
        }
        sprintf(query, "consecutive_valid=0");
        sprintf(clause, "host_id=%d and app_version_id=%d", r.hostid, gavid);
        retval = hav.update_fields_noid(query, clause);
        return retval;
    }

    if (r.app_version_id < 0) {
        // anon platform
        //
        bool do_scale = true;
        if (hav.pfc.n < MIN_HOST_SAMPLES || hav.pfc.get_avg()<=0) {
            do_scale = false;
            if (config.debug_credit) {
                log_messages.printf(MSG_NORMAL,
                    "[credit] [RESULT#%d] anon platform, not scaling, PFC avg zero or too few samples %.0f\n",
                    r.id, hav.pfc.n
                );
            }
        }
        if (do_scale
            && app.host_scale_check
            && hav.consecutive_valid < CONS_VALID_HOST_SCALE
        ) {
            do_scale = false;
            if (config.debug_credit) {
                log_messages.printf(MSG_NORMAL,
                    "[credit] [RESULT#%d] anon platform, not scaling, cons valid %d\n",
                    r.id, hav.consecutive_valid
                );
            }
        }
        if (do_scale) {
            double scale = app.min_avg_pfc / hav.pfc.get_avg();
            pfc = raw_pfc * scale;
            if (config.debug_credit) {
                log_messages.printf(MSG_NORMAL,
                    "[credit] [RESULT#%d] anon platform, scaling by %g (%.2f/%.2f)\n",
                    r.id, scale, app.min_avg_pfc, hav.pfc.get_avg()
                );
            }
        } else {
            pfc = wu_estimated_pfc(wu, app);
            if (config.debug_credit) {
                log_messages.printf(MSG_NORMAL,
                    "[credit] [RESULT#%d] not scaling, using app avg %.2f\n",
                    r.id, pfc*COBBLESTONE_SCALE
                );
            }
        }
        if (config.debug_credit) {
            log_messages.printf(MSG_NORMAL,
                "[credit] [RESULT#%d] anon platform, returning %.2f\n",
                r.id, pfc*COBBLESTONE_SCALE
            );
        }
    } else {
        avp = av_lookup(r.app_version_id, app_versions);
        if (!avp) {
            log_messages.printf(MSG_CRITICAL,
                "get_pfc() [RESULT#%d]: No AVP %d!!\n", r.id, r.app_version_id
            );
            return ERR_NOT_FOUND;
        }
        if (config.debug_credit) {
            log_messages.printf(MSG_NORMAL,
                "[credit] [RESULT#%d] [AV#%d] normal case. %.0f sec, %.1f GFLOPS.  raw credit: %.2f\n",
                r.id, avp->id, r.elapsed_time, r.flops_estimate/1e9,
                raw_pfc*COBBLESTONE_SCALE
            );
        }

        bool do_scale = true;
        double host_scale = 0;
        if (app.host_scale_check
            && hav.consecutive_valid < CONS_VALID_HOST_SCALE
        ) {
            do_scale = false;
            if (config.debug_credit) {
                log_messages.printf(MSG_NORMAL,
                    "[credit] [RESULT#%d] not host scaling - cons valid %d\n",
                    r.id, hav.consecutive_valid
                );
            }
        }
        if (do_scale && (hav.pfc.n < MIN_HOST_SAMPLES || hav.pfc.get_avg()==0)) {
            do_scale = false;
            if (config.debug_credit) {
                log_messages.printf(MSG_NORMAL,
                    "[credit] [RESULT#%d] not host scaling - HAV PFC zero or too few samples %.0f\n",
                    r.id, hav.pfc.n
                );
            }
        }
        if (do_scale && avp->pfc.n < MIN_VERSION_SAMPLES) {
            do_scale = false;
            if (config.debug_credit) {
                log_messages.printf(MSG_NORMAL,
                    "[credit] [RESULT#%d] not host scaling - app_version PFC too few samples%.0f\n",
                    r.id, avp->pfc.n
                );
            }
        }
        if (do_scale && hav.pfc.get_avg() <= 0) {
            do_scale = false;
            if (config.debug_credit) {
                log_messages.printf(MSG_NORMAL,
                    "[credit] [RESULT#%d] not host scaling - HAV PFC is zero\n",
                    r.id
                );
            }
        }
        if (do_scale) {
            host_scale = avp->pfc.get_avg() / hav.pfc.get_avg();
            if (host_scale > 10) host_scale = 10;
            if (config.debug_credit) {
                log_messages.printf(MSG_NORMAL,
                    "[credit] [RESULT#%d] host scale: %.2f (%f/%f)\n",
                    r.id, host_scale, avp->pfc.get_avg(), hav.pfc.get_avg()
                );
            }
        }

        pfc = raw_pfc;
        if (avp->pfc_scale) {
            pfc *= avp->pfc_scale;
            if (host_scale) {
                pfc *= host_scale;
                mode = PFC_MODE_NORMAL;
            }
            if (config.debug_credit) {
                log_messages.printf(MSG_NORMAL,
                    "[credit] [RESULT#%d] applying app version scale %.3f\n",
                    r.id, avp->pfc_scale
                );
            }
        } else {
            if (host_scale) {
                pfc *= host_scale;
            }
            if (config.debug_credit) {
                log_messages.printf(MSG_NORMAL,
                    "[credit] [RESULT#%d] no app version scale\n",
                    r.id
                );
            }
        }
        if (config.debug_credit) {
            log_messages.printf(MSG_NORMAL,
                "[credit] [RESULT#%d] [AV#%d] PFC avgs with %g (%g/%g)\n",
                r.id, avp->id,
                raw_pfc/wu.rsc_fpops_est,
                raw_pfc, wu.rsc_fpops_est
            );
        }
        double x = raw_pfc / wu.rsc_fpops_est;
        if (!r.runtime_outlier && is_pfc_sane(x, wu, app)) {
            avp->pfc_samples.push_back(x);
        }
    }

    if (config.debug_credit) {
        log_messages.printf(MSG_NORMAL,
            "[credit] [RESULT#%d] updating HAV PFC %.2f et %g turnaround %d\n",
            r.id, raw_pfc / wu.rsc_fpops_est,
            r.elapsed_time / wu.rsc_fpops_est,
            (r.received_time - r.sent_time)
        );
    }

    double x = raw_pfc / wu.rsc_fpops_est;
    if (!r.runtime_outlier && is_pfc_sane(x, wu, app)) {
        hav.pfc.update(x, HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT);
    }
    if (!r.runtime_outlier) {
        hav.et.update_var(
            r.elapsed_time / wu.rsc_fpops_est,
            HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT
        );
        hav.turnaround.update_var(
            (r.received_time - r.sent_time),
            HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT
        );
    }

    // keep track of credit per app version
    //
    if (avp) {
        avp->credit_samples.push_back(pfc*COBBLESTONE_SCALE);
        avp->credit_times.push_back(r.sent_time);
    }

    return 0;
}
Example #6
0
// input:
// cav.host_usage.projected_flops
//      This is the <flops> specified in app_info.xml
//      If not specified there, it's a conservative estimate
//      (CPU speed * (ncpus + 10*ngpus))
//      In either case, this value will be used by the client
//      to estimate job runtime and runtime limit
//          est runtime = wu.rsc_fpops_est/x
//          runtime limit = wu.rsc_fpops_bound/x
//      x may be way off from the actual speed.
//      So to get accurate runtime est, we need to adjust wu.rsc_fpops_est
//
// output:
// cav.host_usage.projected_flops
//      An estimate of the actual FLOPS the app will get,
//      based on elapsed time history (if possible).
//      This is used by the scheduler to estimate runtime.
// cav.rsc_fpops_scale
//      wu.rsc_fpops_est and wu.rsc_fpops_bound will be scaled by this
//
// called at start of send_work().
//
void estimate_flops_anon_platform() {
    unsigned int i;
    for (i=0; i<g_request->client_app_versions.size(); i++) {
        CLIENT_APP_VERSION& cav = g_request->client_app_versions[i];
        if (!cav.app) continue;

        cav.rsc_fpops_scale = 1;

        if (cav.host_usage.avg_ncpus == 0
            && cav.host_usage.proc_type == PROC_TYPE_CPU
        ) {
            cav.host_usage.avg_ncpus = 1;
        }

        // if projected_flops is missing, make a wild guess
        // Note: 6.12+ clients supply a project FLOPS,
        // even if the user didn't
        //
        if (cav.host_usage.projected_flops == 0) {
            cav.host_usage.projected_flops = g_reply->host.p_fpops;
        }

        // If data is available, estimate FLOPS based on average elapsed time
        //
        DB_HOST_APP_VERSION* havp = gavid_to_havp(
            generalized_app_version_id(
                cav.host_usage.resource_type(), cav.app->id
            )
        );
        if (havp
            && (havp->et.n > MIN_HOST_SAMPLES)
            && (havp->et.get_avg() > 0)
        ) {
            double new_flops = 1./havp->et.get_avg();

            // cap this at ET_RATIO_LIMIT*projected,
            // in case we've had a bunch of short jobs recently
            //
            if (new_flops > ET_RATIO_LIMIT*cav.host_usage.projected_flops) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] (%s) capping new_flops; %.1fG > %.0f*%.1fG\n",
                        cav.plan_class, new_flops/1e9,
                        ET_RATIO_LIMIT,
                        cav.host_usage.projected_flops/1e9
                    );
                }
                new_flops = ET_RATIO_LIMIT*cav.host_usage.projected_flops;
            }
            cav.rsc_fpops_scale = cav.host_usage.projected_flops/new_flops;
            cav.host_usage.projected_flops = new_flops;
            if (config.debug_version_select) {
                log_messages.printf(MSG_NORMAL,
                    "[version] (%s) setting projected flops to %fG based on ET\n",
                    cav.plan_class, new_flops/1e9
                );
                log_messages.printf(MSG_NORMAL,
                    "[version] setting rsc_fpops_scale to %g\n",
                    cav.rsc_fpops_scale
                );
            }
        } else {
            if (config.debug_version_select) {
                log_messages.printf(MSG_NORMAL,
                    "[version] (%s) using client-supplied flops %fG\n",
                    cav.plan_class, cav.host_usage.projected_flops/1e9
                );
            }
        }
    }
}
Example #7
0
// A result timed out; penalize the corresponding host_app_version
//
static int result_timed_out(
    TRANSITIONER_ITEM res_item, TRANSITIONER_ITEM& wu_item
) {
    DB_HOST_APP_VERSION hav;
    char query[512], clause[512];

    int gavid = generalized_app_version_id(
        res_item.res_app_version_id, wu_item.appid
    );
    int retval = hav_lookup(hav, res_item.res_hostid, gavid);
    if (retval) {
        log_messages.printf(MSG_NORMAL,
            "result_timed_out(): hav_lookup failed: %s\n", boincerror(retval)
        );
        return 0;
    }
    hav.turnaround.update_var(
        (double)wu_item.delay_bound,
        HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT
    );
    int n = hav.max_jobs_per_day;
    if (n == 0) {
        n = config.daily_result_quota;
    }
    if (n > config.daily_result_quota) {
        n = config.daily_result_quota;
    }
    n -= 1;
    if (n < 1) {
        n = 1;
    }
    if (config.debug_quota) {
        log_messages.printf(MSG_NORMAL,
            "[quota] max_jobs_per_day for %d; %d->%d\n",
            gavid, hav.max_jobs_per_day, n
        );
    }
    hav.max_jobs_per_day = n;

    hav.consecutive_valid = 0;

    sprintf(query,
        "turnaround_n=%.15e, turnaround_avg=%.15e, turnaround_var=%.15e, turnaround_q=%.15e, max_jobs_per_day=%d, consecutive_valid=%d",
        hav.turnaround.n,
        hav.turnaround.avg,
        hav.turnaround.var,
        hav.turnaround.q,
        hav.max_jobs_per_day,
        hav.consecutive_valid
    );
    sprintf(clause,
        "host_id=%d and app_version_id=%d",
        hav.host_id, hav.app_version_id
    );
    retval = hav.update_fields_noid(query, clause);
    if (retval) {
        log_messages.printf(MSG_CRITICAL,
            "CRITICAL result_timed_out(): hav updated failed: %s\n",
            boincerror(retval)
        );
    }
    return 0;
}
Example #8
0
// handle a workunit which has new results
//
int handle_wu(
    DB_VALIDATOR_ITEM_SET& validator, std::vector<VALIDATOR_ITEM>& items
) {
    int canonical_result_index = -1;
    bool update_result, retry;
    TRANSITION_TIME transition_time = NO_CHANGE;
    int retval = 0, canonicalid = 0, x;
    double credit = 0;
    unsigned int i;

    WORKUNIT& wu = items[0].wu;
    g_wup = &wu;
        vector<RESULT> results;
        vector<DB_HOST_APP_VERSION> host_app_versions, host_app_versions_orig;
        int nsuccess_results;

        // Here if WU doesn't have a canonical result yet.
        // Try to get one

        log_messages.printf(MSG_NORMAL,
            "[WU#%d %s] handle_wu(): No canonical result yet\n",
            wu.id, wu.name
        );
        ++log_messages;

        // make a vector of the successful results,
        // and a parallel vector of host_app_versions
        //
        for (i=0; i<items.size(); i++) {
            RESULT& result = items[i].res;

            if ((result.server_state == RESULT_SERVER_STATE_OVER) &&
                (result.outcome == RESULT_OUTCOME_SUCCESS)
            ) {
                results.push_back(result);
                DB_HOST_APP_VERSION hav;
                retval = hav_lookup(hav, result.hostid,
                    generalized_app_version_id(result.app_version_id, result.appid)
                );
                if (retval) {
                    hav.host_id=0;   // flag that it's missing
                }
                host_app_versions.push_back(hav);
                host_app_versions_orig.push_back(hav);
            }
        }

        log_messages.printf(MSG_DEBUG,
            "[WU#%d %s] Found %d successful results\n",
            wu.id, wu.name, (int)results.size()
        );
        if (results.size() >= (unsigned int)wu.min_quorum) {
            log_messages.printf(MSG_DEBUG,
                "[WU#%d %s] Enough for quorum, checking set.\n",
                wu.id, wu.name
            );

            double dummy;
            retval = check_set(
                results, wu, canonicalid, dummy, retry
            );
            if (retval) {
                log_messages.printf(MSG_CRITICAL,
                    "[WU#%d %s] check_set returned %d, exiting\n",
                    wu.id, wu.name, retval
                );
                return retval;
            }
            if (retry) transition_time = DELAYED;

            if (credit_from_wu) {
                retval = get_credit_from_wu(wu, results, credit);
                if (retval) {
                    log_messages.printf(MSG_CRITICAL,
                        "[WU#%d %s] get_credit_from_wu returned %d\n",
                        wu.id, wu.name, retval
                    );
                    return retval;
                }
            }

            if (canonicalid) {
                retval = assign_credit_set(
                    wu, results, app, app_versions, host_app_versions,
                    max_granted_credit, credit
                );
                if (retval) {
                    log_messages.printf(MSG_CRITICAL,
                        "[WU#%d %s] assign_credit_set() returned %d\n",
                        wu.id, wu.name, retval
                    );
                    transition_time = DELAYED;
                    goto leave;
                }
            }

            if (max_granted_credit && credit>max_granted_credit) {
                credit = max_granted_credit;
            }

            // scan results.
            // update as needed, and count the # of results
            // that are still outcome=SUCCESS
            // (some may have changed to VALIDATE_ERROR)
            //
            nsuccess_results = 0;
            for (i=0; i<results.size(); i++) {
                RESULT& result = results[i];
                DB_HOST_APP_VERSION& hav = host_app_versions[i];
                DB_HOST_APP_VERSION& hav_orig = host_app_versions_orig[i];

                update_result = false;
                bool update_host = false;
                if (result.outcome == RESULT_OUTCOME_VALIDATE_ERROR) {
                    transition_time = IMMEDIATE;
                    update_result = true;
                } else {
                    nsuccess_results++;
                }

                DB_HOST host;
                HOST host_initial;
                switch (result.validate_state) {
                case VALIDATE_STATE_VALID:
                case VALIDATE_STATE_INVALID:
                    retval = host.lookup_id(result.hostid);
                    if (retval) {
                        log_messages.printf(MSG_CRITICAL,
                            "[RESULT#%d] lookup of host %d failed %d\n",
                            result.id, result.hostid, retval
                        );
                        continue;
                    }
                    host_initial = host;
                }

                switch (result.validate_state) {
                case VALIDATE_STATE_VALID:
                    update_result = true;
                    update_host = true;
                    retval = is_valid(host, result, wu, host_app_versions[i]);
                    if (retval) {
                        log_messages.printf(MSG_DEBUG,
                            "[RESULT#%d %s] is_valid() failed: %d\n",
                            result.id, result.name, retval
                        );
                    }
                    grant_credit(
                        host, result.sent_time, result.cpu_time,
                        result.granted_credit
                    );
                    log_messages.printf(MSG_NORMAL,
                        "[RESULT#%d %s] Valid; granted %f credit [HOST#%d]\n",
                        result.id, result.name, result.granted_credit,
                        result.hostid
                    );
                    break;
                case VALIDATE_STATE_INVALID:
                    update_result = true;
                    update_host = true;
                    log_messages.printf(MSG_NORMAL,
                        "[RESULT#%d %s] Invalid [HOST#%d]\n",
                        result.id, result.name, result.hostid
                    );
                    is_invalid(host_app_versions[i]);
                    break;
                case VALIDATE_STATE_INIT:
                    log_messages.printf(MSG_NORMAL,
                        "[RESULT#%d %s] Inconclusive [HOST#%d]\n",
                        result.id, result.name, result.hostid
                    );
                    result.validate_state = VALIDATE_STATE_INCONCLUSIVE;
                    update_result = true;
                    break;
                }

                if (hav.host_id) {
                    retval = hav.update_validator(hav_orig);
                }
                if (update_host) {
                    retval = host.update_diff_validator(host_initial);
                }
                if (update_result) {
                    retval = validator.update_result(result);
                    if (retval) {
                        log_messages.printf(MSG_CRITICAL,
                            "[RESULT#%d %s] result.update() failed: %d\n",
                            result.id, result.name, retval
                        );
                    }
                }
            }

            if (canonicalid) {
                // if we found a canonical result,
                // trigger the assimilator, but do NOT trigger
                // the transitioner - doing so creates a race condition
                //
                transition_time = NEVER;
                log_messages.printf(MSG_DEBUG,
                    "[WU#%d %s] Found a canonical result: id=%d\n",
                    wu.id, wu.name, canonicalid
                );
                wu.canonical_resultid = canonicalid;
                wu.canonical_credit = credit;
                wu.assimilate_state = ASSIMILATE_READY;

                // don't need to send any more results
                //
                for (i=0; i<items.size(); i++) {
                    RESULT& result = items[i].res;

                    if (result.server_state != RESULT_SERVER_STATE_UNSENT) {
                        continue;
                    }

                    result.server_state = RESULT_SERVER_STATE_OVER;
                    result.outcome = RESULT_OUTCOME_DIDNT_NEED;
                    retval = validator.update_result(result);
                    if (retval) {
                        log_messages.printf(MSG_CRITICAL,
                            "[RESULT#%d %s] result.update() failed: %d\n",
                            result.id, result.name, retval
                        );
                    }
                }
            } else {
                // here if no consensus.

                // check if #success results is too large
                //
                if (nsuccess_results > wu.max_success_results) {
                    wu.error_mask |= WU_ERROR_TOO_MANY_SUCCESS_RESULTS;
                    transition_time = IMMEDIATE;
                }

                // if #success results >= target_nresults,
                // we need more results, so bump target_nresults
                // NOTE: nsuccess_results should never be > target_nresults,
                // but accommodate that if it should happen
                //
                if (nsuccess_results >= wu.target_nresults) {
                    wu.target_nresults = nsuccess_results+1;
                    transition_time = IMMEDIATE;
                }
            }
        
    }

leave:
    --log_messages;

    switch (transition_time) {
    case IMMEDIATE:
        wu.transition_time = time(0);
        break;
    case DELAYED:
        x = time(0) + 6*3600;
        if (x < wu.transition_time) wu.transition_time = x;
        break;
    case NEVER:
        wu.transition_time = INT_MAX;
        break;
    case NO_CHANGE:
        break;
    }

    wu.need_validate = 0;
    
    retval = validator.update_workunit(wu);
    if (retval) {
        log_messages.printf(MSG_CRITICAL,
            "[WU#%d %s] update_workunit() failed: %d; exiting\n",
            wu.id, wu.name, retval
        );
        return retval;
    }
    return 0;
}
Example #9
0
// handle a workunit which has new results
//
int handle_wu(
    DB_VALIDATOR_ITEM_SET& validator, std::vector<VALIDATOR_ITEM>& items
) {
    int canonical_result_index = -1;
    bool update_result, retry;
    TRANSITION_TIME transition_time = NO_CHANGE;
    int retval = 0, x;
    DB_ID_TYPE canonicalid = 0;
    double credit = 0;
    unsigned int i;

    WORKUNIT& wu = items[0].wu;
    g_wup = &wu;

    if (wu.canonical_resultid) {
        log_messages.printf(MSG_NORMAL,
            "[WU#%lu %s] Already has canonical result %lu\n",
            wu.id, wu.name, wu.canonical_resultid
        );
        ++log_messages;

        // Here if WU already has a canonical result.
        // Get unchecked results and see if they match the canonical result
        //
        for (i=0; i<items.size(); i++) {
            RESULT& result = items[i].res;

            if (result.id == wu.canonical_resultid) {
                canonical_result_index = i;
            }
        }
        if (canonical_result_index == -1) {
            log_messages.printf(MSG_CRITICAL,
                "[WU#%lu %s] Can't find canonical result %lu\n",
                wu.id, wu.name, wu.canonical_resultid
            );
            return 0;
        }

        RESULT& canonical_result = items[canonical_result_index].res;

        // scan this WU's results, and check the unchecked ones
        //
        for (i=0; i<items.size(); i++) {
            RESULT& result = items[i].res;

            if (result.server_state != RESULT_SERVER_STATE_OVER) continue;
            if (result.outcome !=  RESULT_OUTCOME_SUCCESS) continue;
            switch (result.validate_state) {
            case VALIDATE_STATE_INIT:
            case VALIDATE_STATE_INCONCLUSIVE:
                break;
            default:
                continue;
            }
            log_messages.printf(MSG_NORMAL,
                 "[WU#%lu] handle_wu(): testing result %lu\n",
                 wu.id, result.id
             );

            check_pair(result, canonical_result, retry);
            if (retry) {
                // this usually means an NFS mount has failed;
                // arrange to try again later.
                //
                transition_time = DELAYED;
                goto leave;
            }
            update_result = false;

            if (result.outcome == RESULT_OUTCOME_VALIDATE_ERROR) {
                update_result = true;
            }

            // this might be last result, so let transitioner
            // trigger file delete etc. if needed
            //
            transition_time = IMMEDIATE;

            DB_HOST host;
            retval = host.lookup_id(result.hostid);
            if (retval) {
                log_messages.printf(MSG_CRITICAL,
                    "[RESULT#%lu] lookup of host %lu failed: %s\n",
                    result.id, result.hostid, boincerror(retval)
                );
                continue;
            }
            HOST host_initial = host;

            bool update_hav = false;
            DB_HOST_APP_VERSION hav;
            retval = hav_lookup(hav, result.hostid,
                generalized_app_version_id(result.app_version_id, result.appid)
            );
            if (retval) {
                log_messages.printf(MSG_CRITICAL,
                    "[RESULT#%lu %s] hav_lookup returned %d\n",
                    result.id, result.name, retval
                );
                hav.host_id = 0;
            }
            DB_HOST_APP_VERSION hav_orig = hav;
            vector<DB_HOST_APP_VERSION> havv;
            havv.push_back(hav);

            vector<RESULT> rv;
            switch (result.validate_state) {
            case VALIDATE_STATE_VALID:
                update_result = true;
                update_hav = true;
                log_messages.printf(MSG_NORMAL,
                    "[RESULT#%lu %s] pair_check() matched: setting result to valid\n",
                    result.id, result.name
                );
                retval = is_valid(host, result, wu, havv[0]);
                if (retval) {
                    log_messages.printf(MSG_NORMAL,
                        "[RESULT#%lu %s] is_valid() error: %s\n",
                        result.id, result.name, boincerror(retval)
                    );
                }
                // do credit computation, but grant credit of canonical result
                //
                rv.push_back(result);
                assign_credit_set(
                    wu, rv, app, app_versions, havv,
                    max_granted_credit, credit
                );
                if (!no_credit) {
                    result.granted_credit = canonical_result.granted_credit;
                    grant_credit(host, result.sent_time, result.granted_credit);
                    if (config.credit_by_app) {
                        grant_credit_by_app(result, result.granted_credit);
                    }
                }
                break;
            case VALIDATE_STATE_INVALID:
                update_result = true;
                update_hav = true;
                log_messages.printf(MSG_NORMAL,
                    "[RESULT#%lu %s] pair_check() didn't match: setting result to invalid\n",
                    result.id, result.name
                );
                is_invalid(havv[0]);
            }
            if (hav.host_id && update_hav) {
                if (dry_run) {
                    log_messages.printf(MSG_NORMAL, "DB not updated (dry run)\n");
                } else {
                    log_messages.printf(MSG_NORMAL,
                        "[HOST#%lu AV#%lu] [outlier=%d] Updating HAV in DB.  pfc.n=%f->%f\n",
                        havv[0].host_id, havv[0].app_version_id,
                        result.runtime_outlier, hav_orig.pfc.n, havv[0].pfc.n
                    );
                    retval=havv[0].update_validator(hav_orig);
                    if (retval) {
                        log_messages.printf(MSG_CRITICAL,
                            "[HOST#%lu AV%lu] hav.update_validator() failed: %s\n",
                            hav.host_id, hav.app_version_id, boincerror(retval)
                        );
                    }
                }
            }
            host.update_diff_validator(host_initial);
            if (update_result) {
                log_messages.printf(MSG_NORMAL,
                    "[RESULT#%lu %s] granted_credit %f\n",
                    result.id, result.name, result.granted_credit
                );
                if (dry_run) {
                    log_messages.printf(MSG_NORMAL, "DB not updated (dry run)\n");
                } else {
                    retval = validator.update_result(result);
                    if (retval) {
                        log_messages.printf(MSG_CRITICAL,
                            "[RESULT#%lu %s] Can't update result: %s\n",
                            result.id, result.name, boincerror(retval)
                        );
                    }
                }
            }
        }
    } else {
        // Here if WU doesn't have a canonical result yet.
        // Try to get one

        vector<RESULT> viable_results;
        vector<DB_HOST_APP_VERSION> host_app_versions, host_app_versions_orig;

        log_messages.printf(MSG_NORMAL,
            "[WU#%lu %s] handle_wu(): No canonical result yet\n",
            wu.id, wu.name
        );
        ++log_messages;

        // make a vector of the "viable" (i.e. possibly canonical) results,
        // and a parallel vector of host_app_versions
        //
        for (i=0; i<items.size(); i++) {
            RESULT& result = items[i].res;

            if (result.server_state != RESULT_SERVER_STATE_OVER) continue;
            if (result.outcome != RESULT_OUTCOME_SUCCESS) continue;
            if (result.validate_state == VALIDATE_STATE_INVALID) continue;

            viable_results.push_back(result);
            DB_HOST_APP_VERSION hav;
            retval = hav_lookup(hav, result.hostid,
                generalized_app_version_id(result.app_version_id, result.appid)
            );
            if (retval) {
                hav.host_id=0;   // flag that it's missing
            }
            host_app_versions.push_back(hav);
            host_app_versions_orig.push_back(hav);
        }

        log_messages.printf(MSG_DEBUG,
            "[WU#%lu %s] Found %d viable results\n",
            wu.id, wu.name, (int)viable_results.size()
        );
        if (viable_results.size() >= (unsigned int)wu.min_quorum) {
            log_messages.printf(MSG_DEBUG,
                "[WU#%lu %s] Enough for quorum, checking set.\n",
                wu.id, wu.name
            );

            double dummy;
            retval = check_set(viable_results, wu, canonicalid, dummy, retry);
            if (retval) {
                log_messages.printf(MSG_CRITICAL,
                    "[WU#%lu %s] check_set() error: %s\n",
                    wu.id, wu.name, boincerror(retval)
                );
                return retval;
            }
            if (retry) transition_time = DELAYED;

            // if we found a canonical instance, decide on credit
            //
            if (canonicalid) {
                // always do the credit calculation, to update statistics,
                // even if we're granting credit a different way
                //
                retval = assign_credit_set(
                    wu, viable_results, app, app_versions, host_app_versions,
                    max_granted_credit, credit
                );
                if (retval) {
                    log_messages.printf(MSG_CRITICAL,
                        "[WU#%lu %s] assign_credit_set(): %s\n",
                        wu.id, wu.name, boincerror(retval)
                    );
                    transition_time = DELAYED;
                    goto leave;
                }

                if (credit_from_wu) {
                    retval = get_credit_from_wu(wu, viable_results, credit);
                    if (retval) {
                        log_messages.printf(MSG_CRITICAL,
                            "[WU#%lu %s] get_credit_from_wu(): credit not specified in WU\n",
                            wu.id, wu.name
                        );
                        credit = 0;
                    }
                } else if (credit_from_runtime) {
                    credit = 0;
                    for (i=0; i<viable_results.size(); i++) {
                        RESULT& result = viable_results[i];
                        if (result.id == canonicalid) {
                            DB_HOST host;
                            retval = host.lookup_id(result.hostid);
                            if (retval) {
                                log_messages.printf(MSG_CRITICAL,
                                    "[WU#%lu %s] host %lu lookup failed\n",
                                    wu.id, wu.name, result.hostid
                                );
                                break;
                            }
                            double runtime = result.elapsed_time;
                            if (runtime <=0 || runtime > max_runtime) {
                                runtime = max_runtime;
                            }
                            credit = result.flops_estimate * runtime * COBBLESTONE_SCALE;
                            log_messages.printf(MSG_NORMAL,
                                "[WU#%lu][RESULT#%lu] credit_from_runtime %.2f = %.0fs * %.2fGFLOPS\n",
                                wu.id, result.id,
                                credit, runtime, result.flops_estimate/1e9
                            );
                            break;
                        }
                    }
                } else if (no_credit) {
                    credit = 0;
                }
                if (max_granted_credit && credit>max_granted_credit) {
                    credit = max_granted_credit;
                }
            }

            // scan the viable results.
            // update as needed,
            // and count the # of results that are still viable
            // (some may now have outcome VALIDATE_ERROR,
            // or validate_state INVALID)
            //
            int n_viable_results = 0;
            for (i=0; i<viable_results.size(); i++) {
                RESULT& result = viable_results[i];
                DB_HOST_APP_VERSION& hav = host_app_versions[i];
                DB_HOST_APP_VERSION& hav_orig = host_app_versions_orig[i];

                update_result = false;
                bool update_host = false;

                if (result.outcome != RESULT_OUTCOME_SUCCESS
                    || result.validate_state == VALIDATE_STATE_INVALID
                ) {
                    transition_time = IMMEDIATE;
                    update_result = true;
                } else {
                    n_viable_results++;
                }

                DB_HOST host;
                HOST host_initial;
                switch (result.validate_state) {
                case VALIDATE_STATE_VALID:
                case VALIDATE_STATE_INVALID:
                    retval = host.lookup_id(result.hostid);
                    if (retval) {
                        log_messages.printf(MSG_CRITICAL,
                            "[RESULT#%lu] lookup of host %lu: %s\n",
                            result.id, result.hostid, boincerror(retval)
                        );
                        continue;
                    }
                    host_initial = host;
                }

                switch (result.validate_state) {
                case VALIDATE_STATE_VALID:
                    update_result = true;
                    update_host = true;
                    retval = is_valid(host, result, wu, host_app_versions[i]);
                    if (retval) {
                        log_messages.printf(MSG_DEBUG,
                            "[RESULT#%lu %s] is_valid() failed: %s\n",
                            result.id, result.name, boincerror(retval)
                        );
                    }
                    if (!no_credit) {
                        result.granted_credit = credit;
                        grant_credit(host, result.sent_time, credit);
                        log_messages.printf(MSG_NORMAL,
                            "[RESULT#%lu %s] Valid; granted %f credit [HOST#%lu]\n",
                            result.id, result.name, result.granted_credit,
                            result.hostid
                        );
                        if (config.credit_by_app) {
                            grant_credit_by_app(result, credit);
                        }
                    }
                    break;
                case VALIDATE_STATE_INVALID:
                    update_result = true;
                    update_host = true;
                    log_messages.printf(MSG_NORMAL,
                        "[RESULT#%lu %s] Invalid [HOST#%lu]\n",
                        result.id, result.name, result.hostid
                    );
                    is_invalid(host_app_versions[i]);
                    break;
                case VALIDATE_STATE_INIT:
                    log_messages.printf(MSG_NORMAL,
                        "[RESULT#%lu %s] Inconclusive [HOST#%lu]\n",
                        result.id, result.name, result.hostid
                    );
                    result.validate_state = VALIDATE_STATE_INCONCLUSIVE;
                    update_result = true;
                    break;
                }

                if (dry_run) {
                    log_messages.printf(MSG_NORMAL, "DB not updated (dry run)\n");
                } else {
                    if (hav.host_id) {
                        log_messages.printf(MSG_NORMAL,
                            "[HOST#%lu AV#%lu] [outlier=%d] Updating HAV in DB.  pfc.n=%f->%f\n",
                            hav.host_id, hav.app_version_id,
                            result.runtime_outlier, hav_orig.pfc.n, hav.pfc.n
                        );
                        retval = hav.update_validator(hav_orig);
                        if (retval) {
                            log_messages.printf(MSG_CRITICAL,
                                "[HOST#%lu AV%lu] hav.update_validator() failed: %s\n",
                                hav.host_id, hav.app_version_id, boincerror(retval)
                            );
                        }
                    }
                    if (update_host) {
                        retval = host.update_diff_validator(host_initial);
                        if (retval) {
                            log_messages.printf(MSG_CRITICAL,
                                "[HOST#%lu] host.update_diff_validator() failed: %s\n",
                                host.id, boincerror(retval)
                            );
                        }
                    }
                    if (update_result) {
                        retval = validator.update_result(result);
                        if (retval) {
                            log_messages.printf(MSG_CRITICAL,
                                "[RESULT#%lu %s] result.update() failed: %s\n",
                                result.id, result.name, boincerror(retval)
                            );
                        }
                    }
                }
            }

            if (canonicalid) {
                // if we found a canonical result,
                // trigger the assimilator, but do NOT trigger
                // the transitioner - doing so creates a race condition
                //
                transition_time = NEVER;
                log_messages.printf(MSG_DEBUG,
                    "[WU#%lu %s] Found a canonical result: id=%lu\n",
                    wu.id, wu.name, canonicalid
                );
                wu.canonical_resultid = canonicalid;
                wu.canonical_credit = credit;
                wu.assimilate_state = ASSIMILATE_READY;

                // don't need to send any more results
                //
                for (i=0; i<items.size(); i++) {
                    RESULT& result = items[i].res;

                    if (result.server_state != RESULT_SERVER_STATE_UNSENT) {
                        continue;
                    }

                    result.server_state = RESULT_SERVER_STATE_OVER;
                    result.outcome = RESULT_OUTCOME_DIDNT_NEED;
                    if (dry_run) {
                        log_messages.printf(MSG_NORMAL, "DB not updated (dry run)\n");
                    } else {
                        retval = validator.update_result(result);
                        if (retval) {
                            log_messages.printf(MSG_CRITICAL,
                                "[RESULT#%lu %s] result.update() failed: %s\n",
                                result.id, result.name, boincerror(retval)
                            );
                        }
                    }
                }
            } else {
                // here if no consensus.

                // check if #viable results is too large
                //
                if (n_viable_results > wu.max_success_results) {
                    wu.error_mask |= WU_ERROR_TOO_MANY_SUCCESS_RESULTS;
                    transition_time = IMMEDIATE;
                }

                // if #viable results >= target_nresults,
                // we need more results, so bump target_nresults
                // NOTE: n_viable_results should never be > target_nresults,
                // but accommodate that if it should happen
                //
                if (n_viable_results >= wu.target_nresults) {
                    wu.target_nresults = n_viable_results+1;
                    transition_time = IMMEDIATE;
                }
            }
        }
    }

leave:
    --log_messages;

    switch (transition_time) {
    case IMMEDIATE:
        wu.transition_time = time(0);
        break;
    case DELAYED:
        x = time(0) + 6*3600;
        if (x < wu.transition_time) wu.transition_time = x;
        break;
    case NEVER:
        wu.transition_time = INT_MAX;
        break;
    case NO_CHANGE:
        break;
    }

    wu.need_validate = 0;
    
    if (dry_run) {
        log_messages.printf(MSG_NORMAL, "DB not updated (dry run)\n");
    } else {
        retval = validator.update_workunit(wu);
        if (retval) {
            log_messages.printf(MSG_CRITICAL,
                "[WU#%lu %s] update_workunit() failed: %s\n",
                wu.id, wu.name, boincerror(retval)
            );
            return retval;
        }
    }
    return 0;
}