Beispiel #1
0
// return the app version with greatest projected FLOPS
// for the given job and host, or NULL if none is available
//
// NOTE: the BEST_APP_VERSION structure returned by this
// must not be modified or reused;
// a pointer to it is stored in APP_VERSION.
//
// check_req: if set, return only app versions that use resources
//  for which the work request is nonzero.
//  This check is not done for:
//    - assigned jobs
//    - resent jobs
// reliable_only: use only versions for which this host is "reliable"
//
// We "memoize" the results, maintaining an array g_wreq->best_app_versions
// that maps app ID to the best app version (or NULL).
//
BEST_APP_VERSION* get_app_version(
    WORKUNIT& wu, bool check_req, bool reliable_only
) {
    unsigned int i;
    int j;
    BEST_APP_VERSION* bavp;
    char buf[256];
    bool job_needs_64b = (wu.rsc_memory_bound > max_32b_address_space());

    if (config.debug_version_select) {
        log_messages.printf(MSG_NORMAL,
            "[version] get_app_version(): getting app version for WU#%lu (%s) appid:%lu\n",
            wu.id, wu.name, wu.appid
        );
        if (job_needs_64b) {
            log_messages.printf(MSG_NORMAL,
                "[version] job needs 64-bit app version: mem bnd %f\n",
                wu.rsc_memory_bound
            );
        }
    }

    APP* app = ssp->lookup_app(wu.appid);
    if (!app) {
        log_messages.printf(MSG_CRITICAL,
            "WU refers to nonexistent app: %lu\n", wu.appid
        );
        return NULL;
    }

    // if the app uses homogeneous app version,
    // don't send to anonymous platform client.
    // Then check if the WU is already committed to an app version
    //
    if (app->homogeneous_app_version) {
        if (g_wreq->anonymous_platform) {
            return NULL;
        }
        if ( wu.app_version_id) {
            return check_homogeneous_app_version(wu, reliable_only);
        }
    }

    // see if app is already in memoized array
    //
    std::vector<BEST_APP_VERSION*>::iterator bavi;
    bavi = g_wreq->best_app_versions.begin();
    while (bavi != g_wreq->best_app_versions.end()) {
        bavp = *bavi;
        if (bavp->appid == wu.appid && (job_needs_64b == bavp->for_64b_jobs)) {
            if (!bavp->present) {
#if 0
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] returning cached NULL\n"
                    );
                }
#endif
                return NULL;
            }

            // if we're at the jobs-in-progress limit for this
            // app and resource type, fall through and find another version
            //
            if (config.max_jobs_in_progress.exceeded(
                app, bavp->host_usage.proc_type
            )) {
                if (config.debug_version_select) {
                    app_version_desc(*bavp, buf);
                    log_messages.printf(MSG_NORMAL,
                        "[version] %s: max jobs in progress exceeded\n", buf
                    );
                }
                g_wreq->best_app_versions.erase(bavi);
                break;
            }

            // if we previously chose an app version but don't need more work
            // for that processor type, fall through and find another version
            //
            if (check_req && g_wreq->rsc_spec_request) {
                int pt = bavp->host_usage.proc_type;
                if (!g_wreq->need_proc_type(pt)) {
                    if (config.debug_version_select) {
                        log_messages.printf(MSG_NORMAL,
                            "[version] have %s version but no more %s work needed\n",
                            proc_type_name(pt),
                            proc_type_name(pt)
                        );
                    }
                    g_wreq->best_app_versions.erase(bavi);
                    break;
                }
            }

            if (config.debug_version_select) {
                app_version_desc(*bavp, buf);
                log_messages.printf(MSG_NORMAL,
                    "[version] returning cached version: %s\n", buf
                );
            }
            return bavp;
        }
        ++bavi;
    }

    // here if app was not in memoized array,
    // or we couldn't use the app version there.

    if (config.debug_version_select) {
        log_messages.printf(MSG_NORMAL,
            "[version] looking for version of %s\n",
            app->name
        );
    }

    bavp = new BEST_APP_VERSION;
    bavp->appid = wu.appid;
    bavp->for_64b_jobs = job_needs_64b;
    if (g_wreq->anonymous_platform) {
        CLIENT_APP_VERSION* cavp = get_app_version_anonymous(
            *app, job_needs_64b, reliable_only
        );
        if (!cavp) {
            bavp->present = false;
        } else {
            bavp->present = true;
            bavp->host_usage = cavp->host_usage;
            bavp->cavp = cavp;
            int gavid = host_usage_to_gavid(cavp->host_usage, *app);
            bavp->reliable = app_version_is_reliable(gavid);
            bavp->trusted = app_version_is_trusted(gavid);
            if (config.debug_version_select) {
                app_version_desc(*bavp, buf);
                log_messages.printf(MSG_NORMAL, "[version] using %s\n", buf);
            }
        }
        g_wreq->best_app_versions.push_back(bavp);
        if (!bavp->present) return NULL;
        return bavp;
    }

    // Go through the client's platforms,
    // and scan the app versions for each platform.
    // Pick the one with highest expected FLOPS
    //
    // if config.prefer_primary_platform is set:
    // stop scanning platforms once we find a feasible version 

    bavp->host_usage.projected_flops = 0;
    bavp->avp = NULL;
    for (i=0; i<g_request->platforms.list.size(); i++) {
        bool found_feasible_version = false;
        PLATFORM* p = g_request->platforms.list[i];
        if (job_needs_64b && !is_64b_platform(p->name)) {
            continue;
        }
        for (j=0; j<ssp->napp_versions; j++) {
            HOST_USAGE host_usage;
            APP_VERSION& av = ssp->app_versions[j];
            if (av.appid != wu.appid) continue;
            if (av.platformid != p->id) continue;
            if (av.beta) {
                if (!g_wreq->allow_beta_work) {
                    continue;
                }
            }

            if (strlen(av.plan_class)) {
                if (!app_plan(*g_request, av.plan_class, host_usage)) {
                    if (config.debug_version_select) {
                        log_messages.printf(MSG_NORMAL,
                            "[version] [AV#%lu] app_plan() returned false\n",
                            av.id
                        );
                    }
                    continue;
                }
                if (!g_request->client_cap_plan_class) {
                    if (!host_usage.is_sequential_app()) {
                        if (config.debug_version_select) {
                            log_messages.printf(MSG_NORMAL,
                                "[version] [AV#%lu] client %d lacks plan class capability\n",
                                av.id, g_request->core_client_version
                            );
                        }
                        continue;
                    }
                }
            } else {
                host_usage.sequential_app(g_reply->host.p_fpops);
            }

            // skip versions that go against resource prefs
            //
            int pt = host_usage.proc_type;
            if (g_wreq->dont_use_proc_type[pt]) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] [AV#%lu] Skipping %s version - user prefs say no %s\n",
                        av.id,
                        proc_type_name(pt),
                        proc_type_name(pt)
                    );
                }
                continue;
            }

            if (reliable_only && !app_version_is_reliable(av.id)) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] [AV#%lu] not reliable\n", av.id
                    );
                }
                continue;
            }

            if (daily_quota_exceeded(av.id, host_usage)) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] [AV#%lu] daily quota exceeded\n", av.id
                    );
                }
                continue;
            }

            // skip versions for which we're at the jobs-in-progress limit
            //
            if (config.max_jobs_in_progress.exceeded(app, host_usage.proc_type)) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] [AV#%lu] jobs in progress limit exceeded\n",
                        av.id
                    );
                    config.max_jobs_in_progress.print_log();
                }
                continue;
            }

            // skip versions for resources we don't need
            //
            if (check_req && !need_this_resource(host_usage, &av, NULL)) {
                continue;
            }

            // skip versions which require a newer core client
            //
            if (g_request->core_client_version < av.min_core_version) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] [AV#%lu] client version %d < min core version %d\n",
                        av.id, g_request->core_client_version, av.min_core_version
                    );
                }
                // Do not tell the user he needs to update the client
                // just because the client is too old for a particular app version
                // g_wreq->outdated_client = true;
                continue;
            }
            if (av.max_core_version && g_request->core_client_version > av.max_core_version) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] [AV#%lu] client version %d > max core version %d\n",
                        av.id, g_request->core_client_version, av.max_core_version
                    );
                }
                continue;
            }

            // at this point we know the version is feasible,
            // so if config.prefer_primary_platform is set
            // we won't look any further.
            //
            found_feasible_version = true;

            // pick the fastest version.
            // Throw in a random factor in case the estimates are off.
            //
            DB_HOST_APP_VERSION* havp = gavid_to_havp(av.id);
            double r = 1;
            long n = 1;
            if (havp) {
                // slowly move from raw calc to measured performance as number
                // of results increases
                //
                n = std::max((long)havp->pfc.n, (long)n);
                double old_projected_flops = host_usage.projected_flops;
                estimate_flops(host_usage, av);
                host_usage.projected_flops = (host_usage.projected_flops*(n-1) + old_projected_flops)/n;

                // special case for versions that don't work on a given host.
                // This is defined as:
                // 1. pfc.n is 0
                // 2. The max_jobs_per_day is 1
                // 3. Consecutive valid is 0.
                // In that case, heavily penalize this app_version most of the
                // time.
                //
                if ((havp->pfc.n==0) && (havp->max_jobs_per_day==1) && (havp->consecutive_valid==0)) {
                    if (drand() > 0.01) {
                        host_usage.projected_flops *= 0.01;
                        if (config.debug_version_select) {
                            log_messages.printf(MSG_NORMAL,
                                "[version] App version AV#%lu is failing on HOST#%lu\n",
                                havp->app_version_id, havp->host_id
                            );
                        }
                   }
                }
            }
            if (config.version_select_random_factor) {
                r += config.version_select_random_factor*rand_normal()/n;
                if (r <= .1) {
                    r = .1;
                }
            }
            if (config.debug_version_select && bavp && bavp->avp) {
                log_messages.printf(MSG_NORMAL,
                    "[version] Comparing AV#%lu (%.2f GFLOP) against AV#%lu (%.2f GFLOP)\n",
                    av.id, host_usage.projected_flops/1e+9,
                    bavp->avp->id, bavp->host_usage.projected_flops/1e+9
                );
            }
            if (r*host_usage.projected_flops > bavp->host_usage.projected_flops) {
                if (config.debug_version_select && (host_usage.projected_flops <= bavp->host_usage.projected_flops)) {
                      log_messages.printf(MSG_NORMAL,
                          "[version] [AV#%lu] Random factor wins.  r=%f n=%ld\n",
                          av.id, r, n
                    );
                }
                host_usage.projected_flops*=r;
                bavp->host_usage = host_usage;
                bavp->avp = &av;
                bavp->reliable = app_version_is_reliable(av.id);
                bavp->trusted = app_version_is_trusted(av.id);
                if (config.debug_version_select) {
                      log_messages.printf(MSG_NORMAL,
                          "[version] Best app version is now AV%lu (%.2f GFLOP)\n",
                          bavp->avp->id, bavp->host_usage.projected_flops/1e+9
                    );
                }
            } else {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                            "[version] Not selected, AV#%lu r*%.2f GFLOP <= Best AV %.2f GFLOP (r=%f, n=%ld)\n",
                            av.id, host_usage.projected_flops/1e+9,
                            bavp->host_usage.projected_flops/1e+9, r, n
                    );
                }
            }
        }   // loop over app versions

        if (config.prefer_primary_platform && found_feasible_version) {
            break;
        }
    }   // loop over client platforms

    if (bavp->avp) {
        estimate_flops(bavp->host_usage, *bavp->avp);
        if (config.debug_version_select) {
            log_messages.printf(MSG_NORMAL,
                "[version] Best version of app %s is [AV#%lu] (%.2f GFLOPS)\n",
                app->name, bavp->avp->id, bavp->host_usage.projected_flops/1e9
            );
        }
        bavp->present = true;
        g_wreq->best_app_versions.push_back(bavp);
    } else {
        // Here if there's no app version we can use.
        //
        if (config.debug_version_select) {
            log_messages.printf(MSG_NORMAL,
                "[version] returning NULL; platforms:\n"
            );
            for (i=0; i<g_request->platforms.list.size(); i++) {
                PLATFORM* p = g_request->platforms.list[i];
                log_messages.printf(MSG_NORMAL,
                    "[version] %s\n",
                    p->name
                );
            }
        }
        g_wreq->best_app_versions.push_back(bavp);
        return NULL;
    }
    return bavp;
}
Beispiel #2
0
// return BEST_APP_VERSION for the given job and host, or NULL if none
//
// check_req: check whether we still need work for the resource
// This check is not done for:
//    - assigned jobs
//    - resent jobs
// reliable_only: use only versions for which this host is "reliable"
//
// We "memoize" the results, maintaining an array g_wreq->best_app_versions
// that maps app ID to the best app version (or NULL).
//
BEST_APP_VERSION* get_app_version(
    WORKUNIT& wu, bool check_req, bool reliable_only
) {
    unsigned int i;
    int j;
    BEST_APP_VERSION* bavp;
    char buf[256];
    bool job_needs_64b = (wu.rsc_memory_bound > max_32b_address_space());

    if (config.debug_version_select) {
        if (job_needs_64b) {
            log_messages.printf(MSG_NORMAL,
                "[version] job needs 64-bit app version: mem bnd %f\n",
                wu.rsc_memory_bound
            );
        }
    }

    APP* app = ssp->lookup_app(wu.appid);
    if (!app) {
        log_messages.printf(MSG_CRITICAL,
            "WU refers to nonexistent app: %d\n", wu.appid
        );
        return NULL;
    }

    // handle the case where we're using homogeneous app version
    // and the WU is already committed to an app version
    //
    if (app->homogeneous_app_version && wu.app_version_id) {
        return check_homogeneous_app_version(wu, reliable_only);
    }

    // see if app is already in memoized array
    //
    std::vector<BEST_APP_VERSION*>::iterator bavi;
    bavi = g_wreq->best_app_versions.begin();
    while (bavi != g_wreq->best_app_versions.end()) {
        bavp = *bavi;
        if (bavp->appid == wu.appid && (job_needs_64b == bavp->for_64b_jobs)) {
            if (!bavp->present) {
#if 0
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] returning cached NULL\n"
                    );
                }
#endif
                return NULL;
            }

            // if we're at the jobs-in-progress limit for this
            // app and resource type, fall through and find another version
            //
            if (config.max_jobs_in_progress.exceeded(
                app, bavp->host_usage.uses_gpu())
            ) {
                if (config.debug_version_select) {
                    app_version_desc(*bavp, buf);
                    log_messages.printf(MSG_NORMAL,
                        "[version] %s: max jobs in progress exceeded\n", buf
                    );
                }
                g_wreq->best_app_versions.erase(bavi);
                break;
            }

            // if we previously chose a CUDA app but don't need more CUDA work,
            // fall through and find another version
            //
            if (check_req
                && g_wreq->rsc_spec_request
                && bavp->host_usage.ncudas > 0
                && !g_wreq->need_cuda()
            ) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] have CUDA version but no more CUDA work needed\n"
                    );
                }
                g_wreq->best_app_versions.erase(bavi);
                break;
            }

            // same, ATI
            //
            if (check_req
                && g_wreq->rsc_spec_request
                && bavp->host_usage.natis > 0
                && !g_wreq->need_ati()
            ) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] have ATI version but no more ATI work needed\n"
                    );
                }
                g_wreq->best_app_versions.erase(bavi);
                break;
            }

            // same, CPU
            //
            if (check_req
                && g_wreq->rsc_spec_request
                && !bavp->host_usage.ncudas
                && !bavp->host_usage.natis
                && !g_wreq->need_cpu()
            ) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] have CPU version but no more CPU work needed\n"
                    );
                }
                g_wreq->best_app_versions.erase(bavi);
                break;
            }

            if (config.debug_version_select) {
                app_version_desc(*bavp, buf);
                log_messages.printf(MSG_NORMAL,
                    "[version] returning cached version: %s\n", buf
                );
            }
            return bavp;
        }
        bavi++;
    }

    // here if app was not in memoized array,
    // or we couldn't use the app version there.

    if (config.debug_version_select) {
        log_messages.printf(MSG_NORMAL,
            "[version] looking for version of %s\n",
            app->name
        );
    }

    bavp = new BEST_APP_VERSION;
    bavp->appid = wu.appid;
    bavp->for_64b_jobs = job_needs_64b;
    if (g_wreq->anonymous_platform) {
        CLIENT_APP_VERSION* cavp = get_app_version_anonymous(
            *app, job_needs_64b, reliable_only
        );
        if (!cavp) {
            bavp->present = false;
        } else {
            bavp->present = true;
            bavp->host_usage = cavp->host_usage;
            bavp->cavp = cavp;
            int gavid = host_usage_to_gavid(cavp->host_usage, *app);
            bavp->reliable = app_version_is_reliable(gavid);
            bavp->trusted = app_version_is_trusted(gavid);
            if (config.debug_version_select) {
                app_version_desc(*bavp, buf);
                log_messages.printf(MSG_NORMAL, "[version] using %s\n", buf);
            }
        }
        g_wreq->best_app_versions.push_back(bavp);
        if (!bavp->present) return NULL;
        return bavp;
    }

    // Go through the client's platforms,
    // and scan the app versions for each platform.
    // Pick the one with highest expected FLOPS
    //
    // if config.prefer_primary_platform is set:
    // stop scanning platforms once we find a feasible version 

    bavp->host_usage.projected_flops = 0;
    bavp->avp = NULL;
    for (i=0; i<g_request->platforms.list.size(); i++) {
        bool found_feasible_version = false;
        PLATFORM* p = g_request->platforms.list[i];
        if (job_needs_64b && !is_64b_platform(p->name)) {
            continue;
        }
        for (j=0; j<ssp->napp_versions; j++) {
            HOST_USAGE host_usage;
            APP_VERSION& av = ssp->app_versions[j];
            if (av.appid != wu.appid) continue;
            if (av.platformid != p->id) continue;

            if (g_request->core_client_version < av.min_core_version) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] [AV#%d] client version %d < min core version %d\n",
                        av.id, g_request->core_client_version, av.min_core_version
                    );
                }
                g_wreq->outdated_client = true;
                continue;
            }
            if (strlen(av.plan_class)) {
                if (!app_plan(*g_request, av.plan_class, host_usage)) {
                    if (config.debug_version_select) {
                        log_messages.printf(MSG_NORMAL,
                            "[version] [AV#%d] app_plan() returned false\n",
                            av.id
                        );
                    }
                    continue;
                }
                if (!g_request->client_cap_plan_class) {
                    if (!host_usage.is_sequential_app()) {
                        if (config.debug_version_select) {
                            log_messages.printf(MSG_NORMAL,
                                "[version] [AV#%d] client %d lacks plan class capability\n",
                                av.id, g_request->core_client_version
                            );
                        }
                        continue;
                    }
                }
            } else {
                host_usage.sequential_app(g_reply->host.p_fpops);
            }

            // skip versions that go against resource prefs
            //
            if (host_usage.ncudas && g_wreq->no_cuda) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] [AV#%d] Skipping CUDA version - user prefs say no CUDA\n",
                        av.id
                    );
                    g_wreq->no_cuda_prefs = true;
                }
                continue;
            }
            if (host_usage.natis && g_wreq->no_ati) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] [AV#%d] Skipping ATI version - user prefs say no ATI\n",
                        av.id
                    );
                    g_wreq->no_ati_prefs = true;
                }
                continue;
            }
            if (!(host_usage.uses_gpu()) && g_wreq->no_cpu) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] [AV#%d] Skipping CPU version - user prefs say no CPUs\n",
                        av.id
                    );
                    g_wreq->no_cpu_prefs = true;
                }
                continue;
            }

            if (reliable_only && !app_version_is_reliable(av.id)) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] [AV#%d] not reliable\n", av.id
                    );
                }
                continue;
            }

            if (daily_quota_exceeded(av.id, host_usage)) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] [AV#%d] daily quota exceeded\n", av.id
                    );
                }
                continue;
            }

            // skip versions for which we're at the jobs-in-progress limit
            //
            if (config.max_jobs_in_progress.exceeded(app, host_usage.uses_gpu())) {
                if (config.debug_version_select) {
                    log_messages.printf(MSG_NORMAL,
                        "[version] [AV#%d] jobs in progress limit exceeded\n",
                        av.id
                    );
                    config.max_jobs_in_progress.print_log();
                }
                continue;
            }

            // skip versions for resources we don't need
            //
            if (!need_this_resource(host_usage, &av, NULL)) {
                continue;
            }

            // at this point we know the version is feasible,
            // so if config.prefer_primary_platform is set
            // we won't look any further.
            //
            found_feasible_version = true;

            // pick the fastest version.
            // Throw in a random factor in case the estimates are off.
            //
            double r = 1 + .1*rand_normal();
            if (r*host_usage.projected_flops > bavp->host_usage.projected_flops) {
                bavp->host_usage = host_usage;
                bavp->avp = &av;
                bavp->reliable = app_version_is_reliable(av.id);
                bavp->trusted = app_version_is_trusted(av.id);
            }
        }   // loop over app versions

        if (config.prefer_primary_platform && found_feasible_version) {
            break;
        }
    }   // loop over client platforms

    if (bavp->avp) {
        estimate_flops(bavp->host_usage, *bavp->avp);
        if (config.debug_version_select) {
            log_messages.printf(MSG_NORMAL,
                "[version] Best version of app %s is [AV#%d] (%.2f GFLOPS)\n",
                app->name, bavp->avp->id, bavp->host_usage.projected_flops/1e9
            );
        }
        bavp->present = true;
        g_wreq->best_app_versions.push_back(bavp);
    } else {
        // Here if there's no app version we can use.
        //
        if (config.debug_version_select) {
            log_messages.printf(MSG_NORMAL,
                "[version] returning NULL; platforms:\n"
            );
            for (i=0; i<g_request->platforms.list.size(); i++) {
                PLATFORM* p = g_request->platforms.list[i];
                log_messages.printf(MSG_NORMAL,
                    "[version] %s\n",
                    p->name
                );
            }
        }
        g_wreq->best_app_versions.push_back(bavp);
        return NULL;
    }
    return bavp;
}