// return the app version with greatest projected FLOPS // for the given job and host, or NULL if none is available // // NOTE: the BEST_APP_VERSION structure returned by this // must not be modified or reused; // a pointer to it is stored in APP_VERSION. // // check_req: if set, return only app versions that use resources // for which the work request is nonzero. // This check is not done for: // - assigned jobs // - resent jobs // reliable_only: use only versions for which this host is "reliable" // // We "memoize" the results, maintaining an array g_wreq->best_app_versions // that maps app ID to the best app version (or NULL). // BEST_APP_VERSION* get_app_version( WORKUNIT& wu, bool check_req, bool reliable_only ) { unsigned int i; int j; BEST_APP_VERSION* bavp; char buf[256]; bool job_needs_64b = (wu.rsc_memory_bound > max_32b_address_space()); if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] get_app_version(): getting app version for WU#%lu (%s) appid:%lu\n", wu.id, wu.name, wu.appid ); if (job_needs_64b) { log_messages.printf(MSG_NORMAL, "[version] job needs 64-bit app version: mem bnd %f\n", wu.rsc_memory_bound ); } } APP* app = ssp->lookup_app(wu.appid); if (!app) { log_messages.printf(MSG_CRITICAL, "WU refers to nonexistent app: %lu\n", wu.appid ); return NULL; } // if the app uses homogeneous app version, // don't send to anonymous platform client. // Then check if the WU is already committed to an app version // if (app->homogeneous_app_version) { if (g_wreq->anonymous_platform) { return NULL; } if ( wu.app_version_id) { return check_homogeneous_app_version(wu, reliable_only); } } // see if app is already in memoized array // std::vector<BEST_APP_VERSION*>::iterator bavi; bavi = g_wreq->best_app_versions.begin(); while (bavi != g_wreq->best_app_versions.end()) { bavp = *bavi; if (bavp->appid == wu.appid && (job_needs_64b == bavp->for_64b_jobs)) { if (!bavp->present) { #if 0 if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] returning cached NULL\n" ); } #endif return NULL; } // if we're at the jobs-in-progress limit for this // app and resource type, fall through and find another version // if (config.max_jobs_in_progress.exceeded( app, bavp->host_usage.proc_type )) { if (config.debug_version_select) { app_version_desc(*bavp, buf); log_messages.printf(MSG_NORMAL, "[version] %s: max jobs in progress exceeded\n", buf ); } g_wreq->best_app_versions.erase(bavi); break; } // if we previously chose an app version but don't need more work // for that processor type, fall through and find another version // if (check_req && g_wreq->rsc_spec_request) { int pt = bavp->host_usage.proc_type; if (!g_wreq->need_proc_type(pt)) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] have %s version but no more %s work needed\n", proc_type_name(pt), proc_type_name(pt) ); } g_wreq->best_app_versions.erase(bavi); break; } } if (config.debug_version_select) { app_version_desc(*bavp, buf); log_messages.printf(MSG_NORMAL, "[version] returning cached version: %s\n", buf ); } return bavp; } ++bavi; } // here if app was not in memoized array, // or we couldn't use the app version there. if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] looking for version of %s\n", app->name ); } bavp = new BEST_APP_VERSION; bavp->appid = wu.appid; bavp->for_64b_jobs = job_needs_64b; if (g_wreq->anonymous_platform) { CLIENT_APP_VERSION* cavp = get_app_version_anonymous( *app, job_needs_64b, reliable_only ); if (!cavp) { bavp->present = false; } else { bavp->present = true; bavp->host_usage = cavp->host_usage; bavp->cavp = cavp; int gavid = host_usage_to_gavid(cavp->host_usage, *app); bavp->reliable = app_version_is_reliable(gavid); bavp->trusted = app_version_is_trusted(gavid); if (config.debug_version_select) { app_version_desc(*bavp, buf); log_messages.printf(MSG_NORMAL, "[version] using %s\n", buf); } } g_wreq->best_app_versions.push_back(bavp); if (!bavp->present) return NULL; return bavp; } // Go through the client's platforms, // and scan the app versions for each platform. // Pick the one with highest expected FLOPS // // if config.prefer_primary_platform is set: // stop scanning platforms once we find a feasible version bavp->host_usage.projected_flops = 0; bavp->avp = NULL; for (i=0; i<g_request->platforms.list.size(); i++) { bool found_feasible_version = false; PLATFORM* p = g_request->platforms.list[i]; if (job_needs_64b && !is_64b_platform(p->name)) { continue; } for (j=0; j<ssp->napp_versions; j++) { HOST_USAGE host_usage; APP_VERSION& av = ssp->app_versions[j]; if (av.appid != wu.appid) continue; if (av.platformid != p->id) continue; if (av.beta) { if (!g_wreq->allow_beta_work) { continue; } } if (strlen(av.plan_class)) { if (!app_plan(*g_request, av.plan_class, host_usage)) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%lu] app_plan() returned false\n", av.id ); } continue; } if (!g_request->client_cap_plan_class) { if (!host_usage.is_sequential_app()) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%lu] client %d lacks plan class capability\n", av.id, g_request->core_client_version ); } continue; } } } else { host_usage.sequential_app(g_reply->host.p_fpops); } // skip versions that go against resource prefs // int pt = host_usage.proc_type; if (g_wreq->dont_use_proc_type[pt]) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%lu] Skipping %s version - user prefs say no %s\n", av.id, proc_type_name(pt), proc_type_name(pt) ); } continue; } if (reliable_only && !app_version_is_reliable(av.id)) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%lu] not reliable\n", av.id ); } continue; } if (daily_quota_exceeded(av.id, host_usage)) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%lu] daily quota exceeded\n", av.id ); } continue; } // skip versions for which we're at the jobs-in-progress limit // if (config.max_jobs_in_progress.exceeded(app, host_usage.proc_type)) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%lu] jobs in progress limit exceeded\n", av.id ); config.max_jobs_in_progress.print_log(); } continue; } // skip versions for resources we don't need // if (check_req && !need_this_resource(host_usage, &av, NULL)) { continue; } // skip versions which require a newer core client // if (g_request->core_client_version < av.min_core_version) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%lu] client version %d < min core version %d\n", av.id, g_request->core_client_version, av.min_core_version ); } // Do not tell the user he needs to update the client // just because the client is too old for a particular app version // g_wreq->outdated_client = true; continue; } if (av.max_core_version && g_request->core_client_version > av.max_core_version) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%lu] client version %d > max core version %d\n", av.id, g_request->core_client_version, av.max_core_version ); } continue; } // at this point we know the version is feasible, // so if config.prefer_primary_platform is set // we won't look any further. // found_feasible_version = true; // pick the fastest version. // Throw in a random factor in case the estimates are off. // DB_HOST_APP_VERSION* havp = gavid_to_havp(av.id); double r = 1; long n = 1; if (havp) { // slowly move from raw calc to measured performance as number // of results increases // n = std::max((long)havp->pfc.n, (long)n); double old_projected_flops = host_usage.projected_flops; estimate_flops(host_usage, av); host_usage.projected_flops = (host_usage.projected_flops*(n-1) + old_projected_flops)/n; // special case for versions that don't work on a given host. // This is defined as: // 1. pfc.n is 0 // 2. The max_jobs_per_day is 1 // 3. Consecutive valid is 0. // In that case, heavily penalize this app_version most of the // time. // if ((havp->pfc.n==0) && (havp->max_jobs_per_day==1) && (havp->consecutive_valid==0)) { if (drand() > 0.01) { host_usage.projected_flops *= 0.01; if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] App version AV#%lu is failing on HOST#%lu\n", havp->app_version_id, havp->host_id ); } } } } if (config.version_select_random_factor) { r += config.version_select_random_factor*rand_normal()/n; if (r <= .1) { r = .1; } } if (config.debug_version_select && bavp && bavp->avp) { log_messages.printf(MSG_NORMAL, "[version] Comparing AV#%lu (%.2f GFLOP) against AV#%lu (%.2f GFLOP)\n", av.id, host_usage.projected_flops/1e+9, bavp->avp->id, bavp->host_usage.projected_flops/1e+9 ); } if (r*host_usage.projected_flops > bavp->host_usage.projected_flops) { if (config.debug_version_select && (host_usage.projected_flops <= bavp->host_usage.projected_flops)) { log_messages.printf(MSG_NORMAL, "[version] [AV#%lu] Random factor wins. r=%f n=%ld\n", av.id, r, n ); } host_usage.projected_flops*=r; bavp->host_usage = host_usage; bavp->avp = &av; bavp->reliable = app_version_is_reliable(av.id); bavp->trusted = app_version_is_trusted(av.id); if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] Best app version is now AV%lu (%.2f GFLOP)\n", bavp->avp->id, bavp->host_usage.projected_flops/1e+9 ); } } else { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] Not selected, AV#%lu r*%.2f GFLOP <= Best AV %.2f GFLOP (r=%f, n=%ld)\n", av.id, host_usage.projected_flops/1e+9, bavp->host_usage.projected_flops/1e+9, r, n ); } } } // loop over app versions if (config.prefer_primary_platform && found_feasible_version) { break; } } // loop over client platforms if (bavp->avp) { estimate_flops(bavp->host_usage, *bavp->avp); if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] Best version of app %s is [AV#%lu] (%.2f GFLOPS)\n", app->name, bavp->avp->id, bavp->host_usage.projected_flops/1e9 ); } bavp->present = true; g_wreq->best_app_versions.push_back(bavp); } else { // Here if there's no app version we can use. // if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] returning NULL; platforms:\n" ); for (i=0; i<g_request->platforms.list.size(); i++) { PLATFORM* p = g_request->platforms.list[i]; log_messages.printf(MSG_NORMAL, "[version] %s\n", p->name ); } } g_wreq->best_app_versions.push_back(bavp); return NULL; } return bavp; }
// return BEST_APP_VERSION for the given job and host, or NULL if none // // check_req: check whether we still need work for the resource // This check is not done for: // - assigned jobs // - resent jobs // reliable_only: use only versions for which this host is "reliable" // // We "memoize" the results, maintaining an array g_wreq->best_app_versions // that maps app ID to the best app version (or NULL). // BEST_APP_VERSION* get_app_version( WORKUNIT& wu, bool check_req, bool reliable_only ) { unsigned int i; int j; BEST_APP_VERSION* bavp; char buf[256]; bool job_needs_64b = (wu.rsc_memory_bound > max_32b_address_space()); if (config.debug_version_select) { if (job_needs_64b) { log_messages.printf(MSG_NORMAL, "[version] job needs 64-bit app version: mem bnd %f\n", wu.rsc_memory_bound ); } } APP* app = ssp->lookup_app(wu.appid); if (!app) { log_messages.printf(MSG_CRITICAL, "WU refers to nonexistent app: %d\n", wu.appid ); return NULL; } // handle the case where we're using homogeneous app version // and the WU is already committed to an app version // if (app->homogeneous_app_version && wu.app_version_id) { return check_homogeneous_app_version(wu, reliable_only); } // see if app is already in memoized array // std::vector<BEST_APP_VERSION*>::iterator bavi; bavi = g_wreq->best_app_versions.begin(); while (bavi != g_wreq->best_app_versions.end()) { bavp = *bavi; if (bavp->appid == wu.appid && (job_needs_64b == bavp->for_64b_jobs)) { if (!bavp->present) { #if 0 if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] returning cached NULL\n" ); } #endif return NULL; } // if we're at the jobs-in-progress limit for this // app and resource type, fall through and find another version // if (config.max_jobs_in_progress.exceeded( app, bavp->host_usage.uses_gpu()) ) { if (config.debug_version_select) { app_version_desc(*bavp, buf); log_messages.printf(MSG_NORMAL, "[version] %s: max jobs in progress exceeded\n", buf ); } g_wreq->best_app_versions.erase(bavi); break; } // if we previously chose a CUDA app but don't need more CUDA work, // fall through and find another version // if (check_req && g_wreq->rsc_spec_request && bavp->host_usage.ncudas > 0 && !g_wreq->need_cuda() ) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] have CUDA version but no more CUDA work needed\n" ); } g_wreq->best_app_versions.erase(bavi); break; } // same, ATI // if (check_req && g_wreq->rsc_spec_request && bavp->host_usage.natis > 0 && !g_wreq->need_ati() ) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] have ATI version but no more ATI work needed\n" ); } g_wreq->best_app_versions.erase(bavi); break; } // same, CPU // if (check_req && g_wreq->rsc_spec_request && !bavp->host_usage.ncudas && !bavp->host_usage.natis && !g_wreq->need_cpu() ) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] have CPU version but no more CPU work needed\n" ); } g_wreq->best_app_versions.erase(bavi); break; } if (config.debug_version_select) { app_version_desc(*bavp, buf); log_messages.printf(MSG_NORMAL, "[version] returning cached version: %s\n", buf ); } return bavp; } bavi++; } // here if app was not in memoized array, // or we couldn't use the app version there. if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] looking for version of %s\n", app->name ); } bavp = new BEST_APP_VERSION; bavp->appid = wu.appid; bavp->for_64b_jobs = job_needs_64b; if (g_wreq->anonymous_platform) { CLIENT_APP_VERSION* cavp = get_app_version_anonymous( *app, job_needs_64b, reliable_only ); if (!cavp) { bavp->present = false; } else { bavp->present = true; bavp->host_usage = cavp->host_usage; bavp->cavp = cavp; int gavid = host_usage_to_gavid(cavp->host_usage, *app); bavp->reliable = app_version_is_reliable(gavid); bavp->trusted = app_version_is_trusted(gavid); if (config.debug_version_select) { app_version_desc(*bavp, buf); log_messages.printf(MSG_NORMAL, "[version] using %s\n", buf); } } g_wreq->best_app_versions.push_back(bavp); if (!bavp->present) return NULL; return bavp; } // Go through the client's platforms, // and scan the app versions for each platform. // Pick the one with highest expected FLOPS // // if config.prefer_primary_platform is set: // stop scanning platforms once we find a feasible version bavp->host_usage.projected_flops = 0; bavp->avp = NULL; for (i=0; i<g_request->platforms.list.size(); i++) { bool found_feasible_version = false; PLATFORM* p = g_request->platforms.list[i]; if (job_needs_64b && !is_64b_platform(p->name)) { continue; } for (j=0; j<ssp->napp_versions; j++) { HOST_USAGE host_usage; APP_VERSION& av = ssp->app_versions[j]; if (av.appid != wu.appid) continue; if (av.platformid != p->id) continue; if (g_request->core_client_version < av.min_core_version) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%d] client version %d < min core version %d\n", av.id, g_request->core_client_version, av.min_core_version ); } g_wreq->outdated_client = true; continue; } if (strlen(av.plan_class)) { if (!app_plan(*g_request, av.plan_class, host_usage)) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%d] app_plan() returned false\n", av.id ); } continue; } if (!g_request->client_cap_plan_class) { if (!host_usage.is_sequential_app()) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%d] client %d lacks plan class capability\n", av.id, g_request->core_client_version ); } continue; } } } else { host_usage.sequential_app(g_reply->host.p_fpops); } // skip versions that go against resource prefs // if (host_usage.ncudas && g_wreq->no_cuda) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%d] Skipping CUDA version - user prefs say no CUDA\n", av.id ); g_wreq->no_cuda_prefs = true; } continue; } if (host_usage.natis && g_wreq->no_ati) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%d] Skipping ATI version - user prefs say no ATI\n", av.id ); g_wreq->no_ati_prefs = true; } continue; } if (!(host_usage.uses_gpu()) && g_wreq->no_cpu) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%d] Skipping CPU version - user prefs say no CPUs\n", av.id ); g_wreq->no_cpu_prefs = true; } continue; } if (reliable_only && !app_version_is_reliable(av.id)) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%d] not reliable\n", av.id ); } continue; } if (daily_quota_exceeded(av.id, host_usage)) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%d] daily quota exceeded\n", av.id ); } continue; } // skip versions for which we're at the jobs-in-progress limit // if (config.max_jobs_in_progress.exceeded(app, host_usage.uses_gpu())) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] [AV#%d] jobs in progress limit exceeded\n", av.id ); config.max_jobs_in_progress.print_log(); } continue; } // skip versions for resources we don't need // if (!need_this_resource(host_usage, &av, NULL)) { continue; } // at this point we know the version is feasible, // so if config.prefer_primary_platform is set // we won't look any further. // found_feasible_version = true; // pick the fastest version. // Throw in a random factor in case the estimates are off. // double r = 1 + .1*rand_normal(); if (r*host_usage.projected_flops > bavp->host_usage.projected_flops) { bavp->host_usage = host_usage; bavp->avp = &av; bavp->reliable = app_version_is_reliable(av.id); bavp->trusted = app_version_is_trusted(av.id); } } // loop over app versions if (config.prefer_primary_platform && found_feasible_version) { break; } } // loop over client platforms if (bavp->avp) { estimate_flops(bavp->host_usage, *bavp->avp); if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] Best version of app %s is [AV#%d] (%.2f GFLOPS)\n", app->name, bavp->avp->id, bavp->host_usage.projected_flops/1e9 ); } bavp->present = true; g_wreq->best_app_versions.push_back(bavp); } else { // Here if there's no app version we can use. // if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] returning NULL; platforms:\n" ); for (i=0; i<g_request->platforms.list.size(); i++) { PLATFORM* p = g_request->platforms.list[i]; log_messages.printf(MSG_NORMAL, "[version] %s\n", p->name ); } } g_wreq->best_app_versions.push_back(bavp); return NULL; } return bavp; }