Ejemplo n.º 1
0
// If this resource is below min buffer level,
// return the highest-priority project that may have jobs for it.
//
PROJECT* RSC_WORK_FETCH::choose_project_hyst(bool enforce_hyst) {
    PROJECT* pbest = NULL;
    if (enforce_hyst) {
        if (saturated_time > gstate.work_buf_min()) return NULL;
    }
    if (saturated_time > gstate.work_buf_total()) return NULL;

    for (unsigned i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];
        if (p->pwf.cant_fetch_work_reason) continue;
        if (!project_state(p).may_have_work) continue;

        // if project has zero resource share,
        // only fetch work if a device is idle
        //
        if (p->resource_share == 0 && nidle_now == 0) {
            continue;
        }

        // if project has excluded GPUs of this type,
        // and it has runnable jobs for this type,
        // don't fetch work for it.
        // TODO: THIS IS CRUDE. Making it smarter would require
        // computing shortfall etc. on a per-project basis
        //
        if (rsc_type) {
            if (p->ncoprocs_excluded[rsc_type] == ninstances) {
                continue;
            }
            if (p->ncoprocs_excluded[rsc_type]
                && p->rsc_pwf[rsc_type].has_runnable_jobs
            ){
                continue;
            }
        }

        RSC_PROJECT_WORK_FETCH& rpwf = project_state(p);
        if (rpwf.anon_skip) continue;
        if (pbest) {
            if (pbest->sched_priority > p->sched_priority) {
                continue;
            }
        }
        pbest = p;
    }
    if (!pbest) return NULL;
    work_fetch.clear_request();
    work_fetch.set_all_requests_hyst(pbest, rsc_type);
    return pbest;
}
Ejemplo n.º 2
0
// we're going to contact this project for reasons other than work fetch;
// decide if we should piggy-back a work fetch request.
//
void WORK_FETCH::compute_work_request(PROJECT* p) {
    clear_request();
    if (config.fetch_minimal_work && gstate.had_or_requested_work) return;
    if (p->dont_request_more_work) return;
    if (p->non_cpu_intensive) {
        if (!has_a_job(p)) {
            cpu_work_fetch.req_secs = 1;
        }
        return;
    }

    // See if this is the project we'd ask for work anyway.
    // Temporarily clear resource backoffs,
    // since we're going to contact this project in any case.
    //
    double cpu_save = p->cpu_pwf.backoff_time;
    double cuda_save = p->cuda_pwf.backoff_time;
    double ati_save = p->ati_pwf.backoff_time;
    p->cpu_pwf.backoff_time = 0;
    p->cuda_pwf.backoff_time = 0;
    p->ati_pwf.backoff_time = 0;
    PROJECT* pbest = choose_project();
    p->cpu_pwf.backoff_time = cpu_save;
    p->cuda_pwf.backoff_time = cuda_save;
    p->ati_pwf.backoff_time = ati_save;
    if (p == pbest) {
        // Ask for work for all devices w/ a shortfall.
        // Otherwise we can have a situation where a GPU is idle,
        // we ask only for GPU work, and the project never has any
        //
        work_fetch.set_all_requests(pbest);
        return;
    }

    // if not, don't request any work
    //
    clear_request();
}
Ejemplo n.º 3
0
// Choose the best project to ask for work for this resource,
// given the specific criterion
//
PROJECT* RSC_WORK_FETCH::choose_project(int criterion) {
    PROJECT* pbest = NULL;

    switch (criterion) {
    case FETCH_IF_IDLE_INSTANCE:
        if (nidle_now == 0) return NULL;
        break;
    case FETCH_IF_MAJOR_SHORTFALL:
        if (saturated_time > gstate.work_buf_min()) return NULL;
        break;
    case FETCH_IF_MINOR_SHORTFALL:
        if (saturated_time > gstate.work_buf_total()) return NULL;
        break;
    case FETCH_IF_PROJECT_STARVED:
        if (deadline_missed_instances >= ninstances) return NULL;
        break;
    }

    for (unsigned i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];
        if (p->pwf.cant_fetch_work_reason) continue;
        if (!project_state(p).may_have_work) continue;
        RSC_PROJECT_WORK_FETCH& rpwf = project_state(p);
        if (rpwf.anon_skip) continue;
        switch (criterion) {
        case FETCH_IF_MINOR_SHORTFALL:
            if (wacky_dcf(p)) continue;
            if (!p->resource_share) continue;
            break;
        case FETCH_IF_MAJOR_SHORTFALL:
            if (wacky_dcf(p)) continue;
            if (!p->resource_share) continue;
            break;
        case FETCH_IF_PROJECT_STARVED:
            if (p->sched_priority < 0) continue;
            if (rpwf.nused_total >= ninstances) continue;
            if (!p->resource_share) continue;
            break;
        }

        if (pbest) {
            if (!p->resource_share) {
                continue;
            }
            if (pbest->sched_priority > p->sched_priority) {
                continue;
            }
        }
        pbest = p;
    }
    if (!pbest) return NULL;

    // decide how much work to request from each resource
    //
    work_fetch.clear_request();
    switch (criterion) {
    case FETCH_IF_IDLE_INSTANCE:
    case FETCH_IF_MAJOR_SHORTFALL:
        set_request(pbest);
        break;
    case FETCH_IF_PROJECT_STARVED:
        set_request(pbest);
        break;
    case FETCH_IF_MINOR_SHORTFALL:
        // in this case, potentially request work for all resources
        //
        if (pbest->sched_priority < 0) {
            set_request(pbest);
        } else {
            work_fetch.set_all_requests(pbest);
        }
        break;
    }
    // in principle there should be a nonzero request.
    // check, just in case
    //
    if (!req_secs && !req_instances) {
        if (log_flags.work_fetch_debug) {
            msg_printf(pbest, MSG_INFO,
                "[work_fetch] error: project chosen but zero request"
            );
        }
        return 0;
    }

    if (log_flags.work_fetch_debug) {
        msg_printf(pbest, MSG_INFO,
            "[work_fetch] chosen: %s %s: %.2f inst, %.2f sec",
            criterion_name(criterion), rsc_name(rsc_type),
            req_instances, req_secs
        );
    }

    return pbest;
}
Ejemplo n.º 4
0
// If this resource is below min buffer level,
// return the highest-priority project that may have jobs for it.
//
// If strict is true, enforce hysteresis and backoff rules
// (which are there to limit rate of scheduler RPCs).
// Otherwise, we're going to do a scheduler RPC anyway
// and we're deciding whether to piggyback a work request,
// so there is no reason to enforce these rules.
//
PROJECT* RSC_WORK_FETCH::choose_project_hyst(bool strict) {
    PROJECT* pbest = NULL;
    if (strict) {
        if (saturated_time > gstate.work_buf_min()) return NULL;
    } else {
        if (saturated_time > gstate.work_buf_total()) return NULL;
    }
    if (saturated_time > gstate.work_buf_total()) return NULL;

    for (unsigned i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];

        // check whether we can fetch work of any type from this project
        //
        if (p->pwf.cant_fetch_work_reason) continue;

        // check whether we can fetch work of this type
        //
        if (dont_fetch(p, rsc_type)) continue;

        // if strict, check backoff
        //
        if (strict) {
            if (project_state(p).backoff_time > gstate.now) {
                continue;
            }
        }

        // if project has zero resource share,
        // only fetch work if a device is idle
        //
        if (p->resource_share == 0 && nidle_now == 0) {
            continue;
        }

        // if project has excluded GPUs of this type,
        // and it has more runnable jobs than non-excluded instances,
        // don't fetch work for it.
        // TODO: THIS IS CRUDE. Making it smarter would require
        // computing shortfall etc. on a per-project basis
        //
        if (rsc_type) {
            int n_not_excluded = ninstances - p->ncoprocs_excluded[rsc_type];
            if (n_not_excluded == 0) {
                continue;
            }
            if (p->ncoprocs_excluded[rsc_type]
                && p->rsc_pwf[rsc_type].n_runnable_jobs > n_not_excluded
            ) {
                continue;
            }
        }

        RSC_PROJECT_WORK_FETCH& rpwf = project_state(p);
        if (rpwf.anon_skip) continue;
        if (pbest) {
            if (pbest->sched_priority > p->sched_priority) {
                continue;
            }
        }
        pbest = p;
    }
    if (!pbest) return NULL;
    work_fetch.clear_request();
    work_fetch.set_all_requests_hyst(pbest, rsc_type);
    return pbest;
}
Ejemplo n.º 5
0
// If this resource is below min buffer level,
// return the highest-priority project that may have jobs for it.
//
// It the resource has instanced starved because of exclusions,
// return the highest-priority project that may have jobs
// and doesn't exclude those instances.
//
// Only choose a project if the buffer is below min level;
// if strict_hyst is true, relax this to max level
//
// If backoff_exempt_project is non-NULL,
// don't enforce resource backoffs for that project;
// this is for when we're going to do a scheduler RPC anyway
// and we're deciding whether to piggyback a work request
//
PROJECT* RSC_WORK_FETCH::choose_project_hyst(
    bool strict_hyst,
    PROJECT* backoff_exempt_project
) {
    PROJECT* pbest = NULL;
    bool buffer_low = true;
    if (strict_hyst) {
        if (saturated_time > gstate.work_buf_min()) buffer_low = false;
    } else {
        if (saturated_time > gstate.work_buf_total()) buffer_low = false;
    }

    if (log_flags.work_fetch_debug) {
        msg_printf(0, MSG_INFO,
            "[work_fetch] choose_project() for %s: buffer_low: %s; sim_excluded_instances %d\n",
            rsc_name(rsc_type), buffer_low?"yes":"no", sim_excluded_instances
        );
    }

    if (!buffer_low && !sim_excluded_instances) return NULL;

    for (unsigned i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];

        // check whether we can fetch work of any type from this project
        //
        if (p->pwf.cant_fetch_work_reason) {
            //msg_printf(p, MSG_INFO, "skip: cfwr %d", p->pwf.cant_fetch_work_reason);
            continue;
        }

        // see whether work fetch for this resource is banned
        // by prefs, config, project, or acct mgr
        //
        if (dont_fetch(p, rsc_type)) {
            //msg_printf(p, MSG_INFO, "skip: dont_fetch");
            continue;
        }

        // check backoff
        //
        if (p != backoff_exempt_project) {
            if (project_state(p).backoff_time > gstate.now) {
                //msg_printf(p, MSG_INFO, "skip: backoff");
                continue;
            }
        }

        // if project has zero resource share,
        // only fetch work if a device is idle
        //
        if (p->resource_share == 0 && nidle_now == 0) {
            //msg_printf(p, MSG_INFO, "skip: zero share");
            continue;
        }

        // if project has excluded GPUs of this type,
        // we need to avoid fetching work just because there's an idle instance
        // or a shortfall;
        // fetching work might not alleviate either of these,
        // and we'd end up fetching unbounded work.
        // At the same time, we want to respect work buf params if possible.
        //
        // Current policy:
        // don't fetch work if remaining time of this project's jobs
        // exceeds work_buf_min * (#usable instances / #instances)
        //
        // TODO: THIS IS FAIRLY CRUDE. Making it smarter would require
        // computing shortfall etc. on a per-project basis
        //
        int nexcl = p->rsc_pwf[rsc_type].ncoprocs_excluded;
        if (rsc_type && nexcl) {
            int n_not_excluded = ninstances - nexcl;
            if (p->rsc_pwf[rsc_type].queue_est > (gstate.work_buf_min() * n_not_excluded)/ninstances) {
                //msg_printf(p, MSG_INFO, "skip: too much work");
                continue;
            }
        }

        RSC_PROJECT_WORK_FETCH& rpwf = project_state(p);
        if (rpwf.anon_skip) {
            //msg_printf(p, MSG_INFO, "skip: anon");
            continue;
        }

        // if we're sending work only because of exclusion starvation,
        // make sure this project can use the starved instances
        //
        if (!buffer_low) {
            if ((sim_excluded_instances & rpwf.non_excluded_instances) == 0) {
                //msg_printf(p, MSG_INFO, "skip: excl");
                continue;
            }
        }

        if (pbest) {
            if (pbest->sched_priority > p->sched_priority) {
                //msg_printf(p, MSG_INFO, "skip: prio");
                continue;
            }
        }
        pbest = p;
    }
    if (!pbest) {
        if (log_flags.work_fetch_debug) {
            msg_printf(0, MSG_INFO,
                "[work_fetch] no eligible project for %s",
                rsc_name(rsc_type)
            );
        }
        return NULL;
    }
    work_fetch.clear_request();
    if (buffer_low) {
        work_fetch.set_all_requests_hyst(pbest, rsc_type);
    } else {
        set_request_excluded(pbest);
    }
    return pbest;
}