/// Do periodic checks on running apps: /// - get latest CPU time and % done info /// - check if any has exited, and clean up /// - see if any has exceeded its CPU or disk space limits, and abort it bool ACTIVE_TASK_SET::poll() { bool action; unsigned int i; static double last_time = 0; if (gstate.now - last_time < 1.0) return false; last_time = gstate.now; action = check_app_exited(); send_heartbeats(); send_trickle_downs(); graphics_poll(); process_control_poll(); get_memory_usage(); action |= check_rsc_limits_exceeded(); get_msgs(); for (i=0; i<active_tasks.size(); i++) { ACTIVE_TASK* atp = active_tasks[i]; if (atp->task_state() == PROCESS_ABORT_PENDING) { if (gstate.now > atp->abort_time + ABORT_TIMEOUT) { atp->kill_task(false); } } if (atp->task_state() == PROCESS_QUIT_PENDING) { if (gstate.now > atp->quit_time + QUIT_TIMEOUT) { atp->kill_task(true); } } } if (action) { gstate.set_client_state_dirty("ACTIVE_TASK_SET::poll"); } return action; }
// suspend all currently running tasks // called only from CLIENT_STATE::suspend_tasks(), // e.g. because on batteries, time of day, benchmarking, CPU throttle, etc. // void ACTIVE_TASK_SET::suspend_all(int reason) { for (unsigned int i=0; i<active_tasks.size(); i++) { ACTIVE_TASK* atp = active_tasks[i]; if (atp->task_state() != PROCESS_EXECUTING) continue; switch (reason) { case SUSPEND_REASON_CPU_THROTTLE: // if we're doing CPU throttling, don't bother suspending apps // that don't use a full CPU // if (atp->result->dont_throttle()) continue; if (atp->app_version->avg_ncpus < 1) continue; atp->preempt(REMOVE_NEVER); break; case SUSPEND_REASON_BENCHMARKS: atp->preempt(REMOVE_NEVER); break; case SUSPEND_REASON_CPU_USAGE: // If we're suspending because of non-BOINC CPU load, // don't remove from memory. // Some systems do a security check when apps are launched, // which uses a lot of CPU. // Avoid going into a preemption loop. // if (atp->result->non_cpu_intensive()) break; atp->preempt(REMOVE_NEVER); break; default: atp->preempt(REMOVE_MAYBE_USER); } } }
// check for msgs from active tasks, // and update their elapsed time and other info // void ACTIVE_TASK_SET::get_msgs() { //LOGD("app_control: ACTIVE_TASK::get_msgs"); unsigned int i; ACTIVE_TASK *atp; double old_time; static double last_time=0; double delta_t; if (last_time) { delta_t = gstate.now - last_time; // Normally this is called every second. // If delta_t is > 10, we'll assume that a period of hibernation // or suspension happened, and treat it as zero. // If negative, must be clock reset. Ignore. // if (delta_t > 10 || delta_t < 0) { delta_t = 0; } } else { delta_t = 0; } last_time = gstate.now; for (i=0; i<active_tasks.size(); i++) { atp = active_tasks[i]; if (!atp->process_exists()) continue; old_time = atp->checkpoint_cpu_time; if (atp->task_state() == PROCESS_EXECUTING) { atp->elapsed_time += delta_t; } if (atp->get_app_status_msg()) { if (old_time != atp->checkpoint_cpu_time) { char buf[256]; sprintf(buf, "%s checkpointed", atp->result->name); if (atp->overdue_checkpoint) { gstate.request_schedule_cpus(buf); } atp->checkpoint_wall_time = gstate.now; atp->premature_exit_count = 0; atp->checkpoint_elapsed_time = atp->elapsed_time; atp->checkpoint_fraction_done = atp->fraction_done; atp->checkpoint_fraction_done_elapsed_time = atp->fraction_done_elapsed_time; if (log_flags.checkpoint_debug) { msg_printf(atp->wup->project, MSG_INFO, "[checkpoint] result %s checkpointed", atp->result->name ); } else if (log_flags.task_debug) { msg_printf(atp->wup->project, MSG_INFO, "[task] result %s checkpointed", atp->result->name ); } atp->write_task_state_file(); } } atp->get_trickle_up_msg(); atp->get_graphics_msg(); } }
static void handle_get_screensaver_tasks(MIOFILE& fout) { unsigned int i; ACTIVE_TASK* atp; fout.printf( "<handle_get_screensaver_tasks>\n" " <suspend_reason>%d</suspend_reason>\n", gstate.suspend_reason ); for (i=0; i<gstate.active_tasks.active_tasks.size(); i++) { atp = gstate.active_tasks.active_tasks[i]; if ((atp->task_state() == PROCESS_EXECUTING) || ((atp->task_state() == PROCESS_SUSPENDED) && (gstate.suspend_reason == SUSPEND_REASON_CPU_THROTTLE))) { atp->result->write_gui(fout); } } fout.printf("</handle_get_screensaver_tasks>\n"); }
// resume all currently scheduled tasks // void ACTIVE_TASK_SET::unsuspend_all() { unsigned int i; ACTIVE_TASK* atp; for (i=0; i<active_tasks.size(); i++) { atp = active_tasks[i]; if (atp->scheduler_state != CPU_SCHED_SCHEDULED) continue; if (atp->task_state() == PROCESS_UNINITIALIZED) { if (atp->start(false)) { msg_printf(atp->wup->project, MSG_INTERNAL_ERROR, "Couldn't restart task %s", atp->result->name ); } } else if (atp->task_state() == PROCESS_SUSPENDED) { atp->unsuspend(); } } }
// Check if any of the active tasks have exceeded their // resource limits on disk, CPU time or memory // bool ACTIVE_TASK_SET::check_rsc_limits_exceeded() { //LOGD("app_control: ACTIVE_TASK_SET::check_rsc_limits_exceeded"); unsigned int i; ACTIVE_TASK *atp; static double last_disk_check_time = 0; bool do_disk_check = false; bool did_anything = false; double ram_left = gstate.available_ram(); double max_ram = gstate.max_available_ram(); // Some slot dirs have lots of files, // so only check every min(disk_interval, 300) secs // double min_interval = gstate.global_prefs.disk_interval; if (min_interval < 300) min_interval = 300; if (gstate.now > last_disk_check_time + min_interval) { do_disk_check = true; } for (i=0; i<active_tasks.size(); i++) { atp = active_tasks[i]; if (atp->task_state() != PROCESS_EXECUTING) continue; if (!atp->result->non_cpu_intensive() && (atp->elapsed_time > atp->max_elapsed_time)) { msg_printf(atp->result->project, MSG_INFO, "Aborting task %s: exceeded elapsed time limit %.2f (%.2fG/%.2fG)", atp->result->name, atp->max_elapsed_time, atp->result->wup->rsc_fpops_bound/1e9, atp->result->avp->flops/1e9 ); atp->abort_task(ERR_RSC_LIMIT_EXCEEDED, "Maximum elapsed time exceeded"); did_anything = true; continue; } if (atp->procinfo.working_set_size_smoothed > max_ram) { msg_printf(atp->result->project, MSG_INFO, "Aborting task %s: exceeded memory limit %.2fMB > %.2fMB\n", atp->result->name, atp->procinfo.working_set_size_smoothed/MEGA, max_ram/MEGA ); atp->abort_task(ERR_RSC_LIMIT_EXCEEDED, "Maximum memory exceeded"); did_anything = true; continue; } if (do_disk_check && atp->check_max_disk_exceeded()) { did_anything = true; continue; } ram_left -= atp->procinfo.working_set_size_smoothed; } if (ram_left < 0) { gstate.request_schedule_cpus("RAM usage limit exceeded"); } if (do_disk_check) { last_disk_check_time = gstate.now; } return did_anything; }
// Check to see if any tasks are running // called if benchmarking and waiting for suspends to happen // or the system needs to suspend itself so we are suspending // the applications // bool ACTIVE_TASK_SET::is_task_executing() { unsigned int i; ACTIVE_TASK* atp; for (i=0; i<active_tasks.size(); i++) { atp = active_tasks[i]; if (atp->task_state() == PROCESS_EXECUTING) { return true; } } return false; }
// clean up after finished apps // bool CLIENT_STATE::handle_finished_apps() { ACTIVE_TASK* atp; bool action = false; static double last_time = 0; if (!clock_change && now - last_time < HANDLE_FINISHED_APPS_PERIOD) return false; last_time = now; vector<ACTIVE_TASK*>::iterator iter; iter = active_tasks.active_tasks.begin(); while (iter != active_tasks.active_tasks.end()) { atp = *iter; switch (atp->task_state()) { case PROCESS_EXITED: case PROCESS_WAS_SIGNALED: case PROCESS_EXIT_UNKNOWN: case PROCESS_COULDNT_START: case PROCESS_ABORTED: if (log_flags.task) { msg_printf(atp->wup->project, MSG_INFO, "Computation for task %s finished", atp->result->name ); } app_finished(*atp); if (!action) { adjust_rec(); // update REC before erasing ACTIVE_TASK } iter = active_tasks.active_tasks.erase(iter); delete atp; set_client_state_dirty("handle_finished_apps"); // the following is critical; otherwise the result is // still in the "scheduled" list and enforce_schedule() // will try to run it again. // request_schedule_cpus("handle_finished_apps"); action = true; break; default: ++iter; } } return action; }
// suspend all currently running tasks // called only from CLIENT_STATE::suspend_tasks(), // e.g. because on batteries, time of day, benchmarking, CPU throttle, etc. // void ACTIVE_TASK_SET::suspend_all(int reason) { for (unsigned int i=0; i<active_tasks.size(); i++) { ACTIVE_TASK* atp = active_tasks[i]; if (atp->task_state() != PROCESS_EXECUTING) continue; switch (reason) { case SUSPEND_REASON_CPU_THROTTLE: // if we're doing CPU throttling, don't bother suspending apps // that don't use a full CPU // if (atp->result->project->non_cpu_intensive) continue; if (atp->app_version->avg_ncpus < 1) continue; atp->preempt(REMOVE_NEVER); break; case SUSPEND_REASON_BENCHMARKS: atp->preempt(REMOVE_NEVER); break; case SUSPEND_REASON_CPU_USAGE: if (atp->result->project->non_cpu_intensive) break; // fall through default: atp->preempt(REMOVE_MAYBE_USER); } } }
// scan the set of all processes to // 1) get the working-set size of active tasks // 2) see if exclusive apps are running // 3) get CPU time of non-BOINC processes // void ACTIVE_TASK_SET::get_memory_usage() { static double last_mem_time=0; unsigned int i; int retval; static bool first = true; static double last_cpu_time; double diff=0; if (!first) { diff = gstate.now - last_mem_time; if (diff < 0 || diff > MEMORY_USAGE_PERIOD + 10) { // user has changed system clock, // or there has been a long system sleep // last_mem_time = gstate.now; return; } if (diff < MEMORY_USAGE_PERIOD) return; } last_mem_time = gstate.now; PROC_MAP pm; retval = procinfo_setup(pm); if (retval) { if (log_flags.mem_usage_debug) { msg_printf(NULL, MSG_INTERNAL_ERROR, "[mem_usage] procinfo_setup() returned %d", retval ); } return; } PROCINFO boinc_total; if (log_flags.mem_usage_debug) { boinc_total.clear(); boinc_total.working_set_size_smoothed = 0; } for (i=0; i<active_tasks.size(); i++) { ACTIVE_TASK* atp = active_tasks[i]; if (atp->task_state() == PROCESS_UNINITIALIZED) continue; if (atp->pid ==0) continue; // scan all active tasks with a process, even if not scheduled, because // 1) we might have recently suspended a tasks, // and we still need to count its time // 2) preempted tasks might not actually suspend themselves // (and we'd count that as non-BOINC CPU usage // and suspend everything). PROCINFO& pi = atp->procinfo; unsigned long last_page_fault_count = pi.page_fault_count; pi.clear(); pi.id = atp->pid; vector<int>* v = NULL; if (atp->other_pids.size()>0) { v = &(atp->other_pids); } procinfo_app(pi, v, pm, atp->app_version->graphics_exec_file); if (atp->app_version->is_vm_app) { // the memory of virtual machine apps is not reported correctly, // at least on Windows. Use the VM size instead. // pi.working_set_size_smoothed = atp->wup->rsc_memory_bound; } else { pi.working_set_size_smoothed = .5*(pi.working_set_size_smoothed + pi.working_set_size); } if (pi.working_set_size > atp->peak_working_set_size) { atp->peak_working_set_size = pi.working_set_size; } if (pi.swap_size > atp->peak_swap_size) { atp->peak_swap_size = pi.swap_size; } if (!first) { int pf = pi.page_fault_count - last_page_fault_count; pi.page_fault_rate = pf/diff; if (log_flags.mem_usage_debug) { msg_printf(atp->result->project, MSG_INFO, "[mem_usage] %s%s: WS %.2fMB, smoothed %.2fMB, swap %.2fMB, %.2f page faults/sec, user CPU %.3f, kernel CPU %.3f", atp->scheduler_state==CPU_SCHED_SCHEDULED?"":" (not running)", atp->result->name, pi.working_set_size/MEGA, pi.working_set_size_smoothed/MEGA, pi.swap_size/MEGA, pi.page_fault_rate, pi.user_time, pi.kernel_time ); boinc_total.working_set_size += pi.working_set_size; boinc_total.working_set_size_smoothed += pi.working_set_size_smoothed; boinc_total.swap_size += pi.swap_size; boinc_total.page_fault_rate += pi.page_fault_rate; } } } if (!first) { if (log_flags.mem_usage_debug) { msg_printf(0, MSG_INFO, "[mem_usage] BOINC totals: WS %.2fMB, smoothed %.2fMB, swap %.2fMB, %.2f page faults/sec", boinc_total.working_set_size/MEGA, boinc_total.working_set_size_smoothed/MEGA, boinc_total.swap_size/MEGA, boinc_total.page_fault_rate ); } } for (i=0; i<cc_config.exclusive_apps.size(); i++) { if (app_running(pm, cc_config.exclusive_apps[i].c_str())) { if (log_flags.mem_usage_debug) { msg_printf(NULL, MSG_INFO, "[mem_usage] exclusive app %s is running", cc_config.exclusive_apps[i].c_str() ); } exclusive_app_running = gstate.now; break; } } for (i=0; i<cc_config.exclusive_gpu_apps.size(); i++) { if (app_running(pm, cc_config.exclusive_gpu_apps[i].c_str())) { if (log_flags.mem_usage_debug) { msg_printf(NULL, MSG_INFO, "[mem_usage] exclusive GPU app %s is running", cc_config.exclusive_gpu_apps[i].c_str() ); } exclusive_gpu_app_running = gstate.now; break; } } // get info on non-BOINC processes. // mem usage info is not useful because most OSs don't // move idle processes out of RAM, so physical memory is always full. // Also (at least on Win) page faults are used for various things, // not all of them generate disk I/O, // so they're not useful for detecting paging/thrashing. // PROCINFO pi; procinfo_non_boinc(pi, pm); if (log_flags.mem_usage_debug) { //procinfo_show(pm); msg_printf(NULL, MSG_INFO, "[mem_usage] All others: WS %.2fMB, swap %.2fMB, user %.3fs, kernel %.3fs", pi.working_set_size/MEGA, pi.swap_size/MEGA, pi.user_time, pi.kernel_time ); } double new_cpu_time = pi.user_time + pi.kernel_time; if (!first) { non_boinc_cpu_usage = (new_cpu_time - last_cpu_time)/(diff*gstate.host_info.p_ncpus); // processes might have exited in the last 10 sec, // causing this to be negative. if (non_boinc_cpu_usage < 0) non_boinc_cpu_usage = 0; if (log_flags.mem_usage_debug) { msg_printf(NULL, MSG_INFO, "[mem_usage] non-BOINC CPU usage: %.2f%%", non_boinc_cpu_usage*100 ); } } last_cpu_time = new_cpu_time; first = false; }
void show_resource(int rsc_type) { unsigned int i; char buf[256]; fprintf(html_out, "<td width=%d valign=top>", WIDTH2); bool found = false; for (i=0; i<gstate.active_tasks.active_tasks.size(); i++) { ACTIVE_TASK* atp = gstate.active_tasks.active_tasks[i]; RESULT* rp = atp->result; if (atp->task_state() != PROCESS_EXECUTING) continue; double ninst=0; if (rsc_type) { if (rp->avp->gpu_usage.rsc_type != rsc_type) continue; ninst = rp->avp->gpu_usage.usage; } else { ninst = rp->avp->avg_ncpus; } PROJECT* p = rp->project; if (!found) { found = true; fprintf(html_out, "<table>\n" "<tr><th>#devs</th><th>Job name (* = high priority)</th><th>GFLOPs left</th>%s</tr>\n", rsc_type?"<th>GPU</th>":"" ); } if (rsc_type) { sprintf(buf, "<td>%d</td>", rp->coproc_indices[0]); } else { safe_strcpy(buf, ""); } fprintf(html_out, "<tr valign=top><td>%.2f</td><td bgcolor=%s><font color=#ffffff>%s%s</font></td><td>%.0f</td>%s</tr>\n", ninst, colors[p->index%NCOLORS], rp->edf_scheduled?"*":"", rp->name, rp->sim_flops_left/1e9, buf ); } if (found) { fprintf(html_out, "</table>\n"); } else { fprintf(html_out, "IDLE\n"); } fprintf(html_out, "<table><tr><td>Project</td><td>In progress</td><td>done</td><td>REC</td></tr>\n" ); found = false; for (i=0; i<gstate.projects.size(); i++) { PROJECT* p = gstate.projects[i]; int in_progress, done; job_count(p, rsc_type, in_progress, done); if (in_progress || done) { fprintf(html_out, "<td bgcolor=%s><font color=#ffffff>%s</font></td><td>%d</td><td>%d</td><td>%.3f</td></tr>\n", colors[p->index%NCOLORS], p->project_name, in_progress, done, p->pwf.rec ); found = true; } } //if (!found) fprintf(html_out, " ---\n"); fprintf(html_out, "</table></td>"); }
bool ACTIVE_TASK_SET::poll() { unsigned int i; char buf[256]; bool action = false; static double last_time = START_TIME; double diff = gstate.now - last_time; if (diff < 1.0) return false; last_time = gstate.now; if (diff > delta) { diff = 0; } PROJECT* p; for (i=0; i<gstate.projects.size(); i++) { p = gstate.projects[i]; p->idle = true; } // we do two kinds of FLOPs accounting: // 1) actual FLOPS (for job completion) // 2) peak FLOPS (for total and per-project resource usage) // // CPU may be overcommitted, in which case we compute // a "cpu_scale" factor that is < 1. // GPUs are never overcommitted. // // actual FLOPS is based on app_version.flops, scaled by cpu_scale for CPU jobs // peak FLOPS is based on device peak FLOPS, // with CPU component scaled by cpu_scale for all jobs // get CPU usage by GPU and CPU jobs // double cpu_usage_cpu=0; double cpu_usage_gpu=0; for (i=0; i<active_tasks.size(); i++) { ACTIVE_TASK* atp = active_tasks[i]; if (atp->task_state() != PROCESS_EXECUTING) continue; RESULT* rp = atp->result; if (rp->uses_gpu()) { if (gpu_active) { cpu_usage_gpu += rp->avp->avg_ncpus; } } else { cpu_usage_cpu += rp->avp->avg_ncpus; } } double cpu_usage = cpu_usage_cpu + cpu_usage_gpu; // if CPU is overcommitted, compute cpu_scale // double cpu_scale = 1; if (cpu_usage > gstate.ncpus) { cpu_scale = (gstate.ncpus - cpu_usage_gpu) / (cpu_usage - cpu_usage_gpu); } double used = 0; for (i=0; i<active_tasks.size(); i++) { ACTIVE_TASK* atp = active_tasks[i]; if (atp->task_state() != PROCESS_EXECUTING) continue; RESULT* rp = atp->result; if (!gpu_active && rp->uses_gpu()) { continue; } atp->elapsed_time += diff; double flops = rp->avp->flops; if (!rp->uses_gpu()) { flops *= cpu_scale; } rp->sim_flops_left -= diff*flops; atp->fraction_done = 1 - rp->sim_flops_left / rp->wup->rsc_fpops_est; atp->checkpoint_wall_time = gstate.now; if (rp->sim_flops_left <= 0) { atp->set_task_state(PROCESS_EXITED, "poll"); rp->exit_status = 0; rp->ready_to_report = true; gstate.request_schedule_cpus("job finished"); gstate.request_work_fetch("job finished"); sprintf(buf, "result %s finished<br>", rp->name); html_msg += buf; action = true; } double pf = diff * app_peak_flops(rp->avp, cpu_scale); rp->project->project_results.flops_used += pf; rp->peak_flop_count += pf; sim_results.flops_used += pf; used += pf; rp->project->idle = false; } for (i=0; i<gstate.projects.size(); i++) { p = gstate.projects[i]; if (p->idle) { p->idle_time += diff; p->idle_time_sumsq += diff*(p->idle_time*p->idle_time); } else { p->idle_time = 0; } } active_time += diff; if (gpu_active) { gpu_active_time += diff; } return action; }
void simulate() { bool action; double start = START_TIME; gstate.now = start; html_start(); fprintf(summary_file, "Hardware summary\n %d CPUs, %.1f GFLOPS\n", gstate.host_info.p_ncpus, gstate.host_info.p_fpops/1e9 ); for (int i=1; i<coprocs.n_rsc; i++) { fprintf(summary_file, " %d %s GPUs, %.1f GFLOPS\n", coprocs.coprocs[i].count, coprocs.coprocs[i].type, coprocs.coprocs[i].peak_flops/1e9 ); } fprintf(summary_file, "Preferences summary\n" " work buf min %f max %f\n" " Scheduling period %f\n" "Scheduling policies\n" " Round-robin only: %s\n" " Scheduler EDF simulation: %s\n" " REC half-life: %f\n", gstate.work_buf_min(), gstate.work_buf_total(), gstate.global_prefs.cpu_scheduling_period(), cpu_sched_rr_only?"yes":"no", server_uses_workload?"yes":"no", cc_config.rec_half_life ); fprintf(summary_file, "Jobs\n"); for (unsigned int i=0; i<gstate.results.size(); i++) { RESULT* rp = gstate.results[i]; fprintf(summary_file, " %s %s (%s)\n time left %s deadline %s\n", rp->project->project_name, rp->name, rsc_name_long(rp->avp->gpu_usage.rsc_type), timediff_format(rp->sim_flops_left/rp->avp->flops).c_str(), timediff_format(rp->report_deadline - START_TIME).c_str() ); } fprintf(summary_file, "Simulation parameters\n" " time step %f, duration %f\n" "-------------------\n", delta, duration ); write_inputs(); while (1) { on = on_proc.sample(delta); if (on) { active = active_proc.sample(delta); if (active) { gpu_active = gpu_active_proc.sample(delta); } else { gpu_active = false; } connected = connected_proc.sample(delta); } else { active = gpu_active = connected = false; } // do accounting for the period that just ended, // even if we're now in an "off" state. // // need both of the following, else crash // action |= gstate.active_tasks.poll(); action |= gstate.handle_finished_apps(); if (on) { while (1) { action = false; action |= gstate.schedule_cpus(); if (connected) { action |= gstate.scheduler_rpc_poll(); // this deletes completed results } action |= gstate.active_tasks.poll(); action |= gstate.handle_finished_apps(); gpu_suspend_reason = gpu_active?0:1; //msg_printf(0, MSG_INFO, action?"did action":"did no action"); if (!action) break; } } //msg_printf(0, MSG_INFO, "took time step"); for (unsigned int i=0; i<gstate.active_tasks.active_tasks.size(); i++) { ACTIVE_TASK* atp = gstate.active_tasks.active_tasks[i]; if (atp->task_state() == PROCESS_EXECUTING) { atp->elapsed_time += delta; } } html_rec(); write_recs(); gstate.now += delta; if (gstate.now > start + duration) break; } html_end(); }
// Do periodic checks on running apps: // - get latest CPU time and % done info // - check if any has exited, and clean up // - see if any has exceeded its CPU or disk space limits, and abort it // bool ACTIVE_TASK_SET::poll() { bool action; unsigned int i; static double last_time = 0; if (!gstate.clock_change && gstate.now - last_time < TASK_POLL_PERIOD) return false; last_time = gstate.now; action = check_app_exited(); send_heartbeats(); send_trickle_downs(); process_control_poll(); action |= check_rsc_limits_exceeded(); get_msgs(); for (i=0; i<active_tasks.size(); i++) { ACTIVE_TASK* atp = active_tasks[i]; if (atp->task_state() == PROCESS_ABORT_PENDING) { if (gstate.now > atp->abort_time + ABORT_TIMEOUT) { if (log_flags.task_debug) { msg_printf(atp->result->project, MSG_INFO, "[task] abort request timed out, killing task %s", atp->result->name ); } atp->kill_task(false); } } if (atp->task_state() == PROCESS_QUIT_PENDING) { if (gstate.now > atp->quit_time + QUIT_TIMEOUT) { if (log_flags.task_debug) { msg_printf(atp->result->project, MSG_INFO, "[task] quit request timed out, killing task %s", atp->result->name ); } atp->kill_task(true); } } } // Check for finish files every 10 sec. // If we already found a finish file, abort the app; // it must be hung somewhere in boinc_finish(); // static double last_finish_check_time = 0; if (gstate.clock_change || gstate.now - last_finish_check_time > 10) { last_finish_check_time = gstate.now; for (i=0; i<active_tasks.size(); i++) { ACTIVE_TASK* atp = active_tasks[i]; if (atp->task_state() == PROCESS_UNINITIALIZED) continue; if (atp->finish_file_time) { // process is still there 10 sec after it wrote finish file. // abort the job atp->abort_task(EXIT_ABORTED_BY_CLIENT, "finish file present too long"); } else if (atp->finish_file_present()) { atp->finish_file_time = gstate.now; } } } if (action) { gstate.set_client_state_dirty("ACTIVE_TASK_SET::poll"); } return action; }
// Check if any of the active tasks have exceeded their // resource limits on disk, CPU time or memory // // TODO: this gets called ever 1 sec, // but mem and disk usage are computed less often. // refactor. // bool ACTIVE_TASK_SET::check_rsc_limits_exceeded() { unsigned int i; ACTIVE_TASK *atp; static double last_disk_check_time = 0; bool do_disk_check = false; bool did_anything = false; char buf[256]; double ram_left = gstate.available_ram(); double max_ram = gstate.max_available_ram(); // Some slot dirs have lots of files, // so only check every min(disk_interval, 300) secs // double min_interval = gstate.global_prefs.disk_interval; if (min_interval < 300) min_interval = 300; if (gstate.clock_change || gstate.now > last_disk_check_time + min_interval) { do_disk_check = true; } for (i=0; i<active_tasks.size(); i++) { atp = active_tasks[i]; if (atp->task_state() != PROCESS_EXECUTING) continue; if (!atp->result->non_cpu_intensive() && (atp->elapsed_time > atp->max_elapsed_time)) { sprintf(buf, "exceeded elapsed time limit %.2f (%.2fG/%.2fG)", atp->max_elapsed_time, atp->result->wup->rsc_fpops_bound/1e9, atp->result->avp->flops/1e9 ); msg_printf(atp->result->project, MSG_INFO, "Aborting task %s: %s", atp->result->name, buf ); atp->abort_task(EXIT_TIME_LIMIT_EXCEEDED, buf); did_anything = true; continue; } #if 0 // removing this for now because most projects currently // have too-low values of workunit.rsc_memory_bound // (causing lots of aborts) // and I don't think we can expect projects to provide // accurate bounds. // if (atp->procinfo.working_set_size_smoothed > atp->max_mem_usage) { sprintf(buf, "working set size > workunit.rsc_memory_bound: %.2fMB > %.2fMB", atp->procinfo.working_set_size_smoothed/MEGA, atp->max_mem_usage/MEGA ); msg_printf(atp->result->project, MSG_INFO, "Aborting task %s: %s", atp->result->name, buf ); atp->abort_task(EXIT_MEM_LIMIT_EXCEEDED, buf); did_anything = true; continue; } #endif if (atp->procinfo.working_set_size_smoothed > max_ram) { sprintf(buf, "working set size > client RAM limit: %.2fMB > %.2fMB", atp->procinfo.working_set_size_smoothed/MEGA, max_ram/MEGA ); msg_printf(atp->result->project, MSG_INFO, "Aborting task %s: %s", atp->result->name, buf ); atp->abort_task(EXIT_MEM_LIMIT_EXCEEDED, buf); did_anything = true; continue; } if (do_disk_check || atp->peak_disk_usage == 0) { if (atp->check_max_disk_exceeded()) { did_anything = true; continue; } } // don't count RAM usage of non-CPU-intensive jobs // if (!atp->result->non_cpu_intensive()) { ram_left -= atp->procinfo.working_set_size_smoothed; } } if (ram_left < 0) { gstate.request_schedule_cpus("RAM usage limit exceeded"); } if (do_disk_check) { last_disk_check_time = gstate.now; } return did_anything; }
// suspend all currently running tasks // e.g. because on batteries, time of day, benchmarking, CPU throttle, etc. // void ACTIVE_TASK_SET::suspend_all(int reason) { for (unsigned int i=0; i<active_tasks.size(); i++) { ACTIVE_TASK* atp = active_tasks[i]; // don't suspend if process doesn't exist, // or if quit/abort is pending. // If process is currently suspended, proceed; // the new suspension may require it to be removed from memory. // E.g. a GPU job may currently be suspended due to CPU throttling, // and therefore left in memory, // but this suspension (say, a user request) // might require it to be removed from memory. // switch (atp->task_state()) { case PROCESS_EXECUTING: case PROCESS_SUSPENDED: break; default: continue; } // handle CPU throttling separately // if (reason == SUSPEND_REASON_CPU_THROTTLE) { if (atp->result->dont_throttle()) continue; atp->preempt(REMOVE_NEVER, reason); continue; } #ifdef ANDROID // On Android, remove apps from memory if on batteries // no matter what the reason for suspension. // The message polling in the BOINC runtime system // imposes an overhead which drains the battery // if (gstate.host_info.host_is_running_on_batteries()) { atp->preempt(REMOVE_ALWAYS); continue; } #endif switch (reason) { case SUSPEND_REASON_BENCHMARKS: atp->preempt(REMOVE_NEVER); break; case SUSPEND_REASON_CPU_USAGE: // If we're suspending because of non-BOINC CPU load, // don't remove from memory. // Some systems do a security check when apps are launched, // which uses a lot of CPU. // Avoid going into a preemption loop. // if (atp->result->non_cpu_intensive()) break; atp->preempt(REMOVE_NEVER); break; case SUSPEND_REASON_BATTERY_OVERHEATED: case SUSPEND_REASON_BATTERY_CHARGING: // these conditions can oscillate, so leave apps in mem // atp->preempt(REMOVE_NEVER); break; default: atp->preempt(REMOVE_MAYBE_USER); break; } } }