bool do_pass() { int retval; DB_TRANSITIONER_ITEM_SET transitioner; std::vector<TRANSITIONER_ITEM> items; bool did_something = false; if (!one_pass) check_stop_daemons(); // loop over entries that are due to be checked // while (1) { if (wu_id) { // kludge to tell enumerate to return a given WU mod_n = 1; mod_i = wu_id; } retval = transitioner.enumerate( (int)time(0), SELECT_LIMIT, mod_n, mod_i, items ); if (retval) { if (retval != ERR_DB_NOT_FOUND) { log_messages.printf(MSG_CRITICAL, "WU enum error: %s; exiting\n", boincerror(retval) ); exit(1); } break; } did_something = true; TRANSITIONER_ITEM& wu_item = items[0]; retval = handle_wu(transitioner, items); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%lu %s] handle_wu: %s; quitting\n", wu_item.id, wu_item.name, boincerror(retval) ); // probably better to exit here. // Whatever cause this WU to fail (and it could be temporary) // might cause ALL WUs to fail // exit(1); } if (!one_pass) check_stop_daemons(); if (wu_id) break; } return did_something; }
bool do_pass() { int retval; DB_TRANSITIONER_ITEM_SET transitioner; std::vector<TRANSITIONER_ITEM> items; bool did_something = false; if (!one_pass) check_stop_daemons(); // loop over entries that are due to be checked // while (1) { retval = transitioner.enumerate( (int)time(0), SELECT_LIMIT, mod_n, mod_i, items ); if (retval) { if (retval != ERR_DB_NOT_FOUND) { log_messages.printf(MSG_CRITICAL, "WU enum error: %s; exiting\n", boincerror(retval) ); exit(1); } break; } did_something = true; TRANSITIONER_ITEM& wu_item = items[0]; retval = handle_wu(transitioner, items); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] handle_wu: %s; quitting\n", wu_item.id, wu_item.name, boincerror(retval) ); exit(1); } if (!one_pass) check_stop_daemons(); } return did_something; }
int handle_wu( DB_TRANSITIONER_ITEM_SET& transitioner, std::vector<TRANSITIONER_ITEM>& items ) { int ntotal, nerrors, retval, ninprogress, nsuccess; int nunsent, ncouldnt_send, nover, ndidnt_need, nno_reply; int canonical_result_index, j; char suffix[256]; time_t now = time(0), x; bool all_over_and_validated, have_new_result_to_validate, do_delete; unsigned int i; TRANSITIONER_ITEM& wu_item = items[0]; TRANSITIONER_ITEM wu_item_original = wu_item; // count up the number of results in various states, // and check for timed-out results // ntotal = 0; nunsent = 0; // including INACTIVE ninprogress = 0; nover = 0; nerrors = 0; nsuccess = 0; // not counting invalid results!!!! ncouldnt_send = 0; nno_reply = 0; ndidnt_need = 0; have_new_result_to_validate = false; int rs, max_result_suffix = -1; // Scan the WU's results, and find the canonical result if there is one // canonical_result_index = -1; if (wu_item.canonical_resultid) { for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; if (!res_item.res_id) continue; if (res_item.res_id == wu_item.canonical_resultid) { canonical_result_index = i; } } } if (wu_item.canonical_resultid && (canonical_result_index == -1)) { log_messages.printf(MSG_CRITICAL, "[WU#%u %s] can't find canonical result\n", wu_item.id, wu_item.name ); } // if there is a canonical result, see if its file are deleted // bool canonical_result_files_deleted = false; if (canonical_result_index >= 0) { TRANSITIONER_ITEM& cr = items[canonical_result_index]; if (cr.res_file_delete_state == FILE_DELETE_DONE) { canonical_result_files_deleted = true; } } // Scan this WU's results, and // 1) count those in various server states; // 2) identify timed-out results and update their server state and outcome // 3) find the max result suffix (in case need to generate new ones) // 4) see if we have a new result to validate // (outcome SUCCESS and validate_state INIT) // for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; if (!res_item.res_id) continue; ntotal++; rs = result_suffix(res_item.res_name); if (rs > max_result_suffix) max_result_suffix = rs; switch (res_item.res_server_state) { case RESULT_SERVER_STATE_INACTIVE: case RESULT_SERVER_STATE_UNSENT: nunsent++; break; case RESULT_SERVER_STATE_IN_PROGRESS: if (res_item.res_report_deadline < now) { log_messages.printf(MSG_NORMAL, "[WU#%u %s] [RESULT#%u %s] result timed out (%d < %d) server_state:IN_PROGRESS=>OVER; outcome:NO_REPLY\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, res_item.res_report_deadline, (int)now ); res_item.res_server_state = RESULT_SERVER_STATE_OVER; res_item.res_outcome = RESULT_OUTCOME_NO_REPLY; retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%u %s] [RESULT#%u %s] update_result(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } retval = result_timed_out(res_item, wu_item); if (retval) { log_messages.printf(MSG_CRITICAL, "result_timed_out() error: %s\n", boincerror(retval) ); exit(1); } nover++; nno_reply++; } else { ninprogress++; } break; case RESULT_SERVER_STATE_OVER: nover++; switch (res_item.res_outcome) { case RESULT_OUTCOME_COULDNT_SEND: log_messages.printf(MSG_NORMAL, "[WU#%u %s] [RESULT#%u %s] result couldn't be sent\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); ncouldnt_send++; break; case RESULT_OUTCOME_SUCCESS: if (res_item.res_validate_state == VALIDATE_STATE_INIT) { if (canonical_result_files_deleted) { res_item.res_validate_state = VALIDATE_STATE_TOO_LATE; retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%u %s] [RESULT#%u %s] update_result(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } else { log_messages.printf(MSG_NORMAL, "[WU#%u %s] [RESULT#%u %s] validate_state:INIT=>TOO_LATE\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); } } else { have_new_result_to_validate = true; } } // don't count invalid results as successful // if (res_item.res_validate_state != VALIDATE_STATE_INVALID) { nsuccess++; } break; case RESULT_OUTCOME_CLIENT_ERROR: // is user aborted job, don't count it as an error // if (res_item.res_exit_status == EXIT_ABORTED_VIA_GUI) { nno_reply++; } else { nerrors++; } break; case RESULT_OUTCOME_VALIDATE_ERROR: nerrors++; break; case RESULT_OUTCOME_CLIENT_DETACHED: case RESULT_OUTCOME_NO_REPLY: nno_reply++; break; case RESULT_OUTCOME_DIDNT_NEED: ndidnt_need++; break; } break; } } log_messages.printf(MSG_DEBUG, "[WU#%u %s] %d results: unsent %d, in_progress %d, over %d (success %d, error %d, couldnt_send %d, no_reply %d, didnt_need %d)\n", wu_item.id, wu_item.name, ntotal, nunsent, ninprogress, nover, nsuccess, nerrors, ncouldnt_send, nno_reply, ndidnt_need ); // if there's a new result to validate, trigger validation // if (have_new_result_to_validate && (nsuccess >= wu_item.min_quorum)) { wu_item.need_validate = true; log_messages.printf(MSG_NORMAL, "[WU#%u %s] need_validate:=>true\n", wu_item.id, wu_item.name ); } // check for WU error conditions // NOTE: check on max # of success results is done in validater // if (ncouldnt_send > 0) { wu_item.error_mask |= WU_ERROR_COULDNT_SEND_RESULT; } // if WU has results with errors and there are no results that are // - successful // - in progress // - timed out (but could still be returned) // reset homogeneous redundancy class to give other platforms a try; // also reset app version ID if using HAV // if (nerrors && !(nsuccess || ninprogress || nno_reply)) { if (!config.hr_class_static) { wu_item.hr_class = 0; wu_item.app_version_id = 0; } } if (nerrors > wu_item.max_error_results) { log_messages.printf(MSG_NORMAL, "[WU#%u %s] WU has too many errors (%d errors for %d results)\n", wu_item.id, wu_item.name, nerrors, ntotal ); wu_item.error_mask |= WU_ERROR_TOO_MANY_ERROR_RESULTS; } // see how many new results we need to make // int n_new_results_needed = wu_item.target_nresults - nunsent - ninprogress - nsuccess; if (n_new_results_needed < 0) n_new_results_needed = 0; int n_new_results_allowed = wu_item.max_total_results - ntotal; // if we're already at the limit and need more, error out the WU // bool too_many = false; if (n_new_results_allowed < 0) { too_many = true; } else if (n_new_results_allowed == 0) { if (n_new_results_needed > 0) { too_many = true; } } else { if (n_new_results_needed > n_new_results_allowed) { n_new_results_needed = n_new_results_allowed; } } if (too_many) { log_messages.printf(MSG_NORMAL, "[WU#%u %s] WU has too many total results (%d)\n", wu_item.id, wu_item.name, ntotal ); wu_item.error_mask |= WU_ERROR_TOO_MANY_TOTAL_RESULTS; } // if this WU had an error, don't send any unsent results, // and trigger assimilation if needed // if (wu_item.error_mask) { for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; if (!res_item.res_id) continue; bool update_result = false; switch(res_item.res_server_state) { case RESULT_SERVER_STATE_INACTIVE: case RESULT_SERVER_STATE_UNSENT: log_messages.printf(MSG_NORMAL, "[WU#%u %s] [RESULT#%u %s] server_state:UNSENT=>OVER; outcome:=>DIDNT_NEED\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); res_item.res_server_state = RESULT_SERVER_STATE_OVER; res_item.res_outcome = RESULT_OUTCOME_DIDNT_NEED; update_result = true; break; case RESULT_SERVER_STATE_OVER: switch (res_item.res_outcome) { case RESULT_OUTCOME_SUCCESS: switch(res_item.res_validate_state) { case VALIDATE_STATE_INIT: case VALIDATE_STATE_INCONCLUSIVE: res_item.res_validate_state = VALIDATE_STATE_NO_CHECK; update_result = true; break; } } } if (update_result) { retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%u %s] [RESULT#%u %s] result.update(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } } } if (wu_item.assimilate_state == ASSIMILATE_INIT) { wu_item.assimilate_state = ASSIMILATE_READY; log_messages.printf(MSG_NORMAL, "[WU#%u %s] error_mask:%d assimilate_state:INIT=>READY\n", wu_item.id, wu_item.name, wu_item.error_mask ); } } else if (wu_item.canonical_resultid == 0) { // Here if no WU-level error. // Generate new results if needed. // std::string values; char value_buf[MAX_QUERY_LEN]; if (wu_item.transitioner_flags != TRANSITION_NO_NEW_RESULTS && n_new_results_needed > 0 ) { log_messages.printf( MSG_NORMAL, "[WU#%u %s] Generating %d more results (%d target - %d unsent - %d in progress - %d success)\n", wu_item.id, wu_item.name, n_new_results_needed, wu_item.target_nresults, nunsent, ninprogress, nsuccess ); for (j=0; j<n_new_results_needed; j++) { sprintf(suffix, "%d", max_result_suffix+j+1); const char *rtfpath = config.project_path("%s", wu_item.result_template_file); int priority_increase = 0; if (nover && config.reliable_priority_on_over) { priority_increase += config.reliable_priority_on_over; } else if (nover && !nerrors && config.reliable_priority_on_over_except_error) { priority_increase += config.reliable_priority_on_over_except_error; } retval = create_result_ti( wu_item, (char *)rtfpath, suffix, key, config, value_buf, priority_increase ); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%u %s] create_result_ti(): %s\n", wu_item.id, wu_item.name, boincerror(retval) ); return retval; } if (j==0) { values = value_buf; } else { values += ","; values += value_buf; } } DB_RESULT r; retval = r.insert_batch(values); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%u %s] insert_batch(): %s\n", wu_item.id, wu_item.name, boincerror(retval) ); return retval; } } } // scan results: // - see if all over and validated // all_over_and_validated = true; bool all_over_and_ready_to_assimilate = true; // used for the defer assimilation double most_recently_returned = 0; for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; if (!res_item.res_id) continue; if (res_item.res_server_state == RESULT_SERVER_STATE_OVER) { if (res_item.res_received_time > most_recently_returned) { most_recently_returned = res_item.res_received_time; } if (res_item.res_outcome == RESULT_OUTCOME_SUCCESS) { if (res_item.res_validate_state == VALIDATE_STATE_INIT) { all_over_and_validated = false; all_over_and_ready_to_assimilate = false; } } else if (res_item.res_outcome == RESULT_OUTCOME_NO_REPLY) { if (now < res_item.res_report_deadline) { all_over_and_validated = false; } } } else { all_over_and_validated = false; all_over_and_ready_to_assimilate = false; } } // If we're deferring assimilation until all results are over and validated, // when that happens make sure that WU state is advanced to assimilate ready // the items.size is a kludge // if (all_over_and_ready_to_assimilate && wu_item.assimilate_state == ASSIMILATE_INIT && items.size() > 0 && wu_item.canonical_resultid > 0 ) { wu_item.assimilate_state = ASSIMILATE_READY; log_messages.printf(MSG_NORMAL, "[WU#%u %s] Deferred assimilation now set to ASSIMILATE_STATE_READY\n", wu_item.id, wu_item.name ); } // if WU is assimilated, trigger file deletion // double deferred_file_delete_time = 0; if (wu_item.assimilate_state == ASSIMILATE_DONE) { if (now >= (most_recently_returned + config.delete_delay)) { // can delete input files if all results OVER // if (all_over_and_validated && wu_item.file_delete_state == FILE_DELETE_INIT) { wu_item.file_delete_state = FILE_DELETE_READY; log_messages.printf(MSG_DEBUG, "[WU#%u %s] ASSIMILATE_DONE: file_delete_state:=>READY\n", wu_item.id, wu_item.name ); } // output of error results can be deleted immediately; // output of success results can be deleted if validated // for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; // can delete canonical result outputs only if all successful // results have been validated // if (((int)i == canonical_result_index) && !all_over_and_validated) { continue; } if (!res_item.res_id) continue; do_delete = false; switch(res_item.res_outcome) { case RESULT_OUTCOME_CLIENT_ERROR: do_delete = true; break; case RESULT_OUTCOME_SUCCESS: do_delete = (res_item.res_validate_state != VALIDATE_STATE_INIT); break; } if (do_delete && res_item.res_file_delete_state == FILE_DELETE_INIT) { log_messages.printf(MSG_NORMAL, "[WU#%u %s] [RESULT#%u %s] file_delete_state:=>READY\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); res_item.res_file_delete_state = FILE_DELETE_READY; retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%u %s] [RESULT#%u %s] result.update(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } } } } else { deferred_file_delete_time = most_recently_returned + config.delete_delay; log_messages.printf(MSG_DEBUG, "[WU#%u %s] deferring file deletion for %.0f seconds\n", wu_item.id, wu_item.name, deferred_file_delete_time - now ); } } // Compute next transition time. // This is the min of // - timeouts of in-progress results // - deferred file deletion time // - safety net // // It is then adjusted to deal with transitioner congestion // if (wu_item.canonical_resultid || wu_item.error_mask) { wu_item.transition_time = INT_MAX; } else { // Safety net: if there is no canonical result and no WU-level error, // make sure that the transitioner will process this WU again. // In principle this is not needed, // but it makes the BOINC back-end more robust. // const int ten_days = 10*86400; int long_delay = (int)(1.5*wu_item.delay_bound); wu_item.transition_time = (long_delay > ten_days) ? long_delay : ten_days; wu_item.transition_time += time(0); } // handle timeout of in-progress results // for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; if (!res_item.res_id) continue; if (res_item.res_server_state == RESULT_SERVER_STATE_IN_PROGRESS) { x = res_item.res_report_deadline; if (x < wu_item.transition_time) { wu_item.transition_time = x; } } } // handle deferred file deletion // if (deferred_file_delete_time && deferred_file_delete_time < wu_item.transition_time ) { wu_item.transition_time = (int)deferred_file_delete_time; } // Handle transitioner overload. // If transition time is in the past, // the system is bogged down and behind schedule. // Delay processing of the WU by an amount DOUBLE the amount we are behind, // but not less than 60 secs or more than one day. // if (wu_item.transition_time < now) { int extra_delay = 2*(now - wu_item.transition_time); if (extra_delay < 60) extra_delay = 60; if (extra_delay > 86400) extra_delay = 86400; log_messages.printf(MSG_DEBUG, "[WU#%u %s] transition time in past: adding extra delay %d sec\n", wu_item.id, wu_item.name, extra_delay ); wu_item.transition_time = now + extra_delay; } log_messages.printf(MSG_DEBUG, "[WU#%u %s] setting transition_time to %d\n", wu_item.id, wu_item.name, wu_item.transition_time ); retval = transitioner.update_workunit(wu_item, wu_item_original); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%u %s] workunit.update(): %s\n", wu_item.id, wu_item.name, boincerror(retval) ); return retval; } return 0; }