int handle_wu( DB_TRANSITIONER_ITEM_SET& transitioner, std::vector<TRANSITIONER_ITEM>& items ) { int ntotal, nerrors, retval, ninprogress, nsuccess; int nunsent, ncouldnt_send, nover, ndidnt_need, nno_reply; int canonical_result_index, j; char suffix[256]; time_t now = time(0), x; bool all_over_and_validated, have_new_result_to_validate, do_delete; unsigned int i; TRANSITIONER_ITEM& wu_item = items[0]; TRANSITIONER_ITEM wu_item_original = wu_item; // "assigned" WUs aren't supposed to be handled by the transitioner. // If we get one, it's an error // if (config.enable_assignment && strstr(wu_item.name, ASSIGNED_WU_STR)) { DB_WORKUNIT wu; char buf[256]; wu.id = wu_item.id; log_messages.printf(MSG_CRITICAL, "Assigned WU %d unexpectedly found by transitioner\n", wu.id ); sprintf(buf, "transition_time=%d", INT_MAX); retval = wu.update_field(buf); if (retval) { log_messages.printf(MSG_CRITICAL, "update_field failed: %s\n", boincerror(retval) ); } return 0; } // count up the number of results in various states, // and check for timed-out results // ntotal = 0; nunsent = 0; ninprogress = 0; nover = 0; nerrors = 0; nsuccess = 0; // not counting invalid results!!!! ncouldnt_send = 0; nno_reply = 0; ndidnt_need = 0; have_new_result_to_validate = false; int rs, max_result_suffix = -1; // Scan the WU's results, and find the canonical result if there is one // canonical_result_index = -1; if (wu_item.canonical_resultid) { for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; if (!res_item.res_id) continue; if (res_item.res_id == wu_item.canonical_resultid) { canonical_result_index = i; } } } if (wu_item.canonical_resultid && (canonical_result_index == -1)) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] can't find canonical result\n", wu_item.id, wu_item.name ); } // if there is a canonical result, see if its file are deleted // bool canonical_result_files_deleted = false; if (canonical_result_index >= 0) { TRANSITIONER_ITEM& cr = items[canonical_result_index]; if (cr.res_file_delete_state == FILE_DELETE_DONE) { canonical_result_files_deleted = true; } } // Scan this WU's results, and // 1) count those in various server states; // 2) identify timed-out results and update their server state and outcome // 3) find the max result suffix (in case need to generate new ones) // 4) see if we have a new result to validate // (outcome SUCCESS and validate_state INIT) // for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; if (!res_item.res_id) continue; ntotal++; rs = result_suffix(res_item.res_name); if (rs > max_result_suffix) max_result_suffix = rs; switch (res_item.res_server_state) { case RESULT_SERVER_STATE_UNSENT: nunsent++; break; case RESULT_SERVER_STATE_IN_PROGRESS: if (res_item.res_report_deadline < now) { log_messages.printf(MSG_NORMAL, "[WU#%d %s] [RESULT#%d %s] result timed out (%d < %d) server_state:IN_PROGRESS=>OVER; outcome:NO_REPLY\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, res_item.res_report_deadline, (int)now ); res_item.res_server_state = RESULT_SERVER_STATE_OVER; res_item.res_outcome = RESULT_OUTCOME_NO_REPLY; retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] [RESULT#%d %s] update_result(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } retval = result_timed_out(res_item, wu_item); if (retval) { log_messages.printf(MSG_CRITICAL, "result_timed_out() error: %s\n", boincerror(retval) ); exit(1); } nover++; nno_reply++; } else { ninprogress++; } break; case RESULT_SERVER_STATE_OVER: nover++; switch (res_item.res_outcome) { case RESULT_OUTCOME_COULDNT_SEND: log_messages.printf(MSG_NORMAL, "[WU#%d %s] [RESULT#%d %s] result couldn't be sent\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); ncouldnt_send++; break; case RESULT_OUTCOME_SUCCESS: if (res_item.res_validate_state == VALIDATE_STATE_INIT) { if (canonical_result_files_deleted) { res_item.res_validate_state = VALIDATE_STATE_TOO_LATE; retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] [RESULT#%d %s] update_result(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } else { log_messages.printf(MSG_NORMAL, "[WU#%d %s] [RESULT#%d %s] validate_state:INIT=>TOO_LATE\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); } } else { have_new_result_to_validate = true; } } // don't count invalid results as successful // if (res_item.res_validate_state != VALIDATE_STATE_INVALID) { nsuccess++; } break; case RESULT_OUTCOME_CLIENT_ERROR: case RESULT_OUTCOME_VALIDATE_ERROR: nerrors++; break; case RESULT_OUTCOME_CLIENT_DETACHED: case RESULT_OUTCOME_NO_REPLY: nno_reply++; break; case RESULT_OUTCOME_DIDNT_NEED: ndidnt_need++; break; } break; } } log_messages.printf(MSG_DEBUG, "[WU#%d %s] %d results: unsent %d, in_progress %d, over %d (success %d, error %d, couldnt_send %d, no_reply %d, didnt_need %d)\n", wu_item.id, wu_item.name, ntotal, nunsent, ninprogress, nover, nsuccess, nerrors, ncouldnt_send, nno_reply, ndidnt_need ); // if there's a new result to validate, trigger validation // if (have_new_result_to_validate && (nsuccess >= wu_item.min_quorum)) { wu_item.need_validate = true; log_messages.printf(MSG_NORMAL, "[WU#%d %s] need_validate:=>true\n", wu_item.id, wu_item.name ); } // check for WU error conditions // NOTE: check on max # of success results is done in validater // if (ncouldnt_send > 0) { wu_item.error_mask |= WU_ERROR_COULDNT_SEND_RESULT; } // if WU has results with errors and no success yet, // reset homogeneous redundancy class to give other platforms a try; // also reset app version ID if using HAV // if (nerrors && !(nsuccess || ninprogress)) { wu_item.hr_class = 0; wu_item.app_version_id = 0; } if (nerrors > wu_item.max_error_results) { log_messages.printf(MSG_NORMAL, "[WU#%d %s] WU has too many errors (%d errors for %d results)\n", wu_item.id, wu_item.name, nerrors, ntotal ); wu_item.error_mask |= WU_ERROR_TOO_MANY_ERROR_RESULTS; } // see how many new results we need to make // int n_new_results_needed = wu_item.target_nresults - nunsent - ninprogress - nsuccess; if (n_new_results_needed < 0) n_new_results_needed = 0; int n_new_results_allowed = wu_item.max_total_results - ntotal; // if we're already at the limit and need more, error out the WU // bool too_many = false; if (n_new_results_allowed < 0) { too_many = true; } else if (n_new_results_allowed == 0) { if (n_new_results_needed > 0) { too_many = true; } } else { if (n_new_results_needed > n_new_results_allowed) { n_new_results_needed = n_new_results_allowed; } } if (too_many) { log_messages.printf(MSG_NORMAL, "[WU#%d %s] WU has too many total results (%d)\n", wu_item.id, wu_item.name, ntotal ); wu_item.error_mask |= WU_ERROR_TOO_MANY_TOTAL_RESULTS; } // if this WU had an error, don't send any unsent results, // and trigger assimilation if needed // if (wu_item.error_mask) { for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; if (!res_item.res_id) continue; bool update_result = false; switch(res_item.res_server_state) { case RESULT_SERVER_STATE_UNSENT: log_messages.printf(MSG_NORMAL, "[WU#%d %s] [RESULT#%d %s] server_state:UNSENT=>OVER; outcome:=>DIDNT_NEED\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); res_item.res_server_state = RESULT_SERVER_STATE_OVER; res_item.res_outcome = RESULT_OUTCOME_DIDNT_NEED; update_result = true; break; case RESULT_SERVER_STATE_OVER: switch (res_item.res_outcome) { case RESULT_OUTCOME_SUCCESS: switch(res_item.res_validate_state) { case VALIDATE_STATE_INIT: case VALIDATE_STATE_INCONCLUSIVE: res_item.res_validate_state = VALIDATE_STATE_NO_CHECK; update_result = true; break; } } } if (update_result) { retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] [RESULT#%d %s] result.update(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } } } if (wu_item.assimilate_state == ASSIMILATE_INIT) { wu_item.assimilate_state = ASSIMILATE_READY; log_messages.printf(MSG_NORMAL, "[WU#%d %s] error_mask:%d assimilate_state:INIT=>READY\n", wu_item.id, wu_item.name, wu_item.error_mask ); } } else if (wu_item.canonical_resultid == 0) { // Here if no WU-level error. // Generate new results if needed. // std::string values; char value_buf[MAX_QUERY_LEN]; if (n_new_results_needed > 0) { log_messages.printf( MSG_NORMAL, "[WU#%d %s] Generating %d more results (%d target - %d unsent - %d in progress - %d success)\n", wu_item.id, wu_item.name, n_new_results_needed, wu_item.target_nresults, nunsent, ninprogress, nsuccess ); for (j=0; j<n_new_results_needed; j++) { sprintf(suffix, "%d", max_result_suffix+j+1); const char *rtfpath = config.project_path("%s", wu_item.result_template_file); int priority_increase = 0; if (nover && config.reliable_priority_on_over) { priority_increase += config.reliable_priority_on_over; } else if (nover && !nerrors && config.reliable_priority_on_over_except_error) { priority_increase += config.reliable_priority_on_over_except_error; } retval = create_result_ti( wu_item, (char *)rtfpath, suffix, key, config, value_buf, priority_increase ); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] create_result_ti(): %s\n", wu_item.id, wu_item.name, boincerror(retval) ); return retval; } if (j==0) { values = value_buf; } else { values += ","; values += value_buf; } } DB_RESULT r; retval = r.insert_batch(values); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] insert_batch(): %s\n", wu_item.id, wu_item.name, boincerror(retval) ); return retval; } } } // scan results: // - see if all over and validated // all_over_and_validated = true; bool all_over_and_ready_to_assimilate = true; // used for the defer assimilation double most_recently_returned = 0; for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; if (!res_item.res_id) continue; if (res_item.res_server_state == RESULT_SERVER_STATE_OVER) { if (res_item.res_received_time > most_recently_returned) { most_recently_returned = res_item.res_received_time; } if (res_item.res_outcome == RESULT_OUTCOME_SUCCESS) { if (res_item.res_validate_state == VALIDATE_STATE_INIT) { all_over_and_validated = false; all_over_and_ready_to_assimilate = false; } } else if (res_item.res_outcome == RESULT_OUTCOME_NO_REPLY) { if (now < res_item.res_report_deadline) { all_over_and_validated = false; } } } else { all_over_and_validated = false; all_over_and_ready_to_assimilate = false; } } // If we are deferring assimilation until all results are over and validated, // when that happens make sure that WU state is advanced to assimilate ready // the items.size is a kludge // if (all_over_and_ready_to_assimilate && wu_item.assimilate_state == ASSIMILATE_INIT && items.size() > 0 && wu_item.canonical_resultid > 0 ) { wu_item.assimilate_state = ASSIMILATE_READY; log_messages.printf(MSG_NORMAL, "[WU#%d %s] Deferred assimilation now set to ASSIMILATE_STATE_READY\n", wu_item.id, wu_item.name ); } // if WU is assimilated, trigger file deletion // double deferred_file_delete_time = 0; if (wu_item.assimilate_state == ASSIMILATE_DONE) { if (now >= (most_recently_returned + config.delete_delay)) { // can delete input files if all results OVER // if (all_over_and_validated && wu_item.file_delete_state == FILE_DELETE_INIT) { wu_item.file_delete_state = FILE_DELETE_READY; log_messages.printf(MSG_DEBUG, "[WU#%d %s] ASSIMILATE_DONE: file_delete_state:=>READY\n", wu_item.id, wu_item.name ); } // output of error results can be deleted immediately; // output of success results can be deleted if validated // for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; // can delete canonical result outputs only if all successful // results have been validated // if (((int)i == canonical_result_index) && !all_over_and_validated) { continue; } if (!res_item.res_id) continue; do_delete = false; switch(res_item.res_outcome) { case RESULT_OUTCOME_CLIENT_ERROR: do_delete = true; break; case RESULT_OUTCOME_SUCCESS: do_delete = (res_item.res_validate_state != VALIDATE_STATE_INIT); break; } if (do_delete && res_item.res_file_delete_state == FILE_DELETE_INIT) { log_messages.printf(MSG_NORMAL, "[WU#%d %s] [RESULT#%d %s] file_delete_state:=>READY\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); res_item.res_file_delete_state = FILE_DELETE_READY; retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] [RESULT#%d %s] result.update(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } } } } else { deferred_file_delete_time = most_recently_returned + config.delete_delay; log_messages.printf(MSG_DEBUG, "[WU#%d %s] deferring file deletion for %.0f seconds\n", wu_item.id, wu_item.name, deferred_file_delete_time - now ); } } // Compute next transition time. // This is the min of // - timeouts of in-progress results // - deferred file deletion time // - safety net // // It is then adjusted to deal with transitioner congestion // if (wu_item.canonical_resultid || wu_item.error_mask) { wu_item.transition_time = INT_MAX; } else { // Safety net: if there is no canonical result and no WU-level error, // make sure that the transitioner will process this WU again. // In principle this is not needed, // but it makes the BOINC back-end more robust. // const int ten_days = 10*86400; int long_delay = (int)(1.5*wu_item.delay_bound); wu_item.transition_time = (long_delay > ten_days) ? long_delay : ten_days; wu_item.transition_time += time(0); } // handle timeout of in-progress results // for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; if (!res_item.res_id) continue; if (res_item.res_server_state == RESULT_SERVER_STATE_IN_PROGRESS) { x = res_item.res_report_deadline; if (x < wu_item.transition_time) { wu_item.transition_time = x; } } } // handle deferred file deletion // if (deferred_file_delete_time && deferred_file_delete_time < wu_item.transition_time ) { wu_item.transition_time = deferred_file_delete_time; } // Handle transitioner overload. // If transition time is in the past, // the system is bogged down and behind schedule. // Delay processing of the WU by an amount DOUBLE the amount we are behind, // but not less than 60 secs or more than one day. // if (wu_item.transition_time < now) { int extra_delay = 2*(now - wu_item.transition_time); if (extra_delay < 60) extra_delay = 60; if (extra_delay > 86400) extra_delay = 86400; log_messages.printf(MSG_DEBUG, "[WU#%d %s] transition time in past: adding extra delay %d sec\n", wu_item.id, wu_item.name, extra_delay ); wu_item.transition_time = now + extra_delay; } log_messages.printf(MSG_DEBUG, "[WU#%d %s] setting transition_time to %d\n", wu_item.id, wu_item.name, wu_item.transition_time ); retval = transitioner.update_workunit(wu_item, wu_item_original); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] workunit.update(): %s\n", wu_item.id, wu_item.name, boincerror(retval) ); return retval; } return 0; }
// DAVID, this is missing a return value! Am I right that this will // also eventually move 'non locality' work through and out of the // system? // // This looks for work created in the range t_min < t < t_max. Use // t_min=INT_MIN if you wish to leave off the left constraint. // static int send_old_work(int t_min, int t_max) { char buf[1024], filename[256]; int retval, extract_retval, nsent; DB_RESULT result; int now=time(0); if (!work_needed(true)) { return 0; } boinc_db.start_transaction(); if (t_min != INT_MIN) { sprintf(buf, "where server_state=%d and %d<create_time and create_time<%d limit 1", RESULT_SERVER_STATE_UNSENT, t_min, t_max ); } else { sprintf(buf, "where server_state=%d and create_time<%d limit 1", RESULT_SERVER_STATE_UNSENT, t_max ); } retval = result.lookup(buf); if (!retval) { retval = possibly_send_result(result); boinc_db.commit_transaction(); if (!retval) { double age=(now-result.create_time)/3600.0; if (config.debug_locality) { log_messages.printf(MSG_NORMAL, "[locality] send_old_work(%s) sent result created %.1f hours ago [RESULT#%d]\n", result.name, age, result.id ); } extract_retval=extract_filename(result.name, filename); if (!extract_retval) { send_results_for_file(filename, nsent, false); } else { // David, is this right? Is this the only place in // the locality scheduler that non-locality work // // gets done? if (config.debug_locality) { log_messages.printf(MSG_NORMAL, "[locality] Note: sent NON-LOCALITY result %s\n", result.name ); } } } else if (retval == ERR_NO_APP_VERSION || retval==ERR_INSUFFICIENT_RESOURCE) { // if no app version found or no resources, give up completely! return retval; } } else { boinc_db.commit_transaction(); } if (retval) { double older=(now-t_max)/3600.0; if (t_min != INT_MIN) { double young=(now-t_min)/3600.0; if (config.debug_locality) { log_messages.printf(MSG_NORMAL, "[locality] send_old_work() no feasible result younger than %.1f hours and older than %.1f hours\n", young, older ); } } else { if (config.debug_locality) { log_messages.printf(MSG_NORMAL, "[locality] send_old_work() no feasible result older than %.1f hours\n", older ); } } } // DAVID, YOU CHANGED THIS FROM VOID TO INT. IS THIS THE RIGHT // RETURN VAL? You should probably use the return value from // sent_results_for_file as well. return retval; }