// cancel a particular job // int cancel_job(DB_WORKUNIT& wu) { DB_RESULT result; char set_clause[256], where_clause[256]; int retval; // cancel unsent results // sprintf(set_clause, "server_state=%d, outcome=%d", RESULT_SERVER_STATE_OVER, RESULT_OUTCOME_DIDNT_NEED ); sprintf(where_clause, "server_state<=%d and workunitid=%lu", RESULT_SERVER_STATE_UNSENT, wu.id ); retval = result.update_fields_noid(set_clause, where_clause); if (retval) return retval; // cancel the workunit // sprintf(set_clause, "error_mask=error_mask|%d, transition_time=%d", WU_ERROR_CANCELLED, (int)(time(0)) ); retval = wu.update_field(set_clause); if (retval) return retval; return 0; }
// Called when there's evidence that the host has detached. // Mark in-progress results for the given host // as server state OVER, outcome CLIENT_DETACHED. // This serves two purposes: // 1) make sure we don't resend these results to the host // (they may be the reason the user detached) // 2) trigger the generation of new results for these WUs // static void mark_results_over(DB_HOST& host) { char buf[256], buf2[256]; DB_RESULT result; sprintf(buf, "where hostid=%d and server_state=%d", host.id, RESULT_SERVER_STATE_IN_PROGRESS ); while (!result.enumerate(buf)) { sprintf(buf2, "server_state=%d, outcome=%d, received_time = %ld", RESULT_SERVER_STATE_OVER, RESULT_OUTCOME_CLIENT_DETACHED, time(0) ); result.update_field(buf2); // and trigger WU transition // DB_WORKUNIT wu; wu.id = result.workunitid; sprintf(buf2, "transition_time=%d", (int)time(0)); wu.update_field(buf2); log_messages.printf(MSG_CRITICAL, "[HOST#%d] [RESULT#%u] [WU#%u] changed CPID: marking in-progress result %s as client error!\n", host.id, result.id, result.workunitid, result.name ); } }
// We're purging this item because it's been in shared mem too long. // In general it will get added again soon. // But if it's committed to an HR class, // it could be because it got sent to a rare host. // Un-commit it by zeroing out the WU's hr class, // and incrementing target_nresults // static void purge_stale(WU_RESULT& wu_result) { DB_WORKUNIT wu; wu.id = wu_result.workunit.id; if (wu_result.workunit.hr_class) { char buf[256]; sprintf(buf, "hr_class=0, target_nresults=target_nresults+1, transition_time=%ld", time(0) ); wu.update_field(buf); } }
void JOB_DESC::create() { char buf[256]; int retval = create_work2( wu, wu_template, result_template_file, result_template_path, infiles, config, command_line, additional_xml ); if (retval) { fprintf(stderr, "create_work: %s\n", boincerror(retval)); exit(1); } if (assign_flag) { DB_ASSIGNMENT assignment; assignment.clear(); assignment.create_time = time(0); assignment.target_id = assign_id; assignment.target_type = assign_type; assignment.multi = assign_multi; assignment.workunitid = wu.id; retval = assignment.insert(); if (retval) { fprintf(stderr, "assignment.insert() failed: %s\n", boincerror(retval) ); exit(1); } sprintf(buf, "transitioner_flags=%d", assign_multi?TRANSITION_NONE:TRANSITION_NO_NEW_RESULTS ); retval = wu.update_field(buf); if (retval) { fprintf(stderr, "wu.update() failed: %s\n", boincerror(retval)); exit(1); } } }
// Arrange that further results for this workunit // will be sent only to hosts with the given user ID. // This could be used, for example, so that late workunits // are sent only to cloud or cluster resources // int restrict_wu_to_user(WORKUNIT& _wu, int userid) { DB_RESULT result; DB_ASSIGNMENT asg; DB_WORKUNIT wu; wu = _wu; char buf[256]; int retval; // mark unsent results as DIDNT_NEED // sprintf(buf, "where workunitid=%d and server_state=%d", wu.id, RESULT_SERVER_STATE_UNSENT ); while (!result.enumerate(buf)) { char buf2[256]; sprintf(buf2, "server_state=%d, outcome=%d", RESULT_SERVER_STATE_OVER, RESULT_OUTCOME_DIDNT_NEED ); result.update_field(buf2); } // mark the WU as TRANSITION_NO_NEW_RESULTS // sprintf(buf, "transitioner_flags=%d", TRANSITION_NO_NEW_RESULTS); retval = wu.update_field(buf); if (retval) return retval; // create an assignment record // asg.clear(); asg.create_time = time(0); asg.target_id = userid; asg.target_type = ASSIGN_USER; asg.multi = 0; asg.workunitid = wu.id; retval = asg.insert(); return retval; }
// return true if we changed the file_delete_state of a WU or a result // bool do_pass(bool retry_error) { DB_WORKUNIT wu; DB_RESULT result; bool did_something = false; char buf[256]; char clause[256]; int retval, new_state; check_stop_daemons(); strcpy(clause, ""); if (id_modulus) { sprintf(clause, " and id %% %d = %d ", id_modulus, id_remainder); } if (dont_delete_batches) { strcat(clause, " and batch <= 0 "); } if (appid) { sprintf(buf, " and appid = %d ", appid); strcat(clause, buf); } sprintf(buf, "where file_delete_state=%d %s limit %d", retry_error?FILE_DELETE_ERROR:FILE_DELETE_READY, clause, WUS_PER_ENUM ); while (do_input_files) { retval = wu.enumerate(buf); if (retval) { if (retval != ERR_DB_NOT_FOUND) { log_messages.printf(MSG_DEBUG, "DB connection lost, exiting\n"); exit(0); } break; } if (preserve_wu_files) { retval = 0; } else { retval = wu_delete_files(wu); } if (retval) { new_state = FILE_DELETE_ERROR; log_messages.printf(MSG_CRITICAL, "[WU#%d] file deletion failed: %s\n", wu.id, boincerror(retval) ); } else { new_state = FILE_DELETE_DONE; } if (new_state != wu.file_delete_state) { sprintf(buf, "file_delete_state=%d", new_state); retval = wu.update_field(buf); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d] update failed: %s\n", wu.id, boincerror(retval) ); } else { log_messages.printf(MSG_DEBUG, "[WU#%d] file_delete_state updated\n", wu.id ); did_something = true; } } } sprintf(buf, "where file_delete_state=%d %s limit %d", retry_error?FILE_DELETE_ERROR:FILE_DELETE_READY, clause, RESULTS_PER_ENUM ); while (do_output_files) { retval = result.enumerate(buf); if (retval) { if (retval != ERR_DB_NOT_FOUND) { log_messages.printf(MSG_DEBUG, "DB connection lost, exiting\n"); exit(0); } break; } if (preserve_result_files) { retval = 0; } else { retval = result_delete_files(result); } if (retval) { new_state = FILE_DELETE_ERROR; log_messages.printf(MSG_CRITICAL, "[RESULT#%d] file deletion failed: %s\n", result.id, boincerror(retval) ); } else { new_state = FILE_DELETE_DONE; } if (new_state != result.file_delete_state) { sprintf(buf, "file_delete_state=%d", new_state); retval = result.update_field(buf); if (retval) { log_messages.printf(MSG_CRITICAL, "[RESULT#%d] update failed: %s\n", result.id, boincerror(retval) ); } else { log_messages.printf(MSG_DEBUG, "[RESULT#%d] file_delete_state updated\n", result.id ); did_something = true; } } } return did_something; }
// send non-multi assigned jobs // bool send_assigned_jobs() { DB_ASSIGNMENT asg; DB_RESULT result; DB_WORKUNIT wu; bool sent_something = false; int retval; // for now, only look for user assignments // char buf[256]; sprintf(buf, "where target_type=%d and target_id=%d and multi=0", ASSIGN_USER, g_reply->user.id ); while (!asg.enumerate(buf)) { if (!work_needed(false)) continue; // if the WU doesn't exist, delete the assignment record. // retval = wu.lookup_id(asg.workunitid); if (retval) { asg.delete_from_db(); continue; } // don't send if WU is validation pending or completed, // or has transition pending // if (wu.need_validate) continue; if (wu.canonical_resultid) continue; if (wu.transition_time < time(0)) continue; // don't send if we already sent one to this host // sprintf(buf, "where workunitid=%d and hostid=%d", asg.workunitid, g_request->host.id ); retval = result.lookup(buf); if (retval != ERR_DB_NOT_FOUND) continue; // don't send if there's already one in progress to this user // sprintf(buf, "where workunitid=%d and userid=%d and server_state=%d", asg.workunitid, g_reply->user.id, RESULT_SERVER_STATE_IN_PROGRESS ); retval = result.lookup(buf); if (retval != ERR_DB_NOT_FOUND) continue; // OK, send the job // retval = send_assigned_job(asg); if (retval) continue; sent_something = true; // update the WU's transition time to time out this job // retval = wu.lookup_id(asg.workunitid); if (retval) continue; int new_tt = time(0) + wu.delay_bound; if (new_tt < wu.transition_time) { char buf2[256]; sprintf(buf2, "transition_time=%d", new_tt); wu.update_field(buf2); } } return sent_something; }
// assimilate all WUs that need it // return nonzero (true) if did anything // bool do_pass(APP& app) { DB_WORKUNIT wu; DB_RESULT canonical_result, result; bool did_something = false; char buf[256]; char mod_clause[256]; int retval; int num_assimilated=0; check_stop_daemons(); if (wu_id_modulus) { sprintf(mod_clause, " and workunit.id %% %d = %d ", wu_id_modulus, wu_id_remainder ); } else { strcpy(mod_clause, ""); } sprintf(buf, "where appid=%d and assimilate_state=%d %s limit %d", app.id, ASSIMILATE_READY, mod_clause, one_pass_N_WU ? one_pass_N_WU : 1000 ); while (1) { retval = wu.enumerate(buf); if (retval) { if (retval != ERR_DB_NOT_FOUND) { log_messages.printf(MSG_DEBUG, "DB connection lost, exiting\n" ); exit(0); } break; } vector<RESULT> results; // must be inside while()! // for testing purposes, pretend we did nothing // if (update_db) { did_something = true; } log_messages.printf(MSG_DEBUG, "[%s] assimilating WU %d; state=%d\n", wu.name, wu.id, wu.assimilate_state ); sprintf(buf, "where workunitid=%d", wu.id); canonical_result.clear(); bool found = false; while (1) { retval = result.enumerate(buf); if (retval) { if (retval != ERR_DB_NOT_FOUND) { log_messages.printf(MSG_DEBUG, "DB connection lost, exiting\n" ); exit(0); } break; } results.push_back(result); if (result.id == wu.canonical_resultid) { canonical_result = result; found = true; } } // If no canonical result found and WU had no other errors, // something is wrong, e.g. result records got deleted prematurely. // This is probably unrecoverable, so mark the WU as having // an assimilation error and keep going. // if (!found && !wu.error_mask) { log_messages.printf(MSG_CRITICAL, "[%s] no canonical result\n", wu.name ); wu.error_mask = WU_ERROR_NO_CANONICAL_RESULT; sprintf(buf, "error_mask=%d", wu.error_mask); wu.update_field(buf); } retval = assimilate_handler(wu, results, canonical_result); if (retval && retval != DEFER_ASSIMILATION) { log_messages.printf(MSG_CRITICAL, "[%s] handler error: %s; exiting\n", wu.name, boincerror(retval) ); exit(retval); } if (update_db) { // Defer assimilation until next result is returned int assimilate_state = ASSIMILATE_DONE; if (retval == DEFER_ASSIMILATION) { assimilate_state = ASSIMILATE_INIT; } sprintf( buf, "assimilate_state=%d, transition_time=%d", assimilate_state, (int)time(0) ); retval = wu.update_field(buf); if (retval) { log_messages.printf(MSG_CRITICAL, "[%s] update failed: %s\n", wu.name, boincerror(retval) ); exit(1); } } num_assimilated++; } if (did_something) { boinc_db.commit_transaction(); } if (num_assimilated) { log_messages.printf(MSG_NORMAL, "Assimilated %d workunits.\n", num_assimilated ); } return did_something; }
int main(int argc, const char** argv) { DB_APP app; DB_WORKUNIT wu; int retval; char wu_template[BLOB_SIZE]; char wu_template_file[256], result_template_file[256], result_template_path[MAXPATHLEN]; const char* command_line=NULL; const char** infiles = NULL; int i, ninfiles; char download_dir[256], db_name[256], db_passwd[256]; char db_user[256],db_host[256]; char buf[256]; char additional_xml[256]; bool show_wu_name = true; bool assign_flag = false; bool assign_multi = false; int assign_id = 0; int assign_type = ASSIGN_NONE; strcpy(wu_template_file, ""); strcpy(result_template_file, ""); strcpy(app.name, ""); strcpy(db_passwd, ""); strcpy(additional_xml, ""); const char* config_dir = 0; i = 1; ninfiles = 0; wu.clear(); // defaults (in case they're not in WU template) wu.id = 0; wu.min_quorum = 2; wu.target_nresults = 2; wu.max_error_results = 3; wu.max_total_results = 10; wu.max_success_results = 6; wu.rsc_fpops_est = 3600e9; wu.rsc_fpops_bound = 86400e9; wu.rsc_memory_bound = 5e8; wu.rsc_disk_bound = 1e9; wu.rsc_bandwidth_bound = 0.0; wu.delay_bound = 7*86400; while (i < argc) { if (arg(argv, i, "appname")) { strcpy(app.name, argv[++i]); } else if (arg(argv, i, "d")) { int dl = atoi(argv[++i]); log_messages.set_debug_level(dl); if (dl ==4) g_print_queries = true; } else if (arg(argv, i, "wu_name")) { show_wu_name = false; strcpy(wu.name, argv[++i]); } else if (arg(argv, i, "wu_template")) { strcpy(wu_template_file, argv[++i]); } else if (arg(argv, i, "result_template")) { strcpy(result_template_file, argv[++i]); } else if (arg(argv, i, "batch")) { wu.batch = atoi(argv[++i]); } else if (arg(argv, i, "config_dir")) { config_dir = argv[++i]; } else if (arg(argv, i, "batch")) { wu.batch = atoi(argv[++i]); } else if (arg(argv, i, "priority")) { wu.priority = atoi(argv[++i]); } else if (arg(argv, i, "rsc_fpops_est")) { wu.rsc_fpops_est = atof(argv[++i]); } else if (arg(argv, i, "rsc_fpops_bound")) { wu.rsc_fpops_bound = atof(argv[++i]); } else if (arg(argv, i, "rsc_memory_bound")) { wu.rsc_memory_bound = atof(argv[++i]); } else if (arg(argv, i, "rsc_disk_bound")) { wu.rsc_disk_bound = atof(argv[++i]); } else if (arg(argv, i, "delay_bound")) { wu.delay_bound = atoi(argv[++i]); } else if (arg(argv, i, "min_quorum")) { wu.min_quorum = atoi(argv[++i]); } else if (arg(argv, i, "target_nresults")) { wu.target_nresults = atoi(argv[++i]); } else if (arg(argv, i, "max_error_results")) { wu.max_error_results = atoi(argv[++i]); } else if (arg(argv, i, "max_total_results")) { wu.max_total_results = atoi(argv[++i]); } else if (arg(argv, i, "max_success_results")) { wu.max_success_results = atoi(argv[++i]); } else if (arg(argv, i, "opaque")) { wu.opaque = atoi(argv[++i]); } else if (arg(argv, i, "command_line")) { command_line= argv[++i]; } else if (arg(argv, i, "additional_xml")) { strcpy(additional_xml, argv[++i]); } else if (arg(argv, i, "wu_id")) { wu.id = atoi(argv[++i]); } else if (arg(argv, i, "broadcast")) { assign_multi = true; assign_flag = true; assign_type = ASSIGN_NONE; } else if (arg(argv, i, "broadcast_user")) { assign_flag = true; assign_type = ASSIGN_USER; assign_multi = true; assign_id = atoi(argv[++i]); } else if (arg(argv, i, "broadcast_team")) { assign_flag = true; assign_type = ASSIGN_TEAM; assign_multi = true; assign_id = atoi(argv[++i]); } else if (arg(argv, i, "target_host")) { assign_flag = true; assign_type = ASSIGN_HOST; assign_id = atoi(argv[++i]); } else if (arg(argv, i, "target_user")) { assign_flag = true; assign_type = ASSIGN_USER; assign_id = atoi(argv[++i]); } else if (arg(argv, i, "target_team")) { assign_flag = true; assign_type = ASSIGN_TEAM; assign_id = atoi(argv[++i]); } else if (arg(argv, i, "help")) { usage(); exit(0); } else { if (!strncmp("-", argv[i], 1)) { fprintf(stderr, "create_work: bad argument '%s'\n", argv[i]); exit(1); } infiles = argv+i; ninfiles = argc - i; break; } i++; } if (!strlen(app.name)) { usage(); } if (!strlen(wu.name)) { sprintf(wu.name, "%s_%d_%f", app.name, getpid(), dtime()); } if (!strlen(wu_template_file)) { sprintf(wu_template_file, "templates/%s_in", app.name); } if (!strlen(result_template_file)) { sprintf(result_template_file, "templates/%s_out", app.name); } retval = config.parse_file(config_dir); if (retval) { fprintf(stderr, "Can't parse config file: %s\n", boincerror(retval)); exit(1); } else { strcpy(db_name, config.db_name); strcpy(db_passwd, config.db_passwd); strcpy(db_user, config.db_user); strcpy(db_host, config.db_host); strcpy(download_dir, config.download_dir); } retval = boinc_db.open(db_name, db_host, db_user, db_passwd); if (retval) { fprintf(stderr, "create_work: error opening database: %s\n", boincerror(retval) ); exit(1); } sprintf(buf, "where name='%s'", app.name); retval = app.lookup(buf); if (retval) { fprintf(stderr, "create_work: app not found\n"); exit(1); } retval = read_filename(wu_template_file, wu_template, sizeof(wu_template)); if (retval) { fprintf(stderr, "create_work: can't open input template %s\n", wu_template_file ); exit(1); } wu.appid = app.id; strcpy(result_template_path, "./"); strcat(result_template_path, result_template_file); retval = create_work( wu, wu_template, result_template_file, result_template_path, const_cast<const char **>(infiles), ninfiles, config, command_line, additional_xml ); if (retval) { fprintf(stderr, "create_work: %s\n", boincerror(retval)); exit(1); } else { if (show_wu_name) { printf("workunit name: %s\n", wu.name); } } if (assign_flag) { DB_ASSIGNMENT assignment; assignment.clear(); assignment.create_time = time(0); assignment.target_id = assign_id; assignment.target_type = assign_type; assignment.multi = assign_multi; assignment.workunitid = wu.id; retval = assignment.insert(); if (retval) { fprintf(stderr, "assignment.insert() failed: %s\n", boincerror(retval) ); exit(1); } sprintf(buf, "transitioner_flags=%d", assign_multi?TRANSITION_NONE:TRANSITION_NO_NEW_RESULTS ); retval = wu.update_field(buf); if (retval) { fprintf(stderr, "wu.update() failed: %s\n", boincerror(retval)); exit(1); } } boinc_db.close(); }
int handle_wu( DB_TRANSITIONER_ITEM_SET& transitioner, std::vector<TRANSITIONER_ITEM>& items ) { int ntotal, nerrors, retval, ninprogress, nsuccess; int nunsent, ncouldnt_send, nover, ndidnt_need, nno_reply; int canonical_result_index, j; char suffix[256]; time_t now = time(0), x; bool all_over_and_validated, have_new_result_to_validate, do_delete; unsigned int i; TRANSITIONER_ITEM& wu_item = items[0]; TRANSITIONER_ITEM wu_item_original = wu_item; // "assigned" WUs aren't supposed to pass through the transitioner. // If we get one, it's an error // if (config.enable_assignment && strstr(wu_item.name, ASSIGNED_WU_STR)) { DB_WORKUNIT wu; char buf[256]; wu.id = wu_item.id; log_messages.printf(MSG_CRITICAL, "Assigned WU %d unexpectedly found by transitioner\n", wu.id ); sprintf(buf, "transition_time=%d", INT_MAX); retval = wu.update_field(buf); if (retval) { log_messages.printf(MSG_CRITICAL, "update_field failed: %s\n", boincerror(retval) ); } return 0; } // count up the number of results in various states, // and check for timed-out results // ntotal = 0; nunsent = 0; ninprogress = 0; nover = 0; nerrors = 0; nsuccess = 0; // not counting invalid results!!!! ncouldnt_send = 0; nno_reply = 0; ndidnt_need = 0; have_new_result_to_validate = false; int rs, max_result_suffix = -1; // Scan the WU's results, and find the canonical result if there is one // canonical_result_index = -1; if (wu_item.canonical_resultid) { for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; if (!res_item.res_id) continue; if (res_item.res_id == wu_item.canonical_resultid) { canonical_result_index = i; } } } if (wu_item.canonical_resultid && (canonical_result_index == -1)) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] can't find canonical result\n", wu_item.id, wu_item.name ); } // if there is a canonical result, see if its file are deleted // bool canonical_result_files_deleted = false; if (canonical_result_index >= 0) { TRANSITIONER_ITEM& cr = items[canonical_result_index]; if (cr.res_file_delete_state == FILE_DELETE_DONE) { canonical_result_files_deleted = true; } } // Scan this WU's results, and // 1) count those in various server states; // 2) identify timed-out results and update their server state and outcome // 3) find the max result suffix (in case need to generate new ones) // 4) see if we have a new result to validate // (outcome SUCCESS and validate_state INIT) // for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; if (!res_item.res_id) continue; ntotal++; rs = result_suffix(res_item.res_name); if (rs > max_result_suffix) max_result_suffix = rs; switch (res_item.res_server_state) { case RESULT_SERVER_STATE_UNSENT: nunsent++; break; case RESULT_SERVER_STATE_IN_PROGRESS: if (res_item.res_report_deadline < now) { log_messages.printf(MSG_NORMAL, "[WU#%d %s] [RESULT#%d %s] result timed out (%d < %d) server_state:IN_PROGRESS=>OVER; outcome:NO_REPLY\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, res_item.res_report_deadline, (int)now ); res_item.res_server_state = RESULT_SERVER_STATE_OVER; res_item.res_outcome = RESULT_OUTCOME_NO_REPLY; retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] [RESULT#%d %s] update_result(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } retval = result_timed_out(res_item, wu_item); if (retval) { log_messages.printf(MSG_CRITICAL, "result_timed_out() error: %s\n", boincerror(retval) ); exit(1); } nover++; nno_reply++; } else { ninprogress++; } break; case RESULT_SERVER_STATE_OVER: nover++; switch (res_item.res_outcome) { case RESULT_OUTCOME_COULDNT_SEND: log_messages.printf(MSG_NORMAL, "[WU#%d %s] [RESULT#%d %s] result couldn't be sent\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); ncouldnt_send++; break; case RESULT_OUTCOME_SUCCESS: if (res_item.res_validate_state == VALIDATE_STATE_INIT) { if (canonical_result_files_deleted) { res_item.res_validate_state = VALIDATE_STATE_TOO_LATE; retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] [RESULT#%d %s] update_result(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } else { log_messages.printf(MSG_NORMAL, "[WU#%d %s] [RESULT#%d %s] validate_state:INIT=>TOO_LATE\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); } } else { have_new_result_to_validate = true; } } // don't count invalid results as successful // if (res_item.res_validate_state != VALIDATE_STATE_INVALID) { nsuccess++; } break; case RESULT_OUTCOME_CLIENT_ERROR: case RESULT_OUTCOME_VALIDATE_ERROR: nerrors++; break; case RESULT_OUTCOME_CLIENT_DETACHED: case RESULT_OUTCOME_NO_REPLY: nno_reply++; break; case RESULT_OUTCOME_DIDNT_NEED: ndidnt_need++; break; } break; } } log_messages.printf(MSG_DEBUG, "[WU#%d %s] %d results: unsent %d, in_progress %d, over %d (success %d, error %d, couldnt_send %d, no_reply %d, didnt_need %d)\n", wu_item.id, wu_item.name, ntotal, nunsent, ninprogress, nover, nsuccess, nerrors, ncouldnt_send, nno_reply, ndidnt_need ); // if there's a new result to validate, trigger validation // if (have_new_result_to_validate && (nsuccess >= wu_item.min_quorum)) { wu_item.need_validate = true; log_messages.printf(MSG_NORMAL, "[WU#%d %s] need_validate:=>true\n", wu_item.id, wu_item.name ); } // check for WU error conditions // NOTE: check on max # of success results is done in validater // if (ncouldnt_send > 0) { wu_item.error_mask |= WU_ERROR_COULDNT_SEND_RESULT; } // if WU has results with errors and no success yet, // reset homogeneous redundancy class to give other platforms a try; // also reset app version ID if using HAV // if (nerrors && !(nsuccess || ninprogress)) { wu_item.hr_class = 0; wu_item.app_version_id = 0; } if (nerrors > wu_item.max_error_results) { log_messages.printf(MSG_NORMAL, "[WU#%d %s] WU has too many errors (%d errors for %d results)\n", wu_item.id, wu_item.name, nerrors, ntotal ); wu_item.error_mask |= WU_ERROR_TOO_MANY_ERROR_RESULTS; } // see how many new results we need to make // int n_new_results_needed = wu_item.target_nresults - nunsent - ninprogress - nsuccess; if (n_new_results_needed < 0) n_new_results_needed = 0; int n_new_results_allowed = wu_item.max_total_results - ntotal; // if we're already at the limit and need more, error out the WU // bool too_many = false; if (n_new_results_allowed < 0) { too_many = true; } else if (n_new_results_allowed == 0) { if (n_new_results_needed > 0) { too_many = true; } } else { if (n_new_results_needed > n_new_results_allowed) { n_new_results_needed = n_new_results_allowed; } } if (too_many) { log_messages.printf(MSG_NORMAL, "[WU#%d %s] WU has too many total results (%d)\n", wu_item.id, wu_item.name, ntotal ); wu_item.error_mask |= WU_ERROR_TOO_MANY_TOTAL_RESULTS; } // if this WU had an error, don't send any unsent results, // and trigger assimilation if needed // if (wu_item.error_mask) { for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; if (!res_item.res_id) continue; bool update_result = false; switch(res_item.res_server_state) { case RESULT_SERVER_STATE_UNSENT: log_messages.printf(MSG_NORMAL, "[WU#%d %s] [RESULT#%d %s] server_state:UNSENT=>OVER; outcome:=>DIDNT_NEED\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); res_item.res_server_state = RESULT_SERVER_STATE_OVER; res_item.res_outcome = RESULT_OUTCOME_DIDNT_NEED; update_result = true; break; case RESULT_SERVER_STATE_OVER: switch (res_item.res_outcome) { case RESULT_OUTCOME_SUCCESS: switch(res_item.res_validate_state) { case VALIDATE_STATE_INIT: case VALIDATE_STATE_INCONCLUSIVE: res_item.res_validate_state = VALIDATE_STATE_NO_CHECK; update_result = true; break; } } } if (update_result) { retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] [RESULT#%d %s] result.update(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } } } if (wu_item.assimilate_state == ASSIMILATE_INIT) { wu_item.assimilate_state = ASSIMILATE_READY; log_messages.printf(MSG_NORMAL, "[WU#%d %s] error_mask:%d assimilate_state:INIT=>READY\n", wu_item.id, wu_item.name, wu_item.error_mask ); } } else if (wu_item.canonical_resultid == 0) { // Here if no WU-level error. // Generate new results if needed. // std::string values; char value_buf[MAX_QUERY_LEN]; if (n_new_results_needed > 0) { log_messages.printf( MSG_NORMAL, "[WU#%d %s] Generating %d more results (%d target - %d unsent - %d in progress - %d success)\n", wu_item.id, wu_item.name, n_new_results_needed, wu_item.target_nresults, nunsent, ninprogress, nsuccess ); for (j=0; j<n_new_results_needed; j++) { sprintf(suffix, "%d", max_result_suffix+j+1); const char *rtfpath = config.project_path("%s", wu_item.result_template_file); int priority_increase = 0; if (nover && config.reliable_priority_on_over) { priority_increase += config.reliable_priority_on_over; } else if (nover && !nerrors && config.reliable_priority_on_over_except_error) { priority_increase += config.reliable_priority_on_over_except_error; } retval = create_result_ti( wu_item, (char *)rtfpath, suffix, key, config, value_buf, priority_increase ); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] create_result_ti(): %s\n", wu_item.id, wu_item.name, boincerror(retval) ); return retval; } if (j==0) { values = value_buf; } else { values += ","; values += value_buf; } } DB_RESULT r; retval = r.insert_batch(values); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] insert_batch(): %s\n", wu_item.id, wu_item.name, boincerror(retval) ); return retval; } } } // scan results: // - see if all over and validated // all_over_and_validated = true; bool all_over_and_ready_to_assimilate = true; // used for the defer assimilation double most_recently_returned = 0; for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; if (!res_item.res_id) continue; if (res_item.res_server_state == RESULT_SERVER_STATE_OVER) { if (res_item.res_received_time > most_recently_returned) { most_recently_returned = res_item.res_received_time; } if (res_item.res_outcome == RESULT_OUTCOME_SUCCESS) { if (res_item.res_validate_state == VALIDATE_STATE_INIT) { all_over_and_validated = false; all_over_and_ready_to_assimilate = false; } } else if (res_item.res_outcome == RESULT_OUTCOME_NO_REPLY) { if (now < res_item.res_report_deadline) { all_over_and_validated = false; } } } else { all_over_and_validated = false; all_over_and_ready_to_assimilate = false; } } // If we are deferring assimilation until all results are over and validated, // when that happens make sure that WU state is advanced to assimilate ready // the items.size is a kludge // if (all_over_and_ready_to_assimilate && wu_item.assimilate_state == ASSIMILATE_INIT && items.size() > 0 && wu_item.canonical_resultid > 0 ) { wu_item.assimilate_state = ASSIMILATE_READY; log_messages.printf(MSG_NORMAL, "[WU#%d %s] Deferred assimilation now set to ASSIMILATE_STATE_READY\n", wu_item.id, wu_item.name ); } // if WU is assimilated, trigger file deletion // double deferred_file_delete_time = 0; if (wu_item.assimilate_state == ASSIMILATE_DONE) { if (now >= (most_recently_returned + config.delete_delay)) { // can delete input files if all results OVER // if (all_over_and_validated && wu_item.file_delete_state == FILE_DELETE_INIT) { wu_item.file_delete_state = FILE_DELETE_READY; log_messages.printf(MSG_DEBUG, "[WU#%d %s] ASSIMILATE_DONE: file_delete_state:=>READY\n", wu_item.id, wu_item.name ); } // output of error results can be deleted immediately; // output of success results can be deleted if validated // for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; // can delete canonical result outputs only if all successful // results have been validated // if (((int)i == canonical_result_index) && !all_over_and_validated) { continue; } if (!res_item.res_id) continue; do_delete = false; switch(res_item.res_outcome) { case RESULT_OUTCOME_CLIENT_ERROR: do_delete = true; break; case RESULT_OUTCOME_SUCCESS: do_delete = (res_item.res_validate_state != VALIDATE_STATE_INIT); break; } if (do_delete && res_item.res_file_delete_state == FILE_DELETE_INIT) { log_messages.printf(MSG_NORMAL, "[WU#%d %s] [RESULT#%d %s] file_delete_state:=>READY\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); res_item.res_file_delete_state = FILE_DELETE_READY; retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] [RESULT#%d %s] result.update(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } } } } else { deferred_file_delete_time = most_recently_returned + config.delete_delay; log_messages.printf(MSG_DEBUG, "[WU#%d %s] deferring file deletion for %.0f seconds\n", wu_item.id, wu_item.name, deferred_file_delete_time - now ); } } // Compute next transition time. // This is the min of // - timeouts of in-progress results // - deferred file deletion time // - safety net // // It is then adjusted to deal with transitioner congestion // if (wu_item.canonical_resultid || wu_item.error_mask) { wu_item.transition_time = INT_MAX; } else { // Safety net: if there is no canonical result and no WU-level error, // make sure that the transitioner will process this WU again. // In principle this is not needed, // but it makes the BOINC back-end more robust. // const int ten_days = 10*86400; int long_delay = (int)(1.5*wu_item.delay_bound); wu_item.transition_time = (long_delay > ten_days) ? long_delay : ten_days; wu_item.transition_time += time(0); } // handle timeout of in-progress results // for (i=0; i<items.size(); i++) { TRANSITIONER_ITEM& res_item = items[i]; if (!res_item.res_id) continue; if (res_item.res_server_state == RESULT_SERVER_STATE_IN_PROGRESS) { x = res_item.res_report_deadline; if (x < wu_item.transition_time) { wu_item.transition_time = x; } } } // handle deferred file deletion // if (deferred_file_delete_time && deferred_file_delete_time < wu_item.transition_time ) { wu_item.transition_time = deferred_file_delete_time; } // Handle transitioner overload. // If transition time is in the past, // the system is bogged down and behind schedule. // Delay processing of the WU by an amount DOUBLE the amount we are behind, // but not less than 60 secs or more than one day. // if (wu_item.transition_time < now) { int extra_delay = 2*(now - wu_item.transition_time); if (extra_delay < 60) extra_delay = 60; if (extra_delay > 86400) extra_delay = 86400; log_messages.printf(MSG_DEBUG, "[WU#%d %s] transition time in past: adding extra delay %d sec\n", wu_item.id, wu_item.name, extra_delay ); wu_item.transition_time = now + extra_delay; } log_messages.printf(MSG_DEBUG, "[WU#%d %s] setting transition_time to %d\n", wu_item.id, wu_item.name, wu_item.transition_time ); retval = transitioner.update_workunit(wu_item, wu_item_original); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] workunit.update(): %s\n", wu_item.id, wu_item.name, boincerror(retval) ); return retval; } return 0; }
// Send targeted jobs of a given type. // NOTE: there may be an atomicity problem in the following. // Ideally it should be in a transaction. // bool send_jobs(int assign_type) { DB_ASSIGNMENT asg; DB_RESULT result; DB_WORKUNIT wu; int retval; bool sent_something = false; char query[256]; switch (assign_type) { case ASSIGN_USER: sprintf(query, "where target_type=%d and target_id=%lu and multi=0", ASSIGN_USER, g_reply->user.id ); break; case ASSIGN_HOST: sprintf(query, "where target_type=%d and target_id=%lu and multi=0", ASSIGN_HOST, g_reply->host.id ); break; case ASSIGN_TEAM: sprintf(query, "where target_type=%d and target_id=%lu and multi=0", ASSIGN_TEAM, g_reply->team.id ); break; } while (!asg.enumerate(query)) { if (!work_needed(false)) { asg.end_enumerate(); break; } // if the WU doesn't exist, delete the assignment record. // retval = wu.lookup_id(asg.workunitid); if (retval) { asg.delete_from_db(); continue; } if (!need_targeted_instance(wu, g_reply->host.id)) { continue; } // OK, send the job // if (config.debug_send) { log_messages.printf(MSG_NORMAL, "sending targeted job: %s\n", wu.name ); } retval = send_assigned_job(asg); if (retval) { log_messages.printf(MSG_NORMAL, "failed to send targeted job: %s\n", boincerror(retval) ); continue; } sent_something = true; // update the WU's transition time to time out this job // retval = wu.lookup_id(asg.workunitid); if (retval) continue; int new_tt = time(0) + wu.delay_bound; if (new_tt < wu.transition_time) { char buf2[256]; sprintf(buf2, "transition_time=%d", new_tt); wu.update_field(buf2); } } return sent_something; }