// send a job for the given assignment // static int send_assigned_job(ASSIGNMENT& asg) { int retval; DB_WORKUNIT wu; char suffix[256], path[MAXPATHLEN]; const char *rtfpath; static bool first=true; static int seqno=0; static R_RSA_PRIVATE_KEY key; BEST_APP_VERSION* bavp; if (first) { first = false; sprintf(path, "%s/upload_private", config.key_dir); retval = read_key_file(path, key); if (retval) { log_messages.printf(MSG_CRITICAL, "can't read key\n"); return -1; } } retval = wu.lookup_id(asg.workunitid); if (retval) { log_messages.printf(MSG_CRITICAL, "assigned WU %d not found\n", asg.workunitid ); return retval; } bavp = get_app_version(wu, false, false); if (!bavp) { log_messages.printf(MSG_CRITICAL, "App version for assigned WU not found\n" ); return ERR_NOT_FOUND; } rtfpath = config.project_path("%s", wu.result_template_file); sprintf(suffix, "%d_%d_%d", getpid(), (int)time(0), seqno++); retval = create_result( wu, const_cast<char*>(rtfpath), suffix, key, config, 0, 0 ); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] create_result(): %s\n", wu.id, wu.name, boincerror(retval) ); return retval; } int result_id = boinc_db.insert_id(); SCHED_DB_RESULT result; retval = result.lookup_id(result_id); add_result_to_reply(result, wu, bavp, false); if (config.debug_assignment) { log_messages.printf(MSG_NORMAL, "[assign] [WU#%d] [RESULT#%d] [HOST#%d] send assignment %d\n", wu.id, result_id, g_reply->host.id, asg.id ); } return 0; }
// Try to send the client this result // This can fail because: // - result needs more disk/mem/speed than host has // - already sent a result for this WU // - no app_version available // static int possibly_send_result(SCHED_DB_RESULT& result) { DB_WORKUNIT wu; SCHED_DB_RESULT result2; int retval; long count; char buf[256]; BEST_APP_VERSION* bavp; g_wreq->no_jobs_available = false; retval = wu.lookup_id(result.workunitid); if (retval) return ERR_DB_NOT_FOUND; // This doesn't take into account g_wreq->allow_non_selected_apps, // however Einstein@Home, which is the only project that currently uses // this locality scheduler, doesn't support the respective project-specific // preference setting // if (app_not_selected(wu.appid)) return ERR_NO_APP_VERSION; bavp = get_app_version(wu, true, false); if (!config.locality_scheduler_fraction && !bavp && is_anonymous(g_request->platforms.list[0])) { char help_msg_buf[512]; sprintf(help_msg_buf, "To get more %s work, finish current work, stop BOINC, remove app_info.xml file, and restart.", config.long_name ); g_reply->insert_message(help_msg_buf, "notice"); g_reply->set_delay(DELAY_ANONYMOUS); } if (!bavp) return ERR_NO_APP_VERSION; APP* app = ssp->lookup_app(wu.appid); retval = wu_is_infeasible_fast( wu, result.server_state, result.report_deadline, result.priority, *app, *bavp ); if (retval) return retval; if (config.one_result_per_user_per_wu) { sprintf(buf, "where userid=%lu and workunitid=%lu", g_reply->user.id, wu.id); retval = result2.count(count, buf); if (retval) return ERR_DB_NOT_FOUND; if (count > 0) return ERR_WU_USER_RULE; } return add_result_to_reply(result, wu, bavp, true); }
int handle_result(DB_RESULT& result) { DB_WORKUNIT wu; int retval; char path[256]; char buf[256]; FILE* f; retval = wu.lookup_id(result.workunitid); if (retval) { printf( "ERROR: can't find WU %d for result %d\n", result.workunitid, result.id ); return 1; } get_file_path(wu, path); f = fopen(path, "r"); if (f) { fclose(f); } else { printf("no file %s for result %d\n", path, result.id ); if (repair) { if (result.server_state == RESULT_SERVER_STATE_UNSENT) { result.server_state = RESULT_SERVER_STATE_OVER; result.outcome = RESULT_OUTCOME_COULDNT_SEND; sprintf( buf,"server_state=%d, outcome=%d", result.server_state, result.outcome ); retval = result.update_field(buf); if (retval) { printf( "ERROR: can't update result %d\n", result.id ); return 1; } } } return 1; } return 0; }
// Try to send the client this result // This can fail because: // - result needs more disk/mem/speed than host has // - already sent a result for this WU // - no app_version available // static int possibly_send_result(SCHED_DB_RESULT& result) { DB_WORKUNIT wu; SCHED_DB_RESULT result2; int retval, count; char buf[256]; BEST_APP_VERSION* bavp; g_wreq->no_jobs_available = false; retval = wu.lookup_id(result.workunitid); if (retval) return ERR_DB_NOT_FOUND; bavp = get_app_version(wu, true, false); if (!config.locality_scheduler_fraction && !bavp && is_anonymous(g_request->platforms.list[0])) { char help_msg_buf[512]; sprintf(help_msg_buf, "To get more %s work, finish current work, stop BOINC, remove app_info.xml file, and restart.", config.long_name ); g_reply->insert_message(help_msg_buf, "notice"); g_reply->set_delay(DELAY_ANONYMOUS); } if (!bavp) return ERR_NO_APP_VERSION; APP* app = ssp->lookup_app(wu.appid); retval = wu_is_infeasible_fast( wu, result.server_state, result.report_deadline, result.priority, *app, *bavp ); if (retval) return retval; if (config.one_result_per_user_per_wu) { sprintf(buf, "where userid=%d and workunitid=%d", g_reply->user.id, wu.id); retval = result2.count(count, buf); if (retval) return ERR_DB_NOT_FOUND; if (count > 0) return ERR_WU_USER_RULE; } return add_result_to_reply(result, wu, bavp, true); }
// Try to send the client this result // This can fail because: // - result needs more disk/mem/speed than host has // - already sent a result for this WU // - no app_version available // static int possibly_send_result(DB_RESULT& result) { DB_WORKUNIT wu; DB_RESULT result2; int retval, count; char buf[256]; BEST_APP_VERSION* bavp; retval = wu.lookup_id(result.workunitid); if (retval) return ERR_DB_NOT_FOUND; bavp = get_app_version(wu, true); if (!bavp && anonymous(g_request->platforms.list[0])) { char help_msg_buf[512]; sprintf(help_msg_buf, "To get more %s work, finish current work, stop BOINC, remove app_info.xml file, and restart.", config.long_name ); g_reply->insert_message(USER_MESSAGE(help_msg_buf, "high")); g_reply->set_delay(DELAY_ANONYMOUS); } if (!bavp) return ERR_NO_APP_VERSION; APP* app = ssp->lookup_app(wu.appid); if (wu_is_infeasible_fast(wu, *app, *bavp)) { return ERR_INSUFFICIENT_RESOURCE; } if (config.one_result_per_user_per_wu) { sprintf(buf, "where userid=%d and workunitid=%d", g_reply->user.id, wu.id); retval = result2.count(count, buf); if (retval) return ERR_DB_NOT_FOUND; if (count > 0) return ERR_WU_USER_RULE; } return add_result_to_reply(result, wu, bavp, true); }
// send non-multi assigned jobs // bool send_assigned_jobs() { DB_ASSIGNMENT asg; DB_RESULT result; DB_WORKUNIT wu; bool sent_something = false; int retval; // for now, only look for user assignments // char buf[256]; sprintf(buf, "where target_type=%d and target_id=%d and multi=0", ASSIGN_USER, g_reply->user.id ); while (!asg.enumerate(buf)) { if (!work_needed(false)) continue; // if the WU doesn't exist, delete the assignment record. // retval = wu.lookup_id(asg.workunitid); if (retval) { asg.delete_from_db(); continue; } // don't send if WU is validation pending or completed, // or has transition pending // if (wu.need_validate) continue; if (wu.canonical_resultid) continue; if (wu.transition_time < time(0)) continue; // don't send if we already sent one to this host // sprintf(buf, "where workunitid=%d and hostid=%d", asg.workunitid, g_request->host.id ); retval = result.lookup(buf); if (retval != ERR_DB_NOT_FOUND) continue; // don't send if there's already one in progress to this user // sprintf(buf, "where workunitid=%d and userid=%d and server_state=%d", asg.workunitid, g_reply->user.id, RESULT_SERVER_STATE_IN_PROGRESS ); retval = result.lookup(buf); if (retval != ERR_DB_NOT_FOUND) continue; // OK, send the job // retval = send_assigned_job(asg); if (retval) continue; sent_something = true; // update the WU's transition time to time out this job // retval = wu.lookup_id(asg.workunitid); if (retval) continue; int new_tt = time(0) + wu.delay_bound; if (new_tt < wu.transition_time) { char buf2[256]; sprintf(buf2, "transition_time=%d", new_tt); wu.update_field(buf2); } } return sent_something; }
// resend any jobs that: // 1) we already sent to this host; // 2) are still in progress (i.e. haven't timed out) and // 3) aren't present on the host // Return true if there were any such jobs // bool resend_lost_work() { SCHED_DB_RESULT result; std::vector<DB_RESULT>results; unsigned int i; char buf[256]; char warning_msg[256]; bool did_any = false; int num_eligible_to_resend=0; int num_resent=0; BEST_APP_VERSION* bavp = NULL; APP* app = NULL; int retval; sprintf(buf, " where hostid=%d and server_state=%d ", g_reply->host.id, RESULT_SERVER_STATE_IN_PROGRESS ); while (!result.enumerate(buf)) { if (!work_needed(false)) { result.end_enumerate(); break; } bool found = false; for (i=0; i<g_request->other_results.size(); i++) { OTHER_RESULT& orp = g_request->other_results[i]; if (!strcmp(orp.name, result.name)) { found = true; break; } } if (found) continue; num_eligible_to_resend++; if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] [HOST#%d] found lost [RESULT#%u]: %s\n", g_reply->host.id, result.id, result.name ); } DB_WORKUNIT wu; bool can_resend = true; retval = wu.lookup_id(result.workunitid); if (retval) { log_messages.printf(MSG_CRITICAL, "[HOST#%d] can't resend - WU not found for [RESULT#%u]\n", g_reply->host.id, result.id ); can_resend = false; } if (can_resend) { app = ssp->lookup_app(wu.appid); bavp = get_app_version(wu, true, false); if (!bavp) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[HOST#%d] can't resend [RESULT#%u]: no app version for %s\n", g_reply->host.id, result.id, app->name ); } can_resend = false; } } if (can_resend && wu.error_mask) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] skipping [RESULT#%u]: WU error mask %d\n", result.id, wu.error_mask ); } can_resend = false; } if (can_resend && wu.canonical_resultid) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] skipping [RESULT#%u]: already have canonical result\n", result.id ); } can_resend = false; } if (can_resend && wu_is_infeasible_fast( wu, result.server_state, result.priority, result.report_deadline, *app, *bavp )) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] skipping [RESULT#%u]: feasibility check failed\n", result.id ); } can_resend = false; } if (can_resend && possibly_give_result_new_deadline(result, wu, *bavp)) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] skipping [RESULT#%u]: deadline assignment failed\n", result.id ); } can_resend = false; } // If we can't resend this job for any of the above reasons, // make it time out so that the transitioner does the right thing. // if (!can_resend) { result.report_deadline = time(0)-1; retval = result.mark_as_sent(result.server_state, config.report_grace_period); if (retval) { log_messages.printf(MSG_CRITICAL, "resend_lost_work: can't update result deadline: %s\n", boincerror(retval) ); continue; } retval = update_wu_on_send( wu, result.report_deadline + config.report_grace_period, *app, *bavp ); if (retval) { log_messages.printf(MSG_CRITICAL, "resend_lost_result: can't update WU transition time: %s\n", boincerror(retval) ); continue; } sprintf(warning_msg, "Didn't resend lost task %s (expired)", result.name ); g_reply->insert_message(warning_msg, "low"); } else { retval = add_result_to_reply(result, wu, bavp, false); if (retval) { log_messages.printf(MSG_CRITICAL, "[HOST#%d] failed to send [RESULT#%u]\n", g_reply->host.id, result.id ); continue; } sprintf(warning_msg, "Resent lost task %s", result.name); g_reply->insert_message(warning_msg, "low"); num_resent++; did_any = true; if (g_wreq->njobs_sent >= config.max_wus_to_send) { result.end_enumerate(); break; } } } if (num_eligible_to_resend && config.debug_resend) { log_messages.printf(MSG_NORMAL, "[resend] [HOST#%d] %d lost results, resent %d\n", g_reply->host.id, num_eligible_to_resend, num_resent ); } return did_any; }
static int send_assigned_job(ASSIGNMENT& asg) { int retval; DB_WORKUNIT wu; char suffix[256], path[256], buf[256]; const char *rtfpath; static bool first=true; static int seqno=0; static R_RSA_PRIVATE_KEY key; BEST_APP_VERSION* bavp; if (first) { first = false; sprintf(path, "%s/upload_private", config.key_dir); retval = read_key_file(path, key); if (retval) { log_messages.printf(MSG_CRITICAL, "can't read key\n"); return -1; } } retval = wu.lookup_id(asg.workunitid); if (retval) { log_messages.printf(MSG_CRITICAL, "assigned WU %d not found\n", asg.workunitid ); return retval; } bavp = get_app_version(wu); if (!bavp) { log_messages.printf(MSG_CRITICAL, "App version for assigned WU not found\n" ); return ERR_NOT_FOUND; } rtfpath = config.project_path("%s", wu.result_template_file); sprintf(suffix, "%d_%d_%d", getpid(), (int)time(0), seqno++); retval = create_result(wu, (char *)rtfpath, suffix, key, config, 0, 0); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%d %s] create_result() %d\n", wu.id, wu.name, retval ); return retval; } int result_id = boinc_db.insert_id(); DB_RESULT result; retval = result.lookup_id(result_id); add_result_to_reply(result, wu, bavp, false); // if this is a one-job assignment, fill in assignment.resultid // so that it doesn't get sent again // if (!asg.multi && asg.target_type!=ASSIGN_NONE) { DB_ASSIGNMENT db_asg; db_asg.id = asg.id; sprintf(buf, "resultid=%d", result_id); retval = db_asg.update_field(buf); if (retval) { log_messages.printf(MSG_CRITICAL, "assign update failed: %d\n", retval ); return retval; } asg.resultid = result_id; } if (config.debug_assignment) { log_messages.printf(MSG_NORMAL, "[assign] [WU#%d] [RESULT#%d] [HOST#%d] send assignment %d\n", wu.id, result_id, g_reply->host.id, asg.id ); } return 0; }
// Send targeted jobs of a given type. // NOTE: there may be an atomicity problem in the following. // Ideally it should be in a transaction. // bool send_jobs(int assign_type) { DB_ASSIGNMENT asg; DB_RESULT result; DB_WORKUNIT wu; int retval; bool sent_something = false; char query[256]; switch (assign_type) { case ASSIGN_USER: sprintf(query, "where target_type=%d and target_id=%lu and multi=0", ASSIGN_USER, g_reply->user.id ); break; case ASSIGN_HOST: sprintf(query, "where target_type=%d and target_id=%lu and multi=0", ASSIGN_HOST, g_reply->host.id ); break; case ASSIGN_TEAM: sprintf(query, "where target_type=%d and target_id=%lu and multi=0", ASSIGN_TEAM, g_reply->team.id ); break; } while (!asg.enumerate(query)) { if (!work_needed(false)) { asg.end_enumerate(); break; } // if the WU doesn't exist, delete the assignment record. // retval = wu.lookup_id(asg.workunitid); if (retval) { asg.delete_from_db(); continue; } if (!need_targeted_instance(wu, g_reply->host.id)) { continue; } // OK, send the job // if (config.debug_send) { log_messages.printf(MSG_NORMAL, "sending targeted job: %s\n", wu.name ); } retval = send_assigned_job(asg); if (retval) { log_messages.printf(MSG_NORMAL, "failed to send targeted job: %s\n", boincerror(retval) ); continue; } sent_something = true; // update the WU's transition time to time out this job // retval = wu.lookup_id(asg.workunitid); if (retval) continue; int new_tt = time(0) + wu.delay_bound; if (new_tt < wu.transition_time) { char buf2[256]; sprintf(buf2, "transition_time=%d", new_tt); wu.update_field(buf2); } } return sent_something; }