Пример #1
0
// cancel a particular job
//
int cancel_job(DB_WORKUNIT& wu) {
    DB_RESULT result;
    char set_clause[256], where_clause[256];
    int retval;

    // cancel unsent results
    //
    sprintf(set_clause, "server_state=%d, outcome=%d",
        RESULT_SERVER_STATE_OVER, RESULT_OUTCOME_DIDNT_NEED
    );
    sprintf(where_clause, "server_state<=%d and workunitid=%lu",
        RESULT_SERVER_STATE_UNSENT, wu.id
    );
    retval = result.update_fields_noid(set_clause, where_clause);
    if (retval) return retval;

    // cancel the workunit
    //
    sprintf(set_clause, "error_mask=error_mask|%d, transition_time=%d",
        WU_ERROR_CANCELLED, (int)(time(0))
    );
    retval = wu.update_field(set_clause);
    if (retval) return retval;
    return 0;
}
Пример #2
0
// Called when there's evidence that the host has detached.
// Mark in-progress results for the given host
// as server state OVER, outcome CLIENT_DETACHED.
// This serves two purposes:
// 1) make sure we don't resend these results to the host
//    (they may be the reason the user detached)
// 2) trigger the generation of new results for these WUs
//
static void mark_results_over(DB_HOST& host) {
    char buf[256], buf2[256];
    DB_RESULT result;
    sprintf(buf, "where hostid=%d and server_state=%d",
            host.id,
            RESULT_SERVER_STATE_IN_PROGRESS
           );
    while (!result.enumerate(buf)) {
        sprintf(buf2,
                "server_state=%d, outcome=%d, received_time = %ld",
                RESULT_SERVER_STATE_OVER,
                RESULT_OUTCOME_CLIENT_DETACHED,
                time(0)
               );
        result.update_field(buf2);

        // and trigger WU transition
        //
        DB_WORKUNIT wu;
        wu.id = result.workunitid;
        sprintf(buf2, "transition_time=%d", (int)time(0));
        wu.update_field(buf2);

        log_messages.printf(MSG_CRITICAL,
                            "[HOST#%d] [RESULT#%u] [WU#%u] changed CPID: marking in-progress result %s as client error!\n",
                            host.id, result.id, result.workunitid, result.name
                           );
    }
}
Пример #3
0
// We're purging this item because it's been in shared mem too long.
// In general it will get added again soon.
// But if it's committed to an HR class,
// it could be because it got sent to a rare host.
// Un-commit it by zeroing out the WU's hr class,
// and incrementing target_nresults
//
static void purge_stale(WU_RESULT& wu_result) {
    DB_WORKUNIT wu;
    wu.id = wu_result.workunit.id;
    if (wu_result.workunit.hr_class) {
        char buf[256];
        sprintf(buf,
            "hr_class=0, target_nresults=target_nresults+1, transition_time=%ld",
            time(0)
        );
        wu.update_field(buf);
    }
}
Пример #4
0
void JOB_DESC::create() {
    char buf[256];
    int retval = create_work2(
        wu,
        wu_template,
        result_template_file,
        result_template_path,
        infiles,
        config,
        command_line,
        additional_xml
    );
    if (retval) {
        fprintf(stderr, "create_work: %s\n", boincerror(retval));
        exit(1);
    }
    if (assign_flag) {
        DB_ASSIGNMENT assignment;
        assignment.clear();
        assignment.create_time = time(0);
        assignment.target_id = assign_id;
        assignment.target_type = assign_type;
        assignment.multi = assign_multi;
        assignment.workunitid = wu.id;
        retval = assignment.insert();
        if (retval) {
            fprintf(stderr,
                "assignment.insert() failed: %s\n", boincerror(retval)
            );
            exit(1);
        }
        sprintf(buf, "transitioner_flags=%d",
            assign_multi?TRANSITION_NONE:TRANSITION_NO_NEW_RESULTS
        );
        retval = wu.update_field(buf);
        if (retval) {
            fprintf(stderr, "wu.update() failed: %s\n", boincerror(retval));
            exit(1);
        }
    }
}
Пример #5
0
// Arrange that further results for this workunit
// will be sent only to hosts with the given user ID.
// This could be used, for example, so that late workunits
// are sent only to cloud or cluster resources
//
int restrict_wu_to_user(WORKUNIT& _wu, int userid) {
    DB_RESULT result;
    DB_ASSIGNMENT asg;
    DB_WORKUNIT wu;
    wu = _wu;
    char buf[256];
    int retval;

    // mark unsent results as DIDNT_NEED
    //
    sprintf(buf, "where workunitid=%d and server_state=%d",
        wu.id, RESULT_SERVER_STATE_UNSENT
    );
    while (!result.enumerate(buf)) {
        char buf2[256];
        sprintf(buf2, "server_state=%d, outcome=%d",
            RESULT_SERVER_STATE_OVER,
            RESULT_OUTCOME_DIDNT_NEED
        );
        result.update_field(buf2);
    }

    // mark the WU as TRANSITION_NO_NEW_RESULTS
    //
    sprintf(buf, "transitioner_flags=%d", TRANSITION_NO_NEW_RESULTS);
    retval = wu.update_field(buf);
    if (retval) return retval;

    // create an assignment record
    //
    asg.clear();
    asg.create_time = time(0);
    asg.target_id = userid;
    asg.target_type = ASSIGN_USER;
    asg.multi = 0;
    asg.workunitid = wu.id;
    retval = asg.insert();
    return retval;
}
// return true if we changed the file_delete_state of a WU or a result
//
bool do_pass(bool retry_error) {
    DB_WORKUNIT wu;
    DB_RESULT result;
    bool did_something = false;
    char buf[256];
    char clause[256];
    int retval, new_state;

    check_stop_daemons();

    strcpy(clause, "");
    if (id_modulus) {
        sprintf(clause, " and id %% %d = %d ", id_modulus, id_remainder);
    }
    if (dont_delete_batches) {
        strcat(clause, " and batch <= 0 ");
    }
    if (appid) {
        sprintf(buf, " and appid = %d ", appid);
        strcat(clause, buf);
    }
    sprintf(buf,
            "where file_delete_state=%d %s limit %d",
            retry_error?FILE_DELETE_ERROR:FILE_DELETE_READY,
            clause, WUS_PER_ENUM
           );

    while (do_input_files) {
        retval = wu.enumerate(buf);
        if (retval) {
            if (retval != ERR_DB_NOT_FOUND) {
                log_messages.printf(MSG_DEBUG, "DB connection lost, exiting\n");
                exit(0);
            }
            break;
        }

        if (preserve_wu_files) {
            retval = 0;
        } else {
            retval = wu_delete_files(wu);
        }
        if (retval) {
            new_state = FILE_DELETE_ERROR;
            log_messages.printf(MSG_CRITICAL,
                                "[WU#%d] file deletion failed: %s\n", wu.id, boincerror(retval)
                               );
        } else {
            new_state = FILE_DELETE_DONE;
        }
        if (new_state != wu.file_delete_state) {
            sprintf(buf, "file_delete_state=%d", new_state);
            retval = wu.update_field(buf);
            if (retval) {
                log_messages.printf(MSG_CRITICAL,
                                    "[WU#%d] update failed: %s\n", wu.id, boincerror(retval)
                                   );
            } else {
                log_messages.printf(MSG_DEBUG,
                                    "[WU#%d] file_delete_state updated\n", wu.id
                                   );
                did_something = true;
            }
        }
    }

    sprintf(buf,
            "where file_delete_state=%d %s limit %d",
            retry_error?FILE_DELETE_ERROR:FILE_DELETE_READY,
            clause, RESULTS_PER_ENUM
           );

    while (do_output_files) {
        retval = result.enumerate(buf);
        if (retval) {
            if (retval != ERR_DB_NOT_FOUND) {
                log_messages.printf(MSG_DEBUG, "DB connection lost, exiting\n");
                exit(0);
            }
            break;
        }

        if (preserve_result_files) {
            retval = 0;
        } else {
            retval = result_delete_files(result);
        }
        if (retval) {
            new_state = FILE_DELETE_ERROR;
            log_messages.printf(MSG_CRITICAL,
                                "[RESULT#%d] file deletion failed: %s\n", result.id, boincerror(retval)
                               );
        } else {
            new_state = FILE_DELETE_DONE;
        }
        if (new_state != result.file_delete_state) {
            sprintf(buf, "file_delete_state=%d", new_state);
            retval = result.update_field(buf);
            if (retval) {
                log_messages.printf(MSG_CRITICAL,
                                    "[RESULT#%d] update failed: %s\n", result.id, boincerror(retval)
                                   );
            } else {
                log_messages.printf(MSG_DEBUG,
                                    "[RESULT#%d] file_delete_state updated\n", result.id
                                   );
                did_something = true;
            }
        }
    }

    return did_something;
}
// send non-multi assigned jobs
//
bool send_assigned_jobs() {
    DB_ASSIGNMENT asg;
    DB_RESULT result;
    DB_WORKUNIT wu;
    bool sent_something = false;
    int retval;

    // for now, only look for user assignments
    //
    char buf[256];
    sprintf(buf, "where target_type=%d and target_id=%d and multi=0",
        ASSIGN_USER, g_reply->user.id
    );
    while (!asg.enumerate(buf)) {
        if (!work_needed(false)) continue; 

        // if the WU doesn't exist, delete the assignment record.
        //
        retval = wu.lookup_id(asg.workunitid);
        if (retval) {
            asg.delete_from_db();
            continue;
        }
        // don't send if WU is validation pending or completed,
        // or has transition pending
        //
        if (wu.need_validate) continue;
        if (wu.canonical_resultid) continue;
        if (wu.transition_time < time(0)) continue;

        // don't send if we already sent one to this host
        //
        sprintf(buf, "where workunitid=%d and hostid=%d",
            asg.workunitid,
            g_request->host.id
        );
        retval = result.lookup(buf);
        if (retval != ERR_DB_NOT_FOUND) continue;

        // don't send if there's already one in progress to this user
        //
        sprintf(buf,
            "where workunitid=%d and userid=%d and server_state=%d",
            asg.workunitid,
            g_reply->user.id,
            RESULT_SERVER_STATE_IN_PROGRESS
        );
        retval = result.lookup(buf);
        if (retval != ERR_DB_NOT_FOUND) continue;

        // OK, send the job
        //
        retval = send_assigned_job(asg);
        if (retval) continue;

        sent_something = true;

        // update the WU's transition time to time out this job
        //
        retval = wu.lookup_id(asg.workunitid);
        if (retval) continue;
        int new_tt = time(0) + wu.delay_bound;
        if (new_tt < wu.transition_time) {
            char buf2[256];
            sprintf(buf2, "transition_time=%d", new_tt);
            wu.update_field(buf2);
        }
    }
    return sent_something;
}
Пример #8
0
// assimilate all WUs that need it
// return nonzero (true) if did anything
//
bool do_pass(APP& app) {
    DB_WORKUNIT wu;
    DB_RESULT canonical_result, result;
    bool did_something = false;
    char buf[256];
    char mod_clause[256];
    int retval;
    int num_assimilated=0;

    check_stop_daemons();

    if (wu_id_modulus) {
        sprintf(mod_clause, " and workunit.id %% %d = %d ",
                wu_id_modulus, wu_id_remainder
        );
    } else {
        strcpy(mod_clause, "");
    }

    sprintf(buf,
        "where appid=%d and assimilate_state=%d %s limit %d",
        app.id, ASSIMILATE_READY, mod_clause,
        one_pass_N_WU ? one_pass_N_WU : 1000
    );
    while (1) {
        retval = wu.enumerate(buf);
        if (retval) {
            if (retval != ERR_DB_NOT_FOUND) {
                log_messages.printf(MSG_DEBUG,
                    "DB connection lost, exiting\n"
                );
                exit(0);
            }
            break;
        }
        vector<RESULT> results;     // must be inside while()!

        // for testing purposes, pretend we did nothing
        //
        if (update_db) {
            did_something = true;
        }

        log_messages.printf(MSG_DEBUG,
            "[%s] assimilating WU %d; state=%d\n", wu.name, wu.id, wu.assimilate_state
        );

        sprintf(buf, "where workunitid=%d", wu.id);
        canonical_result.clear();
        bool found = false;
        while (1) {
            retval = result.enumerate(buf);
            if (retval) {
                if (retval != ERR_DB_NOT_FOUND) {
                    log_messages.printf(MSG_DEBUG,
                        "DB connection lost, exiting\n"
                    );
                    exit(0);
                }
                break;
            }
            results.push_back(result);
            if (result.id == wu.canonical_resultid) {
                canonical_result = result;
                found = true;
            }
        }

        // If no canonical result found and WU had no other errors,
        // something is wrong, e.g. result records got deleted prematurely.
        // This is probably unrecoverable, so mark the WU as having
        // an assimilation error and keep going.
        //
        if (!found && !wu.error_mask) {
            log_messages.printf(MSG_CRITICAL,
                "[%s] no canonical result\n", wu.name
            );
            wu.error_mask = WU_ERROR_NO_CANONICAL_RESULT;
            sprintf(buf, "error_mask=%d", wu.error_mask);
            wu.update_field(buf);
        }

        retval = assimilate_handler(wu, results, canonical_result);
        if (retval && retval != DEFER_ASSIMILATION) {
            log_messages.printf(MSG_CRITICAL,
                "[%s] handler error: %s; exiting\n", wu.name, boincerror(retval)
            );
            exit(retval);
        }

        if (update_db) {
            // Defer assimilation until next result is returned
            int assimilate_state = ASSIMILATE_DONE;
            if (retval == DEFER_ASSIMILATION) {
                assimilate_state = ASSIMILATE_INIT;
            }
            sprintf(
                buf, "assimilate_state=%d, transition_time=%d",
                assimilate_state, (int)time(0)
            );
            retval = wu.update_field(buf);
            if (retval) {
                log_messages.printf(MSG_CRITICAL,
                    "[%s] update failed: %s\n", wu.name, boincerror(retval)
                );
                exit(1);
            }
        }

        num_assimilated++;

    }

    if (did_something) {
        boinc_db.commit_transaction();
    }

    if (num_assimilated)  {
        log_messages.printf(MSG_NORMAL,
            "Assimilated %d workunits.\n", num_assimilated
        );
    }

    return did_something;
}
Пример #9
0
int main(int argc, const char** argv) {
    DB_APP app;
    DB_WORKUNIT wu;
    int retval;
    char wu_template[BLOB_SIZE];
    char wu_template_file[256], result_template_file[256], result_template_path[MAXPATHLEN];
    const char* command_line=NULL;
    const char** infiles = NULL;
    int i, ninfiles;
    char download_dir[256], db_name[256], db_passwd[256];
    char db_user[256],db_host[256];
    char buf[256];
    char additional_xml[256];
    bool show_wu_name = true;
    bool assign_flag = false;
    bool assign_multi = false;
    int assign_id = 0;
    int assign_type = ASSIGN_NONE;

    strcpy(wu_template_file, "");
    strcpy(result_template_file, "");
    strcpy(app.name, "");
    strcpy(db_passwd, "");
    strcpy(additional_xml, "");
    const char* config_dir = 0;
    i = 1;
    ninfiles = 0;
    wu.clear();

    // defaults (in case they're not in WU template)

    wu.id = 0;
    wu.min_quorum = 2;
    wu.target_nresults = 2;
    wu.max_error_results = 3;
    wu.max_total_results = 10;
    wu.max_success_results = 6;
    wu.rsc_fpops_est = 3600e9;
    wu.rsc_fpops_bound =  86400e9;
    wu.rsc_memory_bound = 5e8;
    wu.rsc_disk_bound = 1e9;
    wu.rsc_bandwidth_bound = 0.0;
    wu.delay_bound = 7*86400;

    while (i < argc) {
        if (arg(argv, i, "appname")) {
            strcpy(app.name, argv[++i]);
        } else if (arg(argv, i, "d")) {
            int dl = atoi(argv[++i]);
            log_messages.set_debug_level(dl);
            if (dl ==4) g_print_queries = true;
        } else if (arg(argv, i, "wu_name")) {
            show_wu_name = false;
            strcpy(wu.name, argv[++i]);
        } else if (arg(argv, i, "wu_template")) {
            strcpy(wu_template_file, argv[++i]);
        } else if (arg(argv, i, "result_template")) {
            strcpy(result_template_file, argv[++i]);
        } else if (arg(argv, i, "batch")) {
            wu.batch = atoi(argv[++i]);
        } else if (arg(argv, i, "config_dir")) {
            config_dir = argv[++i];
        } else if (arg(argv, i, "batch")) {
            wu.batch = atoi(argv[++i]);
        } else if (arg(argv, i, "priority")) {
            wu.priority = atoi(argv[++i]);
        } else if (arg(argv, i, "rsc_fpops_est")) {
            wu.rsc_fpops_est = atof(argv[++i]);
        } else if (arg(argv, i, "rsc_fpops_bound")) {
            wu.rsc_fpops_bound = atof(argv[++i]);
        } else if (arg(argv, i, "rsc_memory_bound")) {
            wu.rsc_memory_bound = atof(argv[++i]);
        } else if (arg(argv, i, "rsc_disk_bound")) {
            wu.rsc_disk_bound = atof(argv[++i]);
        } else if (arg(argv, i, "delay_bound")) {
            wu.delay_bound = atoi(argv[++i]);
        } else if (arg(argv, i, "min_quorum")) {
            wu.min_quorum = atoi(argv[++i]);
        } else if (arg(argv, i, "target_nresults")) {
            wu.target_nresults = atoi(argv[++i]);
        } else if (arg(argv, i, "max_error_results")) {
            wu.max_error_results = atoi(argv[++i]);
        } else if (arg(argv, i, "max_total_results")) {
            wu.max_total_results = atoi(argv[++i]);
        } else if (arg(argv, i, "max_success_results")) {
            wu.max_success_results = atoi(argv[++i]);
        } else if (arg(argv, i, "opaque")) {
            wu.opaque = atoi(argv[++i]);
        } else if (arg(argv, i, "command_line")) {
            command_line= argv[++i];
        } else if (arg(argv, i, "additional_xml")) {
            strcpy(additional_xml, argv[++i]);
        } else if (arg(argv, i, "wu_id")) {
            wu.id = atoi(argv[++i]);
        } else if (arg(argv, i, "broadcast")) {
            assign_multi = true;
            assign_flag = true;
            assign_type = ASSIGN_NONE;
        } else if (arg(argv, i, "broadcast_user")) {
            assign_flag = true;
            assign_type = ASSIGN_USER;
            assign_multi = true;
            assign_id = atoi(argv[++i]);
        } else if (arg(argv, i, "broadcast_team")) {
            assign_flag = true;
            assign_type = ASSIGN_TEAM;
            assign_multi = true;
            assign_id = atoi(argv[++i]);
        } else if (arg(argv, i, "target_host")) {
            assign_flag = true;
            assign_type = ASSIGN_HOST;
            assign_id = atoi(argv[++i]);
        } else if (arg(argv, i, "target_user")) {
            assign_flag = true;
            assign_type = ASSIGN_USER;
            assign_id = atoi(argv[++i]);
        } else if (arg(argv, i, "target_team")) {
            assign_flag = true;
            assign_type = ASSIGN_TEAM;
            assign_id = atoi(argv[++i]);
        } else if (arg(argv, i, "help")) {
            usage();
            exit(0);
        } else {
            if (!strncmp("-", argv[i], 1)) {
                fprintf(stderr, "create_work: bad argument '%s'\n", argv[i]);
                exit(1);
            }
            infiles = argv+i;
            ninfiles = argc - i;
            break;
        }
        i++;
    }

    if (!strlen(app.name)) {
        usage();
    }
    if (!strlen(wu.name)) {
        sprintf(wu.name, "%s_%d_%f", app.name, getpid(), dtime());
    }
    if (!strlen(wu_template_file)) {
        sprintf(wu_template_file, "templates/%s_in", app.name);
    }
    if (!strlen(result_template_file)) {
        sprintf(result_template_file, "templates/%s_out", app.name);
    }

    retval = config.parse_file(config_dir);
    if (retval) {
        fprintf(stderr, "Can't parse config file: %s\n", boincerror(retval));
        exit(1);
    } else {
        strcpy(db_name, config.db_name);
        strcpy(db_passwd, config.db_passwd);
        strcpy(db_user, config.db_user);
        strcpy(db_host, config.db_host);
        strcpy(download_dir, config.download_dir);
    }

    retval = boinc_db.open(db_name, db_host, db_user, db_passwd);
    if (retval) {
        fprintf(stderr,
            "create_work: error opening database: %s\n", boincerror(retval)
        );
        exit(1);
    }
    sprintf(buf, "where name='%s'", app.name);
    retval = app.lookup(buf);
    if (retval) {
        fprintf(stderr, "create_work: app not found\n");
        exit(1);
    }

    retval = read_filename(wu_template_file, wu_template, sizeof(wu_template));
    if (retval) {
        fprintf(stderr,
            "create_work: can't open input template %s\n", wu_template_file
        );
        exit(1);
    }

    wu.appid = app.id;

    strcpy(result_template_path, "./");
    strcat(result_template_path, result_template_file);
    retval = create_work(
        wu,
        wu_template,
        result_template_file,
        result_template_path,
        const_cast<const char **>(infiles),
        ninfiles,
        config,
        command_line,
        additional_xml
    );
    if (retval) {
        fprintf(stderr, "create_work: %s\n", boincerror(retval));
        exit(1);
    } else {
        if (show_wu_name) {
            printf("workunit name: %s\n", wu.name);
        }
    }
    if (assign_flag) {
        DB_ASSIGNMENT assignment;
        assignment.clear();
        assignment.create_time = time(0);
        assignment.target_id = assign_id;
        assignment.target_type = assign_type;
        assignment.multi = assign_multi;
        assignment.workunitid = wu.id;
        retval = assignment.insert();
        if (retval) {
            fprintf(stderr,
                "assignment.insert() failed: %s\n", boincerror(retval)
            );
            exit(1);
        }
        sprintf(buf, "transitioner_flags=%d",
            assign_multi?TRANSITION_NONE:TRANSITION_NO_NEW_RESULTS
        );
        retval = wu.update_field(buf);
        if (retval) {
            fprintf(stderr, "wu.update() failed: %s\n", boincerror(retval));
            exit(1);
        }
    }
    boinc_db.close();
}
Пример #10
0
int handle_wu(
    DB_TRANSITIONER_ITEM_SET& transitioner,
    std::vector<TRANSITIONER_ITEM>& items
) {
    int ntotal, nerrors, retval, ninprogress, nsuccess;
    int nunsent, ncouldnt_send, nover, ndidnt_need, nno_reply;
    int canonical_result_index, j;
    char suffix[256];
    time_t now = time(0), x;
    bool all_over_and_validated, have_new_result_to_validate, do_delete;
    unsigned int i;

    TRANSITIONER_ITEM& wu_item = items[0];
    TRANSITIONER_ITEM wu_item_original = wu_item;

    // "assigned" WUs aren't supposed to pass through the transitioner.
    // If we get one, it's an error
    //
    if (config.enable_assignment && strstr(wu_item.name, ASSIGNED_WU_STR)) {
        DB_WORKUNIT wu;
        char buf[256];

        wu.id = wu_item.id;
        log_messages.printf(MSG_CRITICAL,
            "Assigned WU %d unexpectedly found by transitioner\n", wu.id
        );
        sprintf(buf, "transition_time=%d", INT_MAX);
        retval = wu.update_field(buf);
        if (retval) {
            log_messages.printf(MSG_CRITICAL,
                "update_field failed: %s\n", boincerror(retval)
            );
        }
        return 0;
    }

    // count up the number of results in various states,
    // and check for timed-out results
    //
    ntotal = 0;
    nunsent = 0;
    ninprogress = 0;
    nover = 0;
    nerrors = 0;
    nsuccess = 0;
        // not counting invalid results!!!!
    ncouldnt_send = 0;
    nno_reply = 0;
    ndidnt_need = 0;
    have_new_result_to_validate = false;
    int rs, max_result_suffix = -1;

    // Scan the WU's results, and find the canonical result if there is one
    //
    canonical_result_index = -1;
    if (wu_item.canonical_resultid) {
        for (i=0; i<items.size(); i++) {
            TRANSITIONER_ITEM& res_item = items[i];
            if (!res_item.res_id) continue;
            if (res_item.res_id == wu_item.canonical_resultid) {
                canonical_result_index = i;
            }
        }
    }

    if (wu_item.canonical_resultid && (canonical_result_index == -1)) {
        log_messages.printf(MSG_CRITICAL,
            "[WU#%d %s] can't find canonical result\n",
            wu_item.id, wu_item.name
        );
    }

    // if there is a canonical result, see if its file are deleted
    //
    bool canonical_result_files_deleted = false;
    if (canonical_result_index >= 0) {
        TRANSITIONER_ITEM& cr = items[canonical_result_index];
        if (cr.res_file_delete_state == FILE_DELETE_DONE) {
            canonical_result_files_deleted = true;
        }
    }

    // Scan this WU's results, and
    // 1) count those in various server states;
    // 2) identify timed-out results and update their server state and outcome
    // 3) find the max result suffix (in case need to generate new ones)
    // 4) see if we have a new result to validate
    //    (outcome SUCCESS and validate_state INIT)
    //
    for (i=0; i<items.size(); i++) {
        TRANSITIONER_ITEM& res_item = items[i];

        if (!res_item.res_id) continue;
        ntotal++;

        rs = result_suffix(res_item.res_name);
        if (rs > max_result_suffix) max_result_suffix = rs;

        switch (res_item.res_server_state) {
        case RESULT_SERVER_STATE_UNSENT:
            nunsent++;
            break;
        case RESULT_SERVER_STATE_IN_PROGRESS:
            if (res_item.res_report_deadline < now) {
                log_messages.printf(MSG_NORMAL,
                    "[WU#%d %s] [RESULT#%d %s] result timed out (%d < %d) server_state:IN_PROGRESS=>OVER; outcome:NO_REPLY\n",
                    wu_item.id, wu_item.name, res_item.res_id,
                    res_item.res_name,
                    res_item.res_report_deadline, (int)now
                );
                res_item.res_server_state = RESULT_SERVER_STATE_OVER;
                res_item.res_outcome = RESULT_OUTCOME_NO_REPLY;
                retval = transitioner.update_result(res_item);
                if (retval) {
                    log_messages.printf(MSG_CRITICAL,
                        "[WU#%d %s] [RESULT#%d %s] update_result(): %s\n",
                        wu_item.id, wu_item.name, res_item.res_id,
                        res_item.res_name, boincerror(retval)
                    );
                }
                retval = result_timed_out(res_item, wu_item);
                if (retval) {
                    log_messages.printf(MSG_CRITICAL,
                        "result_timed_out() error: %s\n", boincerror(retval)
                    );
                    exit(1);
                }
                nover++;
                nno_reply++;
            } else {
                ninprogress++;
            }
            break;
        case RESULT_SERVER_STATE_OVER:
            nover++;
            switch (res_item.res_outcome) {
            case RESULT_OUTCOME_COULDNT_SEND:
                log_messages.printf(MSG_NORMAL,
                    "[WU#%d %s] [RESULT#%d %s] result couldn't be sent\n",
                    wu_item.id, wu_item.name, res_item.res_id, res_item.res_name
                );
                ncouldnt_send++;
                break;
            case RESULT_OUTCOME_SUCCESS:
                if (res_item.res_validate_state == VALIDATE_STATE_INIT) {
                    if (canonical_result_files_deleted) {
                        res_item.res_validate_state = VALIDATE_STATE_TOO_LATE;
                        retval = transitioner.update_result(res_item);
                        if (retval) {
                            log_messages.printf(MSG_CRITICAL,
                                "[WU#%d %s] [RESULT#%d %s] update_result(): %s\n",
                                wu_item.id, wu_item.name, res_item.res_id,
                                res_item.res_name, boincerror(retval)
                            );
                        } else {
                            log_messages.printf(MSG_NORMAL,
                                "[WU#%d %s] [RESULT#%d %s] validate_state:INIT=>TOO_LATE\n",
                                wu_item.id, wu_item.name, res_item.res_id,
                                res_item.res_name
                            );
                        }
                    } else {
                        have_new_result_to_validate = true;
                    }
                }
                // don't count invalid results as successful
                //
                if (res_item.res_validate_state != VALIDATE_STATE_INVALID) {
                    nsuccess++;
                }
                break;
            case RESULT_OUTCOME_CLIENT_ERROR:
            case RESULT_OUTCOME_VALIDATE_ERROR:
                nerrors++;
                break;
            case RESULT_OUTCOME_CLIENT_DETACHED:
            case RESULT_OUTCOME_NO_REPLY:
                nno_reply++;
                break;
            case RESULT_OUTCOME_DIDNT_NEED:
                ndidnt_need++;
                break;
            }
            break;
        }
    }

    log_messages.printf(MSG_DEBUG,
        "[WU#%d %s] %d results: unsent %d, in_progress %d, over %d (success %d, error %d, couldnt_send %d, no_reply %d, didnt_need %d)\n",
        wu_item.id, wu_item.name, ntotal, nunsent, ninprogress, nover,
        nsuccess, nerrors, ncouldnt_send, nno_reply, ndidnt_need
    );

    // if there's a new result to validate, trigger validation
    //
    if (have_new_result_to_validate && (nsuccess >= wu_item.min_quorum)) {
        wu_item.need_validate = true;
        log_messages.printf(MSG_NORMAL,
            "[WU#%d %s] need_validate:=>true\n", wu_item.id, wu_item.name
        );
    }

    // check for WU error conditions
    // NOTE: check on max # of success results is done in validater
    //
    if (ncouldnt_send > 0) {
        wu_item.error_mask |= WU_ERROR_COULDNT_SEND_RESULT;
    }

    // if WU has results with errors and no success yet,
    // reset homogeneous redundancy class to give other platforms a try;
    // also reset app version ID if using HAV
    //
    if (nerrors && !(nsuccess || ninprogress)) {
        wu_item.hr_class = 0;
        wu_item.app_version_id = 0;
    }

    if (nerrors > wu_item.max_error_results) {
        log_messages.printf(MSG_NORMAL,
            "[WU#%d %s] WU has too many errors (%d errors for %d results)\n",
            wu_item.id, wu_item.name, nerrors, ntotal
        );
        wu_item.error_mask |= WU_ERROR_TOO_MANY_ERROR_RESULTS;
    }

    // see how many new results we need to make
    //
    int n_new_results_needed = wu_item.target_nresults - nunsent - ninprogress - nsuccess;
    if (n_new_results_needed < 0) n_new_results_needed = 0;
    int n_new_results_allowed = wu_item.max_total_results - ntotal;

    // if we're already at the limit and need more, error out the WU
    //
    bool too_many = false;
    if (n_new_results_allowed < 0) {
        too_many = true;
    } else if (n_new_results_allowed == 0) {
        if (n_new_results_needed > 0) {
            too_many = true;
        }
    } else {
        if (n_new_results_needed > n_new_results_allowed) {
            n_new_results_needed = n_new_results_allowed;
        }
    }
    if (too_many) {
        log_messages.printf(MSG_NORMAL,
            "[WU#%d %s] WU has too many total results (%d)\n",
            wu_item.id, wu_item.name, ntotal
        );
        wu_item.error_mask |= WU_ERROR_TOO_MANY_TOTAL_RESULTS;
    }

    // if this WU had an error, don't send any unsent results,
    // and trigger assimilation if needed
    //
    if (wu_item.error_mask) {
        for (i=0; i<items.size(); i++) {
            TRANSITIONER_ITEM& res_item = items[i];
            if (!res_item.res_id) continue;
            bool update_result = false;
            switch(res_item.res_server_state) {
            case RESULT_SERVER_STATE_UNSENT:
                log_messages.printf(MSG_NORMAL,
                    "[WU#%d %s] [RESULT#%d %s] server_state:UNSENT=>OVER; outcome:=>DIDNT_NEED\n",
                    wu_item.id, wu_item.name, res_item.res_id, res_item.res_name
                );
                res_item.res_server_state = RESULT_SERVER_STATE_OVER;
                res_item.res_outcome = RESULT_OUTCOME_DIDNT_NEED;
                update_result = true;
                break;
            case RESULT_SERVER_STATE_OVER:
                switch (res_item.res_outcome) {
                case RESULT_OUTCOME_SUCCESS:
                    switch(res_item.res_validate_state) {
                    case VALIDATE_STATE_INIT:
                    case VALIDATE_STATE_INCONCLUSIVE:
                        res_item.res_validate_state = VALIDATE_STATE_NO_CHECK;
                        update_result = true;
                        break;
                    }
                }
            }
            if (update_result) {
                retval = transitioner.update_result(res_item);
                if (retval) {
                    log_messages.printf(MSG_CRITICAL,
                        "[WU#%d %s] [RESULT#%d %s] result.update(): %s\n",
                        wu_item.id, wu_item.name, res_item.res_id,
                        res_item.res_name, boincerror(retval)
                    );
                }
            }
        }
        if (wu_item.assimilate_state == ASSIMILATE_INIT) {
            wu_item.assimilate_state = ASSIMILATE_READY;
            log_messages.printf(MSG_NORMAL,
                "[WU#%d %s] error_mask:%d assimilate_state:INIT=>READY\n",
                wu_item.id, wu_item.name, wu_item.error_mask
            );
        }
    } else if (wu_item.canonical_resultid == 0) {
        // Here if no WU-level error.
        // Generate new results if needed.
        //
        std::string values;
        char value_buf[MAX_QUERY_LEN];
        if (n_new_results_needed > 0) {
            log_messages.printf(
                MSG_NORMAL,
                "[WU#%d %s] Generating %d more results (%d target - %d unsent - %d in progress - %d success)\n",
                wu_item.id, wu_item.name, n_new_results_needed,
                wu_item.target_nresults, nunsent, ninprogress, nsuccess
            );
            for (j=0; j<n_new_results_needed; j++) {
                sprintf(suffix, "%d", max_result_suffix+j+1);
                const char *rtfpath = config.project_path("%s", wu_item.result_template_file);
                int priority_increase = 0;
                if (nover && config.reliable_priority_on_over) {
                    priority_increase += config.reliable_priority_on_over;
                } else if (nover && !nerrors && config.reliable_priority_on_over_except_error) {
                    priority_increase += config.reliable_priority_on_over_except_error;
                }
                retval = create_result_ti(
                    wu_item, (char *)rtfpath, suffix, key, config, value_buf, priority_increase
                );
                if (retval) {
                    log_messages.printf(MSG_CRITICAL,
                        "[WU#%d %s] create_result_ti(): %s\n",
                        wu_item.id, wu_item.name, boincerror(retval)
                    );
                    return retval;
                }
                if (j==0) {
                    values = value_buf;
                } else {
                    values += ",";
                    values += value_buf;
                }
            }
            DB_RESULT r;
            retval = r.insert_batch(values);
            if (retval) {
                log_messages.printf(MSG_CRITICAL,
                    "[WU#%d %s] insert_batch(): %s\n",
                    wu_item.id, wu_item.name, boincerror(retval)
                );
                return retval;
            }
        }
    }

    // scan results:
    //  - see if all over and validated
    //
    all_over_and_validated = true;
    bool all_over_and_ready_to_assimilate = true;
        // used for the defer assimilation
    double most_recently_returned = 0;
    for (i=0; i<items.size(); i++) {
        TRANSITIONER_ITEM& res_item = items[i];
        if (!res_item.res_id) continue;
        if (res_item.res_server_state == RESULT_SERVER_STATE_OVER) {
            if (res_item.res_received_time > most_recently_returned) {
                most_recently_returned = res_item.res_received_time;
            }
            if (res_item.res_outcome == RESULT_OUTCOME_SUCCESS) {
                if (res_item.res_validate_state == VALIDATE_STATE_INIT) {
                    all_over_and_validated = false;
                    all_over_and_ready_to_assimilate = false;
                }
            } else if (res_item.res_outcome == RESULT_OUTCOME_NO_REPLY) {
                if (now < res_item.res_report_deadline) {
                    all_over_and_validated = false;
                }
            }
        } else {
            all_over_and_validated = false;
            all_over_and_ready_to_assimilate = false;
        }
    }

    // If we are deferring assimilation until all results are over and validated,
    // when that happens make sure that WU state is advanced to assimilate ready
    // the items.size is a kludge
    //
    if (all_over_and_ready_to_assimilate
        && wu_item.assimilate_state == ASSIMILATE_INIT
        && items.size() > 0
        && wu_item.canonical_resultid > 0
    ) {
        wu_item.assimilate_state = ASSIMILATE_READY;
        log_messages.printf(MSG_NORMAL,
            "[WU#%d %s] Deferred assimilation now set to ASSIMILATE_STATE_READY\n",
            wu_item.id, wu_item.name
        );
    }

    // if WU is assimilated, trigger file deletion
    //
    double deferred_file_delete_time = 0;
    if (wu_item.assimilate_state == ASSIMILATE_DONE) {
        if (now >= (most_recently_returned + config.delete_delay)) {
            // can delete input files if all results OVER
            //
            if (all_over_and_validated && wu_item.file_delete_state == FILE_DELETE_INIT) {
                wu_item.file_delete_state = FILE_DELETE_READY;
                log_messages.printf(MSG_DEBUG,
                    "[WU#%d %s] ASSIMILATE_DONE: file_delete_state:=>READY\n",
                    wu_item.id, wu_item.name
                );
            }

            // output of error results can be deleted immediately;
            // output of success results can be deleted if validated
            //
            for (i=0; i<items.size(); i++) {
                TRANSITIONER_ITEM& res_item = items[i];

                // can delete canonical result outputs only if all successful
                // results have been validated
                //
                if (((int)i == canonical_result_index) && !all_over_and_validated) {
                    continue;
                }

                if (!res_item.res_id) continue;
                do_delete = false;
                switch(res_item.res_outcome) {
                case RESULT_OUTCOME_CLIENT_ERROR:
                    do_delete = true;
                    break;
                case RESULT_OUTCOME_SUCCESS:
                    do_delete = (res_item.res_validate_state != VALIDATE_STATE_INIT);
                    break;
                }
                if (do_delete && res_item.res_file_delete_state == FILE_DELETE_INIT) {
                    log_messages.printf(MSG_NORMAL,
                        "[WU#%d %s] [RESULT#%d %s] file_delete_state:=>READY\n",
                        wu_item.id, wu_item.name, res_item.res_id, res_item.res_name
                    );
                    res_item.res_file_delete_state = FILE_DELETE_READY;

                    retval = transitioner.update_result(res_item);
                    if (retval) {
                        log_messages.printf(MSG_CRITICAL,
                            "[WU#%d %s] [RESULT#%d %s] result.update(): %s\n",
                            wu_item.id, wu_item.name, res_item.res_id,
                            res_item.res_name, boincerror(retval)
                        );
                    }
                }
            }
        } else {
            deferred_file_delete_time = most_recently_returned + config.delete_delay;
            log_messages.printf(MSG_DEBUG,
                "[WU#%d %s] deferring file deletion for %.0f seconds\n",
                wu_item.id,
                wu_item.name,
                deferred_file_delete_time - now
            );
        }
    }

    // Compute next transition time.
    // This is the min of
    // - timeouts of in-progress results
    // - deferred file deletion time
    // - safety net
    //
    // It is then adjusted to deal with transitioner congestion
    //
    if (wu_item.canonical_resultid || wu_item.error_mask) {
        wu_item.transition_time = INT_MAX;
    } else {
        // Safety net: if there is no canonical result and no WU-level error,
        // make sure that the transitioner will process this WU again.
        // In principle this is not needed,
        // but it makes the BOINC back-end more robust.
        //
        const int ten_days = 10*86400;
        int long_delay = (int)(1.5*wu_item.delay_bound);
        wu_item.transition_time = (long_delay > ten_days) ? long_delay : ten_days;
        wu_item.transition_time += time(0);
    }

    // handle timeout of in-progress results
    //
    for (i=0; i<items.size(); i++) {
        TRANSITIONER_ITEM& res_item = items[i];
        if (!res_item.res_id) continue;
        if (res_item.res_server_state == RESULT_SERVER_STATE_IN_PROGRESS) {
            x = res_item.res_report_deadline;
            if (x < wu_item.transition_time) {
                wu_item.transition_time = x;
            }
        }
    }

    // handle deferred file deletion
    //
    if (deferred_file_delete_time
        && deferred_file_delete_time < wu_item.transition_time
    ) {
        wu_item.transition_time = deferred_file_delete_time;
    }

    // Handle transitioner overload.
    // If transition time is in the past,
    // the system is bogged down and behind schedule.
    // Delay processing of the WU by an amount DOUBLE the amount we are behind,
    // but not less than 60 secs or more than one day.
    //
    if (wu_item.transition_time < now) {
        int extra_delay = 2*(now - wu_item.transition_time);
        if (extra_delay < 60) extra_delay = 60;
        if (extra_delay > 86400) extra_delay = 86400;
        log_messages.printf(MSG_DEBUG,
            "[WU#%d %s] transition time in past: adding extra delay %d sec\n",
            wu_item.id, wu_item.name, extra_delay
        );
        wu_item.transition_time = now + extra_delay;
    }

    log_messages.printf(MSG_DEBUG,
        "[WU#%d %s] setting transition_time to %d\n",
        wu_item.id, wu_item.name, wu_item.transition_time
    );

    retval = transitioner.update_workunit(wu_item, wu_item_original);
    if (retval) {
        log_messages.printf(MSG_CRITICAL,
            "[WU#%d %s] workunit.update(): %s\n",
            wu_item.id, wu_item.name, boincerror(retval)
        );
        return retval;
    }
    return 0;
}
Пример #11
0
// Send targeted jobs of a given type.
// NOTE: there may be an atomicity problem in the following.
// Ideally it should be in a transaction.
//
bool send_jobs(int assign_type) {
    DB_ASSIGNMENT asg;
    DB_RESULT result;
    DB_WORKUNIT wu;
    int retval;
    bool sent_something = false;
    char query[256];

    switch (assign_type) {
    case ASSIGN_USER:
        sprintf(query, "where target_type=%d and target_id=%lu and multi=0",
            ASSIGN_USER, g_reply->user.id
        );
        break;
    case ASSIGN_HOST:
        sprintf(query, "where target_type=%d and target_id=%lu and multi=0",
            ASSIGN_HOST, g_reply->host.id
        );
        break;
    case ASSIGN_TEAM:
        sprintf(query, "where target_type=%d and target_id=%lu and multi=0",
            ASSIGN_TEAM, g_reply->team.id
        );
        break;
    }

    while (!asg.enumerate(query)) {
        if (!work_needed(false)) {
            asg.end_enumerate();
            break;
        }

        // if the WU doesn't exist, delete the assignment record.
        //
        retval = wu.lookup_id(asg.workunitid);
        if (retval) {
            asg.delete_from_db();
            continue;
        }

        if (!need_targeted_instance(wu, g_reply->host.id)) {
            continue;
        }

        // OK, send the job
        //
        if (config.debug_send) {
            log_messages.printf(MSG_NORMAL,
                "sending targeted job: %s\n", wu.name
            );
        }
        retval = send_assigned_job(asg);
        if (retval) {
            log_messages.printf(MSG_NORMAL,
                "failed to send targeted job: %s\n", boincerror(retval)
            );
            continue;
        }

        sent_something = true;

        // update the WU's transition time to time out this job
        //
        retval = wu.lookup_id(asg.workunitid);
        if (retval) continue;
        int new_tt = time(0) + wu.delay_bound;
        if (new_tt < wu.transition_time) {
            char buf2[256];
            sprintf(buf2, "transition_time=%d", new_tt);
            wu.update_field(buf2);
        }
    }
    return sent_something;
}