Beispiel #1
0
int wait_for_signal(MPI_Comm comm, int signal_num) {
    int new_failures, ret;
    MPI_Status status;



    while(cur_epoch < signal_num) {
        /*
         * MPI_Probe is a blocking call that will unblock when it encounters an
         * error on the indicated peer. By passing MPI_ANY_SOURCE, it will cause
         * MPI_Probe to unblock on the first un-recognized failed process.
         *
         * The error handler will be called before MPI_Probe returns.
         *
         * In this example, MPI_Probe will only return when there is an error
         * since we do not send messages in this example. But we check the
         * return code anyway, just for good form.
         */
        ret = MPI_Probe(MPI_ANY_SOURCE, TAG_FT_DETECT, comm, &status);

        if( MPI_SUCCESS != ret ) {
            printf("%2d of %2d) MPI_Probe() Error: Some rank failed (error = %3d)\n",
                   mpi_mcw_rank, mpi_mcw_size,
                   status.MPI_ERROR);

            /* Recognize the failure and move on */
            OMPI_Comm_failure_ack(comm);
        }
        sleep(1);
    }

    return 0;
}
int numProcsFails(MPI_Comm mcw){
	int rank, ret, numFailures = 0, flag;
        MPI_Group fGroup;
        MPI_Errhandler newEh;
        MPI_Comm dupComm;

        // Error handler
        MPI_Comm_create_errhandler(mpiErrorHandler, &newEh);

        MPI_Comm_rank(mcw, &rank);

        // Set error handler for communicator
        MPI_Comm_set_errhandler(mcw, newEh);

        // Target function
        if(MPI_SUCCESS != (ret = MPI_Comm_dup(mcw, &dupComm))) {
        //if(MPI_SUCCESS != (ret = MPI_Barrier(mcw))) { // MPI_Comm_dup or MPI_Barrier
           OMPI_Comm_failure_ack(mcw);
           OMPI_Comm_failure_get_acked(mcw, &fGroup);
           // Get the number of failures
           MPI_Group_size(fGroup, &numFailures);
        }// end of "MPI_Comm_dup failure"

        OMPI_Comm_agree(mcw, &flag);
        // Memory release
	if(numFailures > 0)
           MPI_Group_free(&fGroup);
        MPI_Errhandler_free(&newEh);

        return numFailures;
}//numProcsFails()
void mpiErrorHandler(MPI_Comm * comm, int *errorCode, ...){
    MPI_Group failedGroup;

    OMPI_Comm_failure_ack(*comm);
    OMPI_Comm_failure_get_acked(*comm, &failedGroup);

    // *errorCode == MPI_ERR_PROC_FAILED classify failure type as "process" failure

    // Failed processes will NOT be synchronized Without delay
    // This delay MUST be through error handler (otherwise, problematic)
    usleep(10000); // 10 milliseconds delay (MPI_Comm_revoke is failed without
                   // this for a large number of processes)
    MPI_Group_free(&failedGroup);

    return;
}//mpiErrorHandler()
Beispiel #4
0
void mpi_error_handler(MPI_Comm *comm, int *error_code, ...)
{
    MPI_Group f_group;
    int num_failures;
    int loc_size;
    char * ranks_failed = NULL;

    MPI_Comm_size(*comm, &loc_size);

    /*
     * Will cause normal termination by unblocking the wait_for_signal() function
     * when all process failures have been reported.
     */
    cur_epoch++;
    printf("Handler !!\n");
    fflush(NULL);

    if(*error_code == MPI_ERR_PROC_FAILED ) {
        /* Access the local list of failures */
        OMPI_Comm_failure_ack(*comm);
        OMPI_Comm_failure_get_acked(*comm, &f_group);

        /* Get the number of failures */
        MPI_Group_size(f_group, &num_failures);

        cur_epoch = num_failures;

        ranks_failed = get_str_failed_procs(*comm, f_group);

        printf("%2d of %2d) Error Handler: (Comm = %s) %3d Failed Ranks: %s\n",
               mpi_mcw_rank, mpi_mcw_size,
               (mpi_mcw_size == loc_size ? "MCW" : "Subcomm"),
               num_failures, ranks_failed);

        free(ranks_failed);
    } else {
        printf("%2d of %2d) Error Handler: Some Other error has occurred. (Comm = %s) [Count = %2d / %2d]\n",
               mpi_mcw_rank, mpi_mcw_size,
               (mpi_mcw_size == loc_size ? "MCW" : "Subcomm"), cur_epoch, max_signals );
    }

    /* Introduce a small delay to aid debugging */
    fflush(NULL);
    sleep(1);

    return;
}
void mpiErrorHandler(MPI_Comm * comm, int *errorCode, ...) {
    MPI_Group failedGroup;

    OMPI_Comm_failure_ack(*comm);
    OMPI_Comm_failure_get_acked(*comm, &failedGroup);

    /*
    int rank, nprocs;
    MPI_Comm_rank(*comm, &rank);
    MPI_Comm_size(*comm, &nprocs);

    if(*errorCode == MPI_ERR_PROC_FAILED ) {
       printf("(Process %d of %d) C Error Handler: MPI_ERROR_PROC_FAILED Detected.\n",
              rank, nprocs);
    }
    else {
       printf("(Process %d of %d) C Error Handler: Other Failure Detected.\n",
              rank, nprocs);
    }
    */

    // Introduce a small delay to aid debugging
    fflush(NULL);

    // 1. Without delay, failed processes will NOT be synchronized.
    // 2. This delay MUST be through error handler. Otherwise, NOT work.
    // 3. 10 milliseconds is the minimum delay I have tested on a dual-core machine
    //    for 200 processes. Not working if smaller time is given.

    //sleep(1);    // 1 second
    usleep(10000); // # 10 milliseconds
    // # MPI_Comm_revoke is failed without this for a large number of
    //   processes, but works for 1 process failure.
    // # most of the time hangs on for more than 1 process failure without this.
    // # necessary to enable this for a small number of processes to test.

    MPI_Group_free(&failedGroup);

    return;
}//mpiErrorHandler()
void repairComm(MPI_Comm * broken, MPI_Comm * repaired, int iteration, int * listFails, int * numFails,
                int * numNodeFails, int sumPrevNumNodeFails, int argc, char ** argv, int verbosity) {
    MPI_Comm tempShrink, unorderIntracomm, tempIntercomm;
    int i, ret, result, procsNeeded = 0, oldRank, newRank, oldGroupSize, rankKey = 0, flag;
    int * tempRanks, * failedRanks, * errCodes, rank, hostfileLineIndex;
    MPI_Group oldGroup, failedGroup, shrinkGroup;
    int hostfileLastLineIndex, tempLineIndex, * failedNodeList = NULL, * nodeList = NULL, totNodeFailed = 0;
    double startTime = 0.0, endTime;
    int nprocs, j, * shrinkMergeList;
    char hostName[128];
    gethostname(hostName, sizeof(hostName));

    char ** appToLaunch;
    char *** argvToLaunch;
    int * procsNeededToLaunch;
    MPI_Info * hostInfoToLaunch;
    char ** hostNameToLaunch;

    MPI_Comm_rank(*broken, &rank);
    if(rank == 0)
        startTime = MPI_Wtime();

#ifndef GLOBAL_DETECTION
    MPI_Comm_size(*broken, &oldGroupSize);
    MPI_Comm_group(*broken, &oldGroup);
    MPI_Comm_rank(*broken, &oldRank);
    OMPI_Comm_failure_ack(*broken);
    OMPI_Comm_failure_get_acked(*broken, &failedGroup);
    MPI_Group_size(failedGroup, &procsNeeded);
    errCodes = (int *) malloc(sizeof(int) * procsNeeded);

    // Figure out ranks of the processes which had failed
    tempRanks = (int *) malloc(sizeof(int) * oldGroupSize);
    failedRanks = (int *) malloc(sizeof(int) * oldGroupSize);
    #pragma omp parallel for default(shared)
    for(i = 0; i < oldGroupSize; i++)
        tempRanks[i] = i;

    MPI_Group_translate_ranks(failedGroup, procsNeeded, tempRanks, oldGroup, failedRanks);
#endif

    double shrinkTime = MPI_Wtime();
    // Shrink the broken communicator to remove failed procs
    if(MPI_SUCCESS != (ret = OMPI_Comm_shrink(*broken, &tempShrink)))
        printf("Iteration %d: OMPI_Comm_shrink (parent): ERROR!\n", iteration);
    else {
        if(verbosity > 1 )
            printf("Iteration %d: OMPI_Comm_shrink (parent): SUCCESS\n", iteration);
    }
    if (verbosity > 0 && rank == 0)
        printf("OMPI_Comm_shrink takes %0.6f Sec\n", MPI_Wtime() - shrinkTime);

#ifdef GLOBAL_DETECTION
    MPI_Comm_group(*broken, &oldGroup);
    MPI_Comm_group(tempShrink, &shrinkGroup);
    MPI_Comm_size(*broken, &oldGroupSize);

    MPI_Group_compare(oldGroup, shrinkGroup, &result);

    if(result != MPI_IDENT)
        MPI_Group_difference(oldGroup, shrinkGroup, &failedGroup);

    MPI_Comm_rank(*broken, &oldRank);
    MPI_Group_size(failedGroup, &procsNeeded);

    errCodes = (int *) malloc(sizeof(int)*procsNeeded);

    // Figure out ranks of the processes which had failed
    tempRanks = (int*)malloc(sizeof(int)*oldGroupSize);
    failedRanks = (int*)malloc(sizeof(int)*oldGroupSize);
    #pragma omp parallel for default(shared)
    for(i = 0; i < oldGroupSize; i++)
        tempRanks[i] = i;

    MPI_Group_translate_ranks(failedGroup, procsNeeded, tempRanks, oldGroup, failedRanks);

    MPI_Group_free(&shrinkGroup);
#endif

    // Assign number of failed processes
    *numFails = procsNeeded;

    hostNameToLaunch = (char **) malloc(procsNeeded * sizeof(char *));

    if(verbosity > 0 && rank == 0)
        printf("*** Iteration %d: Application: Number of process(es) failed in the corresponding "
               "communicator is %d ***\n", iteration, procsNeeded);

    if(rank == 0) {
        endTime = MPI_Wtime();
        printf("[%d]----- Creating failed process list takes %0.6f Sec (MPI_Wtime) -----\n", rank, endTime - startTime);
    }

#ifdef RECOV_ON_SPARE_NODES
    // Determining total number of node failed, and a list of them
    hostfileLastLineIndex = getHostfileLastLineIndex(); //started from 0
    nodeList = (int *) malloc((hostfileLastLineIndex+1) * sizeof(int));
    memset(nodeList, 0, (hostfileLastLineIndex+1)*sizeof(int)); // initialize nodeList with 0's

    for(int i = 0; i < procsNeeded; ++i) {
        tempLineIndex = failedRanks[i]/SLOTS; //started from 0
        nodeList[tempLineIndex] = 1;
    }

    for(int nodeCounter = 0; nodeCounter < (hostfileLastLineIndex+1); ++nodeCounter)
        totNodeFailed += nodeList[nodeCounter];
    *numNodeFails = totNodeFailed;

    // Check if there is sufficient spare node available for recovery
    if((hostfileLastLineIndex - totNodeFailed -sumPrevNumNodeFails) < (oldGroupSize-1)/SLOTS) {
        if(rank == 0)
            printf("[%d] There is no sufficient spare node available for recovery.\n", rank);
        exit(0);
    }

    failedNodeList = (int *) malloc(totNodeFailed * sizeof(int));
    memset(failedNodeList, 0, totNodeFailed * sizeof(int)); // initialize failedNodeList with 0's

    int failedNodeCounter = 0;
    for(int nodeCounter = 0; nodeCounter < (hostfileLastLineIndex+1); ++nodeCounter) {
        if(nodeList[nodeCounter] == 1)
            failedNodeList[failedNodeCounter++] = nodeCounter;
    }
#endif

    char * hostNameFailed = NULL;
    #pragma omp parallel for default(shared)
    for(i = 0; i < procsNeeded; ++i) {
        // Assign list of processes failed
        listFails[i] = failedRanks[i];

#ifdef RUN_ON_COMPUTE_NODES
        tempLineIndex = failedRanks[i]/SLOTS; //started from 0
#ifdef RECOV_ON_SPARE_NODES
        for(int j = 0; j < totNodeFailed; ++j) {
            if(failedNodeList[j] == tempLineIndex)
                hostfileLineIndex = hostfileLastLineIndex - j - sumPrevNumNodeFails;
        }
#else // Recovery on the same node (no node failure, only process failure)
        hostfileLineIndex = tempLineIndex;
#endif
        hostNameToLaunch[i] = getHostToLaunch(hostfileLineIndex);
        hostNameFailed = getHostToLaunch(tempLineIndex);
#else // Run on head node or personal machine
        hostNameToLaunch[i] = (char *)hostName;
        hostNameFailed = (char *)hostName;
#endif

        if(verbosity > 0 && rank == 0)
            printf("--- Iteration %d: Application: Process %d on node %s is failed! ---\n", iteration, failedRanks[i], hostNameFailed);
    }
    // Release memory of hostNameFailed
    free(hostNameFailed);

    appToLaunch = (char **) malloc(procsNeeded * sizeof(char *));
    argvToLaunch = (char ***) malloc(procsNeeded * sizeof(char **));
    procsNeededToLaunch = (int *) malloc(procsNeeded * sizeof(int));
    hostInfoToLaunch = (MPI_Info *) malloc(procsNeeded * sizeof(MPI_Info));
    argv[argc] = NULL;
    #pragma omp parallel for default(shared)
    for(i = 0; i < procsNeeded; i++) {
        appToLaunch[i] = (char *)argv[0];
        argvToLaunch[i] = (char **)argv;
        procsNeededToLaunch[i] = 1;
        // Host information where to spawn the processes
        MPI_Info_create(&hostInfoToLaunch[i]);
        MPI_Info_set(hostInfoToLaunch[i], (char *)"host", hostNameToLaunch[i]);
        //MPI_Info_set(hostInfoToLaunch[i], "hostfile", "hostfile");
    }

    double spawnTime = MPI_Wtime();
#ifdef HANG_ON_REMOVE
    OMPI_Comm_agree(tempShrink, &flag);
#endif
    // Spawn the new process(es)
    if(MPI_SUCCESS != (ret = MPI_Comm_spawn_multiple(procsNeeded, appToLaunch, argvToLaunch, procsNeededToLaunch,
                             hostInfoToLaunch, 0, tempShrink, &tempIntercomm, MPI_ERRCODES_IGNORE))) {
        free(tempRanks);
        free(failedRanks);
        free(errCodes);
        if(MPI_ERR_COMM  == ret)
            printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid communicator (parent)\n", iteration);
        if(MPI_ERR_ARG  == ret)
            printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid argument (parent)\n", iteration);
        if(MPI_ERR_INFO  == ret)
            printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid info (parent)\n", iteration);

        if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
            OMPI_Comm_revoke(tempShrink);
            return repairComm(broken, repaired, iteration, listFails, numFails, numNodeFails,
                              sumPrevNumNodeFails, argc, argv, verbosity);
        }
        else {
            fprintf(stderr, "Iteration %d: Unknown error with MPI_Comm_spawn_multiple (parent): %d\n", iteration, ret);
            exit(1);
        }
    }
    else {
        if(verbosity > 0 && rank == 0) {
            for(i = 0; i < procsNeeded; i++)
                printf("Iteration %d: MPI_Comm_spawn_multiple (parent) [spawning failed process %d on "
                       "node %s]: SUCCESS\n", iteration, failedRanks[i], hostNameToLaunch[i]);
        }
        // Memory release. Moving the last two to the end of the function causes segmentation faults for 4 processes failure
    }
    if (verbosity > 0 && rank == 0)
        printf("MPI_Comm_spawn_multiple takes %0.6f Sec\n", MPI_Wtime() - spawnTime);

    double mergeTime = MPI_Wtime();
    // Merge the new processes into a new communicator
    if(MPI_SUCCESS != (ret = MPI_Intercomm_merge(tempIntercomm, false, &unorderIntracomm))) {
        free(tempRanks);
        free(failedRanks);
        if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
            // Start the recovery over again if there is a failure
            OMPI_Comm_revoke(tempIntercomm);
            return repairComm(broken, repaired, iteration, listFails, numFails,
                              numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity);
        }
        else if(MPI_ERR_COMM == ret) {
            fprintf(stderr, "Iteration %d: Invalid communicator in MPI_Intercomm_merge (parent) %d\n", iteration, ret);
            exit(1);
        }
        else if(MPI_ERR_INTERN == ret) {
            fprintf(stderr, "Iteration %d: Acquaring memory error in MPI_Intercomm_merge ()%d\n", iteration, ret);
            exit(1);
        }
        else {
            fprintf(stderr, "Iteration %d: Unknown error with MPI_Intercomm_merge: %d\n", iteration, ret);
            exit(1);
        }
    }
    else {
        if(verbosity > 1 )
            printf("Iteration %d: MPI_Intercomm_merge (parent): SUCCESS\n", iteration);
    }
    if (verbosity > 0 && rank == 0)
        printf("MPI_Intercomm_merge takes %0.6f Sec\n", MPI_Wtime() - mergeTime);

    double agreeTime = MPI_Wtime();
    // Synchronize. sometimes hangs in without this
    // position of code and intercommunicator (not intra) is important
#ifdef HANG_ON_REMOVE
    //MPI_Barrier(tempIntercomm);
    OMPI_Comm_agree(tempIntercomm, &flag);// since some of the times MPI_Barrier hangs
#endif
    if (verbosity > 0 && rank == 0)
        printf("OMPI_Comm_agree takes %0.6f Sec\n", MPI_Wtime() - agreeTime);

    // Sending failed ranks and number of processes failed to the the newly created ranks.
    // oldGroupSize is the size of communicator before failure.
    // procsNeeded is the number of processes that are failed
    int * child = (int *) malloc(procsNeeded*sizeof(int));
    #pragma omp parallel for default(shared)
    for(i = 0; i < procsNeeded; i++)
        child[i] = oldGroupSize - procsNeeded + i;

    MPI_Comm_rank(unorderIntracomm, &newRank);
    if(newRank == 0) {
        int send_val[2];
        for(i = 0; i < procsNeeded; i++) {
            send_val[0] = failedRanks[i];
            send_val[1] = procsNeeded;
            if(MPI_SUCCESS != (ret = MPI_Send(&send_val, 2, MPI_INT, child[i], MERGE_TAG, unorderIntracomm))) {
                free(tempRanks);
                free(failedRanks);
                if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
                    // Start the recovery over again if there is a failure
                    OMPI_Comm_revoke(unorderIntracomm);
                    return repairComm(broken, repaired, iteration, listFails, numFails,
                                      numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity);
                }
                else {
                    fprintf(stderr, "Iteration %d: Unknown error with MPI_Send1 (parent): %d\n", iteration, ret);
                    exit(1);
                }
            }
            else {
                if(verbosity > 1 )
                    printf("Iteration %d: MPI_Send1 (parent): SUCCESS\n", iteration);
            }
        }
    }

    // Split the current world (splitted from original) to order the ranks.
    MPI_Comm_rank(unorderIntracomm, &newRank);
    MPI_Comm_size(unorderIntracomm, &nprocs);

    // For one or more process failure (ordering)
    shrinkMergeList = (int *) malloc(nprocs*sizeof(int));

    j = 0;
    for(i = 0; i < nprocs; i++) {
        if(rankIsNotOnFailedList(i, failedRanks, procsNeeded))
            shrinkMergeList[j++] = i;
    }

    for(i = j; i < nprocs; i++)
        shrinkMergeList[i] = failedRanks[i-j];

    for(i = 0; i < (nprocs - procsNeeded); i++) {
        if(newRank == i)
            rankKey = shrinkMergeList[i];
    }

    if(MPI_SUCCESS != (MPI_Comm_split(unorderIntracomm, 0, rankKey, repaired))) {
        if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
            // Start the recovery over again if there is a failure
            OMPI_Comm_revoke(unorderIntracomm);
            return repairComm(broken, repaired, iteration, listFails, numFails,
                              numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity);
        }
        else {
            fprintf(stderr, "Iteration %d: Unknown error with MPI_Comm_split (parent): %d\n", iteration, ret);
            exit(1);
        }
    }
    else {
        if(verbosity > 1 )
            printf("Iteration %d: MPI_Comm_split (parent): SUCCESS\n", iteration);
    }

    // Release memory
    free(appToLaunch);
    free(argvToLaunch);
    free(procsNeededToLaunch);
    free(hostInfoToLaunch);
    free(hostNameToLaunch);
    free(shrinkMergeList);
    free(errCodes);
    MPI_Comm_free(&tempShrink);
    free(tempRanks);
    free(failedRanks);
    free(child);
    MPI_Group_free(&failedGroup);
    MPI_Group_free(&oldGroup);
    MPI_Comm_free(&tempIntercomm);
    MPI_Comm_free(&unorderIntracomm);
#ifdef RECOV_ON_SPARE_NODES
    if(failedNodeList != NULL)
        free(failedNodeList);
    if(nodeList != NULL)
        free(nodeList);
#endif
}//repairComm()