Пример #1
void iterative_solver(MPI_Comm comm)
    int epsilon = 1;
    int gnorm = mpi_mcw_size, lnorm =  mpi_mcw_rank;
    int allsucceeded;
    int rc;
    int rank, size;
    MPI_Comm comm2;
    int i;

    for (i = 0; ; i++) {
        MPI_Comm_rank(comm, &rank);
        MPI_Comm_size(comm, &size);
        /* Add a computation iteration to converge and
         * compute local norm in lnorm */
        rc = MPI_Allreduce( &lnorm, &gnorm, 1, MPI_DOUBLE, MPI_MAX, comm);

        //    if( (MPI_ERR_PROC_FAILED == rc ) ||
        //	(MPI_ERR_COMM_REVOKE == rc) ||
        //	(gnorm <= epsilon) )

        if(MPI_ERR_PROC_FAILED == rc)

            if( MPI_ERR_PROC_FAILED == rc )

            /* About to leave: let's be sure that everybody
             * received the same information */
            allsucceeded = (rc == MPI_SUCCESS);
            OMPI_Comm_agree(comm, &allsucceeded);
            if(!allsucceeded ) {
                /* We plan to join the shrink, thus the communicator
                 * should be marked as revoked */
                OMPI_Comm_shrink(comm, &comm2);
                MPI_Comm_free(&comm); /* Release the revoked communicator */
                comm = comm2;

        if (rank == size - 1 && i % 10 == 0 ) {
            printf("See you %d\n",  mpi_mcw_rank);

Пример #2
void iterative_solver2(MPI_Comm comm)
    int epsilon = 1;
    int gnorm = mpi_mcw_size, lnorm =  mpi_mcw_rank;
    int allsucceeded;
    int rc;
    int rank, size;
    MPI_Comm comm2;
    int i;

    for (i = 0; ; i++) {
        /* Add a computation iteration to converge and
         * compute local norm in lnorm */

        MPI_Comm_rank(comm, &rank);
        MPI_Comm_size(comm, &size);
        if (mpi_mcw_rank == root) printf("start\n");

        if (rank == size - 1 && i % 10== 0 ) {
            //   if (rank == size - 1 && i % 10== 0 ) {
            //    if (rank == size - 1) {
            printf("See you %d\n", rank);
        rc = MPI_Probe(MPI_ANY_SOURCE, TAG_FT_DETECT, comm, NULL);
        //rc = MPI_Allreduce( &lnorm, &gnorm, 1, MPI_DOUBLE, MPI_MAX, comm);
        if (mpi_mcw_rank == root && MPI_SUCCESS == rc) printf("gnorm: %d, size: %d\n", gnorm, size);
        if (mpi_mcw_rank == root) printf("end\n");
        if (MPI_ERR_PROC_FAILED == rc) {
            allsucceeded = (rc == MPI_SUCCESS);
            OMPI_Comm_agree(comm, &allsucceeded);
            if (mpi_mcw_rank == root)  printf("gnorm: %d, size: %d, rc: %d, agree:%d\n", gnorm, size, rc, allsucceeded);
            if(!allsucceeded) {
                /* We plan to join the shrink, thus the communicator
                 * should be marked as revoked */
                OMPI_Comm_shrink(comm, &comm2);
                MPI_Comm_free(&comm); /* Release the revoked communicator */
                comm = comm2;

        //    if (mpi_mcw_rank == root || MPI_SUCCESS == rc) printf("gnorm: %d, size: %d\n", gnorm, size);
        //    printf("gnorm: %d, rank: %d, size: %d\n", gnorm, rank, size);
MPI_Comm communicatorReconstruct(MPI_Comm myCommWorld, int childFlag, int * listFails, int * numFails,
                                 int * numNodeFails, int sumPrevNumNodeFails, int argc, char ** argv, int verbosity) {
    int i, ret, rank, nprocs, oldRank = 0, totFails = 0, * failedList, flag;
    int iterCounter = 0, failure = 0, recvVal[2], length;
    MPI_Status mpiStatus;
    MPI_Comm parent, mcw;
    MPI_Comm dupComm, tempIntracomm, unorderIntracomm;
    MPI_Errhandler newEh;
    double startTime = 0.0, endTime;
    char hostName[MPI_MAX_PROCESSOR_NAME];

    // Error handler
    MPI_Comm_create_errhandler(mpiErrorHandler, &newEh);


    MPI_Comm_rank(myCommWorld, &rank);
    if(MPI_COMM_NULL == parent && childFlag == 0 && rank == 0)
        startTime = MPI_Wtime();

    do {
        failure = 0;
        ret = MPI_SUCCESS;
        if(childFlag == 0 && MPI_COMM_NULL != parent){
           parent = MPI_COMM_NULL;
        // Parent part
        if(MPI_COMM_NULL == parent) {
            if(iterCounter == 0)
                mcw = myCommWorld;
            // Set error handler for communicator
            MPI_Comm_set_errhandler(mcw, newEh);

            // World information
            MPI_Comm_rank(mcw, &rank);
            MPI_Comm_size(mcw, &nprocs);
            // Synchronize. Sometimes hangs on without this
            OMPI_Comm_agree(mcw, &flag); // since some of the times MPI_Barrier hangs on

            // Target function
            //if(MPI_SUCCESS != (ret = MPI_Barrier(mcw))){
            if(MPI_SUCCESS != (ret = MPI_Comm_dup(mcw, &dupComm))) {
                if(verbosity > 0 && rank == 0)
                    printf("[????? Process %d (nprocs %d)] MPI_Comm_dup (parent): "
                           "Unsuccessful (due to process failure) OK\n", rank, nprocs);

                // Revoke the communicator
                if(MPI_SUCCESS != (OMPI_Comm_revoke(mcw))) {
                    if(rank == 0)
                        printf("[Process %d (nprocs %d)] Iteration %d: OMPI_Comm_revoke "
                               "(parent): Error!\n", rank, nprocs,  iterCounter);
                else {
                    if(verbosity > 1 && rank == 0)
                        printf("[Process %d (nprocs %d)] Iteration %d: OMPI_Comm_revoke "
                               "(parent): SUCCESS\n", rank, nprocs, iterCounter);

                // Call repair with splitted world
                totFails = numProcsFails(mcw);
                failedList = (int *) malloc(totFails*sizeof(int));
                repairComm(&mcw, &tempIntracomm, iterCounter, failedList, numFails, numNodeFails,
                           sumPrevNumNodeFails, argc, argv, verbosity);

                // Assign list of failed processes
                #pragma omp parallel for default(shared)
                for(i = 0; i < *numFails; i++)
                    listFails[i] = failedList[i];

                // Free memory

                // Operation failed: retry
                failure = 1;
            } //end of "if MPI_Barrier/MPI_Comm_dup fails"
            else {
                if(verbosity > 0 && rank == 0)
                    printf("[..... Process %d (nprocs %d)] Iteration %d: MPI_Comm_dup "
                           "(parent): SUCCESS\n", rank, nprocs, iterCounter);

                // Operation success: breaking iteration
                failure = 0;
        } // end of "parent"
        // Child part
        else {
            MPI_Comm_set_errhandler(parent, newEh);
            // Synchronize. Sometimes hangs on without this
            // Position of code and intercommunicator, parent, (not intra) is important
            OMPI_Comm_agree(parent, &flag);// since some of the times MPI_Barrier hangs on

            MPI_Comm_rank(parent, &rank);
            MPI_Comm_size(parent, &nprocs);

            if(verbosity > 0 && rank == 0) {
                MPI_Get_processor_name(hostName, &length);
                printf("[Process %d, nprocs = %d] created on host %s (child)\n",
                       rank, nprocs, hostName);

            if(MPI_SUCCESS != (MPI_Intercomm_merge(parent, true, &unorderIntracomm))) {
                if(rank == 0)
                    printf("[Process %d] Iteration %d: MPI_Intercomm_merge (child): Error!\n",
                           rank, iterCounter);
            else {
                if(verbosity > 1 && rank == 0)
                    printf("[Process %d] Iteration %d: MPI_Intercomm_merge (child): SUCCESS\n",
                           rank, iterCounter);
            // Receive failed ranks and number of fails from process 0 of parent
            if(MPI_SUCCESS != (MPI_Recv(&recvVal, 2, MPI_INT, 0, MERGE_TAG,
                                        unorderIntracomm, &mpiStatus))) {
                if(rank == 0)
                    printf("[Process %d] Iteration %d: MPI_Recv1 (child): Error!\n",
                           rank, iterCounter);
            else {
                if(verbosity > 1 && rank == 0)
                    printf("[Process %d] Iteration %d: MPI_Recv1 (child): SUCCESS\n",
                           rank, iterCounter);
                oldRank = recvVal[0];
                *numFails = recvVal[1];

            // Split the communicator to order the ranks.
            // No order is maintaining here. Actual ordering is done on parent side
            // This is a support only to parent side
            if(MPI_SUCCESS != (MPI_Comm_split(unorderIntracomm, 0, oldRank, &tempIntracomm))) {
                if(rank == 0)
                    printf("[Process %d] Iteration %d: MPI_Comm_split (child): Error!\n",
                           rank, iterCounter);
            else {
                if(verbosity > 1 && rank == 0)
                    printf("[Process %d] Iteration %d: MPI_Comm_split (child): SUCCESS\n",
                           rank, iterCounter);

            // Operation on parent failed: retry
            ret = (!MPI_SUCCESS);
            failure = 1;

            // Free memory
        }// end of "child"

        // Reset comm world
        if(ret != MPI_SUCCESS)
            mcw = tempIntracomm;

        // Reset parent value for parent
        if(parent == MPI_COMM_NULL && ret != MPI_SUCCESS)
            parent = mcw;

        // Reset parent value of child and make the operation collective
        if(MPI_SUCCESS != ret && MPI_COMM_NULL != parent)
            parent = MPI_COMM_NULL;
    } while(failure > 1);// replace 'failure > 1' with 'failure' if want fault tolerant recovery

    if(MPI_COMM_NULL == parent && childFlag == 0 && rank == 0) {
        endTime = MPI_Wtime();
        printf("[%d]----- Reconstructing failed communicator (including failed list creation) "
               "takes %0.6f Sec (MPI_Wtime) -----\n", rank, endTime - startTime);

    // Memory release

    return mcw;
void repairComm(MPI_Comm * broken, MPI_Comm * repaired, int iteration, int * listFails, int * numFails,
                int * numNodeFails, int sumPrevNumNodeFails, int argc, char ** argv, int verbosity) {
    MPI_Comm tempShrink, unorderIntracomm, tempIntercomm;
    int i, ret, result, procsNeeded = 0, oldRank, newRank, oldGroupSize, rankKey = 0, flag;
    int * tempRanks, * failedRanks, * errCodes, rank, hostfileLineIndex;
    MPI_Group oldGroup, failedGroup, shrinkGroup;
    int hostfileLastLineIndex, tempLineIndex, * failedNodeList = NULL, * nodeList = NULL, totNodeFailed = 0;
    double startTime = 0.0, endTime;
    int nprocs, j, * shrinkMergeList;
    char hostName[128];
    gethostname(hostName, sizeof(hostName));

    char ** appToLaunch;
    char *** argvToLaunch;
    int * procsNeededToLaunch;
    MPI_Info * hostInfoToLaunch;
    char ** hostNameToLaunch;

    MPI_Comm_rank(*broken, &rank);
    if(rank == 0)
        startTime = MPI_Wtime();

    MPI_Comm_size(*broken, &oldGroupSize);
    MPI_Comm_group(*broken, &oldGroup);
    MPI_Comm_rank(*broken, &oldRank);
    OMPI_Comm_failure_get_acked(*broken, &failedGroup);
    MPI_Group_size(failedGroup, &procsNeeded);
    errCodes = (int *) malloc(sizeof(int) * procsNeeded);

    // Figure out ranks of the processes which had failed
    tempRanks = (int *) malloc(sizeof(int) * oldGroupSize);
    failedRanks = (int *) malloc(sizeof(int) * oldGroupSize);
    #pragma omp parallel for default(shared)
    for(i = 0; i < oldGroupSize; i++)
        tempRanks[i] = i;

    MPI_Group_translate_ranks(failedGroup, procsNeeded, tempRanks, oldGroup, failedRanks);

    double shrinkTime = MPI_Wtime();
    // Shrink the broken communicator to remove failed procs
    if(MPI_SUCCESS != (ret = OMPI_Comm_shrink(*broken, &tempShrink)))
        printf("Iteration %d: OMPI_Comm_shrink (parent): ERROR!\n", iteration);
    else {
        if(verbosity > 1 )
            printf("Iteration %d: OMPI_Comm_shrink (parent): SUCCESS\n", iteration);
    if (verbosity > 0 && rank == 0)
        printf("OMPI_Comm_shrink takes %0.6f Sec\n", MPI_Wtime() - shrinkTime);

    MPI_Comm_group(*broken, &oldGroup);
    MPI_Comm_group(tempShrink, &shrinkGroup);
    MPI_Comm_size(*broken, &oldGroupSize);

    MPI_Group_compare(oldGroup, shrinkGroup, &result);

    if(result != MPI_IDENT)
        MPI_Group_difference(oldGroup, shrinkGroup, &failedGroup);

    MPI_Comm_rank(*broken, &oldRank);
    MPI_Group_size(failedGroup, &procsNeeded);

    errCodes = (int *) malloc(sizeof(int)*procsNeeded);

    // Figure out ranks of the processes which had failed
    tempRanks = (int*)malloc(sizeof(int)*oldGroupSize);
    failedRanks = (int*)malloc(sizeof(int)*oldGroupSize);
    #pragma omp parallel for default(shared)
    for(i = 0; i < oldGroupSize; i++)
        tempRanks[i] = i;

    MPI_Group_translate_ranks(failedGroup, procsNeeded, tempRanks, oldGroup, failedRanks);


    // Assign number of failed processes
    *numFails = procsNeeded;

    hostNameToLaunch = (char **) malloc(procsNeeded * sizeof(char *));

    if(verbosity > 0 && rank == 0)
        printf("*** Iteration %d: Application: Number of process(es) failed in the corresponding "
               "communicator is %d ***\n", iteration, procsNeeded);

    if(rank == 0) {
        endTime = MPI_Wtime();
        printf("[%d]----- Creating failed process list takes %0.6f Sec (MPI_Wtime) -----\n", rank, endTime - startTime);

    // Determining total number of node failed, and a list of them
    hostfileLastLineIndex = getHostfileLastLineIndex(); //started from 0
    nodeList = (int *) malloc((hostfileLastLineIndex+1) * sizeof(int));
    memset(nodeList, 0, (hostfileLastLineIndex+1)*sizeof(int)); // initialize nodeList with 0's

    for(int i = 0; i < procsNeeded; ++i) {
        tempLineIndex = failedRanks[i]/SLOTS; //started from 0
        nodeList[tempLineIndex] = 1;

    for(int nodeCounter = 0; nodeCounter < (hostfileLastLineIndex+1); ++nodeCounter)
        totNodeFailed += nodeList[nodeCounter];
    *numNodeFails = totNodeFailed;

    // Check if there is sufficient spare node available for recovery
    if((hostfileLastLineIndex - totNodeFailed -sumPrevNumNodeFails) < (oldGroupSize-1)/SLOTS) {
        if(rank == 0)
            printf("[%d] There is no sufficient spare node available for recovery.\n", rank);

    failedNodeList = (int *) malloc(totNodeFailed * sizeof(int));
    memset(failedNodeList, 0, totNodeFailed * sizeof(int)); // initialize failedNodeList with 0's

    int failedNodeCounter = 0;
    for(int nodeCounter = 0; nodeCounter < (hostfileLastLineIndex+1); ++nodeCounter) {
        if(nodeList[nodeCounter] == 1)
            failedNodeList[failedNodeCounter++] = nodeCounter;

    char * hostNameFailed = NULL;
    #pragma omp parallel for default(shared)
    for(i = 0; i < procsNeeded; ++i) {
        // Assign list of processes failed
        listFails[i] = failedRanks[i];

        tempLineIndex = failedRanks[i]/SLOTS; //started from 0
        for(int j = 0; j < totNodeFailed; ++j) {
            if(failedNodeList[j] == tempLineIndex)
                hostfileLineIndex = hostfileLastLineIndex - j - sumPrevNumNodeFails;
#else // Recovery on the same node (no node failure, only process failure)
        hostfileLineIndex = tempLineIndex;
        hostNameToLaunch[i] = getHostToLaunch(hostfileLineIndex);
        hostNameFailed = getHostToLaunch(tempLineIndex);
#else // Run on head node or personal machine
        hostNameToLaunch[i] = (char *)hostName;
        hostNameFailed = (char *)hostName;

        if(verbosity > 0 && rank == 0)
            printf("--- Iteration %d: Application: Process %d on node %s is failed! ---\n", iteration, failedRanks[i], hostNameFailed);
    // Release memory of hostNameFailed

    appToLaunch = (char **) malloc(procsNeeded * sizeof(char *));
    argvToLaunch = (char ***) malloc(procsNeeded * sizeof(char **));
    procsNeededToLaunch = (int *) malloc(procsNeeded * sizeof(int));
    hostInfoToLaunch = (MPI_Info *) malloc(procsNeeded * sizeof(MPI_Info));
    argv[argc] = NULL;
    #pragma omp parallel for default(shared)
    for(i = 0; i < procsNeeded; i++) {
        appToLaunch[i] = (char *)argv[0];
        argvToLaunch[i] = (char **)argv;
        procsNeededToLaunch[i] = 1;
        // Host information where to spawn the processes
        MPI_Info_set(hostInfoToLaunch[i], (char *)"host", hostNameToLaunch[i]);
        //MPI_Info_set(hostInfoToLaunch[i], "hostfile", "hostfile");

    double spawnTime = MPI_Wtime();
    OMPI_Comm_agree(tempShrink, &flag);
    // Spawn the new process(es)
    if(MPI_SUCCESS != (ret = MPI_Comm_spawn_multiple(procsNeeded, appToLaunch, argvToLaunch, procsNeededToLaunch,
                             hostInfoToLaunch, 0, tempShrink, &tempIntercomm, MPI_ERRCODES_IGNORE))) {
        if(MPI_ERR_COMM  == ret)
            printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid communicator (parent)\n", iteration);
        if(MPI_ERR_ARG  == ret)
            printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid argument (parent)\n", iteration);
        if(MPI_ERR_INFO  == ret)
            printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid info (parent)\n", iteration);

        if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
            return repairComm(broken, repaired, iteration, listFails, numFails, numNodeFails,
                              sumPrevNumNodeFails, argc, argv, verbosity);
        else {
            fprintf(stderr, "Iteration %d: Unknown error with MPI_Comm_spawn_multiple (parent): %d\n", iteration, ret);
    else {
        if(verbosity > 0 && rank == 0) {
            for(i = 0; i < procsNeeded; i++)
                printf("Iteration %d: MPI_Comm_spawn_multiple (parent) [spawning failed process %d on "
                       "node %s]: SUCCESS\n", iteration, failedRanks[i], hostNameToLaunch[i]);
        // Memory release. Moving the last two to the end of the function causes segmentation faults for 4 processes failure
    if (verbosity > 0 && rank == 0)
        printf("MPI_Comm_spawn_multiple takes %0.6f Sec\n", MPI_Wtime() - spawnTime);

    double mergeTime = MPI_Wtime();
    // Merge the new processes into a new communicator
    if(MPI_SUCCESS != (ret = MPI_Intercomm_merge(tempIntercomm, false, &unorderIntracomm))) {
        if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
            // Start the recovery over again if there is a failure
            return repairComm(broken, repaired, iteration, listFails, numFails,
                              numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity);
        else if(MPI_ERR_COMM == ret) {
            fprintf(stderr, "Iteration %d: Invalid communicator in MPI_Intercomm_merge (parent) %d\n", iteration, ret);
        else if(MPI_ERR_INTERN == ret) {
            fprintf(stderr, "Iteration %d: Acquaring memory error in MPI_Intercomm_merge ()%d\n", iteration, ret);
        else {
            fprintf(stderr, "Iteration %d: Unknown error with MPI_Intercomm_merge: %d\n", iteration, ret);
    else {
        if(verbosity > 1 )
            printf("Iteration %d: MPI_Intercomm_merge (parent): SUCCESS\n", iteration);
    if (verbosity > 0 && rank == 0)
        printf("MPI_Intercomm_merge takes %0.6f Sec\n", MPI_Wtime() - mergeTime);

    double agreeTime = MPI_Wtime();
    // Synchronize. sometimes hangs in without this
    // position of code and intercommunicator (not intra) is important
    OMPI_Comm_agree(tempIntercomm, &flag);// since some of the times MPI_Barrier hangs
    if (verbosity > 0 && rank == 0)
        printf("OMPI_Comm_agree takes %0.6f Sec\n", MPI_Wtime() - agreeTime);

    // Sending failed ranks and number of processes failed to the the newly created ranks.
    // oldGroupSize is the size of communicator before failure.
    // procsNeeded is the number of processes that are failed
    int * child = (int *) malloc(procsNeeded*sizeof(int));
    #pragma omp parallel for default(shared)
    for(i = 0; i < procsNeeded; i++)
        child[i] = oldGroupSize - procsNeeded + i;

    MPI_Comm_rank(unorderIntracomm, &newRank);
    if(newRank == 0) {
        int send_val[2];
        for(i = 0; i < procsNeeded; i++) {
            send_val[0] = failedRanks[i];
            send_val[1] = procsNeeded;
            if(MPI_SUCCESS != (ret = MPI_Send(&send_val, 2, MPI_INT, child[i], MERGE_TAG, unorderIntracomm))) {
                if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
                    // Start the recovery over again if there is a failure
                    return repairComm(broken, repaired, iteration, listFails, numFails,
                                      numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity);
                else {
                    fprintf(stderr, "Iteration %d: Unknown error with MPI_Send1 (parent): %d\n", iteration, ret);
            else {
                if(verbosity > 1 )
                    printf("Iteration %d: MPI_Send1 (parent): SUCCESS\n", iteration);

    // Split the current world (splitted from original) to order the ranks.
    MPI_Comm_rank(unorderIntracomm, &newRank);
    MPI_Comm_size(unorderIntracomm, &nprocs);

    // For one or more process failure (ordering)
    shrinkMergeList = (int *) malloc(nprocs*sizeof(int));

    j = 0;
    for(i = 0; i < nprocs; i++) {
        if(rankIsNotOnFailedList(i, failedRanks, procsNeeded))
            shrinkMergeList[j++] = i;

    for(i = j; i < nprocs; i++)
        shrinkMergeList[i] = failedRanks[i-j];

    for(i = 0; i < (nprocs - procsNeeded); i++) {
        if(newRank == i)
            rankKey = shrinkMergeList[i];

    if(MPI_SUCCESS != (MPI_Comm_split(unorderIntracomm, 0, rankKey, repaired))) {
        if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
            // Start the recovery over again if there is a failure
            return repairComm(broken, repaired, iteration, listFails, numFails,
                              numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity);
        else {
            fprintf(stderr, "Iteration %d: Unknown error with MPI_Comm_split (parent): %d\n", iteration, ret);
    else {
        if(verbosity > 1 )
            printf("Iteration %d: MPI_Comm_split (parent): SUCCESS\n", iteration);

    // Release memory
    if(failedNodeList != NULL)
    if(nodeList != NULL)