int wait_for_signal(MPI_Comm comm, int signal_num) { int new_failures, ret; MPI_Status status; while(cur_epoch < signal_num) { /* * MPI_Probe is a blocking call that will unblock when it encounters an * error on the indicated peer. By passing MPI_ANY_SOURCE, it will cause * MPI_Probe to unblock on the first un-recognized failed process. * * The error handler will be called before MPI_Probe returns. * * In this example, MPI_Probe will only return when there is an error * since we do not send messages in this example. But we check the * return code anyway, just for good form. */ ret = MPI_Probe(MPI_ANY_SOURCE, TAG_FT_DETECT, comm, &status); if( MPI_SUCCESS != ret ) { printf("%2d of %2d) MPI_Probe() Error: Some rank failed (error = %3d)\n", mpi_mcw_rank, mpi_mcw_size, status.MPI_ERROR); /* Recognize the failure and move on */ OMPI_Comm_failure_ack(comm); } sleep(1); } return 0; }
int numProcsFails(MPI_Comm mcw){ int rank, ret, numFailures = 0, flag; MPI_Group fGroup; MPI_Errhandler newEh; MPI_Comm dupComm; // Error handler MPI_Comm_create_errhandler(mpiErrorHandler, &newEh); MPI_Comm_rank(mcw, &rank); // Set error handler for communicator MPI_Comm_set_errhandler(mcw, newEh); // Target function if(MPI_SUCCESS != (ret = MPI_Comm_dup(mcw, &dupComm))) { //if(MPI_SUCCESS != (ret = MPI_Barrier(mcw))) { // MPI_Comm_dup or MPI_Barrier OMPI_Comm_failure_ack(mcw); OMPI_Comm_failure_get_acked(mcw, &fGroup); // Get the number of failures MPI_Group_size(fGroup, &numFailures); }// end of "MPI_Comm_dup failure" OMPI_Comm_agree(mcw, &flag); // Memory release if(numFailures > 0) MPI_Group_free(&fGroup); MPI_Errhandler_free(&newEh); return numFailures; }//numProcsFails()
void mpiErrorHandler(MPI_Comm * comm, int *errorCode, ...){ MPI_Group failedGroup; OMPI_Comm_failure_ack(*comm); OMPI_Comm_failure_get_acked(*comm, &failedGroup); // *errorCode == MPI_ERR_PROC_FAILED classify failure type as "process" failure // Failed processes will NOT be synchronized Without delay // This delay MUST be through error handler (otherwise, problematic) usleep(10000); // 10 milliseconds delay (MPI_Comm_revoke is failed without // this for a large number of processes) MPI_Group_free(&failedGroup); return; }//mpiErrorHandler()
void mpi_error_handler(MPI_Comm *comm, int *error_code, ...) { MPI_Group f_group; int num_failures; int loc_size; char * ranks_failed = NULL; MPI_Comm_size(*comm, &loc_size); /* * Will cause normal termination by unblocking the wait_for_signal() function * when all process failures have been reported. */ cur_epoch++; printf("Handler !!\n"); fflush(NULL); if(*error_code == MPI_ERR_PROC_FAILED ) { /* Access the local list of failures */ OMPI_Comm_failure_ack(*comm); OMPI_Comm_failure_get_acked(*comm, &f_group); /* Get the number of failures */ MPI_Group_size(f_group, &num_failures); cur_epoch = num_failures; ranks_failed = get_str_failed_procs(*comm, f_group); printf("%2d of %2d) Error Handler: (Comm = %s) %3d Failed Ranks: %s\n", mpi_mcw_rank, mpi_mcw_size, (mpi_mcw_size == loc_size ? "MCW" : "Subcomm"), num_failures, ranks_failed); free(ranks_failed); } else { printf("%2d of %2d) Error Handler: Some Other error has occurred. (Comm = %s) [Count = %2d / %2d]\n", mpi_mcw_rank, mpi_mcw_size, (mpi_mcw_size == loc_size ? "MCW" : "Subcomm"), cur_epoch, max_signals ); } /* Introduce a small delay to aid debugging */ fflush(NULL); sleep(1); return; }
void mpiErrorHandler(MPI_Comm * comm, int *errorCode, ...) { MPI_Group failedGroup; OMPI_Comm_failure_ack(*comm); OMPI_Comm_failure_get_acked(*comm, &failedGroup); /* int rank, nprocs; MPI_Comm_rank(*comm, &rank); MPI_Comm_size(*comm, &nprocs); if(*errorCode == MPI_ERR_PROC_FAILED ) { printf("(Process %d of %d) C Error Handler: MPI_ERROR_PROC_FAILED Detected.\n", rank, nprocs); } else { printf("(Process %d of %d) C Error Handler: Other Failure Detected.\n", rank, nprocs); } */ // Introduce a small delay to aid debugging fflush(NULL); // 1. Without delay, failed processes will NOT be synchronized. // 2. This delay MUST be through error handler. Otherwise, NOT work. // 3. 10 milliseconds is the minimum delay I have tested on a dual-core machine // for 200 processes. Not working if smaller time is given. //sleep(1); // 1 second usleep(10000); // # 10 milliseconds // # MPI_Comm_revoke is failed without this for a large number of // processes, but works for 1 process failure. // # most of the time hangs on for more than 1 process failure without this. // # necessary to enable this for a small number of processes to test. MPI_Group_free(&failedGroup); return; }//mpiErrorHandler()
void repairComm(MPI_Comm * broken, MPI_Comm * repaired, int iteration, int * listFails, int * numFails, int * numNodeFails, int sumPrevNumNodeFails, int argc, char ** argv, int verbosity) { MPI_Comm tempShrink, unorderIntracomm, tempIntercomm; int i, ret, result, procsNeeded = 0, oldRank, newRank, oldGroupSize, rankKey = 0, flag; int * tempRanks, * failedRanks, * errCodes, rank, hostfileLineIndex; MPI_Group oldGroup, failedGroup, shrinkGroup; int hostfileLastLineIndex, tempLineIndex, * failedNodeList = NULL, * nodeList = NULL, totNodeFailed = 0; double startTime = 0.0, endTime; int nprocs, j, * shrinkMergeList; char hostName[128]; gethostname(hostName, sizeof(hostName)); char ** appToLaunch; char *** argvToLaunch; int * procsNeededToLaunch; MPI_Info * hostInfoToLaunch; char ** hostNameToLaunch; MPI_Comm_rank(*broken, &rank); if(rank == 0) startTime = MPI_Wtime(); #ifndef GLOBAL_DETECTION MPI_Comm_size(*broken, &oldGroupSize); MPI_Comm_group(*broken, &oldGroup); MPI_Comm_rank(*broken, &oldRank); OMPI_Comm_failure_ack(*broken); OMPI_Comm_failure_get_acked(*broken, &failedGroup); MPI_Group_size(failedGroup, &procsNeeded); errCodes = (int *) malloc(sizeof(int) * procsNeeded); // Figure out ranks of the processes which had failed tempRanks = (int *) malloc(sizeof(int) * oldGroupSize); failedRanks = (int *) malloc(sizeof(int) * oldGroupSize); #pragma omp parallel for default(shared) for(i = 0; i < oldGroupSize; i++) tempRanks[i] = i; MPI_Group_translate_ranks(failedGroup, procsNeeded, tempRanks, oldGroup, failedRanks); #endif double shrinkTime = MPI_Wtime(); // Shrink the broken communicator to remove failed procs if(MPI_SUCCESS != (ret = OMPI_Comm_shrink(*broken, &tempShrink))) printf("Iteration %d: OMPI_Comm_shrink (parent): ERROR!\n", iteration); else { if(verbosity > 1 ) printf("Iteration %d: OMPI_Comm_shrink (parent): SUCCESS\n", iteration); } if (verbosity > 0 && rank == 0) printf("OMPI_Comm_shrink takes %0.6f Sec\n", MPI_Wtime() - shrinkTime); #ifdef GLOBAL_DETECTION MPI_Comm_group(*broken, &oldGroup); MPI_Comm_group(tempShrink, &shrinkGroup); MPI_Comm_size(*broken, &oldGroupSize); MPI_Group_compare(oldGroup, shrinkGroup, &result); if(result != MPI_IDENT) MPI_Group_difference(oldGroup, shrinkGroup, &failedGroup); MPI_Comm_rank(*broken, &oldRank); MPI_Group_size(failedGroup, &procsNeeded); errCodes = (int *) malloc(sizeof(int)*procsNeeded); // Figure out ranks of the processes which had failed tempRanks = (int*)malloc(sizeof(int)*oldGroupSize); failedRanks = (int*)malloc(sizeof(int)*oldGroupSize); #pragma omp parallel for default(shared) for(i = 0; i < oldGroupSize; i++) tempRanks[i] = i; MPI_Group_translate_ranks(failedGroup, procsNeeded, tempRanks, oldGroup, failedRanks); MPI_Group_free(&shrinkGroup); #endif // Assign number of failed processes *numFails = procsNeeded; hostNameToLaunch = (char **) malloc(procsNeeded * sizeof(char *)); if(verbosity > 0 && rank == 0) printf("*** Iteration %d: Application: Number of process(es) failed in the corresponding " "communicator is %d ***\n", iteration, procsNeeded); if(rank == 0) { endTime = MPI_Wtime(); printf("[%d]----- Creating failed process list takes %0.6f Sec (MPI_Wtime) -----\n", rank, endTime - startTime); } #ifdef RECOV_ON_SPARE_NODES // Determining total number of node failed, and a list of them hostfileLastLineIndex = getHostfileLastLineIndex(); //started from 0 nodeList = (int *) malloc((hostfileLastLineIndex+1) * sizeof(int)); memset(nodeList, 0, (hostfileLastLineIndex+1)*sizeof(int)); // initialize nodeList with 0's for(int i = 0; i < procsNeeded; ++i) { tempLineIndex = failedRanks[i]/SLOTS; //started from 0 nodeList[tempLineIndex] = 1; } for(int nodeCounter = 0; nodeCounter < (hostfileLastLineIndex+1); ++nodeCounter) totNodeFailed += nodeList[nodeCounter]; *numNodeFails = totNodeFailed; // Check if there is sufficient spare node available for recovery if((hostfileLastLineIndex - totNodeFailed -sumPrevNumNodeFails) < (oldGroupSize-1)/SLOTS) { if(rank == 0) printf("[%d] There is no sufficient spare node available for recovery.\n", rank); exit(0); } failedNodeList = (int *) malloc(totNodeFailed * sizeof(int)); memset(failedNodeList, 0, totNodeFailed * sizeof(int)); // initialize failedNodeList with 0's int failedNodeCounter = 0; for(int nodeCounter = 0; nodeCounter < (hostfileLastLineIndex+1); ++nodeCounter) { if(nodeList[nodeCounter] == 1) failedNodeList[failedNodeCounter++] = nodeCounter; } #endif char * hostNameFailed = NULL; #pragma omp parallel for default(shared) for(i = 0; i < procsNeeded; ++i) { // Assign list of processes failed listFails[i] = failedRanks[i]; #ifdef RUN_ON_COMPUTE_NODES tempLineIndex = failedRanks[i]/SLOTS; //started from 0 #ifdef RECOV_ON_SPARE_NODES for(int j = 0; j < totNodeFailed; ++j) { if(failedNodeList[j] == tempLineIndex) hostfileLineIndex = hostfileLastLineIndex - j - sumPrevNumNodeFails; } #else // Recovery on the same node (no node failure, only process failure) hostfileLineIndex = tempLineIndex; #endif hostNameToLaunch[i] = getHostToLaunch(hostfileLineIndex); hostNameFailed = getHostToLaunch(tempLineIndex); #else // Run on head node or personal machine hostNameToLaunch[i] = (char *)hostName; hostNameFailed = (char *)hostName; #endif if(verbosity > 0 && rank == 0) printf("--- Iteration %d: Application: Process %d on node %s is failed! ---\n", iteration, failedRanks[i], hostNameFailed); } // Release memory of hostNameFailed free(hostNameFailed); appToLaunch = (char **) malloc(procsNeeded * sizeof(char *)); argvToLaunch = (char ***) malloc(procsNeeded * sizeof(char **)); procsNeededToLaunch = (int *) malloc(procsNeeded * sizeof(int)); hostInfoToLaunch = (MPI_Info *) malloc(procsNeeded * sizeof(MPI_Info)); argv[argc] = NULL; #pragma omp parallel for default(shared) for(i = 0; i < procsNeeded; i++) { appToLaunch[i] = (char *)argv[0]; argvToLaunch[i] = (char **)argv; procsNeededToLaunch[i] = 1; // Host information where to spawn the processes MPI_Info_create(&hostInfoToLaunch[i]); MPI_Info_set(hostInfoToLaunch[i], (char *)"host", hostNameToLaunch[i]); //MPI_Info_set(hostInfoToLaunch[i], "hostfile", "hostfile"); } double spawnTime = MPI_Wtime(); #ifdef HANG_ON_REMOVE OMPI_Comm_agree(tempShrink, &flag); #endif // Spawn the new process(es) if(MPI_SUCCESS != (ret = MPI_Comm_spawn_multiple(procsNeeded, appToLaunch, argvToLaunch, procsNeededToLaunch, hostInfoToLaunch, 0, tempShrink, &tempIntercomm, MPI_ERRCODES_IGNORE))) { free(tempRanks); free(failedRanks); free(errCodes); if(MPI_ERR_COMM == ret) printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid communicator (parent)\n", iteration); if(MPI_ERR_ARG == ret) printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid argument (parent)\n", iteration); if(MPI_ERR_INFO == ret) printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid info (parent)\n", iteration); if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) { OMPI_Comm_revoke(tempShrink); return repairComm(broken, repaired, iteration, listFails, numFails, numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity); } else { fprintf(stderr, "Iteration %d: Unknown error with MPI_Comm_spawn_multiple (parent): %d\n", iteration, ret); exit(1); } } else { if(verbosity > 0 && rank == 0) { for(i = 0; i < procsNeeded; i++) printf("Iteration %d: MPI_Comm_spawn_multiple (parent) [spawning failed process %d on " "node %s]: SUCCESS\n", iteration, failedRanks[i], hostNameToLaunch[i]); } // Memory release. Moving the last two to the end of the function causes segmentation faults for 4 processes failure } if (verbosity > 0 && rank == 0) printf("MPI_Comm_spawn_multiple takes %0.6f Sec\n", MPI_Wtime() - spawnTime); double mergeTime = MPI_Wtime(); // Merge the new processes into a new communicator if(MPI_SUCCESS != (ret = MPI_Intercomm_merge(tempIntercomm, false, &unorderIntracomm))) { free(tempRanks); free(failedRanks); if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) { // Start the recovery over again if there is a failure OMPI_Comm_revoke(tempIntercomm); return repairComm(broken, repaired, iteration, listFails, numFails, numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity); } else if(MPI_ERR_COMM == ret) { fprintf(stderr, "Iteration %d: Invalid communicator in MPI_Intercomm_merge (parent) %d\n", iteration, ret); exit(1); } else if(MPI_ERR_INTERN == ret) { fprintf(stderr, "Iteration %d: Acquaring memory error in MPI_Intercomm_merge ()%d\n", iteration, ret); exit(1); } else { fprintf(stderr, "Iteration %d: Unknown error with MPI_Intercomm_merge: %d\n", iteration, ret); exit(1); } } else { if(verbosity > 1 ) printf("Iteration %d: MPI_Intercomm_merge (parent): SUCCESS\n", iteration); } if (verbosity > 0 && rank == 0) printf("MPI_Intercomm_merge takes %0.6f Sec\n", MPI_Wtime() - mergeTime); double agreeTime = MPI_Wtime(); // Synchronize. sometimes hangs in without this // position of code and intercommunicator (not intra) is important #ifdef HANG_ON_REMOVE //MPI_Barrier(tempIntercomm); OMPI_Comm_agree(tempIntercomm, &flag);// since some of the times MPI_Barrier hangs #endif if (verbosity > 0 && rank == 0) printf("OMPI_Comm_agree takes %0.6f Sec\n", MPI_Wtime() - agreeTime); // Sending failed ranks and number of processes failed to the the newly created ranks. // oldGroupSize is the size of communicator before failure. // procsNeeded is the number of processes that are failed int * child = (int *) malloc(procsNeeded*sizeof(int)); #pragma omp parallel for default(shared) for(i = 0; i < procsNeeded; i++) child[i] = oldGroupSize - procsNeeded + i; MPI_Comm_rank(unorderIntracomm, &newRank); if(newRank == 0) { int send_val[2]; for(i = 0; i < procsNeeded; i++) { send_val[0] = failedRanks[i]; send_val[1] = procsNeeded; if(MPI_SUCCESS != (ret = MPI_Send(&send_val, 2, MPI_INT, child[i], MERGE_TAG, unorderIntracomm))) { free(tempRanks); free(failedRanks); if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) { // Start the recovery over again if there is a failure OMPI_Comm_revoke(unorderIntracomm); return repairComm(broken, repaired, iteration, listFails, numFails, numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity); } else { fprintf(stderr, "Iteration %d: Unknown error with MPI_Send1 (parent): %d\n", iteration, ret); exit(1); } } else { if(verbosity > 1 ) printf("Iteration %d: MPI_Send1 (parent): SUCCESS\n", iteration); } } } // Split the current world (splitted from original) to order the ranks. MPI_Comm_rank(unorderIntracomm, &newRank); MPI_Comm_size(unorderIntracomm, &nprocs); // For one or more process failure (ordering) shrinkMergeList = (int *) malloc(nprocs*sizeof(int)); j = 0; for(i = 0; i < nprocs; i++) { if(rankIsNotOnFailedList(i, failedRanks, procsNeeded)) shrinkMergeList[j++] = i; } for(i = j; i < nprocs; i++) shrinkMergeList[i] = failedRanks[i-j]; for(i = 0; i < (nprocs - procsNeeded); i++) { if(newRank == i) rankKey = shrinkMergeList[i]; } if(MPI_SUCCESS != (MPI_Comm_split(unorderIntracomm, 0, rankKey, repaired))) { if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) { // Start the recovery over again if there is a failure OMPI_Comm_revoke(unorderIntracomm); return repairComm(broken, repaired, iteration, listFails, numFails, numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity); } else { fprintf(stderr, "Iteration %d: Unknown error with MPI_Comm_split (parent): %d\n", iteration, ret); exit(1); } } else { if(verbosity > 1 ) printf("Iteration %d: MPI_Comm_split (parent): SUCCESS\n", iteration); } // Release memory free(appToLaunch); free(argvToLaunch); free(procsNeededToLaunch); free(hostInfoToLaunch); free(hostNameToLaunch); free(shrinkMergeList); free(errCodes); MPI_Comm_free(&tempShrink); free(tempRanks); free(failedRanks); free(child); MPI_Group_free(&failedGroup); MPI_Group_free(&oldGroup); MPI_Comm_free(&tempIntercomm); MPI_Comm_free(&unorderIntracomm); #ifdef RECOV_ON_SPARE_NODES if(failedNodeList != NULL) free(failedNodeList); if(nodeList != NULL) free(nodeList); #endif }//repairComm()