int main(int argc, char* argv[]) { int msg; MPI_Comm parent, child; int rank, size; char hostname[512]; pid_t pid; int i; char *cmds[2]; char *argv0[] = { "foo", NULL }; char *argv1[] = { "bar", NULL }; char **spawn_argv[2]; int maxprocs[] = { 1, 1 }; MPI_Info info[] = { MPI_INFO_NULL, MPI_INFO_NULL }; cmds[1] = cmds[0] = argv[0]; spawn_argv[0] = argv0; spawn_argv[1] = argv1; MPI_Init(NULL, NULL); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_get_parent(&parent); /* If we get COMM_NULL back, then we're the parent */ if (MPI_COMM_NULL == parent) { pid = getpid(); printf("Parent [pid %ld] about to spawn!\n", (long)pid); MPI_Comm_spawn_multiple(2, cmds, spawn_argv, maxprocs, info, 0, MPI_COMM_WORLD, &child, MPI_ERRCODES_IGNORE); printf("Parent done with spawn\n"); if (0 == rank) { msg = 38; printf("Parent sending message to children\n"); MPI_Send(&msg, 1, MPI_INT, 0, 1, child); MPI_Send(&msg, 1, MPI_INT, 1, 1, child); } MPI_Comm_disconnect(&child); printf("Parent disconnected\n"); } /* Otherwise, we're the child */ else { MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); gethostname(hostname, 512); pid = getpid(); printf("Hello from the child %d of %d on host %s pid %ld: argv[1] = %s\n", rank, size, hostname, (long)pid, argv[1]); if (0 == rank) { MPI_Recv(&msg, 1, MPI_INT, 0, 1, parent, MPI_STATUS_IGNORE); printf("Child %d received msg: %d\n", rank, msg); } MPI_Comm_disconnect(&parent); printf("Child %d disconnected\n", rank); } MPI_Finalize(); return 0; }
void mpi_comm_spawn_multiple_f(MPI_Fint *count, char *array_commands, char *array_argv, MPI_Fint *array_maxprocs, MPI_Fint *array_info, MPI_Fint *root, MPI_Fint *comm, MPI_Fint *intercomm, MPI_Fint *array_errcds, MPI_Fint *ierr, int cmd_len, int argv_len) { MPI_Comm c_comm, c_new_comm; MPI_Info *c_info; int size, array_size, i; int *c_errs; char **c_array_commands; char ***c_array_argv; OMPI_ARRAY_NAME_DECL(array_maxprocs); OMPI_ARRAY_NAME_DECL(array_errcds); c_comm = MPI_Comm_f2c(*comm); MPI_Comm_size(c_comm, &size); array_size = OMPI_FINT_2_INT(*count); /* It's allowed to ignore the errcodes */ if (OMPI_IS_FORTRAN_ERRCODES_IGNORE(array_errcds)) { c_errs = MPI_ERRCODES_IGNORE; } else { OMPI_ARRAY_FINT_2_INT_ALLOC(array_errcds, size); c_errs = OMPI_ARRAY_NAME_CONVERT(array_errcds); } /* It's allowed to have no argv */ if (OMPI_IS_FORTRAN_ARGVS_NULL(array_argv)) { c_array_argv = MPI_ARGVS_NULL; } else { ompi_fortran_multiple_argvs_f2c(OMPI_FINT_2_INT(*count), array_argv, argv_len, &c_array_argv); } OMPI_ARRAY_FINT_2_INT(array_maxprocs, array_size); ompi_fortran_argv_f2c(array_commands, cmd_len, &c_array_commands); c_info = malloc (array_size * sizeof(MPI_Info)); for (i = 0; i < array_size; ++i) { c_info[i] = MPI_Info_f2c(array_info[i]); } *ierr = OMPI_INT_2_FINT(MPI_Comm_spawn_multiple(OMPI_FINT_2_INT(*count), c_array_commands, c_array_argv, OMPI_ARRAY_NAME_CONVERT(array_maxprocs), c_info, OMPI_FINT_2_INT(*root), c_comm, &c_new_comm, c_errs)); if (MPI_SUCCESS == OMPI_FINT_2_INT(*ierr)) { *intercomm = MPI_Comm_c2f(c_new_comm); } if (!OMPI_IS_FORTRAN_ERRCODES_IGNORE(array_errcds)) { OMPI_ARRAY_INT_2_FINT(array_errcds, size); } OMPI_ARRAY_FINT_2_INT_CLEANUP(array_maxprocs); opal_argv_free(c_array_commands); if (MPI_ARGVS_NULL != c_array_argv && NULL != c_array_argv) { for (i = 0; i < OMPI_FINT_2_INT(*count); ++i) { opal_argv_free(c_array_argv[i]); } } free(c_array_argv); }
void repairComm(MPI_Comm * broken, MPI_Comm * repaired, int iteration, int * listFails, int * numFails, int * numNodeFails, int sumPrevNumNodeFails, int argc, char ** argv, int verbosity) { MPI_Comm tempShrink, unorderIntracomm, tempIntercomm; int i, ret, result, procsNeeded = 0, oldRank, newRank, oldGroupSize, rankKey = 0, flag; int * tempRanks, * failedRanks, * errCodes, rank, hostfileLineIndex; MPI_Group oldGroup, failedGroup, shrinkGroup; int hostfileLastLineIndex, tempLineIndex, * failedNodeList = NULL, * nodeList = NULL, totNodeFailed = 0; double startTime = 0.0, endTime; int nprocs, j, * shrinkMergeList; char hostName[128]; gethostname(hostName, sizeof(hostName)); char ** appToLaunch; char *** argvToLaunch; int * procsNeededToLaunch; MPI_Info * hostInfoToLaunch; char ** hostNameToLaunch; MPI_Comm_rank(*broken, &rank); if(rank == 0) startTime = MPI_Wtime(); #ifndef GLOBAL_DETECTION MPI_Comm_size(*broken, &oldGroupSize); MPI_Comm_group(*broken, &oldGroup); MPI_Comm_rank(*broken, &oldRank); OMPI_Comm_failure_ack(*broken); OMPI_Comm_failure_get_acked(*broken, &failedGroup); MPI_Group_size(failedGroup, &procsNeeded); errCodes = (int *) malloc(sizeof(int) * procsNeeded); // Figure out ranks of the processes which had failed tempRanks = (int *) malloc(sizeof(int) * oldGroupSize); failedRanks = (int *) malloc(sizeof(int) * oldGroupSize); #pragma omp parallel for default(shared) for(i = 0; i < oldGroupSize; i++) tempRanks[i] = i; MPI_Group_translate_ranks(failedGroup, procsNeeded, tempRanks, oldGroup, failedRanks); #endif double shrinkTime = MPI_Wtime(); // Shrink the broken communicator to remove failed procs if(MPI_SUCCESS != (ret = OMPI_Comm_shrink(*broken, &tempShrink))) printf("Iteration %d: OMPI_Comm_shrink (parent): ERROR!\n", iteration); else { if(verbosity > 1 ) printf("Iteration %d: OMPI_Comm_shrink (parent): SUCCESS\n", iteration); } if (verbosity > 0 && rank == 0) printf("OMPI_Comm_shrink takes %0.6f Sec\n", MPI_Wtime() - shrinkTime); #ifdef GLOBAL_DETECTION MPI_Comm_group(*broken, &oldGroup); MPI_Comm_group(tempShrink, &shrinkGroup); MPI_Comm_size(*broken, &oldGroupSize); MPI_Group_compare(oldGroup, shrinkGroup, &result); if(result != MPI_IDENT) MPI_Group_difference(oldGroup, shrinkGroup, &failedGroup); MPI_Comm_rank(*broken, &oldRank); MPI_Group_size(failedGroup, &procsNeeded); errCodes = (int *) malloc(sizeof(int)*procsNeeded); // Figure out ranks of the processes which had failed tempRanks = (int*)malloc(sizeof(int)*oldGroupSize); failedRanks = (int*)malloc(sizeof(int)*oldGroupSize); #pragma omp parallel for default(shared) for(i = 0; i < oldGroupSize; i++) tempRanks[i] = i; MPI_Group_translate_ranks(failedGroup, procsNeeded, tempRanks, oldGroup, failedRanks); MPI_Group_free(&shrinkGroup); #endif // Assign number of failed processes *numFails = procsNeeded; hostNameToLaunch = (char **) malloc(procsNeeded * sizeof(char *)); if(verbosity > 0 && rank == 0) printf("*** Iteration %d: Application: Number of process(es) failed in the corresponding " "communicator is %d ***\n", iteration, procsNeeded); if(rank == 0) { endTime = MPI_Wtime(); printf("[%d]----- Creating failed process list takes %0.6f Sec (MPI_Wtime) -----\n", rank, endTime - startTime); } #ifdef RECOV_ON_SPARE_NODES // Determining total number of node failed, and a list of them hostfileLastLineIndex = getHostfileLastLineIndex(); //started from 0 nodeList = (int *) malloc((hostfileLastLineIndex+1) * sizeof(int)); memset(nodeList, 0, (hostfileLastLineIndex+1)*sizeof(int)); // initialize nodeList with 0's for(int i = 0; i < procsNeeded; ++i) { tempLineIndex = failedRanks[i]/SLOTS; //started from 0 nodeList[tempLineIndex] = 1; } for(int nodeCounter = 0; nodeCounter < (hostfileLastLineIndex+1); ++nodeCounter) totNodeFailed += nodeList[nodeCounter]; *numNodeFails = totNodeFailed; // Check if there is sufficient spare node available for recovery if((hostfileLastLineIndex - totNodeFailed -sumPrevNumNodeFails) < (oldGroupSize-1)/SLOTS) { if(rank == 0) printf("[%d] There is no sufficient spare node available for recovery.\n", rank); exit(0); } failedNodeList = (int *) malloc(totNodeFailed * sizeof(int)); memset(failedNodeList, 0, totNodeFailed * sizeof(int)); // initialize failedNodeList with 0's int failedNodeCounter = 0; for(int nodeCounter = 0; nodeCounter < (hostfileLastLineIndex+1); ++nodeCounter) { if(nodeList[nodeCounter] == 1) failedNodeList[failedNodeCounter++] = nodeCounter; } #endif char * hostNameFailed = NULL; #pragma omp parallel for default(shared) for(i = 0; i < procsNeeded; ++i) { // Assign list of processes failed listFails[i] = failedRanks[i]; #ifdef RUN_ON_COMPUTE_NODES tempLineIndex = failedRanks[i]/SLOTS; //started from 0 #ifdef RECOV_ON_SPARE_NODES for(int j = 0; j < totNodeFailed; ++j) { if(failedNodeList[j] == tempLineIndex) hostfileLineIndex = hostfileLastLineIndex - j - sumPrevNumNodeFails; } #else // Recovery on the same node (no node failure, only process failure) hostfileLineIndex = tempLineIndex; #endif hostNameToLaunch[i] = getHostToLaunch(hostfileLineIndex); hostNameFailed = getHostToLaunch(tempLineIndex); #else // Run on head node or personal machine hostNameToLaunch[i] = (char *)hostName; hostNameFailed = (char *)hostName; #endif if(verbosity > 0 && rank == 0) printf("--- Iteration %d: Application: Process %d on node %s is failed! ---\n", iteration, failedRanks[i], hostNameFailed); } // Release memory of hostNameFailed free(hostNameFailed); appToLaunch = (char **) malloc(procsNeeded * sizeof(char *)); argvToLaunch = (char ***) malloc(procsNeeded * sizeof(char **)); procsNeededToLaunch = (int *) malloc(procsNeeded * sizeof(int)); hostInfoToLaunch = (MPI_Info *) malloc(procsNeeded * sizeof(MPI_Info)); argv[argc] = NULL; #pragma omp parallel for default(shared) for(i = 0; i < procsNeeded; i++) { appToLaunch[i] = (char *)argv[0]; argvToLaunch[i] = (char **)argv; procsNeededToLaunch[i] = 1; // Host information where to spawn the processes MPI_Info_create(&hostInfoToLaunch[i]); MPI_Info_set(hostInfoToLaunch[i], (char *)"host", hostNameToLaunch[i]); //MPI_Info_set(hostInfoToLaunch[i], "hostfile", "hostfile"); } double spawnTime = MPI_Wtime(); #ifdef HANG_ON_REMOVE OMPI_Comm_agree(tempShrink, &flag); #endif // Spawn the new process(es) if(MPI_SUCCESS != (ret = MPI_Comm_spawn_multiple(procsNeeded, appToLaunch, argvToLaunch, procsNeededToLaunch, hostInfoToLaunch, 0, tempShrink, &tempIntercomm, MPI_ERRCODES_IGNORE))) { free(tempRanks); free(failedRanks); free(errCodes); if(MPI_ERR_COMM == ret) printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid communicator (parent)\n", iteration); if(MPI_ERR_ARG == ret) printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid argument (parent)\n", iteration); if(MPI_ERR_INFO == ret) printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid info (parent)\n", iteration); if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) { OMPI_Comm_revoke(tempShrink); return repairComm(broken, repaired, iteration, listFails, numFails, numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity); } else { fprintf(stderr, "Iteration %d: Unknown error with MPI_Comm_spawn_multiple (parent): %d\n", iteration, ret); exit(1); } } else { if(verbosity > 0 && rank == 0) { for(i = 0; i < procsNeeded; i++) printf("Iteration %d: MPI_Comm_spawn_multiple (parent) [spawning failed process %d on " "node %s]: SUCCESS\n", iteration, failedRanks[i], hostNameToLaunch[i]); } // Memory release. Moving the last two to the end of the function causes segmentation faults for 4 processes failure } if (verbosity > 0 && rank == 0) printf("MPI_Comm_spawn_multiple takes %0.6f Sec\n", MPI_Wtime() - spawnTime); double mergeTime = MPI_Wtime(); // Merge the new processes into a new communicator if(MPI_SUCCESS != (ret = MPI_Intercomm_merge(tempIntercomm, false, &unorderIntracomm))) { free(tempRanks); free(failedRanks); if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) { // Start the recovery over again if there is a failure OMPI_Comm_revoke(tempIntercomm); return repairComm(broken, repaired, iteration, listFails, numFails, numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity); } else if(MPI_ERR_COMM == ret) { fprintf(stderr, "Iteration %d: Invalid communicator in MPI_Intercomm_merge (parent) %d\n", iteration, ret); exit(1); } else if(MPI_ERR_INTERN == ret) { fprintf(stderr, "Iteration %d: Acquaring memory error in MPI_Intercomm_merge ()%d\n", iteration, ret); exit(1); } else { fprintf(stderr, "Iteration %d: Unknown error with MPI_Intercomm_merge: %d\n", iteration, ret); exit(1); } } else { if(verbosity > 1 ) printf("Iteration %d: MPI_Intercomm_merge (parent): SUCCESS\n", iteration); } if (verbosity > 0 && rank == 0) printf("MPI_Intercomm_merge takes %0.6f Sec\n", MPI_Wtime() - mergeTime); double agreeTime = MPI_Wtime(); // Synchronize. sometimes hangs in without this // position of code and intercommunicator (not intra) is important #ifdef HANG_ON_REMOVE //MPI_Barrier(tempIntercomm); OMPI_Comm_agree(tempIntercomm, &flag);// since some of the times MPI_Barrier hangs #endif if (verbosity > 0 && rank == 0) printf("OMPI_Comm_agree takes %0.6f Sec\n", MPI_Wtime() - agreeTime); // Sending failed ranks and number of processes failed to the the newly created ranks. // oldGroupSize is the size of communicator before failure. // procsNeeded is the number of processes that are failed int * child = (int *) malloc(procsNeeded*sizeof(int)); #pragma omp parallel for default(shared) for(i = 0; i < procsNeeded; i++) child[i] = oldGroupSize - procsNeeded + i; MPI_Comm_rank(unorderIntracomm, &newRank); if(newRank == 0) { int send_val[2]; for(i = 0; i < procsNeeded; i++) { send_val[0] = failedRanks[i]; send_val[1] = procsNeeded; if(MPI_SUCCESS != (ret = MPI_Send(&send_val, 2, MPI_INT, child[i], MERGE_TAG, unorderIntracomm))) { free(tempRanks); free(failedRanks); if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) { // Start the recovery over again if there is a failure OMPI_Comm_revoke(unorderIntracomm); return repairComm(broken, repaired, iteration, listFails, numFails, numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity); } else { fprintf(stderr, "Iteration %d: Unknown error with MPI_Send1 (parent): %d\n", iteration, ret); exit(1); } } else { if(verbosity > 1 ) printf("Iteration %d: MPI_Send1 (parent): SUCCESS\n", iteration); } } } // Split the current world (splitted from original) to order the ranks. MPI_Comm_rank(unorderIntracomm, &newRank); MPI_Comm_size(unorderIntracomm, &nprocs); // For one or more process failure (ordering) shrinkMergeList = (int *) malloc(nprocs*sizeof(int)); j = 0; for(i = 0; i < nprocs; i++) { if(rankIsNotOnFailedList(i, failedRanks, procsNeeded)) shrinkMergeList[j++] = i; } for(i = j; i < nprocs; i++) shrinkMergeList[i] = failedRanks[i-j]; for(i = 0; i < (nprocs - procsNeeded); i++) { if(newRank == i) rankKey = shrinkMergeList[i]; } if(MPI_SUCCESS != (MPI_Comm_split(unorderIntracomm, 0, rankKey, repaired))) { if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) { // Start the recovery over again if there is a failure OMPI_Comm_revoke(unorderIntracomm); return repairComm(broken, repaired, iteration, listFails, numFails, numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity); } else { fprintf(stderr, "Iteration %d: Unknown error with MPI_Comm_split (parent): %d\n", iteration, ret); exit(1); } } else { if(verbosity > 1 ) printf("Iteration %d: MPI_Comm_split (parent): SUCCESS\n", iteration); } // Release memory free(appToLaunch); free(argvToLaunch); free(procsNeededToLaunch); free(hostInfoToLaunch); free(hostNameToLaunch); free(shrinkMergeList); free(errCodes); MPI_Comm_free(&tempShrink); free(tempRanks); free(failedRanks); free(child); MPI_Group_free(&failedGroup); MPI_Group_free(&oldGroup); MPI_Comm_free(&tempIntercomm); MPI_Comm_free(&unorderIntracomm); #ifdef RECOV_ON_SPARE_NODES if(failedNodeList != NULL) free(failedNodeList); if(nodeList != NULL) free(nodeList); #endif }//repairComm()
static void armci_mpi2_spawn() { int i; char server_program[100]; char **command_arr=NULL, **hostname_arr=NULL, **nid_arr=NULL; int *size_arr=NULL; MPI_Info *info_arr; /* we need to start 1 data server process on each node. So a total of "armci_nclus" data servers */ armci_nserver = armci_nclus; select_server_program(server_program, armci_nserver); armci_mpi2_debug(0, "armci_mpi2_init(): Spawning %d data server processes " "running %s\n", armci_nserver, server_program); /* allocate necessary data structures */ { command_arr = (char**) malloc(armci_nserver * sizeof(char*)); size_arr = (int*) malloc(armci_nserver * sizeof(int)); info_arr = (MPI_Info*) malloc(armci_nserver * sizeof(MPI_Info)); hostname_arr = (char**) malloc(armci_nserver * sizeof(char*)); #ifdef SPAWN_CRAY_XT nid_arr = (char**) malloc(armci_nserver * sizeof(char*));; #endif for(i=0; i<armci_nserver; i++) { hostname_arr[i] = (char*)malloc(MPI_MAX_PROCESSOR_NAME*sizeof(char)); } if(command_arr==NULL || size_arr==NULL || info_arr==NULL || hostname_arr==NULL) { armci_die("armci_mpi2_spawn: malloc failed.", 0); } } /** * 1. root process collects hostnames (i.e. machine names) of where to * spawn dataservers. ARMCI masters of respective node will return their * hostnames. */ armci_gather_hostnames(hostname_arr); /** 2. initialize MPI_Comm_spawn_multiple() arguments */ { for(i=0; i<armci_nserver; i++) { command_arr[i] = (*_armci_argv)[0]; /*CHECK: path needs fix */ size_arr[i] = 1; /* 1 data server in each node */ MPI_Info_create(&info_arr[i]); #ifdef SPAWN_CRAY_XT asprintf(&nid_arr[i], "%d", atoi((hostname_arr[i] + 3))); MPI_Info_set(info_arr[i], "host", nid_arr[i]); /*portability? */ #else MPI_Info_set(info_arr[i], "host", hostname_arr[i]); /*portability? */ #endif } } /** * 3. MPI_Comm_spawn_multiple(): This is a collective call. * Intercommunicator "ds_intercomm" contains only new dataserver processes. */ MPI_Check( MPI_Comm_spawn_multiple(armci_nserver, command_arr, MPI_ARGVS_NULL, size_arr, info_arr, ARMCI_ROOT, MPI_COMM_WORLD, &MPI_COMM_CLIENT2SERVER, MPI_ERRCODES_IGNORE) ); { for(i=0; i<armci_nserver; i++) free(hostname_arr[i]); free(command_arr); free(size_arr); free(info_arr); free(hostname_arr); #ifdef SPAWN_CRAY_XT free(nid_arr); #endif } }