int main(int argc, char* argv[])
{
    int msg;
    MPI_Comm parent, child;
    int rank, size;
    char hostname[512];
    pid_t pid;
    int i;
    char *cmds[2];
    char *argv0[] = { "foo", NULL };
    char *argv1[] = { "bar", NULL };
    char **spawn_argv[2];
    int maxprocs[] = { 1, 1 };
    MPI_Info info[] = { MPI_INFO_NULL, MPI_INFO_NULL };

    cmds[1] = cmds[0] = argv[0];
    spawn_argv[0] = argv0;
    spawn_argv[1] = argv1;

    MPI_Init(NULL, NULL);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_get_parent(&parent);
    /* If we get COMM_NULL back, then we're the parent */
    if (MPI_COMM_NULL == parent) {
        pid = getpid();
        printf("Parent [pid %ld] about to spawn!\n", (long)pid);
        MPI_Comm_spawn_multiple(2, cmds, spawn_argv, maxprocs, 
                                info, 0, MPI_COMM_WORLD,
                                &child, MPI_ERRCODES_IGNORE);
        printf("Parent done with spawn\n");
        if (0 == rank) {
            msg = 38;
            printf("Parent sending message to children\n");
            MPI_Send(&msg, 1, MPI_INT, 0, 1, child);
            MPI_Send(&msg, 1, MPI_INT, 1, 1, child);
        }
        MPI_Comm_disconnect(&child);
        printf("Parent disconnected\n");
    } 
    /* Otherwise, we're the child */
    else {
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        MPI_Comm_size(MPI_COMM_WORLD, &size);
        gethostname(hostname, 512);
        pid = getpid();
        printf("Hello from the child %d of %d on host %s pid %ld: argv[1] = %s\n", rank, size, hostname, (long)pid, argv[1]);
        if (0 == rank) {
            MPI_Recv(&msg, 1, MPI_INT, 0, 1, parent, MPI_STATUS_IGNORE);
            printf("Child %d received msg: %d\n", rank, msg);
        }
        MPI_Comm_disconnect(&parent);
        printf("Child %d disconnected\n", rank);
    }

    MPI_Finalize();
    return 0;
}
Beispiel #2
0
void mpi_comm_spawn_multiple_f(MPI_Fint *count, char *array_commands,
			       char *array_argv,
			       MPI_Fint *array_maxprocs,
			       MPI_Fint *array_info, MPI_Fint *root,
			       MPI_Fint *comm, MPI_Fint *intercomm,
			       MPI_Fint *array_errcds, MPI_Fint *ierr,
			       int cmd_len, int argv_len)
{
    MPI_Comm c_comm, c_new_comm;
    MPI_Info *c_info;
    int size, array_size, i;
    int *c_errs;
    char **c_array_commands;
    char ***c_array_argv;
    OMPI_ARRAY_NAME_DECL(array_maxprocs);
    OMPI_ARRAY_NAME_DECL(array_errcds);
    
    c_comm = MPI_Comm_f2c(*comm);
    
    MPI_Comm_size(c_comm, &size);

    array_size = OMPI_FINT_2_INT(*count);

    /* It's allowed to ignore the errcodes */

    if (OMPI_IS_FORTRAN_ERRCODES_IGNORE(array_errcds)) {
        c_errs = MPI_ERRCODES_IGNORE;
    } else {
        OMPI_ARRAY_FINT_2_INT_ALLOC(array_errcds, size);
        c_errs = OMPI_ARRAY_NAME_CONVERT(array_errcds);
    }

    /* It's allowed to have no argv */

    if (OMPI_IS_FORTRAN_ARGVS_NULL(array_argv)) {
        c_array_argv = MPI_ARGVS_NULL;
    } else {
	ompi_fortran_multiple_argvs_f2c(OMPI_FINT_2_INT(*count), array_argv, 
					argv_len, &c_array_argv);
    }

    OMPI_ARRAY_FINT_2_INT(array_maxprocs, array_size);
    
    ompi_fortran_argv_f2c(array_commands, cmd_len, &c_array_commands);
	
    c_info = malloc (array_size * sizeof(MPI_Info));
    for (i = 0; i < array_size; ++i) {
	c_info[i] = MPI_Info_f2c(array_info[i]);
    }

    *ierr = 
	OMPI_INT_2_FINT(MPI_Comm_spawn_multiple(OMPI_FINT_2_INT(*count),
				       c_array_commands,
				       c_array_argv, 
				       OMPI_ARRAY_NAME_CONVERT(array_maxprocs),
				       c_info,
				       OMPI_FINT_2_INT(*root),
				       c_comm, &c_new_comm,
				       c_errs));
    if (MPI_SUCCESS == OMPI_FINT_2_INT(*ierr)) {
        *intercomm = MPI_Comm_c2f(c_new_comm);
    }

    if (!OMPI_IS_FORTRAN_ERRCODES_IGNORE(array_errcds)) {
	OMPI_ARRAY_INT_2_FINT(array_errcds, size);
    }
    OMPI_ARRAY_FINT_2_INT_CLEANUP(array_maxprocs);

    opal_argv_free(c_array_commands);

    if (MPI_ARGVS_NULL != c_array_argv && NULL != c_array_argv) {
	for (i = 0; i < OMPI_FINT_2_INT(*count); ++i) { 
	    opal_argv_free(c_array_argv[i]);
	}
    }
    free(c_array_argv);
}
void repairComm(MPI_Comm * broken, MPI_Comm * repaired, int iteration, int * listFails, int * numFails,
                int * numNodeFails, int sumPrevNumNodeFails, int argc, char ** argv, int verbosity) {
    MPI_Comm tempShrink, unorderIntracomm, tempIntercomm;
    int i, ret, result, procsNeeded = 0, oldRank, newRank, oldGroupSize, rankKey = 0, flag;
    int * tempRanks, * failedRanks, * errCodes, rank, hostfileLineIndex;
    MPI_Group oldGroup, failedGroup, shrinkGroup;
    int hostfileLastLineIndex, tempLineIndex, * failedNodeList = NULL, * nodeList = NULL, totNodeFailed = 0;
    double startTime = 0.0, endTime;
    int nprocs, j, * shrinkMergeList;
    char hostName[128];
    gethostname(hostName, sizeof(hostName));

    char ** appToLaunch;
    char *** argvToLaunch;
    int * procsNeededToLaunch;
    MPI_Info * hostInfoToLaunch;
    char ** hostNameToLaunch;

    MPI_Comm_rank(*broken, &rank);
    if(rank == 0)
        startTime = MPI_Wtime();

#ifndef GLOBAL_DETECTION
    MPI_Comm_size(*broken, &oldGroupSize);
    MPI_Comm_group(*broken, &oldGroup);
    MPI_Comm_rank(*broken, &oldRank);
    OMPI_Comm_failure_ack(*broken);
    OMPI_Comm_failure_get_acked(*broken, &failedGroup);
    MPI_Group_size(failedGroup, &procsNeeded);
    errCodes = (int *) malloc(sizeof(int) * procsNeeded);

    // Figure out ranks of the processes which had failed
    tempRanks = (int *) malloc(sizeof(int) * oldGroupSize);
    failedRanks = (int *) malloc(sizeof(int) * oldGroupSize);
    #pragma omp parallel for default(shared)
    for(i = 0; i < oldGroupSize; i++)
        tempRanks[i] = i;

    MPI_Group_translate_ranks(failedGroup, procsNeeded, tempRanks, oldGroup, failedRanks);
#endif

    double shrinkTime = MPI_Wtime();
    // Shrink the broken communicator to remove failed procs
    if(MPI_SUCCESS != (ret = OMPI_Comm_shrink(*broken, &tempShrink)))
        printf("Iteration %d: OMPI_Comm_shrink (parent): ERROR!\n", iteration);
    else {
        if(verbosity > 1 )
            printf("Iteration %d: OMPI_Comm_shrink (parent): SUCCESS\n", iteration);
    }
    if (verbosity > 0 && rank == 0)
        printf("OMPI_Comm_shrink takes %0.6f Sec\n", MPI_Wtime() - shrinkTime);

#ifdef GLOBAL_DETECTION
    MPI_Comm_group(*broken, &oldGroup);
    MPI_Comm_group(tempShrink, &shrinkGroup);
    MPI_Comm_size(*broken, &oldGroupSize);

    MPI_Group_compare(oldGroup, shrinkGroup, &result);

    if(result != MPI_IDENT)
        MPI_Group_difference(oldGroup, shrinkGroup, &failedGroup);

    MPI_Comm_rank(*broken, &oldRank);
    MPI_Group_size(failedGroup, &procsNeeded);

    errCodes = (int *) malloc(sizeof(int)*procsNeeded);

    // Figure out ranks of the processes which had failed
    tempRanks = (int*)malloc(sizeof(int)*oldGroupSize);
    failedRanks = (int*)malloc(sizeof(int)*oldGroupSize);
    #pragma omp parallel for default(shared)
    for(i = 0; i < oldGroupSize; i++)
        tempRanks[i] = i;

    MPI_Group_translate_ranks(failedGroup, procsNeeded, tempRanks, oldGroup, failedRanks);

    MPI_Group_free(&shrinkGroup);
#endif

    // Assign number of failed processes
    *numFails = procsNeeded;

    hostNameToLaunch = (char **) malloc(procsNeeded * sizeof(char *));

    if(verbosity > 0 && rank == 0)
        printf("*** Iteration %d: Application: Number of process(es) failed in the corresponding "
               "communicator is %d ***\n", iteration, procsNeeded);

    if(rank == 0) {
        endTime = MPI_Wtime();
        printf("[%d]----- Creating failed process list takes %0.6f Sec (MPI_Wtime) -----\n", rank, endTime - startTime);
    }

#ifdef RECOV_ON_SPARE_NODES
    // Determining total number of node failed, and a list of them
    hostfileLastLineIndex = getHostfileLastLineIndex(); //started from 0
    nodeList = (int *) malloc((hostfileLastLineIndex+1) * sizeof(int));
    memset(nodeList, 0, (hostfileLastLineIndex+1)*sizeof(int)); // initialize nodeList with 0's

    for(int i = 0; i < procsNeeded; ++i) {
        tempLineIndex = failedRanks[i]/SLOTS; //started from 0
        nodeList[tempLineIndex] = 1;
    }

    for(int nodeCounter = 0; nodeCounter < (hostfileLastLineIndex+1); ++nodeCounter)
        totNodeFailed += nodeList[nodeCounter];
    *numNodeFails = totNodeFailed;

    // Check if there is sufficient spare node available for recovery
    if((hostfileLastLineIndex - totNodeFailed -sumPrevNumNodeFails) < (oldGroupSize-1)/SLOTS) {
        if(rank == 0)
            printf("[%d] There is no sufficient spare node available for recovery.\n", rank);
        exit(0);
    }

    failedNodeList = (int *) malloc(totNodeFailed * sizeof(int));
    memset(failedNodeList, 0, totNodeFailed * sizeof(int)); // initialize failedNodeList with 0's

    int failedNodeCounter = 0;
    for(int nodeCounter = 0; nodeCounter < (hostfileLastLineIndex+1); ++nodeCounter) {
        if(nodeList[nodeCounter] == 1)
            failedNodeList[failedNodeCounter++] = nodeCounter;
    }
#endif

    char * hostNameFailed = NULL;
    #pragma omp parallel for default(shared)
    for(i = 0; i < procsNeeded; ++i) {
        // Assign list of processes failed
        listFails[i] = failedRanks[i];

#ifdef RUN_ON_COMPUTE_NODES
        tempLineIndex = failedRanks[i]/SLOTS; //started from 0
#ifdef RECOV_ON_SPARE_NODES
        for(int j = 0; j < totNodeFailed; ++j) {
            if(failedNodeList[j] == tempLineIndex)
                hostfileLineIndex = hostfileLastLineIndex - j - sumPrevNumNodeFails;
        }
#else // Recovery on the same node (no node failure, only process failure)
        hostfileLineIndex = tempLineIndex;
#endif
        hostNameToLaunch[i] = getHostToLaunch(hostfileLineIndex);
        hostNameFailed = getHostToLaunch(tempLineIndex);
#else // Run on head node or personal machine
        hostNameToLaunch[i] = (char *)hostName;
        hostNameFailed = (char *)hostName;
#endif

        if(verbosity > 0 && rank == 0)
            printf("--- Iteration %d: Application: Process %d on node %s is failed! ---\n", iteration, failedRanks[i], hostNameFailed);
    }
    // Release memory of hostNameFailed
    free(hostNameFailed);

    appToLaunch = (char **) malloc(procsNeeded * sizeof(char *));
    argvToLaunch = (char ***) malloc(procsNeeded * sizeof(char **));
    procsNeededToLaunch = (int *) malloc(procsNeeded * sizeof(int));
    hostInfoToLaunch = (MPI_Info *) malloc(procsNeeded * sizeof(MPI_Info));
    argv[argc] = NULL;
    #pragma omp parallel for default(shared)
    for(i = 0; i < procsNeeded; i++) {
        appToLaunch[i] = (char *)argv[0];
        argvToLaunch[i] = (char **)argv;
        procsNeededToLaunch[i] = 1;
        // Host information where to spawn the processes
        MPI_Info_create(&hostInfoToLaunch[i]);
        MPI_Info_set(hostInfoToLaunch[i], (char *)"host", hostNameToLaunch[i]);
        //MPI_Info_set(hostInfoToLaunch[i], "hostfile", "hostfile");
    }

    double spawnTime = MPI_Wtime();
#ifdef HANG_ON_REMOVE
    OMPI_Comm_agree(tempShrink, &flag);
#endif
    // Spawn the new process(es)
    if(MPI_SUCCESS != (ret = MPI_Comm_spawn_multiple(procsNeeded, appToLaunch, argvToLaunch, procsNeededToLaunch,
                             hostInfoToLaunch, 0, tempShrink, &tempIntercomm, MPI_ERRCODES_IGNORE))) {
        free(tempRanks);
        free(failedRanks);
        free(errCodes);
        if(MPI_ERR_COMM  == ret)
            printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid communicator (parent)\n", iteration);
        if(MPI_ERR_ARG  == ret)
            printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid argument (parent)\n", iteration);
        if(MPI_ERR_INFO  == ret)
            printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid info (parent)\n", iteration);

        if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
            OMPI_Comm_revoke(tempShrink);
            return repairComm(broken, repaired, iteration, listFails, numFails, numNodeFails,
                              sumPrevNumNodeFails, argc, argv, verbosity);
        }
        else {
            fprintf(stderr, "Iteration %d: Unknown error with MPI_Comm_spawn_multiple (parent): %d\n", iteration, ret);
            exit(1);
        }
    }
    else {
        if(verbosity > 0 && rank == 0) {
            for(i = 0; i < procsNeeded; i++)
                printf("Iteration %d: MPI_Comm_spawn_multiple (parent) [spawning failed process %d on "
                       "node %s]: SUCCESS\n", iteration, failedRanks[i], hostNameToLaunch[i]);
        }
        // Memory release. Moving the last two to the end of the function causes segmentation faults for 4 processes failure
    }
    if (verbosity > 0 && rank == 0)
        printf("MPI_Comm_spawn_multiple takes %0.6f Sec\n", MPI_Wtime() - spawnTime);

    double mergeTime = MPI_Wtime();
    // Merge the new processes into a new communicator
    if(MPI_SUCCESS != (ret = MPI_Intercomm_merge(tempIntercomm, false, &unorderIntracomm))) {
        free(tempRanks);
        free(failedRanks);
        if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
            // Start the recovery over again if there is a failure
            OMPI_Comm_revoke(tempIntercomm);
            return repairComm(broken, repaired, iteration, listFails, numFails,
                              numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity);
        }
        else if(MPI_ERR_COMM == ret) {
            fprintf(stderr, "Iteration %d: Invalid communicator in MPI_Intercomm_merge (parent) %d\n", iteration, ret);
            exit(1);
        }
        else if(MPI_ERR_INTERN == ret) {
            fprintf(stderr, "Iteration %d: Acquaring memory error in MPI_Intercomm_merge ()%d\n", iteration, ret);
            exit(1);
        }
        else {
            fprintf(stderr, "Iteration %d: Unknown error with MPI_Intercomm_merge: %d\n", iteration, ret);
            exit(1);
        }
    }
    else {
        if(verbosity > 1 )
            printf("Iteration %d: MPI_Intercomm_merge (parent): SUCCESS\n", iteration);
    }
    if (verbosity > 0 && rank == 0)
        printf("MPI_Intercomm_merge takes %0.6f Sec\n", MPI_Wtime() - mergeTime);

    double agreeTime = MPI_Wtime();
    // Synchronize. sometimes hangs in without this
    // position of code and intercommunicator (not intra) is important
#ifdef HANG_ON_REMOVE
    //MPI_Barrier(tempIntercomm);
    OMPI_Comm_agree(tempIntercomm, &flag);// since some of the times MPI_Barrier hangs
#endif
    if (verbosity > 0 && rank == 0)
        printf("OMPI_Comm_agree takes %0.6f Sec\n", MPI_Wtime() - agreeTime);

    // Sending failed ranks and number of processes failed to the the newly created ranks.
    // oldGroupSize is the size of communicator before failure.
    // procsNeeded is the number of processes that are failed
    int * child = (int *) malloc(procsNeeded*sizeof(int));
    #pragma omp parallel for default(shared)
    for(i = 0; i < procsNeeded; i++)
        child[i] = oldGroupSize - procsNeeded + i;

    MPI_Comm_rank(unorderIntracomm, &newRank);
    if(newRank == 0) {
        int send_val[2];
        for(i = 0; i < procsNeeded; i++) {
            send_val[0] = failedRanks[i];
            send_val[1] = procsNeeded;
            if(MPI_SUCCESS != (ret = MPI_Send(&send_val, 2, MPI_INT, child[i], MERGE_TAG, unorderIntracomm))) {
                free(tempRanks);
                free(failedRanks);
                if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
                    // Start the recovery over again if there is a failure
                    OMPI_Comm_revoke(unorderIntracomm);
                    return repairComm(broken, repaired, iteration, listFails, numFails,
                                      numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity);
                }
                else {
                    fprintf(stderr, "Iteration %d: Unknown error with MPI_Send1 (parent): %d\n", iteration, ret);
                    exit(1);
                }
            }
            else {
                if(verbosity > 1 )
                    printf("Iteration %d: MPI_Send1 (parent): SUCCESS\n", iteration);
            }
        }
    }

    // Split the current world (splitted from original) to order the ranks.
    MPI_Comm_rank(unorderIntracomm, &newRank);
    MPI_Comm_size(unorderIntracomm, &nprocs);

    // For one or more process failure (ordering)
    shrinkMergeList = (int *) malloc(nprocs*sizeof(int));

    j = 0;
    for(i = 0; i < nprocs; i++) {
        if(rankIsNotOnFailedList(i, failedRanks, procsNeeded))
            shrinkMergeList[j++] = i;
    }

    for(i = j; i < nprocs; i++)
        shrinkMergeList[i] = failedRanks[i-j];

    for(i = 0; i < (nprocs - procsNeeded); i++) {
        if(newRank == i)
            rankKey = shrinkMergeList[i];
    }

    if(MPI_SUCCESS != (MPI_Comm_split(unorderIntracomm, 0, rankKey, repaired))) {
        if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
            // Start the recovery over again if there is a failure
            OMPI_Comm_revoke(unorderIntracomm);
            return repairComm(broken, repaired, iteration, listFails, numFails,
                              numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity);
        }
        else {
            fprintf(stderr, "Iteration %d: Unknown error with MPI_Comm_split (parent): %d\n", iteration, ret);
            exit(1);
        }
    }
    else {
        if(verbosity > 1 )
            printf("Iteration %d: MPI_Comm_split (parent): SUCCESS\n", iteration);
    }

    // Release memory
    free(appToLaunch);
    free(argvToLaunch);
    free(procsNeededToLaunch);
    free(hostInfoToLaunch);
    free(hostNameToLaunch);
    free(shrinkMergeList);
    free(errCodes);
    MPI_Comm_free(&tempShrink);
    free(tempRanks);
    free(failedRanks);
    free(child);
    MPI_Group_free(&failedGroup);
    MPI_Group_free(&oldGroup);
    MPI_Comm_free(&tempIntercomm);
    MPI_Comm_free(&unorderIntracomm);
#ifdef RECOV_ON_SPARE_NODES
    if(failedNodeList != NULL)
        free(failedNodeList);
    if(nodeList != NULL)
        free(nodeList);
#endif
}//repairComm()
Beispiel #4
0
static void armci_mpi2_spawn() 
{

    int i;
    char server_program[100];
    char **command_arr=NULL, **hostname_arr=NULL, **nid_arr=NULL;
    int *size_arr=NULL;
    MPI_Info *info_arr;
    
    /* we need to start 1 data server process on each node. So a total of
       "armci_nclus" data servers */
    armci_nserver = armci_nclus;
    select_server_program(server_program, armci_nserver);
    
    armci_mpi2_debug(0, "armci_mpi2_init(): Spawning %d data server processes "
                     "running %s\n", armci_nserver, server_program);

    /* allocate necessary data structures */
    {
       command_arr  = (char**)    malloc(armci_nserver * sizeof(char*));
       size_arr     = (int*)      malloc(armci_nserver * sizeof(int));
       info_arr     = (MPI_Info*) malloc(armci_nserver * sizeof(MPI_Info));
       hostname_arr = (char**)    malloc(armci_nserver * sizeof(char*));
#ifdef SPAWN_CRAY_XT
       nid_arr      = (char**)    malloc(armci_nserver * sizeof(char*));;
#endif
       for(i=0; i<armci_nserver; i++) 
       {
          hostname_arr[i] = (char*)malloc(MPI_MAX_PROCESSOR_NAME*sizeof(char));
       }

       if(command_arr==NULL || size_arr==NULL || info_arr==NULL ||
          hostname_arr==NULL) 
       {
          armci_die("armci_mpi2_spawn: malloc failed.", 0);
       }
    }
    
    /**
     * 1. root process collects hostnames (i.e. machine names) of where to
     * spawn dataservers. ARMCI masters of respective node will return their
     * hostnames. 
     */
    armci_gather_hostnames(hostname_arr);
    
       
    /** 2. initialize MPI_Comm_spawn_multiple() arguments */
    {   
       for(i=0; i<armci_nserver; i++)
       {
          command_arr[i] = (*_armci_argv)[0];  /*CHECK: path needs fix */
          size_arr[i]    = 1;                /* 1 data server in each node */
          MPI_Info_create(&info_arr[i]);
#ifdef SPAWN_CRAY_XT
          asprintf(&nid_arr[i], "%d", atoi((hostname_arr[i] + 3)));
          MPI_Info_set(info_arr[i], "host", nid_arr[i]); /*portability? */
#else
          MPI_Info_set(info_arr[i], "host", hostname_arr[i]); /*portability? */
#endif
       }
    }

    
    /**
     * 3. MPI_Comm_spawn_multiple(): This is a collective call.
     * Intercommunicator "ds_intercomm" contains only new dataserver processes.
     */
    MPI_Check(
       MPI_Comm_spawn_multiple(armci_nserver, command_arr, MPI_ARGVS_NULL,
                               size_arr, info_arr, ARMCI_ROOT, MPI_COMM_WORLD,
                               &MPI_COMM_CLIENT2SERVER, MPI_ERRCODES_IGNORE)
       );


    {  
       for(i=0; i<armci_nserver; i++)  free(hostname_arr[i]);
       
       free(command_arr);
       free(size_arr);
       free(info_arr);
       free(hostname_arr);
#ifdef SPAWN_CRAY_XT
       free(nid_arr);
#endif
    }
}