示例#1
0
bool SplitMPI_Communicator::CreateCommunicator(MPI_Comm comm_world, int np, int nb_ddc)
{
	int n_DDC;
	bool splitcomm;

	if ((nb_ddc > 0) && (nb_ddc < np))
	{ // if the number of total cores is larger than the number of DDCs is the same, two new MPI groups will be
// generated will be generated
#ifdef OGS_FEM_IPQC
		splitcomm = true;
		n_DDC = nb_ddc; // number of ddc

		int DDC_ranks[n_DDC];
		for (int k = 0; k < n_DDC; k++)
		{
			DDC_ranks[k] = k;
		}

		MPI_Comm comm_IPQC;
		MPI_Group group_base, group_DDC, group_IPQC;

		// define MPI group and communicator for DDC related processes WH
		MPI_Comm_group(comm_world, &group_base);
		MPI_Group_incl(group_base, n_DDC, DDC_ranks, &group_DDC); // define group flow and mass transport
		MPI_Comm_create(comm_world, group_DDC, &comm_DDC);

		// define MPI group and communicator for IPQC WH
		MPI_Group_difference(group_base, group_DDC, &group_IPQC);
		MPI_Comm_create(comm_world, group_IPQC, &comm_IPQC);

		int myrank_IPQC, mysize_IPQC;
		MPI_Group_size(group_DDC, &mysize); // WH
		MPI_Group_rank(group_DDC, &myrank); // WH
		MPI_Group_rank(group_IPQC, &myrank_IPQC);
		MPI_Group_size(group_IPQC, &mysize_IPQC);
		if (myrank_IPQC != MPI_UNDEFINED) // WH
			std::cout << "After MPI_Init myrank_IPQC = " << myrank_IPQC << '\n';
		if (myrank != MPI_UNDEFINED) // WH
			std::cout << "After MPI_Init myrank_DDC = " << myrank << '\n';

		if (myrank_IPQC != MPI_UNDEFINED) // ranks of group_IPQC will call to IPhreeqc
			Call_IPhreeqc();
#endif
	}
	else
	{ // if no -ddc is specified or the number of ddc is incorrect, make ddc = np, no new MPI groups willnot be
		// generated;
		splitcomm = false;
		n_DDC = np;
		comm_DDC = comm_world;
		MPI_Comm_size(comm_DDC, &mysize);
		MPI_Comm_rank(comm_DDC, &myrank);
		std::cout << "After MPI_Init myrank_DDC = " << myrank << '\n';
	}

	return splitcomm;
}
示例#2
0
int main( int argc, char **argv )
{
    int rank, size, i;
    MPI_Group group1, group2, group3, groupall, groupunion, newgroup;
    MPI_Comm newcomm;
    int ranks1[100], ranks2[100], ranks3[100];
    int nranks1=0, nranks2=0, nranks3=0;

    MPI_Init( &argc, &argv );
    MPI_Barrier( MPI_COMM_WORLD );
    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
    MPI_Comm_size( MPI_COMM_WORLD, &size );
    MPI_Comm_group( MPI_COMM_WORLD, &groupall );

    /* Divide groups */
    for (i=0; i<size; i++) 
      if ( (i%3)==0 )
	ranks1[nranks1++] = i;
      else if ( (i%3)==1 )
	ranks2[nranks2++] = i;
      else
	ranks3[nranks3++] = i;

    MPI_Group_incl ( groupall, nranks1, ranks1, &group1 );
    MPI_Group_incl ( groupall, nranks2, ranks2, &group2 );
    MPI_Group_incl ( groupall, nranks3, ranks3, &group3 );

    MPI_Group_difference ( groupall, group2, &groupunion );

    MPI_Comm_create ( MPI_COMM_WORLD, group3, &newcomm );
    newgroup = MPI_GROUP_NULL;
    if (newcomm != MPI_COMM_NULL)
    {
	/* If we don't belong to group3, this would fail */
	MPI_Comm_group ( newcomm, &newgroup );
    }

    /* Free the groups */
    MPI_Group_free( &groupall );
    MPI_Group_free( &group1 );
    MPI_Group_free( &group2 );
    MPI_Group_free( &group3 );
    MPI_Group_free( &groupunion );
    if (newgroup != MPI_GROUP_NULL)
    {
	MPI_Group_free( &newgroup );
    }

    /* Free the communicator */
    if (newcomm != MPI_COMM_NULL)
	MPI_Comm_free( &newcomm );
    Test_Waitforall( );
    MPI_Finalize();
    return 0;
}
示例#3
0
文件: mpi_Group.c 项目: 00datman/ompi
JNIEXPORT jlong JNICALL Java_mpi_Group_difference(
        JNIEnv *env, jclass jthis, jlong group1, jlong group2)
{
    MPI_Group newGroup;

    int rc = MPI_Group_difference(
             (MPI_Group)group1, (MPI_Group)group2, &newGroup);

    ompi_java_exceptionCheck(env, rc);
    return (jlong)newGroup;
}
示例#4
0
int main (int argc, char **argv)
{
  int num, i, rank, localRank;
  MPI_Group all, odd, even;
  MPI_Comm oddComm, evenComm;
  char mess[11];

  MPI_Init (&argc, &argv);
  // copy all the processes in group "all"
  MPI_Comm_group (MPI_COMM_WORLD, &all);
  MPI_Comm_size (MPI_COMM_WORLD, &num);
  MPI_Comm_rank (MPI_COMM_WORLD, &rank);

  int grN = 0;
  int ranks[num / 2];

  for (i = 0; i < num; i += 2)
    ranks[grN++] = i;

  // extract from "all" only the odd ones
  MPI_Group_excl (all, grN, ranks, &odd);
  // sutract odd group from all to get the even ones
  MPI_Group_difference (all, odd, &even);

  MPI_Comm_create (MPI_COMM_WORLD, odd, &oddComm);
  MPI_Comm_create (MPI_COMM_WORLD, even, &evenComm);
  
  // check group membership
  MPI_Group_rank (odd, &localRank);
  if (localRank != MPI_UNDEFINED)
    {
      if (localRank == 0)       // local group root, sets-up message
        strcpy (mess, "ODD GROUP");
      MPI_Bcast (mess, 11, MPI_CHAR, 0, oddComm);
      MPI_Comm_free (&oddComm);  // free communicator in processes where it is valid
    }
  else
    {
      MPI_Comm_rank (evenComm, &localRank);
      if (localRank == 0)       // local group root, sets-up message
        strcpy (mess, "EVEN GROUP");
      MPI_Bcast (mess, 11, MPI_CHAR, 0, evenComm);
      MPI_Comm_free (&evenComm);
    }

  printf ("Process %i with local rank %i received %s\n", rank, localRank, mess);

  // free up memory
  MPI_Group_free (&all);
  MPI_Group_free (&odd);
  MPI_Group_free (&even);
  MPI_Finalize ();
  return 0;
}
示例#5
0
/*
 * Class:     mpi_Group
 * Method:    difference
 * Signature: (Lmpi/Group;Lmpi/Group;)J
 */
JNIEXPORT jlong JNICALL Java_mpi_Group_difference(JNIEnv *env, jclass jthis,
                                                  jobject group1, jobject group2)
{
    MPI_Group newgroup;

    ompi_java_clearFreeList(env) ;

    MPI_Group_difference((MPI_Group)((*env)->GetLongField(env,group1,ompi_java.GrouphandleID)),
                         (MPI_Group)((*env)->GetLongField(env,group2,ompi_java.GrouphandleID)),
                         &newgroup);
    return (jlong)newgroup;
}
示例#6
0
int main(int argc, char **argv)
{
	int rank, size, i;
	MPI_Group groupall, groupunion, newgroup, group[GROUPS];
	MPI_Comm newcomm;
	int ranks[GROUPS][100];
	int nranks[GROUPS] = { 0, 0, 0 };

	MPI_Init(&argc, &argv);
	MPI_Barrier(MPI_COMM_WORLD);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	MPI_Comm_group(MPI_COMM_WORLD, &groupall);

	/* Divide groups */
	for (i = 0; i < size; i++)
		ranks[i % GROUPS][nranks[i % GROUPS]++] = i;

	for (i = 0; i < GROUPS; i++)
		MPI_Group_incl(groupall, nranks[i], ranks[i], &group[i]);

	MPI_Group_difference(groupall, group[1], &groupunion);

	MPI_Comm_create(MPI_COMM_WORLD, group[2], &newcomm);
	newgroup = MPI_GROUP_NULL;
	if (newcomm != MPI_COMM_NULL)
	{
		/* If we don't belong to group[2], this would fail */
		MPI_Comm_group(newcomm, &newgroup);
	}

	/* Free the groups */
	MPI_Group_free(&groupall);
	for (i = 0; i < GROUPS; i++)
		MPI_Group_free(&group[i]);
	MPI_Group_free(&groupunion);
	if (newgroup != MPI_GROUP_NULL)
	{
		MPI_Group_free(&newgroup);
	}

	/* Free the communicator */
	if (newcomm != MPI_COMM_NULL)
		MPI_Comm_free(&newcomm);
	Test_Waitforall();
	Test_Global_Summary();
	MPI_Finalize();
	return 0;
}
示例#7
0
static VALUE group_difference(VALUE self, VALUE rgrp2)
{
    int rv;
    MPI_Group *grp1, *grp2, *newgrp;

    Data_Get_Struct(self, MPI_Group, grp1);
    Data_Get_Struct(grp2, MPI_Group, grp2);

    newgrp = ALLOC(MPI_Group);

    rv = MPI_Group_difference(*grp1, *grp2, newgrp);
    mpi_exception(rv);

    return group_new(newgrp);
}
示例#8
0
void mpi_group_difference_f(MPI_Fint *group1, MPI_Fint *group2, MPI_Fint *newgroup, MPI_Fint *ierr)
{
  ompi_group_t *c_group1, *c_group2, *c_newgroup;

  /* Make the fortran to c representation conversion */
  c_group1 = MPI_Group_f2c(*group1);
  c_group2 = MPI_Group_f2c(*group2);
  
  *ierr = OMPI_INT_2_FINT(MPI_Group_difference(c_group1, c_group2, 
					       &c_newgroup));

  /* translate the results from c to fortran */
  if (MPI_SUCCESS == OMPI_FINT_2_INT(*ierr)) {
    *newgroup = c_newgroup->grp_f_to_c_index; 
  }
}
示例#9
0
int main (int argc, char **argv)
{
  int num, i, rank;
  MPI_Group all, odd, even;

  MPI_Init (&argc, &argv);
  // copy all the processes in group "all"
  MPI_Comm_group (MPI_COMM_WORLD, &all);
  MPI_Comm_size (MPI_COMM_WORLD, &num);
  MPI_Comm_rank (MPI_COMM_WORLD, &rank);

  int grN = 0;
  int ranks[num / 2];

  for (i = 0; i < num; i += 2)
    ranks[grN++] = i;

  // extract from "all" only the odd ones
  MPI_Group_excl (all, grN, ranks, &odd);
  // sutract odd group from all to get the even ones
  MPI_Group_difference (all, odd, &even);

  // print group sizes
  if (rank == 0)
    {
      MPI_Group_size (odd, &i);
      printf ("Odd group has %i processes\n", i);
      MPI_Group_size (even, &i);
      printf ("Even group has %i processes\n", i);
    }

  // check group membership
  MPI_Group_rank (odd, &i);
  if (i == MPI_UNDEFINED)
    printf ("Process %i belongs to even group\n", rank);
  else
    printf ("Process %i belongs to odd group\n", rank);

  // free up memory
  MPI_Group_free (&all);
  MPI_Group_free (&odd);
  MPI_Group_free (&even);
  MPI_Finalize ();
  return 0;
}
示例#10
0
dart_ret_t dart_group_delmember(
  dart_group_t *g,
  dart_unit_t unitid)
{
  int array[1];
  MPI_Group newgroup, group_all;
  MPI_Comm_group(MPI_COMM_WORLD, &group_all);
  array[0] = unitid;
  MPI_Group_incl(
    group_all,
    1,
    array,
    &newgroup);
  MPI_Group_difference(
    g -> mpi_group,
    newgroup,
    &(g -> mpi_group));
  return DART_OK;
}
示例#11
0
文件: groups.c 项目: DMClambo/pfff
value caml_mpi_group_difference(value group1, value group2)
{
  MPI_Group group;
  MPI_Group_difference(Group_val(group1), Group_val(group2), &group);
  return caml_mpi_alloc_group(group);
}
int
main (int argc, char **argv)
{
  int nprocs = -1;
  int rank = -1;
  int comm = MPI_COMM_WORLD;
  char processor_name[128];
  int namelen = 128;
  int i;
  int ranks[2], ranges[1][3];
  MPI_Group newgroup[GROUP_CONSTRUCTOR_COUNT]; 
  MPI_Group newgroup2[GROUP_CONSTRUCTOR_COUNT]; 
  MPI_Comm temp;
  MPI_Comm intercomm = MPI_COMM_NULL;

  /* init */
  MPI_Init (&argc, &argv);
  MPI_Comm_size (comm, &nprocs);
  MPI_Comm_rank (comm, &rank);
  MPI_Get_processor_name (processor_name, &namelen);
  printf ("(%d) is alive on %s\n", rank, processor_name);
  fflush (stdout);

  ranks[0] = 0;
  ranks[1] = 1;

  ranges[0][0] = 0;
  ranges[0][1] = 2;
  ranges[0][2] = 2;

  MPI_Barrier (comm);

  if (nprocs < 3) {
      printf ("requires at least 3 tasks\n");
  }
  else {
    /* create the groups */
    if (GROUP_CONSTRUCTOR_COUNT > 0)
      MPI_Comm_group (MPI_COMM_WORLD, &newgroup[0]);

    if (GROUP_CONSTRUCTOR_COUNT > 1)
      MPI_Group_incl (newgroup[0], 2, ranks, &newgroup[1]);    

    if (GROUP_CONSTRUCTOR_COUNT > 2)
      MPI_Group_excl (newgroup[0], 2, ranks, &newgroup[2]);

    if (GROUP_CONSTRUCTOR_COUNT > 3)
      MPI_Group_range_incl (newgroup[0], 1, ranges, &newgroup[3]);    

    if (GROUP_CONSTRUCTOR_COUNT > 4)
      MPI_Group_range_excl (newgroup[0], 1, ranges, &newgroup[4]);    

    if (GROUP_CONSTRUCTOR_COUNT > 5)
      MPI_Group_union (newgroup[1], newgroup[3], &newgroup[5]);

    if (GROUP_CONSTRUCTOR_COUNT > 6)
      MPI_Group_intersection (newgroup[5], newgroup[2], &newgroup[6]);

    if (GROUP_CONSTRUCTOR_COUNT > 7)
      MPI_Group_difference (newgroup[5], newgroup[2], &newgroup[7]);

    if (GROUP_CONSTRUCTOR_COUNT > 8) {
      /* need lots of stuff for this constructor... */
      MPI_Comm_split (MPI_COMM_WORLD, rank % 3, nprocs - rank, &temp);

      if (rank % 3) {
	MPI_Intercomm_create (temp, 0, MPI_COMM_WORLD, 
			      (((nprocs % 3) == 2) && ((rank % 3) == 2)) ?
			      nprocs - 1 : nprocs - (rank % 3) - (nprocs % 3),
			      INTERCOMM_CREATE_TAG, &intercomm);

	MPI_Comm_remote_group (intercomm, &newgroup[8]);

	MPI_Comm_free (&intercomm);
      }
      else {
	MPI_Comm_group (temp, &newgroup[8]);
      }

      MPI_Comm_free (&temp);
    }
  }      

  MPI_Barrier (comm);

  printf ("(%d) Finished normally\n", rank);
  MPI_Finalize ();
}
void repairComm(MPI_Comm * broken, MPI_Comm * repaired, int iteration, int * listFails, int * numFails,
                int * numNodeFails, int sumPrevNumNodeFails, int argc, char ** argv, int verbosity) {
    MPI_Comm tempShrink, unorderIntracomm, tempIntercomm;
    int i, ret, result, procsNeeded = 0, oldRank, newRank, oldGroupSize, rankKey = 0, flag;
    int * tempRanks, * failedRanks, * errCodes, rank, hostfileLineIndex;
    MPI_Group oldGroup, failedGroup, shrinkGroup;
    int hostfileLastLineIndex, tempLineIndex, * failedNodeList = NULL, * nodeList = NULL, totNodeFailed = 0;
    double startTime = 0.0, endTime;
    int nprocs, j, * shrinkMergeList;
    char hostName[128];
    gethostname(hostName, sizeof(hostName));

    char ** appToLaunch;
    char *** argvToLaunch;
    int * procsNeededToLaunch;
    MPI_Info * hostInfoToLaunch;
    char ** hostNameToLaunch;

    MPI_Comm_rank(*broken, &rank);
    if(rank == 0)
        startTime = MPI_Wtime();

#ifndef GLOBAL_DETECTION
    MPI_Comm_size(*broken, &oldGroupSize);
    MPI_Comm_group(*broken, &oldGroup);
    MPI_Comm_rank(*broken, &oldRank);
    OMPI_Comm_failure_ack(*broken);
    OMPI_Comm_failure_get_acked(*broken, &failedGroup);
    MPI_Group_size(failedGroup, &procsNeeded);
    errCodes = (int *) malloc(sizeof(int) * procsNeeded);

    // Figure out ranks of the processes which had failed
    tempRanks = (int *) malloc(sizeof(int) * oldGroupSize);
    failedRanks = (int *) malloc(sizeof(int) * oldGroupSize);
    #pragma omp parallel for default(shared)
    for(i = 0; i < oldGroupSize; i++)
        tempRanks[i] = i;

    MPI_Group_translate_ranks(failedGroup, procsNeeded, tempRanks, oldGroup, failedRanks);
#endif

    double shrinkTime = MPI_Wtime();
    // Shrink the broken communicator to remove failed procs
    if(MPI_SUCCESS != (ret = OMPI_Comm_shrink(*broken, &tempShrink)))
        printf("Iteration %d: OMPI_Comm_shrink (parent): ERROR!\n", iteration);
    else {
        if(verbosity > 1 )
            printf("Iteration %d: OMPI_Comm_shrink (parent): SUCCESS\n", iteration);
    }
    if (verbosity > 0 && rank == 0)
        printf("OMPI_Comm_shrink takes %0.6f Sec\n", MPI_Wtime() - shrinkTime);

#ifdef GLOBAL_DETECTION
    MPI_Comm_group(*broken, &oldGroup);
    MPI_Comm_group(tempShrink, &shrinkGroup);
    MPI_Comm_size(*broken, &oldGroupSize);

    MPI_Group_compare(oldGroup, shrinkGroup, &result);

    if(result != MPI_IDENT)
        MPI_Group_difference(oldGroup, shrinkGroup, &failedGroup);

    MPI_Comm_rank(*broken, &oldRank);
    MPI_Group_size(failedGroup, &procsNeeded);

    errCodes = (int *) malloc(sizeof(int)*procsNeeded);

    // Figure out ranks of the processes which had failed
    tempRanks = (int*)malloc(sizeof(int)*oldGroupSize);
    failedRanks = (int*)malloc(sizeof(int)*oldGroupSize);
    #pragma omp parallel for default(shared)
    for(i = 0; i < oldGroupSize; i++)
        tempRanks[i] = i;

    MPI_Group_translate_ranks(failedGroup, procsNeeded, tempRanks, oldGroup, failedRanks);

    MPI_Group_free(&shrinkGroup);
#endif

    // Assign number of failed processes
    *numFails = procsNeeded;

    hostNameToLaunch = (char **) malloc(procsNeeded * sizeof(char *));

    if(verbosity > 0 && rank == 0)
        printf("*** Iteration %d: Application: Number of process(es) failed in the corresponding "
               "communicator is %d ***\n", iteration, procsNeeded);

    if(rank == 0) {
        endTime = MPI_Wtime();
        printf("[%d]----- Creating failed process list takes %0.6f Sec (MPI_Wtime) -----\n", rank, endTime - startTime);
    }

#ifdef RECOV_ON_SPARE_NODES
    // Determining total number of node failed, and a list of them
    hostfileLastLineIndex = getHostfileLastLineIndex(); //started from 0
    nodeList = (int *) malloc((hostfileLastLineIndex+1) * sizeof(int));
    memset(nodeList, 0, (hostfileLastLineIndex+1)*sizeof(int)); // initialize nodeList with 0's

    for(int i = 0; i < procsNeeded; ++i) {
        tempLineIndex = failedRanks[i]/SLOTS; //started from 0
        nodeList[tempLineIndex] = 1;
    }

    for(int nodeCounter = 0; nodeCounter < (hostfileLastLineIndex+1); ++nodeCounter)
        totNodeFailed += nodeList[nodeCounter];
    *numNodeFails = totNodeFailed;

    // Check if there is sufficient spare node available for recovery
    if((hostfileLastLineIndex - totNodeFailed -sumPrevNumNodeFails) < (oldGroupSize-1)/SLOTS) {
        if(rank == 0)
            printf("[%d] There is no sufficient spare node available for recovery.\n", rank);
        exit(0);
    }

    failedNodeList = (int *) malloc(totNodeFailed * sizeof(int));
    memset(failedNodeList, 0, totNodeFailed * sizeof(int)); // initialize failedNodeList with 0's

    int failedNodeCounter = 0;
    for(int nodeCounter = 0; nodeCounter < (hostfileLastLineIndex+1); ++nodeCounter) {
        if(nodeList[nodeCounter] == 1)
            failedNodeList[failedNodeCounter++] = nodeCounter;
    }
#endif

    char * hostNameFailed = NULL;
    #pragma omp parallel for default(shared)
    for(i = 0; i < procsNeeded; ++i) {
        // Assign list of processes failed
        listFails[i] = failedRanks[i];

#ifdef RUN_ON_COMPUTE_NODES
        tempLineIndex = failedRanks[i]/SLOTS; //started from 0
#ifdef RECOV_ON_SPARE_NODES
        for(int j = 0; j < totNodeFailed; ++j) {
            if(failedNodeList[j] == tempLineIndex)
                hostfileLineIndex = hostfileLastLineIndex - j - sumPrevNumNodeFails;
        }
#else // Recovery on the same node (no node failure, only process failure)
        hostfileLineIndex = tempLineIndex;
#endif
        hostNameToLaunch[i] = getHostToLaunch(hostfileLineIndex);
        hostNameFailed = getHostToLaunch(tempLineIndex);
#else // Run on head node or personal machine
        hostNameToLaunch[i] = (char *)hostName;
        hostNameFailed = (char *)hostName;
#endif

        if(verbosity > 0 && rank == 0)
            printf("--- Iteration %d: Application: Process %d on node %s is failed! ---\n", iteration, failedRanks[i], hostNameFailed);
    }
    // Release memory of hostNameFailed
    free(hostNameFailed);

    appToLaunch = (char **) malloc(procsNeeded * sizeof(char *));
    argvToLaunch = (char ***) malloc(procsNeeded * sizeof(char **));
    procsNeededToLaunch = (int *) malloc(procsNeeded * sizeof(int));
    hostInfoToLaunch = (MPI_Info *) malloc(procsNeeded * sizeof(MPI_Info));
    argv[argc] = NULL;
    #pragma omp parallel for default(shared)
    for(i = 0; i < procsNeeded; i++) {
        appToLaunch[i] = (char *)argv[0];
        argvToLaunch[i] = (char **)argv;
        procsNeededToLaunch[i] = 1;
        // Host information where to spawn the processes
        MPI_Info_create(&hostInfoToLaunch[i]);
        MPI_Info_set(hostInfoToLaunch[i], (char *)"host", hostNameToLaunch[i]);
        //MPI_Info_set(hostInfoToLaunch[i], "hostfile", "hostfile");
    }

    double spawnTime = MPI_Wtime();
#ifdef HANG_ON_REMOVE
    OMPI_Comm_agree(tempShrink, &flag);
#endif
    // Spawn the new process(es)
    if(MPI_SUCCESS != (ret = MPI_Comm_spawn_multiple(procsNeeded, appToLaunch, argvToLaunch, procsNeededToLaunch,
                             hostInfoToLaunch, 0, tempShrink, &tempIntercomm, MPI_ERRCODES_IGNORE))) {
        free(tempRanks);
        free(failedRanks);
        free(errCodes);
        if(MPI_ERR_COMM  == ret)
            printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid communicator (parent)\n", iteration);
        if(MPI_ERR_ARG  == ret)
            printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid argument (parent)\n", iteration);
        if(MPI_ERR_INFO  == ret)
            printf("Iteration %d: MPI_Comm_spawn_multiple: Invalid info (parent)\n", iteration);

        if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
            OMPI_Comm_revoke(tempShrink);
            return repairComm(broken, repaired, iteration, listFails, numFails, numNodeFails,
                              sumPrevNumNodeFails, argc, argv, verbosity);
        }
        else {
            fprintf(stderr, "Iteration %d: Unknown error with MPI_Comm_spawn_multiple (parent): %d\n", iteration, ret);
            exit(1);
        }
    }
    else {
        if(verbosity > 0 && rank == 0) {
            for(i = 0; i < procsNeeded; i++)
                printf("Iteration %d: MPI_Comm_spawn_multiple (parent) [spawning failed process %d on "
                       "node %s]: SUCCESS\n", iteration, failedRanks[i], hostNameToLaunch[i]);
        }
        // Memory release. Moving the last two to the end of the function causes segmentation faults for 4 processes failure
    }
    if (verbosity > 0 && rank == 0)
        printf("MPI_Comm_spawn_multiple takes %0.6f Sec\n", MPI_Wtime() - spawnTime);

    double mergeTime = MPI_Wtime();
    // Merge the new processes into a new communicator
    if(MPI_SUCCESS != (ret = MPI_Intercomm_merge(tempIntercomm, false, &unorderIntracomm))) {
        free(tempRanks);
        free(failedRanks);
        if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
            // Start the recovery over again if there is a failure
            OMPI_Comm_revoke(tempIntercomm);
            return repairComm(broken, repaired, iteration, listFails, numFails,
                              numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity);
        }
        else if(MPI_ERR_COMM == ret) {
            fprintf(stderr, "Iteration %d: Invalid communicator in MPI_Intercomm_merge (parent) %d\n", iteration, ret);
            exit(1);
        }
        else if(MPI_ERR_INTERN == ret) {
            fprintf(stderr, "Iteration %d: Acquaring memory error in MPI_Intercomm_merge ()%d\n", iteration, ret);
            exit(1);
        }
        else {
            fprintf(stderr, "Iteration %d: Unknown error with MPI_Intercomm_merge: %d\n", iteration, ret);
            exit(1);
        }
    }
    else {
        if(verbosity > 1 )
            printf("Iteration %d: MPI_Intercomm_merge (parent): SUCCESS\n", iteration);
    }
    if (verbosity > 0 && rank == 0)
        printf("MPI_Intercomm_merge takes %0.6f Sec\n", MPI_Wtime() - mergeTime);

    double agreeTime = MPI_Wtime();
    // Synchronize. sometimes hangs in without this
    // position of code and intercommunicator (not intra) is important
#ifdef HANG_ON_REMOVE
    //MPI_Barrier(tempIntercomm);
    OMPI_Comm_agree(tempIntercomm, &flag);// since some of the times MPI_Barrier hangs
#endif
    if (verbosity > 0 && rank == 0)
        printf("OMPI_Comm_agree takes %0.6f Sec\n", MPI_Wtime() - agreeTime);

    // Sending failed ranks and number of processes failed to the the newly created ranks.
    // oldGroupSize is the size of communicator before failure.
    // procsNeeded is the number of processes that are failed
    int * child = (int *) malloc(procsNeeded*sizeof(int));
    #pragma omp parallel for default(shared)
    for(i = 0; i < procsNeeded; i++)
        child[i] = oldGroupSize - procsNeeded + i;

    MPI_Comm_rank(unorderIntracomm, &newRank);
    if(newRank == 0) {
        int send_val[2];
        for(i = 0; i < procsNeeded; i++) {
            send_val[0] = failedRanks[i];
            send_val[1] = procsNeeded;
            if(MPI_SUCCESS != (ret = MPI_Send(&send_val, 2, MPI_INT, child[i], MERGE_TAG, unorderIntracomm))) {
                free(tempRanks);
                free(failedRanks);
                if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
                    // Start the recovery over again if there is a failure
                    OMPI_Comm_revoke(unorderIntracomm);
                    return repairComm(broken, repaired, iteration, listFails, numFails,
                                      numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity);
                }
                else {
                    fprintf(stderr, "Iteration %d: Unknown error with MPI_Send1 (parent): %d\n", iteration, ret);
                    exit(1);
                }
            }
            else {
                if(verbosity > 1 )
                    printf("Iteration %d: MPI_Send1 (parent): SUCCESS\n", iteration);
            }
        }
    }

    // Split the current world (splitted from original) to order the ranks.
    MPI_Comm_rank(unorderIntracomm, &newRank);
    MPI_Comm_size(unorderIntracomm, &nprocs);

    // For one or more process failure (ordering)
    shrinkMergeList = (int *) malloc(nprocs*sizeof(int));

    j = 0;
    for(i = 0; i < nprocs; i++) {
        if(rankIsNotOnFailedList(i, failedRanks, procsNeeded))
            shrinkMergeList[j++] = i;
    }

    for(i = j; i < nprocs; i++)
        shrinkMergeList[i] = failedRanks[i-j];

    for(i = 0; i < (nprocs - procsNeeded); i++) {
        if(newRank == i)
            rankKey = shrinkMergeList[i];
    }

    if(MPI_SUCCESS != (MPI_Comm_split(unorderIntracomm, 0, rankKey, repaired))) {
        if((MPI_ERR_PROC_FAILED == ret) || (MPI_ERR_REVOKED == ret)) {
            // Start the recovery over again if there is a failure
            OMPI_Comm_revoke(unorderIntracomm);
            return repairComm(broken, repaired, iteration, listFails, numFails,
                              numNodeFails, sumPrevNumNodeFails, argc, argv, verbosity);
        }
        else {
            fprintf(stderr, "Iteration %d: Unknown error with MPI_Comm_split (parent): %d\n", iteration, ret);
            exit(1);
        }
    }
    else {
        if(verbosity > 1 )
            printf("Iteration %d: MPI_Comm_split (parent): SUCCESS\n", iteration);
    }

    // Release memory
    free(appToLaunch);
    free(argvToLaunch);
    free(procsNeededToLaunch);
    free(hostInfoToLaunch);
    free(hostNameToLaunch);
    free(shrinkMergeList);
    free(errCodes);
    MPI_Comm_free(&tempShrink);
    free(tempRanks);
    free(failedRanks);
    free(child);
    MPI_Group_free(&failedGroup);
    MPI_Group_free(&oldGroup);
    MPI_Comm_free(&tempIntercomm);
    MPI_Comm_free(&unorderIntracomm);
#ifdef RECOV_ON_SPARE_NODES
    if(failedNodeList != NULL)
        free(failedNodeList);
    if(nodeList != NULL)
        free(nodeList);
#endif
}//repairComm()
示例#14
0
文件: MPI-api.c 项目: 8l/rose
void declareBindings (void)
{
  /* === Point-to-point === */
  void* buf;
  int count;
  MPI_Datatype datatype;
  int dest;
  int tag;
  MPI_Comm comm;
  MPI_Send (buf, count, datatype, dest, tag, comm); // L12
  int source;
  MPI_Status status;
  MPI_Recv (buf, count, datatype, source, tag, comm, &status); // L15
  MPI_Get_count (&status, datatype, &count);
  MPI_Bsend (buf, count, datatype, dest, tag, comm);
  MPI_Ssend (buf, count, datatype, dest, tag, comm);
  MPI_Rsend (buf, count, datatype, dest, tag, comm);
  void* buffer;
  int size;
  MPI_Buffer_attach (buffer, size); // L22
  MPI_Buffer_detach (buffer, &size);
  MPI_Request request;
  MPI_Isend (buf, count, datatype, dest, tag, comm, &request); // L25
  MPI_Ibsend (buf, count, datatype, dest, tag, comm, &request);
  MPI_Issend (buf, count, datatype, dest, tag, comm, &request);
  MPI_Irsend (buf, count, datatype, dest, tag, comm, &request);
  MPI_Irecv (buf, count, datatype, source, tag, comm, &request);
  MPI_Wait (&request, &status);
  int flag;
  MPI_Test (&request, &flag, &status); // L32
  MPI_Request_free (&request);
  MPI_Request* array_of_requests;
  int index;
  MPI_Waitany (count, array_of_requests, &index, &status); // L36
  MPI_Testany (count, array_of_requests, &index, &flag, &status);
  MPI_Status* array_of_statuses;
  MPI_Waitall (count, array_of_requests, array_of_statuses); // L39
  MPI_Testall (count, array_of_requests, &flag, array_of_statuses);
  int incount;
  int outcount;
  int* array_of_indices;
  MPI_Waitsome (incount, array_of_requests, &outcount, array_of_indices,
		array_of_statuses); // L44--45
  MPI_Testsome (incount, array_of_requests, &outcount, array_of_indices,
		array_of_statuses); // L46--47
  MPI_Iprobe (source, tag, comm, &flag, &status); // L48
  MPI_Probe (source, tag, comm, &status);
  MPI_Cancel (&request);
  MPI_Test_cancelled (&status, &flag);
  MPI_Send_init (buf, count, datatype, dest, tag, comm, &request);
  MPI_Bsend_init (buf, count, datatype, dest, tag, comm, &request);
  MPI_Ssend_init (buf, count, datatype, dest, tag, comm, &request);
  MPI_Rsend_init (buf, count, datatype, dest, tag, comm, &request);
  MPI_Recv_init (buf, count, datatype, source, tag, comm, &request);
  MPI_Start (&request);
  MPI_Startall (count, array_of_requests);
  void* sendbuf;
  int sendcount;
  MPI_Datatype sendtype;
  int sendtag;
  void* recvbuf;
  int recvcount;
  MPI_Datatype recvtype;
  MPI_Datatype recvtag;
  MPI_Sendrecv (sendbuf, sendcount, sendtype, dest, sendtag,
		recvbuf, recvcount, recvtype, source, recvtag,
		comm, &status); // L67--69
  MPI_Sendrecv_replace (buf, count, datatype, dest, sendtag, source, recvtag,
			comm, &status); // L70--71
  MPI_Datatype oldtype;
  MPI_Datatype newtype;
  MPI_Type_contiguous (count, oldtype, &newtype); // L74
  int blocklength;
  {
    int stride;
    MPI_Type_vector (count, blocklength, stride, oldtype, &newtype); // L78
  }
  {
    MPI_Aint stride;
    MPI_Type_hvector (count, blocklength, stride, oldtype, &newtype); // L82
  }
  int* array_of_blocklengths;
  {
    int* array_of_displacements;
    MPI_Type_indexed (count, array_of_blocklengths, array_of_displacements,
		      oldtype, &newtype); // L87--88
  }
  {
    MPI_Aint* array_of_displacements;
    MPI_Type_hindexed (count, array_of_blocklengths, array_of_displacements,
                       oldtype, &newtype); // L92--93
    MPI_Datatype* array_of_types;
    MPI_Type_struct (count, array_of_blocklengths, array_of_displacements,
                     array_of_types, &newtype); // L95--96
  }
  void* location;
  MPI_Aint address;
  MPI_Address (location, &address); // L100
  MPI_Aint extent;
  MPI_Type_extent (datatype, &extent); // L102
  MPI_Type_size (datatype, &size);
  MPI_Aint displacement;
  MPI_Type_lb (datatype, &displacement); // L105
  MPI_Type_ub (datatype, &displacement);
  MPI_Type_commit (&datatype);
  MPI_Type_free (&datatype);
  MPI_Get_elements (&status, datatype, &count);
  void* inbuf;
  void* outbuf;
  int outsize;
  int position;
  MPI_Pack (inbuf, incount, datatype, outbuf, outsize, &position, comm); // L114
  int insize;
  MPI_Unpack (inbuf, insize, &position, outbuf, outcount, datatype,
	      comm); // L116--117
  MPI_Pack_size (incount, datatype, comm, &size);

  /* === Collectives === */
  MPI_Barrier (comm); // L121
  int root;
  MPI_Bcast (buffer, count, datatype, root, comm); // L123
  MPI_Gather (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
	      root, comm); // L124--125
  int* recvcounts;
  int* displs;
  MPI_Gatherv (sendbuf, sendcount, sendtype,
               recvbuf, recvcounts, displs, recvtype,
	       root, comm); // L128--130
  MPI_Scatter (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
               root, comm); // L131--132
  int* sendcounts;
  MPI_Scatterv (sendbuf, sendcounts, displs, sendtype,
		recvbuf, recvcount, recvtype, root, comm); // L134--135
  MPI_Allgather (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
                 comm); // L136--137
  MPI_Allgatherv (sendbuf, sendcount, sendtype,
		  recvbuf, recvcounts, displs, recvtype,
		  comm); // L138--140
  MPI_Alltoall (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
		comm); // L141--142
  int* sdispls;
  int* rdispls;
  MPI_Alltoallv (sendbuf, sendcounts, sdispls, sendtype,
                 recvbuf, recvcounts, rdispls, recvtype,
		 comm); // L145--147
  MPI_Op op;
  MPI_Reduce (sendbuf, recvbuf, count, datatype, op, root, comm); // L149
#if 0
  MPI_User_function function;
  int commute;
  MPI_Op_create (function, commute, &op); // L153
#endif
  MPI_Op_free (&op); // L155
  MPI_Allreduce (sendbuf, recvbuf, count, datatype, op, comm);
  MPI_Reduce_scatter (sendbuf, recvbuf, recvcounts, datatype, op, comm);
  MPI_Scan (sendbuf, recvbuf, count, datatype, op, comm);

  /* === Groups, contexts, and communicators === */
  MPI_Group group;
  MPI_Group_size (group, &size); // L162
  int rank;
  MPI_Group_rank (group, &rank); // L164
  MPI_Group group1;
  int n;
  int* ranks1;
  MPI_Group group2;
  int* ranks2;
  MPI_Group_translate_ranks (group1, n, ranks1, group2, ranks2); // L170
  int result;
  MPI_Group_compare (group1, group2, &result); // L172
  MPI_Group newgroup;
  MPI_Group_union (group1, group2, &newgroup); // L174
  MPI_Group_intersection (group1, group2, &newgroup);
  MPI_Group_difference (group1, group2, &newgroup);
  int* ranks;
  MPI_Group_incl (group, n, ranks, &newgroup); // L178
  MPI_Group_excl (group, n, ranks, &newgroup);
  extern int ranges[][3];
  MPI_Group_range_incl (group, n, ranges, &newgroup); // L181
  MPI_Group_range_excl (group, n, ranges, &newgroup);
  MPI_Group_free (&group);
  MPI_Comm_size (comm, &size);
  MPI_Comm_rank (comm, &rank);
  MPI_Comm comm1;
  MPI_Comm comm2;
  MPI_Comm_compare (comm1, comm2, &result);
  MPI_Comm newcomm;
  MPI_Comm_dup (comm, &newcomm);
  MPI_Comm_create (comm, group, &newcomm);
  int color;
  int key;
  MPI_Comm_split (comm, color, key, &newcomm); // L194
  MPI_Comm_free (&comm);
  MPI_Comm_test_inter (comm, &flag);
  MPI_Comm_remote_size (comm, &size);
  MPI_Comm_remote_group (comm, &group);
  MPI_Comm local_comm;
  int local_leader;
  MPI_Comm peer_comm;
  int remote_leader;
  MPI_Comm newintercomm;
  MPI_Intercomm_create (local_comm, local_leader, peer_comm, remote_leader, tag,
			&newintercomm); // L204--205
  MPI_Comm intercomm;
  MPI_Comm newintracomm;
  int high;
  MPI_Intercomm_merge (intercomm, high, &newintracomm); // L209
  int keyval;
#if 0
  MPI_Copy_function copy_fn;
  MPI_Delete_function delete_fn;
  void* extra_state;
  MPI_Keyval_create (copy_fn, delete_fn, &keyval, extra_state); // L215
#endif
  MPI_Keyval_free (&keyval); // L217
  void* attribute_val;
  MPI_Attr_put (comm, keyval, attribute_val); // L219
  MPI_Attr_get (comm, keyval, attribute_val, &flag);
  MPI_Attr_delete (comm, keyval);

  /* === Environmental inquiry === */
  char* name;
  int resultlen;
  MPI_Get_processor_name (name, &resultlen); // L226
  MPI_Errhandler errhandler;
#if 0
  MPI_Handler_function function;
  MPI_Errhandler_create (function, &errhandler); // L230
#endif
  MPI_Errhandler_set (comm, errhandler); // L232
  MPI_Errhandler_get (comm, &errhandler);
  MPI_Errhandler_free (&errhandler);
  int errorcode;
  char* string;
  MPI_Error_string (errorcode, string, &resultlen); // L237
  int errorclass;
  MPI_Error_class (errorcode, &errorclass); // L239
  MPI_Wtime ();
  MPI_Wtick ();
  int argc;
  char** argv;
  MPI_Init (&argc, &argv); // L244
  MPI_Finalize ();
  MPI_Initialized (&flag);
  MPI_Abort (comm, errorcode);
}
示例#15
0
FC_FUNC( mpi_group_difference, MPI_GROUP_DIFFERENCE )
     (int *group1, int *group2, int *newgroup, int *ierror)
{
  *ierror= MPI_Group_difference(*group1,*group2,newgroup);
}