Beispiel #1
0
int main( int argc , char ** argv )
{
  int comm_rank = 0 ;

#if defined( KOKKOS_ENABLE_MPI )
  MPI_Init( & argc , & argv );
  MPI_Comm comm = MPI_COMM_WORLD ;
  MPI_Comm_rank( comm , & comm_rank );
#else
  MPI_Comm comm = 0 ;
  (void) comm ; // suppress warning
#endif

  int cmdline[ CMD_COUNT ] ;

  for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ;

  if ( 0 == comm_rank ) {
    for ( int i = 1 ; i < argc ; ++i ) {
      if ( 0 == strcasecmp( argv[i] , "threads" ) ) {
        cmdline[ CMD_USE_THREADS ] = atoi( argv[++i] );
      }
      else if ( 0 == strcasecmp( argv[i] , "openmp" ) ) {
        cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] );
      }
      else if ( 0 == strcasecmp( argv[i] , "cores" ) ) {
        sscanf( argv[++i] , "%dx%d" ,
                cmdline + CMD_USE_NUMA ,
                cmdline + CMD_USE_CORE_PER_NUMA );
      }
      else if ( 0 == strcasecmp( argv[i] , "cuda" ) ) {
        cmdline[ CMD_USE_CUDA ] = 1 ;
      }
      else if ( 0 == strcasecmp( argv[i] , "cuda-dev" ) ) {
        cmdline[ CMD_USE_CUDA ] = 1 ;
        cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ;
      }
      else if ( 0 == strcasecmp( argv[i] , "rocm" ) ) {
        cmdline[ CMD_USE_ROCM ] = 1 ;
      }
      else if ( 0 == strcasecmp( argv[i] , "fixture" ) ) {
        sscanf( argv[++i] , "%dx%dx%d" ,
                cmdline + CMD_USE_FIXTURE_X ,
                cmdline + CMD_USE_FIXTURE_Y ,
                cmdline + CMD_USE_FIXTURE_Z );
      }
      else if ( 0 == strcasecmp( argv[i] , "fixture-range" ) ) {
        sscanf( argv[++i] , "%d..%d" ,
                cmdline + CMD_USE_FIXTURE_BEGIN ,
                cmdline + CMD_USE_FIXTURE_END );
      }
      else if ( 0 == strcasecmp( argv[i] , "fixture-quadratic" ) ) {
        cmdline[ CMD_USE_FIXTURE_QUADRATIC ] = 1 ;
      }
      else if ( 0 == strcasecmp( argv[i] , "atomic" ) ) {
        cmdline[ CMD_USE_ATOMIC ] = 1 ;
      }
      else if ( 0 == strcasecmp( argv[i] , "trials" ) ) {
        cmdline[ CMD_USE_TRIALS ] = atoi( argv[++i] ) ;
      }
      else if ( 0 == strcasecmp( argv[i] , "vtune" ) ) {
        cmdline[ CMD_VTUNE ] = 1 ;
      }
      else if ( 0 == strcasecmp( argv[i] , "print" ) ) {
        cmdline[ CMD_PRINT ] = 1 ;
      }
      else if ( 0 == strcasecmp( argv[i] , "echo" ) ) {
        cmdline[ CMD_ECHO ] = 1 ;
      }
      else {
        cmdline[ CMD_ERROR ] = 1 ;

        std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
      }
    }

    if ( cmdline[ CMD_ECHO ] && 0 == comm_rank ) { print_cmdline( std::cout , cmdline ); }
  }

#if defined( KOKKOS_ENABLE_MPI )
  MPI_Bcast( cmdline , CMD_COUNT , MPI_INT , 0 , comm );
#endif

  if ( cmdline[ CMD_VTUNE ] ) {
    std::stringstream cmd;
    pid_t my_os_pid=getpid();
    const std::string vtune_loc =
      "/usr/local/intel/vtune_amplifier_xe_2013/bin64/amplxe-cl";
    const std::string output_dir = "./vtune/vtune.";
    const int p_rank = comm_rank;
    cmd << vtune_loc
        << " -collect hotspots -result-dir " << output_dir << p_rank
        << " -target-pid " << my_os_pid << " &";
    if (p_rank == 0)
      std::cout << cmd.str() << std::endl;
    system(cmd.str().c_str());
    system("sleep 10");
  }

  if ( ! cmdline[ CMD_ERROR ] && ! cmdline[ CMD_ECHO ] ) {

    if ( ! cmdline[ CMD_USE_TRIALS ] ) { cmdline[ CMD_USE_TRIALS ] = 1 ; }

    if ( ! cmdline[ CMD_USE_FIXTURE_X ] && ! cmdline[ CMD_USE_FIXTURE_BEGIN ] ) {
      cmdline[ CMD_USE_FIXTURE_X ] = 2 ;
      cmdline[ CMD_USE_FIXTURE_Y ] = 2 ;
      cmdline[ CMD_USE_FIXTURE_Z ] = 2 ;
    }

#if defined( KOKKOS_ENABLE_THREADS )

    if ( cmdline[ CMD_USE_THREADS ] ) {

      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
        Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] ,
                                     cmdline[ CMD_USE_NUMA ] ,
                                     cmdline[ CMD_USE_CORE_PER_NUMA ] );
      }
      else {
        Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] );
      }

      run< Kokkos::Threads , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );

      Kokkos::Threads::finalize();
    }

#endif

#if defined( KOKKOS_ENABLE_OPENMP )

    if ( cmdline[ CMD_USE_OPENMP ] ) {

      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
        Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] ,
                                     cmdline[ CMD_USE_NUMA ] ,
                                     cmdline[ CMD_USE_CORE_PER_NUMA ] );
      }
      else {
        Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] );
      }

      run< Kokkos::OpenMP , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );

      Kokkos::OpenMP::finalize();
    }

#endif

#if defined( KOKKOS_ENABLE_CUDA )
    if ( cmdline[ CMD_USE_CUDA ] ) {
      // Use the last device:

      Kokkos::HostSpace::execution_space::initialize();
      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cmdline[ CMD_USE_CUDA_DEV ] ) );

      run< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );

      Kokkos::Cuda::finalize();
      Kokkos::HostSpace::execution_space::finalize();
    }

#endif

#if defined( KOKKOS_ENABLE_ROCM )
    if ( cmdline[ CMD_USE_ROCM ] ) {
      // Use the last device:

      Kokkos::HostSpace::execution_space::initialize();
      Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice( cmdline[ CMD_USE_ROCM ] ) );

      run< Kokkos::Experimental::ROCm , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );

      Kokkos::Experimental::ROCm::finalize();
      Kokkos::HostSpace::execution_space::finalize();
    }

#endif

  }

#if defined( KOKKOS_ENABLE_MPI )
  MPI_Finalize();
#endif

  return cmdline[ CMD_ERROR ] ? -1 : 0 ;
}
Beispiel #2
0
int main(int argc, char** argv)
{

  int	   Numprocs, MyRank;
  int 	   NoofCols, NoofRows, VectorSize, ScatterSize;
  int	   index, irow, icol, iproc;
  int	   Root = 0, ValidOutput = 1;
  float    **Matrix, *Buffer, *Mybuffer, *Vector,
	   *MyFinalVector, *FinalVector;
  float    *CheckResultVector;
  FILE	   *fp;
  int	   MatrixFileStatus = 1, VectorFileStatus = 1;
  double    start, end, startcomm, endcomm, commtime, totalcomm;


  /* ........MPI Initialisation .......*/

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &MyRank);
  MPI_Comm_size(MPI_COMM_WORLD, &Numprocs);


  if(MyRank == 0)
  {
    //creating matrix
    start = MPI_Wtime();
    VectorSize = 1024*32*4;
    NoofCols = VectorSize;
    NoofRows = VectorSize;

    Matrix = (float **)malloc(NoofRows*sizeof(float *));
    for(irow=0 ;irow<NoofRows; irow++){
      Matrix[irow] = (float *)malloc(NoofCols*sizeof(float));
      for(icol=0; icol<NoofCols; icol++) {
 	       Matrix[irow][icol] = 1;
      }
    }

    Buffer = (float *)malloc(NoofRows*NoofCols*sizeof(float));
    index = 0;
    for(irow=0; irow< NoofRows; irow++){
      for(icol=0; icol< NoofCols; icol++) {
        Buffer[index] = Matrix[irow][icol];
        index++;
      }
    }

    //creating vector

    Vector = (float*)malloc(VectorSize*sizeof(float));
    for(index = 0; index<VectorSize; index++)
         Vector[index]=1;

   }  /* end  of if myrank = 0 */


   MPI_Barrier(MPI_COMM_WORLD);

   MPI_Bcast (&MatrixFileStatus, 1, MPI_INT, Root, MPI_COMM_WORLD);
   if(MatrixFileStatus == 0) {
      if(MyRank == Root)
	 printf("Can't open input file for Matrix ..... \n");
      MPI_Finalize();
      exit(-1);
   }

   MPI_Bcast (&VectorFileStatus, 1, MPI_INT, Root, MPI_COMM_WORLD);
   if(VectorFileStatus == 0) {
      if(MyRank == Root)
	 printf("Can't open input file for Vector ..... \n");
      MPI_Finalize();
      exit(-1);
   }

   MPI_Bcast(&NoofRows, 1, MPI_INT, Root, MPI_COMM_WORLD);

   if(NoofRows < Numprocs) {
      if(MyRank == 0)
         printf("No of Rows should be more than No of Processors ... \n");
      MPI_Finalize();
      exit(0);
   }

   if(NoofRows % Numprocs != 0) {
      if(MyRank == 0)
	 printf("Matrix Can not be Striped Evenly ..... \n");
      MPI_Finalize();
      exit(0);
   }

   MPI_Bcast(&NoofCols, 1, MPI_INT, Root, MPI_COMM_WORLD);
   MPI_Bcast(&VectorSize, 1, MPI_INT, Root, MPI_COMM_WORLD);

   if(VectorSize != NoofCols){
      if(MyRank == 0){
         printf("Invalid input data..... \n");
	 printf("NoofCols should be equal to VectorSize\n");
      }
      MPI_Finalize();
      exit(0);
   }

   if(MyRank != 0)
      Vector = (float *)malloc(VectorSize*sizeof(float));
   startcomm = MPI_Wtime();
   MPI_Bcast(Vector, VectorSize, MPI_FLOAT, Root, MPI_COMM_WORLD);
   commtime = MPI_Wtime() - startcomm;

   ScatterSize = NoofRows / Numprocs;
   Mybuffer = (float *)malloc(ScatterSize * NoofCols * sizeof(float));
   startcomm = MPI_Wtime();
   MPI_Scatter( Buffer, ScatterSize * NoofCols, MPI_FLOAT, Mybuffer,
		ScatterSize * NoofCols, MPI_FLOAT, 0, MPI_COMM_WORLD);
   commtime += MPI_Wtime() - startcomm;

   MyFinalVector = (float *)malloc(ScatterSize*sizeof(float));

   for(irow = 0 ; irow < ScatterSize ; irow++) {
       MyFinalVector[irow] = 0;
       index = irow * NoofCols;
       for(icol = 0; icol < NoofCols; icol++)
	   MyFinalVector[irow] += (Mybuffer[index++] * Vector[icol]);
   }

   if( MyRank == 0)
      FinalVector = (float *)malloc(NoofRows*sizeof(float));
   startcomm = MPI_Wtime();
   MPI_Gather( MyFinalVector, ScatterSize, MPI_FLOAT, FinalVector,
	       ScatterSize, MPI_FLOAT, Root, MPI_COMM_WORLD);
   commtime += MPI_Wtime() - startcomm;

   MPI_Reduce(&commtime, &totalcomm, 1, MPI_DOUBLE, MPI_MAX, Root, MPI_COMM_WORLD);

   if( MyRank == 0) {
     end = MPI_Wtime();

     printf("Processor : %d\n",Numprocs);
     printf("Size : %d\n",VectorSize);
     printf("All Time : %f\n",end-start);
     printf("Comm Time : %f\n",totalcomm);
     //printf("Comm Time : %f\n",totalcomm);
     /*
      printf ("\n");
      printf(" --------------------------------------------------- \n");
      printf("Results of Gathering data  %d: \n", MyRank);
      printf("\n");

      for(index = 0; index < NoofRows; index++)
        printf(" FinalVector[%d] = %f \n", index, FinalVector[index]);
      printf(" --------------------------------------------------- \n");
      */
   }

   if(MyRank == 0){
      CheckResultVector = (float *)malloc(NoofRows*sizeof(float));
      for(irow = 0 ; irow < NoofRows ; irow++) {
	  CheckResultVector[irow] = 0;
	  for(icol = 0; icol < NoofCols; icol++){
	      CheckResultVector[irow] += (Matrix[irow][icol]*Vector[icol]);
	  }
	  if(fabs((double)(FinalVector[irow]-CheckResultVector[irow])) >
							    1.0E-10){
	     printf("Error %d\n",irow);
	     ValidOutput = 0;
	  }
       }
       free(CheckResultVector);
       if(ValidOutput)
	  printf("\n-------Correct Result------\n");
   }

   MPI_Finalize();

}
Beispiel #3
0
/* ************************************************************************ */
int
main (int argc, char** argv)
{
    int rank;
    int size;
    int rest;
    int from, to;

    MPI_Init(&argc, &argv);

    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    struct options options;
    struct calculation_arguments arguments;
    struct calculation_results results;

    /* Parameter nur einmal abfragen */
    if(rank == 0)
    {
        AskParams(&options, argc, argv);
    }
    MPI_Bcast(&options, (sizeof(options)), MPI_BYTE, MASTER, MPI_COMM_WORLD);

    initVariables(&arguments, &results, &options);

    /* Damit allocation + initialization richtig läuft, wird für GS size = 1 gesetzt */
    if(options.method == METH_GAUSS_SEIDEL)
    {
        size = 1;
    }

    /* Aufteilen bis auf rest */
    int N_part = arguments.N;
    int lines = N_part - 1;
    rest = lines % size;
    N_part = (lines - rest) / size;

    /* globale zeilennummer berechnen, hier wird der rest beachtet						*/
    /* offset ist (rank + 1) für rank < rest, steigt also linear mit steigendem rang 	*/
    if(rank < rest)
    {
        from = N_part * rank 		+ rank + 1;
        to = N_part * (rank + 1) 	+ (rank + 1);
    }
    /* offset hier ist rest also die der maximale offset von oben */
    else
    {
        from = N_part * rank 		+ rest + 1;
        to = N_part * (rank + 1) 	+ rest ;
    }
    arguments.to = to;
    arguments.from = from;


    /* at least we only need N - 1 processes for calculation */
    if((unsigned int)size > (arguments.N -1))
    {
        size = (arguments.N - 1);

        if(rank == MASTER )
        {
            printf("\nWarning, you are using more processes than rows.\n This can slow down the calculation process! \n\n");
        }

    }

    //calculate Number of Rows
    arguments.numberOfRows = ((to - from + 1) > 0 ) ? (to - from + 1) : 0;

    allocateMatrices(&arguments);
    initMatrices(&arguments, &options, rank, size);

    gettimeofday(&start_time, NULL);                   /*  start timer         */


    if (options.method == METH_JACOBI )
    {
        calculateJacobi(&arguments, &results, &options, rank, size);
    }
    else
    {
        /* GS berechnet nur MASTER */
        if(rank == MASTER)
        {
            printf("\nGS wird nur sequentiell berechnet! \n");
            calculate(&arguments, &results, &options);
        }
    }

    gettimeofday(&comp_time, NULL);                   /*  stop timer          */

    /* only once */
    if(rank == MASTER)
    {
        displayStatistics(&arguments, &results, &options, size);
    }

    /* GS macht alte ausgabe */
    if((options.method == METH_GAUSS_SEIDEL) && (rank == MASTER))
    {
        DisplayMatrix(&arguments, &results, &options);
    }
    else
    {
        DisplayMatrixMPI(&arguments, &results, &options, rank, size, from, to);
    }

    freeMatrices(&arguments);                                       /*  free memory     */

    MPI_Finalize();
    return 0;
}
Beispiel #4
0
/* This routine computes various global properties of the particle
 * distribution and stores the result in the struct `SysState'.
 * Currently, not all the information that's computed here is 
 * actually used (e.g. momentum is not really used anywhere),
 * just the energies are written to a log-file every once in a while.
 */
void compute_global_quantities_of_system(void)
{
  int    i, j, n;
  struct state_of_system sys;

  for(n=0; n<5; n++)   
    { 
       sys.MassComp[n]= sys.EnergyKinComp[n]= sys.EnergyPotComp[n]= sys.EnergyIntComp[n]= 0; 
       
       for(j=0;j<4;j++) 
	 sys.CenterOfMassComp[n][j]= sys.MomentumComp[n][j] = sys.AngMomentumComp[n][j] = 0; 
    } 

  for(i=1; i<=NumPart; i++) 
    { 
      sys.MassComp[P[i].Type] += P[i].Mass; 

      sys.EnergyPotComp[P[i].Type] += 0.5*P[i].Mass*P[i].Potential; 
      
      if(P[i].Type==0)
	sys.EnergyIntComp[0] += P[i].Mass * SphP[i].EgySpecPred; 
	
      sys.EnergyKinComp[P[i].Type] += 0.5*P[i].Mass*(P[i].VelPred[0]*P[i].VelPred[0] +
						       P[i].VelPred[1]*P[i].VelPred[1] + 
						       P[i].VelPred[2]*P[i].VelPred[2]); 
      
      for(j=0; j<3; j++) 
	{ 
	  sys.MomentumComp[P[i].Type][j] += P[i].Mass* P[i].VelPred[j]; 
	  sys.CenterOfMassComp[P[i].Type][j] += P[i].Mass*P[i].PosPred[j]; 
	} 
      
      sys.AngMomentumComp[P[i].Type][0] += P[i].Mass* ( P[i].PosPred[1]*P[i].VelPred[2] -
							  P[i].PosPred[2]*P[i].VelPred[1] ); 
      sys.AngMomentumComp[P[i].Type][1] += P[i].Mass* ( P[i].PosPred[2]*P[i].VelPred[0] -
							  P[i].PosPred[0]*P[i].VelPred[2] ); 
      sys.AngMomentumComp[P[i].Type][2] += P[i].Mass* ( P[i].PosPred[0]*P[i].VelPred[1] -
							  P[i].PosPred[1]*P[i].VelPred[0] ); 
    } 


  /* some the stuff over all processors */
  MPI_Reduce(&sys.MassComp[0], &SysState.MassComp[0], 5, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  MPI_Reduce(&sys.EnergyPotComp[0], &SysState.EnergyPotComp[0], 5, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  MPI_Reduce(&sys.EnergyIntComp[0], &SysState.EnergyIntComp[0], 5, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  MPI_Reduce(&sys.EnergyKinComp[0], &SysState.EnergyKinComp[0], 5, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  MPI_Reduce(&sys.MomentumComp[0][0], &SysState.MomentumComp[0][0], 5*4, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  MPI_Reduce(&sys.AngMomentumComp[0][0], &SysState.AngMomentumComp[0][0], 5*4, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  MPI_Reduce(&sys.CenterOfMassComp[0][0], &SysState.CenterOfMassComp[0][0], 5*4, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);


  if(ThisTask==0)
    {
      for(i=0;i<5;i++) 
	  SysState.EnergyTotComp[i] = SysState.EnergyKinComp[i] + 
	                              SysState.EnergyPotComp[i] + 
                                      SysState.EnergyIntComp[i];
   
      SysState.Mass= SysState.EnergyKin= SysState.EnergyPot= SysState.EnergyInt= SysState.EnergyTot=0; 

      for(j=0; j<3; j++) 
	SysState.Momentum[j]= SysState.AngMomentum[j]= SysState.CenterOfMass[j]= 0; 
 
      for(i=0; i<5; i++) 
	{ 
	  SysState.Mass += SysState.MassComp[i]; 
	  SysState.EnergyKin += SysState.EnergyKinComp[i]; 
	  SysState.EnergyPot += SysState.EnergyPotComp[i]; 
	  SysState.EnergyInt += SysState.EnergyIntComp[i]; 
	  SysState.EnergyTot += SysState.EnergyTotComp[i]; 

	  for(j=0; j<3; j++) 
	    { 
	      SysState.Momentum[j] += SysState.MomentumComp[i][j]; 
	      SysState.AngMomentum[j] += SysState.AngMomentumComp[i][j]; 
	      SysState.CenterOfMass[j] += SysState.CenterOfMassComp[i][j]; 
	    } 
	} 
      
      for(i=0; i<5; i++) 
	for(j=0; j<3; j++) 
	  if(SysState.MassComp[i] > 0) 
	    SysState.CenterOfMassComp[i][j] /= SysState.MassComp[i]; 
      
      for(j=0; j<3; j++) 
	if(SysState.Mass > 0) 
	  SysState.CenterOfMass[j] /= SysState.Mass; 

      for(i=0; i<5; i++) 
	{ 
	  SysState.CenterOfMassComp[i][3]= SysState.MomentumComp[i][3]= SysState.AngMomentumComp[i][3]=0; 
	  for(j=0; j<3; j++) 
	    { 
	      SysState.CenterOfMassComp[i][3] += SysState.CenterOfMassComp[i][j]*SysState.CenterOfMassComp[i][j]; 
	      SysState.MomentumComp[i][3] += SysState.MomentumComp[i][j]*SysState.MomentumComp[i][j]; 
	      SysState.AngMomentumComp[i][3] += SysState.AngMomentumComp[i][j]*SysState.AngMomentumComp[i][j]; 
	    } 
	  SysState.CenterOfMassComp[i][3]= sqrt(SysState.CenterOfMassComp[i][3]); 
	  SysState.MomentumComp[i][3]= sqrt(SysState.MomentumComp[i][3]); 
	  SysState.AngMomentumComp[i][3]= sqrt(SysState.AngMomentumComp[i][3]); 
	} 
      
      SysState.CenterOfMass[3]= SysState.Momentum[3]= SysState.AngMomentum[3]= 0; 
      
      for(j=0; j<3; j++) 
	{ 
	  SysState.CenterOfMass[3] += SysState.CenterOfMass[j]*SysState.CenterOfMass[j]; 
	  SysState.Momentum[3] += SysState.Momentum[j]*SysState.Momentum[j]; 
	  SysState.AngMomentum[3] += SysState.AngMomentum[j]*SysState.AngMomentum[j]; 
	} 
      
      SysState.CenterOfMass[3]= sqrt(SysState.CenterOfMass[3]); 
      SysState.Momentum[3]= sqrt(SysState.Momentum[3]); 
      SysState.AngMomentum[3]= sqrt(SysState.AngMomentum[3]); 
    }

  /* give everyone the result, maybe the want to do something with it */
  MPI_Bcast(&SysState, sizeof(struct state_of_system), MPI_BYTE, 0,  MPI_COMM_WORLD);
}
Beispiel #5
0
int main(int argc, char *argv[])
{
	int m, n, c, iters, i, j;
	int my_m, my_n, my_rank, num_procs, size;
	float kappa;
	image u, u_bar;
	unsigned char *image_chars;
	char *input_jpeg_filename, *output_jpeg_filename;
	int* sendcounts, displs, recvcounts;

	sendcounts = (int*)malloc(num_procs*sizeof(int));
	displs = (int*)malloc(num_procs*sizeof(int));
	recvcounts = (int*)malloc(num_procs*sizeof(int));

	printf("Now in main program\n");

	MPI_Init (&argc, &argv);
	MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
	MPI_Comm_size (MPI_COMM_WORLD, &num_procs);

	/* read from kommand line: kappa, iters, input_jpeg filename, output_jpeg_filename */
	kappa = atof(argv[1]);
	iters = atoi(argv[2]);
	input_jpeg_filename = argv[3];
	output_jpeg_filename = argv[4];
	/* Test that parameters are read correctly from command line: 
	printf("kappa: %f\n", kappa);
	printf("iters: %d\n", iters);
	printf("input_jpeg_filename: %s\n", input_jpeg_filename);
	printf("output_jpeg_filename: %s\n", output_jpeg_filename);
	*/

	
	if (my_rank==0)
		import_JPEG_file(input_jpeg_filename, &image_chars, &m, &n, &c);
		printf("Successfully imported JPEG image.\n");
	

	MPI_Bcast (&m, 1, MPI_INT, 0, MPI_COMM_WORLD);
	MPI_Bcast (&n, 1, MPI_INT, 0, MPI_COMM_WORLD);


	/* Divide the m x n pixels evenly among the MPI processes */
	my_m = m/num_procs;
	my_n = n;


	/* If the pixels cannot be evenly divided, the last process picks up 	*/
	/* the remainder.  														*/
	/* Each process needs the rows above and below it. 						*/
	/* The first and last process only need 1 additional row. 				*/
	if (my_rank == num_procs - 1){
		my_m += m % num_procs;
		allocate_image(&u, my_m+1, my_n);
		allocate_image(&u_bar, my_m+1, my_n);
	} else if (my_rank == 0){
		allocate_image(&u_bar, my_m+1, my_n);
	} else {
		allocate_image (&u, my_m+2, my_n);
		allocate_image (&u_bar, my_m+2, my_n);
	}

	/* Each process asks process 0 for a partitioned region */
	/* of image_chars and copy the values into u */

	if (my_rank==0){
		size = (my_m + 1)*my_n;
		sendcounts[my_rank] = size;
		displs[my_rank] = my_rank;
		displs[my_rank+1] = my_n*(my_rank*my_m - 1);
	} else if (my_rank==num_procs-1){
		size = (my_m + 1)*my_n;
		sendcounts[my_rank] = size;
	} else {
		size = (my_m + 2)*my_n;
		sendcounts[my_rank] = size;
		displs[my_rank+1] = my_n*(my_rank*my_m - 1);
	}


	
	MPI_Scatterv(&image_chars, &sendcounts, &displs, MPI_UNSIGNED_CHAR, &u.image_data, size, MPI_UNSIGNED_CHAR,
		0, MPI_COMM_WORLD);

	/* Convert data type from unsigned char to float: */
	for (i=0; i<my_m; i++)
	{
		for (j=0; j<my_n; j++)
		{
			u.image_data[i][j] = (float)u.image_data[i][j];
		}
	}

	iso_diffusion_denoising (&u, &u_bar, kappa, iters);

	/* Each process must convert the data type in u back */
	/* to unsigned char. */
	for (i=0; i<my_m; i++)
	{
		for (j=0; j<my_n; j++)
		{
			u.image_data[i][j] = (unsigned char)u.image_data[i][j];
		}
	}

	/* Each process sends its resulting content of u to process 0 */
	/* Process 0 recieves from each process incoming values and */
	/* copy them into the designated region of image_chars */
	/* ... */


	if (my_rank==0){
		displs[my_rank] = 0;
	
	displs[my_rank+1] = my_rank*my_m*my_n;
	size = my_m*my_n




	if (my_rank==0)
		c = 1;
		export_JPEG_file(output_jpeg_filename, image_chars, m, n, c, 75);
		printf("Successfully exported JPEG image! \n");

	deallocate_image(&u);
	deallocate_image(&u_bar);

	MPI_Finalize ();

	printf("Finished the program!\n");

	return 0;
}
Beispiel #6
0
Datei: main.c Projekt: damora/ivr
//-----------------------------------------------------------------------------------------------------------------------//
// main
//-----------------------------------------------------------------------------------------------------------------------//
int main(int argc,  char * argv[])
{
    int rc=0;
    int rank=0;
#ifdef MPI
    int comsize;

    // init MPI
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &comsize);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    fprintf(stderr,"rank=%d\n", rank);
#ifdef BGQ
    install_signal_handler();
#endif
#endif

    args = parseargs(argc, argv);

    // init, initcamera, initvolattrs
    if ((rc=init(args, rank) != 0)) {
        fprintf(stderr, "initialization failed: %d\n", rc);
        return 0;
    };

    //---------------------------- Platform Specific Display and Interaction ----------------------------------------//
#if LOCAL

    resettransforms();
    mystate.alpha=0.25f;

    glutInit(&argc, argv);
    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH);
    glutInitWindowSize(WINDOWWIDTH, WINDOWHEIGHT);
    glutCreateWindow("iVR");
    glutReshapeFunc(reshape);
    glutDisplayFunc(display);
    glutKeyboardFunc(keyboard);
    glutMouseFunc(mousebutton);
    glutMotionFunc(mousemotion);
//    glEnable(GL_DEPTH_TEST);
    glEnable(GL_CULL_FACE);
    glCullFace(GL_BACK);
    glDisable(GL_DEPTH_TEST);
    glutMainLoop();
#elif MPI		// BEGIN MPI CODE
#ifdef REMOTE 	// START OF REMOTE RENDERING HANDSHAKE

    // connect to relay server
    if (rank == 0) {
        sockfd = initconnection(HOSTNAME, PORT);
        if (sockfd == -1) {
            fprintf(stderr,"remote connection failed, sockfd=%d\n",sockfd);
            freeall();
            exit(0);
        }
        fprintf(stderr,"relay connection succeeded\n");
    }
    PRINTDEBUG("%3d: \n", rank);
    boolean_t connected = TRUE;
    int ctr = 0;
    while (connected) {  // main loop when steering
        fprintf(stderr,"%3d: before steering control\n", rank);
        if (rank == 0) {
            rc = recvstate(sockfd, &mystate);
            fprintf(stderr,"bytes recvd=%d\n", rc);
            if (mystate.finish) connected = 0;;
            winwidth = mystate.winwidth;
            winheight=mystate.winheight;
        } // end if rank == 0 for remote visualization
        // wait until we've completed the handshake with steering client before proceeding;
        MPI_Barrier(MPI_COMM_WORLD);
        MPI_Bcast(&connected, 1, MPI_INT, 0, MPI_COMM_WORLD);
        if (!connected) break;
        MPI_Bcast((float *)&winwidth, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
        MPI_Bcast((float *)&winheight, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
        MPI_Bcast(&mystate.alpha, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
        MPI_Bcast(mystate.newconst, 6, MPI_FLOAT, 0, MPI_COMM_WORLD);
        MPI_Bcast(mystate.mvm, 16, MPI_FLOAT, 0, MPI_COMM_WORLD);
        MPI_Bcast(mystate.pm, 16, MPI_FLOAT, 0, MPI_COMM_WORLD);
        MPI_Bcast(mystate.vv, 4, MPI_FLOAT, 0, MPI_COMM_WORLD);
#endif
        float planeconst[6] = {pzero[0], pzero[1], pzero[2], pzero[3], pzero[4], pzero[5]};
        setviewport(&viewport, 0, 0, winwidth, winheight);
        setplaneconst(planeconst, mystate.newconst);
        setmvm(mvm, mystate.mvm);
        setpm(pm, mystate.pm);
        setfrustum(&viewvolume, mystate.vv);
        updatevolbbox();
        initcolbufs(rank);

        // create and display the image
        unsigned long long lstart, lstop;
        for (int i=0; i<1; i++) {
            TIME(RENDER, lstart);
            createlocalimage(rank);
            // can't composite until all ranks have completed local image generation
            MPI_Barrier(MPI_COMM_WORLD);
            compositelocalimage();
            TIME(RENDER, lstop);
            PERFORMANCE(lstart, lstop, framenum, elapsed, avg);
        }
        // wait until all ranks exit composite step before writing image
        MPI_Barrier(MPI_COMM_WORLD);
        writeimage(ctr++);
#ifdef REMOTE
    } // end while remote loop
    if (rank == 0) disconnect(sockfd);
#endif // END OF REMOTE CLIENT  HANDSHAKE


    fprintf(stderr,"%3d: finished\n", rank);
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Finalize();
#endif
    // clean up
    freeall();
    exit(0);
}
void IOserver::start()
{
    //starting IO server (Sinc line)
    
    MPI_Status status,status2;
    MPI_Request send_statut_request;
    int flag=1;
    int control;
    
    int count=0;
    int fileID;
    int fileCounter;
    int newFilenameLength;
    int len;
    bool getingDataFlag;
    int getingData[IO_ClientSize_];
    
    
    
    
    serverOn_flag=true;
    
    while(serverOn_flag)
    {
        //cout<<count<<endl;
        
        
        ostreamFile_flag=false;
        
        if(IO_Rank_==0)
        {
            //cout<<count<<endl;
            //count++;
            
            //send non-blocking for status server free. (Sync line)
            
            serverReady_flag=true;
            MPI_Isend(&serverReady_flag,1,MPI::BOOL,1,SERVER_STATE_TAG,syncLineComm_,&send_statut_request);
        
            //test if an ostream need to be open or if the server need to be stoped (Sync line) blocking rec
      
            MPI_Recv(&control,1,MPI::INT,1,SERVER_CONTROL_TAG,syncLineComm_,&status);            
        }
        
        MPI_Bcast(&control,1,MPI::INT,0,IO_Comm_);
        //MPI_Bcast(&ostreamFile_flag,1,MPI::BOOL,0,IO_Comm_);
        
        if(control==CONTROL_STOP)
        {
            //if(IO_Rank_==0)cout<<"stop server"<<endl;
            serverOn_flag=false;
        }
        if(control==CONTROL_OPEN_OSTREAM)
        {
            //if(IO_Rank_==0)cout<<"ostream"<<endl;
            ostreamFile_flag=true;
            serverReady_flag=false;
            if(IO_Rank_==0)MPI_Isend(&serverReady_flag,1,MPI::BOOL,1,SERVER_STATE_TAG,syncLineComm_,&send_statut_request);
            fileCounter=0;
            
            
            while(ostreamFile_flag)
            {
                //cout<<"rank: "<<IO_Rank_<<", ostream open"<<endl;
                
                //test if there is a new file (Sync line)
                
                
                
                if(IO_Rank_==0)MPI_Recv(&control,1,MPI::INT,1,IO_FILE_CONTROL_TAG,syncLineComm_,&status);
                
                MPI_Bcast(&control,1,MPI::INT,0,IO_Comm_);
                
                if(control==CONTROL_CLOSE_OSTREAM)
                {
                    //if(IO_Rank_==0)cout<<"stop ostream"<<endl;
                    ostreamFile_flag=false;
                }
                
                if(control==CONTROL_CREATE_FILE)
                {
                    //create new file ID
                    if(fileCounter>= MAX_FILE_NUMBER)
                    {
                        if(IO_Rank_==0)cout<<"Already to much file"<<endl;
                        fileID=FILE_FAIL;
                    }
                    else
                    {
                        fileID=fileCounter;
                        
                        //if(IO_Rank_==0)cout<<"IO_server creating new file : "<<fileID<<endl;
                    }
                    
                    if(IO_Rank_==0)MPI_Ssend(&fileID,1,MPI::INT,1,IO_FILE_CONTROL_FILEID_TAG,syncLineComm_);
                    if(fileID  != FILE_FAIL)
                    {
                        if(IO_Rank_==0)
                        {
                            MPI_Probe(1,IO_FILE_CONTROL_FILENAME_TAG,syncLineComm_,&status);
                            MPI_Get_count(&status,MPI::CHAR,&newFilenameLength);
                        }
                        MPI_Bcast(&newFilenameLength,1,MPI::INT,0,IO_Comm_);
                        char * filename;
                        filename = (char*)malloc(newFilenameLength*sizeof(char));
                        if(IO_Rank_==0)MPI_Recv(filename,newFilenameLength,MPI::CHAR,1,IO_FILE_CONTROL_FILENAME_TAG,syncLineComm_,&status2);
                        MPI_Bcast(filename,newFilenameLength,MPI::CHAR,0,IO_Comm_);
                        files[fileID].filename=filename;
                        free(filename);
                        files[fileID].type=FILETYPE_UNSTRUCTURED; // no structured file implemented!!!!
                        //cout<< files[fileID].filename <<endl;
                        
                        if(fileID==0)files[fileID].data=dataBuffer;
                        else files[fileID].data=&(files[fileID-1].data[files[fileID-1].size]);
                        
                        files[fileID].size=0;
                        
                        fileCounter++;
                        
                        
                        if(files[fileID].type==FILETYPE_UNSTRUCTURED)
                        {
                            for(int i=0;i<IO_ClientSize_;i++)getingData[i]=1;
                            getingDataFlag=true;
                            
                            
                            while(getingDataFlag)
                            {
                                MPI_Iprobe(MPI_ANY_SOURCE,fileID, masterClientComm_,&flag,&status);
                                if(flag==true)
                                {
                                    char * send;
                                    send=&(files[fileID].data[files[fileID].size]);
                                    int size;
                                    MPI_Get_count(&status,MPI::CHAR,&size);
                                    MPI_Recv(send,size,MPI::CHAR,status.MPI_SOURCE,fileID,masterClientComm_,&status2);
                                    files[fileID].size+=size;
                                }
                                else
                                {
                                    MPI_Iprobe(MPI_ANY_SOURCE,IO_FILE_CONTROL_CLOSE_TAG,masterClientComm_ ,&flag,&status);
                                    if(flag==true)
                                    {
                                        //cout<<"file closed"<<endl;  
                                        int tempFileID;
                                        int total=0;
                                        int source = status.MPI_SOURCE;
                                        //cout<<source<<endl;
                                        MPI_Recv(&tempFileID,1,MPI::INT,source,IO_FILE_CONTROL_CLOSE_TAG,masterClientComm_,&status2);
                                        //cout<<source<<endl;
                                        getingData[source-1]=0;
                                        for(int i=0;i<IO_ClientSize_;i++)total+=getingData[i];
                                        if(total==0)getingDataFlag=false;
                                        //cout<<total<<endl;
                                    }
                                    
                                }
                            }
                            //cout<<"file closed"<<endl;
                            MPI_Barrier(IO_Comm_);
                        }
                        MPI_Barrier(IO_Comm_);
                        
                        
                        
                    }
                    
                }
                
            }//close stream and write

            
            
            if(fileCounter!=0)
            {
                for(int i=0;i < fileCounter;i++)
                {
                    //cout<<"writing file: "<< i<<endl;
                    
                    MPI_File ofile;
                    
                    long file_Offset=0;
                    long temp_long=0;
                    string str_fname;
                    str_fname = files[i].filename + int2string(IO_Node_,999)+".dat";
                    char * fname = &(str_fname[0]);
                    
                    for(int k=0;k<(IO_NodeSize_-1); k++)
                    {
                        if(IO_NodeRank_==k)
                        {
                            temp_long = file_Offset + files[i].size;
                            MPI_Send(&temp_long,1,MPI_LONG, k+1 , 0, IO_NodeComm_ );
                        }
                        if(IO_NodeRank_==k+1)
                        {
                            MPI_Recv( &file_Offset, 1, MPI_LONG, k, 0, IO_NodeComm_, &status);
                        }
                    }
                    
                    if(files[i].size!=0)
                    {
                        MPI_File_open(IO_NodeComm_,fname,MPI_MODE_WRONLY | MPI_MODE_CREATE,MPI_INFO_NULL,&ofile);
                        MPI_File_set_view(ofile,file_Offset,MPI_CHAR,MPI_CHAR,(char*)"native",MPI_INFO_NULL);
                        MPI_File_write_all(ofile,files[i].data,files[i].size,MPI_CHAR,&status);
                        MPI_File_close(&ofile);
                    } 
                }
            }
            fileCounter=0;
            
            MPI_Barrier(IO_Comm_);
        
        
        }
        
        
                

        
    }
}
Beispiel #8
0
PetscErrorCode MatLoad_AIJ_HDF5(Mat mat, PetscViewer viewer)
{
  PetscViewerFormat format;
  const PetscInt  *i_glob = NULL;
  PetscInt        *i = NULL;
  const PetscInt  *j = NULL;
  const PetscScalar *a = NULL;
  const char      *a_name = NULL, *i_name = NULL, *j_name = NULL, *mat_name = NULL, *c_name = NULL;
  PetscInt        p, m, M, N;
  PetscInt        bs = mat->rmap->bs;
  PetscBool       flg;
  IS              is_i = NULL, is_j = NULL;
  Vec             vec_a = NULL;
  PetscLayout     jmap = NULL;
  MPI_Comm        comm;
  PetscMPIInt     rank, size;
  PetscErrorCode  ierr;

  PetscFunctionBegin;
  ierr = PetscViewerGetFormat(viewer, &format);CHKERRQ(ierr);
  switch (format) {
    case PETSC_VIEWER_HDF5_PETSC:
    case PETSC_VIEWER_DEFAULT:
    case PETSC_VIEWER_NATIVE:
    case PETSC_VIEWER_HDF5_MAT:
      break;
    default:
      SETERRQ1(PetscObjectComm((PetscObject)mat),PETSC_ERR_SUP,"PetscViewerFormat %s not supported for HDF5 input.",PetscViewerFormats[format]);
  }

  ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
  ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
  ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
  ierr = PetscObjectGetName((PetscObject)mat,&mat_name);CHKERRQ(ierr);
  ierr = PetscViewerHDF5GetAIJNames(viewer,&i_name,&j_name,&a_name,&c_name);CHKERRQ(ierr);

  ierr = PetscOptionsBegin(comm,NULL,"Options for loading matrix from HDF5","Mat");CHKERRQ(ierr);
  ierr = PetscOptionsInt("-matload_block_size","Set the blocksize used to store the matrix","MatLoad",bs,&bs,&flg);CHKERRQ(ierr);
  ierr = PetscOptionsEnd();CHKERRQ(ierr);
  if (flg) {
    ierr = MatSetBlockSize(mat, bs);CHKERRQ(ierr);
  }

  ierr = PetscViewerHDF5PushGroup(viewer,mat_name);CHKERRQ(ierr);
  ierr = PetscViewerHDF5ReadAttribute(viewer,NULL,c_name,PETSC_INT,&N);CHKERRQ(ierr);
  ierr = PetscViewerHDF5ReadSizes(viewer, i_name, NULL, &M);CHKERRQ(ierr);
  --M;  /* i has size M+1 as there is global number of nonzeros stored at the end */

  if (format==PETSC_VIEWER_HDF5_MAT && !mat->symmetric) {
    /* Swap row and columns layout for unallocated matrix. I want to avoid calling MatTranspose() just to transpose sparsity pattern and layout. */
    if (!mat->preallocated) {
      PetscLayout tmp;
      tmp = mat->rmap; mat->rmap = mat->cmap; mat->cmap = tmp;
    } else SETERRQ(comm,PETSC_ERR_SUP,"Not for preallocated matrix - we would need to transpose it here which we want to avoid");
  }

  /* If global sizes are set, check if they are consistent with that given in the file */
  if (mat->rmap->N >= 0 && mat->rmap->N != M) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_FILE_UNEXPECTED,"Inconsistent # of rows: Matrix in file has (%D) and input matrix has (%D)",mat->rmap->N,M);
  if (mat->cmap->N >= 0 && mat->cmap->N != N) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_FILE_UNEXPECTED,"Inconsistent # of cols: Matrix in file has (%D) and input matrix has (%D)",mat->cmap->N,N);

  /* Determine ownership of all (block) rows and columns */
  mat->rmap->N = M;
  mat->cmap->N = N;
  ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr);
  ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr);
  m = mat->rmap->n;

  /* Read array i (array of row indices) */
  ierr = PetscMalloc1(m+1, &i);CHKERRQ(ierr); /* allocate i with one more position for local number of nonzeros on each rank */
  i[0] = i[m] = 0; /* make the last entry always defined - the code block below overwrites it just on last rank */
  if (rank == size-1) m++; /* in the loaded array i_glob, only the last rank has one more position with the global number of nonzeros */
  M++;
  ierr = ISCreate(comm,&is_i);CHKERRQ(ierr);
  ierr = PetscObjectSetName((PetscObject)is_i,i_name);CHKERRQ(ierr);
  ierr = PetscLayoutSetLocalSize(is_i->map,m);CHKERRQ(ierr);
  ierr = PetscLayoutSetSize(is_i->map,M);CHKERRQ(ierr);
  ierr = ISLoad(is_i,viewer);CHKERRQ(ierr);
  ierr = ISGetIndices(is_i,&i_glob);CHKERRQ(ierr);
  ierr = PetscMemcpy(i,i_glob,m*sizeof(PetscInt));CHKERRQ(ierr);

  /* Reset m and M to the matrix sizes */
  m = mat->rmap->n;
  M--;

  /* Create PetscLayout for j and a vectors; construct ranges first */
  ierr = PetscLayoutCreate(comm,&jmap);CHKERRQ(ierr);
  ierr = PetscCalloc1(size+1, &jmap->range);CHKERRQ(ierr);
  ierr = MPI_Allgather(&i[0], 1, MPIU_INT, jmap->range, 1, MPIU_INT, comm);CHKERRQ(ierr);
  /* Last rank has global number of nonzeros (= length of j and a arrays) in i[m] (last i entry) so broadcast it */
  jmap->range[size] = i[m];
  ierr = MPI_Bcast(&jmap->range[size], 1, MPIU_INT, size-1, comm);CHKERRQ(ierr);
  for (p=size-1; p>0; p--) {
    if (!jmap->range[p]) jmap->range[p] = jmap->range[p+1]; /* for ranks with 0 rows, take the value from the next processor */
  }
  i[m] = jmap->range[rank+1]; /* i[m] (last i entry) is equal to next rank's offset */
  /* Deduce rstart, rend, n and N from the ranges */
  ierr = PetscLayoutSetUp(jmap);CHKERRQ(ierr);

  /* Convert global to local indexing of rows */
  for (p=1; p<m+1; ++p) i[p] -= i[0];
  i[0] = 0;

  /* Read array j (array of column indices) */
  ierr = ISCreate(comm,&is_j);CHKERRQ(ierr);
  ierr = PetscObjectSetName((PetscObject)is_j,j_name);CHKERRQ(ierr);
  ierr = PetscLayoutDuplicate(jmap,&is_j->map);CHKERRQ(ierr);
  ierr = ISLoad(is_j,viewer);CHKERRQ(ierr);
  ierr = ISGetIndices(is_j,&j);CHKERRQ(ierr);

  /* Read array a (array of values) */
  ierr = VecCreate(comm,&vec_a);CHKERRQ(ierr);
  ierr = PetscObjectSetName((PetscObject)vec_a,a_name);CHKERRQ(ierr);
  ierr = PetscLayoutDuplicate(jmap,&vec_a->map);CHKERRQ(ierr);
  ierr = VecLoad(vec_a,viewer);CHKERRQ(ierr);
  ierr = VecGetArrayRead(vec_a,&a);CHKERRQ(ierr);

  /* populate matrix */
  if (!((PetscObject)mat)->type_name) {
    ierr = MatSetType(mat,MATAIJ);CHKERRQ(ierr);
  }
  ierr = MatSeqAIJSetPreallocationCSR(mat,i,j,a);CHKERRQ(ierr);
  ierr = MatMPIAIJSetPreallocationCSR(mat,i,j,a);CHKERRQ(ierr);
  /*
  ierr = MatSeqBAIJSetPreallocationCSR(mat,bs,i,j,a);CHKERRQ(ierr);
  ierr = MatMPIBAIJSetPreallocationCSR(mat,bs,i,j,a);CHKERRQ(ierr);
  */

  if (format==PETSC_VIEWER_HDF5_MAT && !mat->symmetric) {
    /* Transpose the input matrix back */
    ierr = MatTranspose(mat,MAT_INPLACE_MATRIX,&mat);CHKERRQ(ierr);
  }

  ierr = PetscViewerHDF5PopGroup(viewer);CHKERRQ(ierr);
  ierr = PetscLayoutDestroy(&jmap);CHKERRQ(ierr);
  ierr = PetscFree(i);CHKERRQ(ierr);
  ierr = ISRestoreIndices(is_i,&i_glob);CHKERRQ(ierr);
  ierr = ISRestoreIndices(is_j,&j);CHKERRQ(ierr);
  ierr = VecRestoreArrayRead(vec_a,&a);CHKERRQ(ierr);
  ierr = ISDestroy(&is_i);CHKERRQ(ierr);
  ierr = ISDestroy(&is_j);CHKERRQ(ierr);
  ierr = VecDestroy(&vec_a);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}
void ADIOI_PVFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
    char *value;
    int flag, tmp_val, str_factor=-1, str_unit=-1, start_iodev=-1; 
    static char myname[] = "ADIOI_PVFS_SETINFO";

    if ((fd->info) == MPI_INFO_NULL) {
	/* This must be part of the open call. can set striping parameters 
           if necessary. */ 
	MPI_Info_create(&(fd->info));
	ADIOI_Info_set(fd->info, "romio_pvfs_listio_read", "disable");
	ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", "disable");
	fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_DISABLE;
	fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_DISABLE;
	
	/* has user specified any pvfs-specific hints (striping params, listio)
           and do they have the same value on all processes? */
	if (users_info != MPI_INFO_NULL) {
	    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));

	    ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, 
			 value, &flag);
	    if (flag) {
		str_factor=atoi(value);
		tmp_val = str_factor;
		MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
		if (tmp_val != str_factor) {
		    /* --BEGIN ERROR HANDLING-- */
		    MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
						       "striping_factor",
						       error_code);
		    return;
		    /* --END ERROR HANDLING-- */
		}
		else ADIOI_Info_set(fd->info, "striping_factor", value);
	    }

	    ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, 
			 value, &flag);
	    if (flag) {
		str_unit=atoi(value);
		tmp_val = str_unit;
		MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
		if (tmp_val != str_unit) {
		    /* --BEGIN ERROR HANDLING-- */
		    MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
						       "striping_unit",
						       error_code);
		    return;
		    /* --END ERROR HANDLING-- */
		}
		else ADIOI_Info_set(fd->info, "striping_unit", value);
	    }

	    ADIOI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, 
			 value, &flag);
	    if (flag) {
		start_iodev=atoi(value);
		tmp_val = start_iodev;
		MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
		if (tmp_val != start_iodev) {
		    /* --BEGIN ERROR HANDLING-- */
		    MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
						       "start_iodevice",
						       error_code);
		    return;
		    /* --END ERROR HANDLING-- */
		}
		else ADIOI_Info_set(fd->info, "start_iodevice", value);
	    }

	    ADIOI_Info_get(users_info, "romio_pvfs_listio_read",
			 MPI_MAX_INFO_VAL,
			 value, &flag);
	    if (flag) {
		if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE")) 
		{
		    ADIOI_Info_set(fd->info, "romio_pvfs_listio_read", value);
		    fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_ENABLE;
		} else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE")) 
		{
		    ADIOI_Info_set(fd->info , "romio_pvfs_listio_read", value);
		    fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_DISABLE;
		}
		else if ( !strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) 
		{
		    ADIOI_Info_set(fd->info, "romio_pvfs_listio_read", value);
		    fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_AUTO;
		}
		tmp_val = fd->hints->fs_hints.pvfs.listio_read;
		MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
		if (tmp_val != fd->hints->fs_hints.pvfs.listio_read) {
		    /* --BEGIN ERROR HANDLING-- */
		    MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
						       "romio_pvfs_listio_read",
						       error_code);
		    return;
		    /* --END ERROR HANDLING-- */
		}
	    }
	    ADIOI_Info_get(users_info, "romio_pvfs_listio_write", MPI_MAX_INFO_VAL,
			 value, &flag);
	    if (flag) {
		if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE")) 
		{
		    ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", value);
		    fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_ENABLE;
		} else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE")) 
		{
		    ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", value);
		    fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_DISABLE;
		}
		else if ( !strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) 
		{
		    ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", value);
		    fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_AUTO;
		}
		tmp_val = fd->hints->fs_hints.pvfs.listio_write;
		MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
		if (tmp_val != fd->hints->fs_hints.pvfs.listio_write) {
		    /* --BEGIN ERROR HANDLING-- */
		    MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
						       "romio_pvfs_listio_write",
						       error_code);
		    return;
		    /* --END ERROR HANDLING-- */
		}
	    }		    
	    ADIOI_Free(value);
	}
    }	

    /* set the values for collective I/O and data sieving parameters */
    ADIOI_GEN_SetInfo(fd, users_info, error_code);

    *error_code = MPI_SUCCESS;
}
Beispiel #10
0
int setImageLayerMemoryBuffer(InterColComm * icComm, char const * imageFile, ImageFromMemoryBuffer * imageLayer, uint8_t ** imageBufferPtr, size_t * imageBufferSizePtr)
{
   // Under MPI, only the root process (rank==0) uses imageFile, imageBufferPtr, or imageBufferSizePtr, but nonroot processes need to call it as well,
   // because the imegeBuffer is scattered to all processes during the call to ImageFromMemoryBuffer::setMemoryBuffer().
   int layerNx = imageLayer->getLayerLoc()->nxGlobal;
   int layerNy = imageLayer->getLayerLoc()->nyGlobal;
   int layerNf = imageLayer->getLayerLoc()->nf;
   int bufferNx, bufferNy, bufferNf;
   const uint8_t zeroVal = (uint8_t) 0;
   const uint8_t oneVal = (uint8_t) 255;
   int xStride, yStride, bandStride;
   int rank = icComm->commRank();
   if (rank==0) {
      // Doubleplusungood: much code duplication from PV::Image::readImage
      bool usingTempFile = false;
      char * path = NULL;
      if (strstr(imageFile, "://") != NULL) {
         printf("Image from URL \"%s\"\n", imageFile);
         usingTempFile = true;
         std::string pathstring = "/tmp/temp.XXXXXX";
         const char * ext = strrchr(imageFile, '.');
         if (ext) { pathstring += ext; }
         path = strdup(pathstring.c_str());
         int fid;
         fid=mkstemps(path, strlen(ext));
         if (fid<0) {
            fprintf(stderr,"Cannot create temp image file for image \"%s\".\n", imageFile);
            exit(EXIT_FAILURE);
         }   
         close(fid);
         std::string systemstring;
         if (strstr(imageFile, "s3://") != NULL) {
            systemstring = "aws s3 cp \'";
            systemstring += imageFile;
            systemstring += "\' ";
            systemstring += path;
         }   
         else { // URLs other than s3://
            systemstring = "wget -O ";
            systemstring += path;
            systemstring += " \'";
            systemstring += imageFile;
            systemstring += "\'";
         }   

         int const numAttempts = MAX_FILESYSTEMCALL_TRIES;
         for(int attemptNum = 0; attemptNum < numAttempts; attemptNum++){
            int status = system(systemstring.c_str());
            if(status != 0){ 
               if(attemptNum == numAttempts - 1){ 
                  fprintf(stderr, "download command \"%s\" failed: %s.  Exiting\n", systemstring.c_str(), strerror(errno));
                  exit(EXIT_FAILURE);
               }   
               else{
                  fprintf(stderr, "download command \"%s\" failed: %s.  Retrying %d out of %d.\n", systemstring.c_str(), strerror(errno), attemptNum+1, numAttempts);
                  sleep(1);
               }
            }
            else{
               break;
            }
         }
      }
      else {
         printf("Image from file \"%s\"\n", imageFile);
         path = strdup(imageFile);
      }
      GDALDataset * gdalDataset = PV_GDALOpen(path);
      if (gdalDataset==NULL)
      {
         fprintf(stderr, "setImageLayerMemoryBuffer: GDALOpen failed for image \"%s\".\n", imageFile);
         exit(EXIT_FAILURE);
      }
      int imageNx= gdalDataset->GetRasterXSize();
      int imageNy = gdalDataset->GetRasterYSize();
      int imageNf = GDALGetRasterCount(gdalDataset);
      // Need to rescale so that the the short side of the image equals the short side of the layer
      // ImageFromMemoryBuffer layer will handle the cropping.
      double xScaleFactor = (double)layerNx / (double) imageNx;
      double yScaleFactor = (double)layerNy / (double) imageNy;
      size_t imageBufferSize = *imageBufferSizePtr;
      uint8_t * imageBuffer = *imageBufferPtr;
      if (xScaleFactor < yScaleFactor) /* need to rescale so that bufferNy=layerNy and bufferNx>layerNx */
      {
         bufferNx = (int) round(imageNx * yScaleFactor);
         bufferNy = layerNy;
      }
      else {
         bufferNx = layerNx;
         bufferNy = (int) round(imageNy * xScaleFactor);
      }
      bufferNf = layerNf;
      size_t newImageBufferSize = (size_t)bufferNx * (size_t)bufferNy * (size_t)bufferNf;
      if (imageBuffer==NULL || newImageBufferSize != imageBufferSize)
      {
         imageBufferSize = newImageBufferSize;
         imageBuffer = (uint8_t *) realloc(imageBuffer, imageBufferSize*sizeof(uint8_t));
         if (imageBuffer==NULL)
         {
            fprintf(stderr, "setImageLayerMemoryBuffer: Unable to resize image buffer to %d-by-%d-by-%d for image \"%s\": %s\n",
                  bufferNx, bufferNy, bufferNf, imageFile, strerror(errno));
            exit(EXIT_FAILURE);
         }
      }

      bool isBinary = true;
      for (int iBand=0;iBand<imageNf; iBand++)
      {
         GDALRasterBandH hBand = GDALGetRasterBand(gdalDataset,iBand+1);
         char ** metadata = GDALGetMetadata(hBand, "Image_Structure");
         if(CSLCount(metadata) > 0){
            bool found = false;
            for(int i = 0; metadata[i] != NULL; i++){
               if(strcmp(metadata[i], "NBITS=1") == 0){
                  found = true;
                  isBinary &= true;
                  break;
               }
            }
            if(!found){
               isBinary &= false;
            }
         }
         else{
            isBinary = false;
         }
         GDALDataType dataType = gdalDataset->GetRasterBand(iBand+1)->GetRasterDataType(); // Why are we using both GDALGetRasterBand and GDALDataset::GetRasterBand?
         if (dataType != GDT_Byte)
         {
            fprintf(stderr, "setImageLayerMemoryBuffer: Image file \"%s\", band %d, is not GDT_Byte type.\n", imageFile, iBand+1);
            exit(EXIT_FAILURE);
         }
      }

#ifdef PV_USE_OPENMP_THREADS
#pragma omp parallel for
#endif
      for (size_t n=0; n < imageBufferSize; n++)
      {
         imageBuffer[n] = oneVal;
      }

      xStride = bufferNf;
      yStride = bufferNf * bufferNx;
      bandStride = 1;
      gdalDataset->RasterIO(GF_Read, 0/*xOffset*/, 0/*yOffset*/, imageNx, imageNy, imageBuffer, bufferNx, bufferNy,
            GDT_Byte, layerNf, NULL, xStride*sizeof(uint8_t), yStride*sizeof(uint8_t), bandStride*sizeof(uint8_t));

      GDALClose(gdalDataset);
      if (usingTempFile) {
         int rmstatus = remove(path);
         if (rmstatus) {
            fprintf(stderr, "remove(\"%s\") failed.  Exiting.\n", path);
            exit(EXIT_FAILURE);
         }    
      }
      free(path);

      *imageBufferPtr = imageBuffer;
      *imageBufferSizePtr = imageBufferSize;

      int buffersize[3];
      buffersize[0] = bufferNx;
      buffersize[1] = bufferNy;
      buffersize[2] = bufferNf;
      MPI_Bcast(buffersize, 3, MPI_INT, 0, icComm->communicator());
   }
   else {
      int buffersize[3];
      MPI_Bcast(buffersize, 3, MPI_INT, 0, icComm->communicator());
      bufferNx = buffersize[0];
      bufferNy = buffersize[1];
      bufferNf = buffersize[2];
      xStride = bufferNf;
      yStride = bufferNf * bufferNx;
      bandStride = 1;
   }
   imageLayer->setMemoryBuffer(*imageBufferPtr, bufferNy, bufferNx, bufferNf, xStride, yStride, bandStride, zeroVal, oneVal);
   return PV_SUCCESS;
}
Beispiel #11
0
//-----------------------------------------------------------------------
void receiveN() {
  MPI_Bcast(&n, 1, MPI_INT, FROM_MASTER, MPI_COMM_WORLD);
  return;
}
Beispiel #12
0
/**
 * Distributed Matrix Multiply using the SUMMA algorithm
 *  Computes C = A*B + C
 * 
 *  This function uses procGridX times procGridY processes
 *   to compute the product
 *  
 *  A is a m by k matrix, each process starts
 *	with a block of A (aBlock) 
 *  
 *  B is a k by n matrix, each process starts
 *	with a block of B (bBlock) 
 *  
 *  C is a n by m matrix, each process starts
 *	with a block of C (cBlock)
 *
 *  The resulting matrix is stored in C.
 *  A and B should not be modified during computation.
 * 
 *  Ablock, Bblock, and CBlock are stored in
 *   column-major format  
 *
 *  pb is the Panel Block Size (Rows sent?)
 **/
void summa(int m, int n, int k, double *Ablock, double *Bblock, double *Cblock,
		int procGridX, int procGridY, int pb) {
			
	int rank, i, j;
	
	MPI_Comm_rank(MPI_COMM_WORLD, &rank); // get processor id
	MPI_Comm rowComm, colComm;
	MPI_Group origGrp, rowGrp, colGrp;

	if(rank >= procGridX * procGridY) return;//Too many processes assigned
			
	int proc_x = rank % procGridX;
	int proc_y = rank / procGridX;
	
	int blockSizeK = k / procGridY;
	int blockSizeM = m / procGridX;
	int blockSizeN = n / procGridY;
	
	if(rank == 0 && 0)
	{
		for(i = 0; i < 4; i++)
			printf("%f ", Ablock[i]);
		printf("\n");
		
		for(i = 0; i < 4; i++)
			printf("%f ", Bblock[i]);
		printf("\n");
	}
	
	int rowRanks[procGridY];
	int colRanks[procGridX];
	
	//Set up communication
	MPI_Comm_group(MPI_COMM_WORLD, &origGrp);
	for(i = 0; i < procGridY; i++)
		rowRanks[i] = proc_x + procGridX * i;
	for(i = 0; i < procGridX; i++)
		colRanks[i] = proc_y * procGridX + i;
		
	MPI_Group_incl(origGrp, procGridY, rowRanks, &rowGrp);
	MPI_Group_incl(origGrp, procGridX, colRanks, &colGrp);
	
	MPI_Comm_create(MPI_COMM_WORLD, rowGrp, &rowComm);
	MPI_Comm_create(MPI_COMM_WORLD, colGrp, &colComm);
	
	MPI_Barrier(MPI_COMM_WORLD);
	
	int rowRank, colRank;
	MPI_Group_rank(rowGrp, &rowRank);
	MPI_Group_rank(colGrp, &colRank);
	
	int memNum = pb > blockSizeK ? pb : blockSizeK;
	double *bufA = (double*)malloc(sizeof(double) * memNum * blockSizeM);
	double *bufB = (double*)malloc(sizeof(double) * memNum * blockSizeN);
	int pbPos = 0;
	int datRecv = 0;
	
	//Loop through blocks
	for(i = 0; i < max(procGridX, procGridY); i++)
	{
		while(datRecv < blockSizeM * blockSizeK)
		{
				
			//Zero the buffers
			memset(bufA, 0, sizeof(double) * blockSizeK * blockSizeM);
			memset(bufB, 0, sizeof(double) * blockSizeK * blockSizeN);
			
			//Row is self
			if(i == rowRank)
				memcpy(&bufA[pbPos * blockSizeM], &Ablock[datRecv], sizeof(double) * pb * blockSizeM);
			
			double bufTemp[pb * blockSizeN];
			//Col is self
			if(i == colRank)
			{
				for(j = 0; j < pb * blockSizeN; j++)
				{
					bufTemp[j] = Bblock[pbPos + j / blockSizeK + (j % blockSizeK) * blockSizeK];
				}
			}
			
			MPI_Bcast(&bufA[datRecv], pb * blockSizeM, MPI_DOUBLE, i, rowComm);
			MPI_Bcast(bufTemp, pb * blockSizeN, MPI_DOUBLE, i, colComm);
			for(j = 0; j < pb * blockSizeN; j++)
			{
				bufB[pbPos + j / blockSizeK + (j % blockSizeK) * blockSizeK] = bufTemp[j];
			}
			
			/*printf("------------\n");
			printf("------------\n");
			print_matrixx(bufA, blockSizeK, blockSizeM);
			printf("------------\n");
			print_matrixx(bufB, blockSizeN, blockSizeK);
			printf("------------\n");
			printf("------------\n");//*/
			
			datRecv += pb * blockSizeM;
			pbPos += pb < blockSizeK ? pb: blockSizeK;//+= pb * blockSizeN;
			local_mm(blockSizeM, blockSizeN, blockSizeK, 1, bufA, blockSizeM, bufB, blockSizeK, 1, Cblock, blockSizeM);
		}
		datRecv = 0;
		if(pbPos >= pb)
			pbPos = 0;
		
		//local_mm(blockSizeM, blockSizeN, blockSizeK, 1, bufA, blockSizeM, bufB, blockSizeK, 1, Cblock, blockSizeM);
	}
	free(bufA);
	free(bufB);
	
	MPI_Barrier(MPI_COMM_WORLD);

}
Beispiel #13
0
static int refine_fm2 (ZZ *zz,
                       HGraph *hg,
                       int p,
                       float *part_sizes,
                       Partition part,
                       PHGPartParams *hgp,
                       float bal_tol
    )
{
    int    i, j, ierr=ZOLTAN_OK, *pins[2]={NULL,NULL}, *lpins[2]={NULL,NULL};
    int    *moves=NULL, *mark=NULL, *adj=NULL, passcnt=0;
    float  *gain=NULL, *lgain=NULL;
    int    best_cutsizeat, cont, successivefails=0;
    double total_weight, weights[2], total_lweight, lweights[2], lwadjust[2],
        max_weight[2], lmax_weight[2], avail[2], gavail[2];
    int availcnt[2], gavailcnt[2];
    double targetw0, ltargetw0, minvw=DBL_MAX;
    double cutsize, best_cutsize, 
        best_limbal, imbal, limbal;
    HEAP   heap[2];
    char   *yo="refine_fm2";
    int    part_dim = (hg->VtxWeightDim ? hg->VtxWeightDim : 1);
#ifdef HANDLE_ISOLATED_VERTICES    
    int    isocnt=hg->nVtx; /* only root uses isocnt, isolated vertices
                               are kept at the end of moves array */
    int    *deg=NULL, *ldeg=NULL;
#if 0
    double best_imbal;
#endif
#endif
    PHGComm *hgc=hg->comm;
    int rootRank;
    
    struct phg_timer_indices *timer = Zoltan_PHG_LB_Data_timers(zz);
    int do_timing = (hgp->use_timers > 2);
    int detail_timing = (hgp->use_timers > 3);

    
    ZOLTAN_TRACE_ENTER(zz, yo);

    if (p != 2) {
        ZOLTAN_PRINT_ERROR(zz->Proc, yo, "p!=2 not allowed for refine_fm2.");
        ZOLTAN_TRACE_EXIT(zz, yo);
        return ZOLTAN_FATAL;
    }

    /* return only if globally there is no edge or vertex */
    if (!hg->dist_y[hgc->nProc_y] || hg->dist_x[hgc->nProc_x] == 0) {
        ZOLTAN_TRACE_EXIT(zz, yo);
        return ZOLTAN_OK;
    }


#ifdef USE_SERIAL_REFINEMENT_ON_ONE_PROC
    if (hgc->nProc==1) /* only one proc? use serial code */
        return serial_fm2 (zz, hg, p, part_sizes, part, hgp, bal_tol);
#endif

    if (do_timing) { 
        if (timer->rfrefine < 0) 
            timer->rfrefine = Zoltan_Timer_Init(zz->ZTime, 1, "Ref_P_Total");
        ZOLTAN_TIMER_START(zz->ZTime, timer->rfrefine, hgc->Communicator);
    }
    if (detail_timing) {
        if (timer->rfpins < 0) 
            timer->rfpins = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_Pins");
        if (timer->rfiso < 0) 
            timer->rfiso = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_IsolatedVert");
        if (timer->rfgain < 0) 
            timer->rfgain = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_Gain");
        if (timer->rfheap < 0) 
            timer->rfheap = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_Heap");
        if (timer->rfpass < 0) 
            timer->rfpass = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_Pass");
        if (timer->rfroll < 0) 
            timer->rfroll = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_Roll");
        if (timer->rfnonroot < 0) 
            timer->rfnonroot = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_NonRoot");
    }
    
    
    /* find the index of the proc in column group with 
       the most #nonzeros; it will be our root
       proc for computing moves since it has better 
       knowedge about global hypergraph.
       We ignore returned #pins (i) in root */
    Zoltan_PHG_Find_Root(hg->nPins, hgc->myProc_y, hgc->col_comm, 
                         &i, &rootRank);
    
    /* Calculate the weights in each partition and total, then maxima */
    weights[0] = weights[1] = 0.0;
    lweights[0] = lweights[1] = 0.0;
    if (hg->vwgt) 
        for (i = 0; i < hg->nVtx; i++) {
            lweights[part[i]] += hg->vwgt[i*hg->VtxWeightDim];
            minvw = (minvw > hg->vwgt[i*hg->VtxWeightDim]) 
                  ? hg->vwgt[i*hg->VtxWeightDim] 
                  : minvw;
        }
    else {
        minvw = 1.0;
        for (i = 0; i < hg->nVtx; i++)
            lweights[part[i]] += 1.0;
    }

    MPI_Allreduce(lweights, weights, 2, MPI_DOUBLE, MPI_SUM, hgc->row_comm);
    total_weight = weights[0] + weights[1];
    targetw0 = total_weight * part_sizes[0]; /* global target weight for part 0 */

    max_weight[0] = total_weight * bal_tol * part_sizes[0];
    max_weight[1] = total_weight * bal_tol * part_sizes[part_dim]; /* should be (1 - part_sizes[0]) */


    if (weights[0]==0.0) {
        ltargetw0 = targetw0 / hgc->nProc_x;
        lmax_weight[0] = max_weight[0] / hgc->nProc_x;
    } else {
        lmax_weight[0] = (weights[0]==0.0) ? 0.0 : lweights[0] +
            (max_weight[0] - weights[0]) * ( lweights[0] / weights[0] );
        ltargetw0 = targetw0 * ( lweights[0] / weights[0] ); /* local target weight */
    }
    if (weights[1]==0.0)
        lmax_weight[1] = max_weight[1] / hgc->nProc_x;
    else
        lmax_weight[1] = (weights[1]==0.0) ? 0.0 : lweights[1] +
            (max_weight[1] - weights[1]) * ( lweights[1] / weights[1] );

    total_lweight = lweights[0]+lweights[1];
    
    avail[0] = MAX(0.0, lmax_weight[0]-total_lweight);
    avail[1] = MAX(0.0, lmax_weight[1]-total_lweight);
    availcnt[0] = (avail[0] == 0) ? 1 : 0;
    availcnt[1] = (avail[1] == 0) ? 1 : 0; 
    MPI_Allreduce(avail, gavail, 2, MPI_DOUBLE, MPI_SUM, hgc->row_comm);
    MPI_Allreduce(availcnt, gavailcnt, 2, MPI_INT, MPI_SUM, hgc->row_comm);

#ifdef _DEBUG
    if (gavailcnt[0] || gavailcnt[1])
        uprintf(hgc, "before adjustment, LMW[%.1lf, %.1lf]\n", lmax_weight[0], lmax_weight[1]);
#endif

    if (gavailcnt[0]) 
        lmax_weight[0] += gavail[0] / (double) gavailcnt[0];
    
    if (gavailcnt[1]) 
        lmax_weight[1] += gavail[1] / (double) gavailcnt[1];
    
    /* Our strategy is to stay close to the current local weight balance.
       We do not need the same local balance on each proc, as long as
       we achieve approximate global balance.                            */

#ifdef _DEBUG
    imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0;
    limbal = (ltargetw0==0.0) ? 0.0 : fabs(lweights[0]-ltargetw0)/ltargetw0;
    uprintf(hgc, "H(%d, %d, %d), FM2: W[%.1lf, %.1lf] MW:[%.1lf, %.1lf] I=%.3lf  LW[%.1lf, %.1lf] LMW[%.1lf, %.1lf] LI=%.3lf\n", hg->nVtx, hg->nEdge, hg->nPins, weights[0], weights[1], max_weight[0], max_weight[1], imbal, lweights[0], lweights[1], lmax_weight[0], lmax_weight[1], limbal);
#endif

    
    if ((hg->nEdge && (!(pins[0]    = (int*) ZOLTAN_MALLOC(2 * hg->nEdge * sizeof(int)))
                      || !(lpins[0] = (int*) ZOLTAN_CALLOC(2 * hg->nEdge, sizeof(int))))) ||
        (hg->nVtx && (!(moves   = (int*)   ZOLTAN_MALLOC(hg->nVtx * sizeof(int)))
                     || !(lgain = (float*) ZOLTAN_MALLOC(hg->nVtx * sizeof(float))))))
        MEMORY_ERROR;

    if (hg->nEdge) {
        pins[1] = &(pins[0][hg->nEdge]);
        lpins[1] = &(lpins[0][hg->nEdge]);
    }

    if (hgc->myProc_y==rootRank) { /* only root needs mark, adj, gain and heaps*/
        if (hg->nVtx &&
            (!(mark     = (int*)   ZOLTAN_CALLOC(hg->nVtx, sizeof(int)))
             || !(adj   = (int*)   ZOLTAN_MALLOC(hg->nVtx * sizeof(int)))   
             || !(gain  = (float*) ZOLTAN_MALLOC(hg->nVtx * sizeof(float)))))
            MEMORY_ERROR;
        Zoltan_Heap_Init(zz, &heap[0], hg->nVtx);
        Zoltan_Heap_Init(zz, &heap[1], hg->nVtx);  
    }

    /* Initial calculation of the local pin distribution (sigma in UVC's papers)  */
    if (detail_timing)         
        ZOLTAN_TIMER_START(zz->ZTime, timer->rfpins, hgc->Communicator);                        
    for (i = 0; i < hg->nEdge; ++i)
        for (j = hg->hindex[i]; j < hg->hindex[i+1]; ++j)
            ++(lpins[part[hg->hvertex[j]]][i]);
    if (detail_timing)         
        ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfpins, hgc->Communicator);                    
    

#ifdef HANDLE_ISOLATED_VERTICES        
    /* first compute vertex degree to find any isolated vertices
       we use lgain and gain, as ldeg, deg.*/
    if (hg->nVtx) {
        if (detail_timing)         
            ZOLTAN_TIMER_START(zz->ZTime, timer->rfiso, hgc->Communicator);        
        ldeg = (int *) lgain;
        deg = (int *) gain; /* null for non-root but that is fine */
        for (i = 0; i < hg->nVtx; ++i)
            ldeg[i] = hg->vindex[i+1] - hg->vindex[i];
        MPI_Reduce(ldeg, deg, hg->nVtx, MPI_INT, MPI_SUM, rootRank,
                   hg->comm->col_comm);

        if (hgc->myProc_y==rootRank) { /* root marks isolated vertices */
            for (i=0; i<hg->nVtx; ++i)
                if (!hgp->UseFixedVtx || hg->fixed_part[i]<0) {
                    if (!deg[i]) {
                        moves[--isocnt] = i;
                        part[i] = -(part[i]+1); /* remove those vertices from that part*/
                    }
                }
        }   
        if (detail_timing)         
            ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfiso, hgc->Communicator);        

    }
#endif
    
    do {
        int v=1, movecnt=0, neggaincnt=0, from, to;
        int maxneggain = (hgp->fm_max_neg_move < 0) ? hg->nVtx : hgp->fm_max_neg_move;
        int notfeasible=(weights[0]>max_weight[0]) || (weights[1]>max_weight[1]);
    
        /* now compute global pin distribution */
        if (hg->nEdge) {
            if (detail_timing)         
                ZOLTAN_TIMER_START(zz->ZTime, timer->rfpins, hgc->Communicator);                    
            MPI_Allreduce(lpins[0], pins[0], 2*hg->nEdge, MPI_INT, MPI_SUM, 
                          hgc->row_comm);
            if (detail_timing)         
                ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfpins, hgc->Communicator);                    
        }

        /* now we can compute actual cut */
        best_cutsizeat=0;
        cutsize = 0.0;
        for (i=0; i < hg->nEdge; ++i) {
            if (pins[0][i] && pins[1][i])
                cutsize += (hg->ewgt ? hg->ewgt[i] : 1.0);
        }
        MPI_Allreduce(&cutsize, &best_cutsize, 1, MPI_DOUBLE, MPI_SUM, hgc->col_comm);
        cutsize = best_cutsize;

        imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0;        
        best_limbal = limbal = (ltargetw0==0.0) ? 0.0
            : fabs(lweights[0]-ltargetw0)/ltargetw0;

        /* UVCUVC: it looks like instead of moving always from overloaded
           part, alternating the 'from' part gives better results.
           Hence if the imbal is not really bad (2x worse) we use that approach  */
        if (imbal > BADBALANCE*(bal_tol-1.0) ) /* decide which way the moves will be in this pass */
            from = (weights[0] < targetw0) ? 1 : 0;
        else 
            from = passcnt % 2; 
        /* we want to be sure that everybody!!! picks the same source */
        MPI_Bcast(&from, 1, MPI_INT, 0, hgc->Communicator); 

        to = 1-from;
        
#ifdef _DEBUG
        /* Just for debugging */
        best_cutsize = Zoltan_PHG_Compute_NetCut(hgc, hg, part, p);
        if (best_cutsize!=cutsize) {
            errexit("%s: Initial cutsize=%.2lf Verify: total=%.2lf\n", uMe(hgc), cutsize,
                    best_cutsize);
        }
        if (hgc->myProc_y==rootRank)
            for (i = 0; i< hg->nVtx; ++i)
                if (mark[i])
                    errexit("mark[%d]=%d", i, mark[i]);
        /* debuggging code ends here */
#endif

        /* compute only the gains of the vertices from 'from' part */
        if (detail_timing)         
            ZOLTAN_TIMER_START(zz->ZTime, timer->rfgain, hgc->Communicator);                    
        
        for (i = 0; i < hg->nVtx; ++i) {
            lgain[i] = 0.0;
            if ((part[i]==from) && (!hgp->UseFixedVtx || hg->fixed_part[i]<0))
                for (j = hg->vindex[i]; j < hg->vindex[i+1]; j++) {
                    int edge = hg->vedge[j];
                    if ((pins[0][edge]+pins[1][edge])>1) { /* if they have at least 2 pins :) */
                        if (pins[part[i]][edge] == 1)
                            lgain[i] += (hg->ewgt ? hg->ewgt[edge] : 1.0);
                        else if (pins[1-part[i]][edge] == 0)
                            lgain[i] -= (hg->ewgt ? hg->ewgt[edge] : 1.0);
                    }
                }
        }
        /* now sum up all gains on only root proc */
        if (hg->nVtx)
            MPI_Reduce(lgain, gain, hg->nVtx, MPI_FLOAT, MPI_SUM, rootRank, 
                       hgc->col_comm);
        if (detail_timing)         
            ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfgain, hgc->Communicator);                    
        

        if (hgp->output_level >= PHG_DEBUG_ALL) {
            imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0;
            printf("%s FM Pass %d (%d->%d) Cut=%.2f W[%5.0f, %5.0f] I= %.2f LW[%5.0f, %5.0f] LI= %.2f\n", uMe(hgc), passcnt, from, to, cutsize, weights[0], weights[1], imbal, lweights[0], lweights[1], limbal);
        }

        if (hgc->myProc_y==rootRank) {
            /* those are the lucky ones; each proc in column-group
               could have compute the same moves concurrently; but for this
               version we'll do it in the root procs and broadcast */

#ifdef HANDLE_ISOLATED_VERTICES
            if (detail_timing)         
                ZOLTAN_TIMER_START(zz->ZTime, timer->rfiso, hgc->Communicator);                    
            lwadjust[0] = lwadjust[1] = 0.0;
            for (i=isocnt; i < hg->nVtx; ++i) { /* go over isolated vertices */
                int   u=moves[i], pno=-part[u]-1;
                float w=(hg->vwgt ? hg->vwgt[u*hg->VtxWeightDim] : 1.0);

                if (pno<0 || pno>1)
                    errexit("heeeey pno=%d", pno);
                /* let's remove it from its part */
                lwadjust[pno] -= w;                
            }
            lweights[0] += lwadjust[0];
            lweights[1] += lwadjust[1];
            if (detail_timing)         
                ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfiso, hgc->Communicator);                    
#endif

            if (detail_timing)         
                ZOLTAN_TIMER_START(zz->ZTime, timer->rfheap, hgc->Communicator);                    
            
            /* Initialize the heaps and fill them with the gain values */
            Zoltan_Heap_Clear(&heap[from]);  
            for (i = 0; i < hg->nVtx; ++i)
                if ((part[i]==from) && (!hgp->UseFixedVtx || hg->fixed_part[i]<0))
                    Zoltan_Heap_Input(&heap[from], i, gain[i]);
            Zoltan_Heap_Make(&heap[from]);
            if (detail_timing) {
                ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfheap, hgc->Communicator);
                ZOLTAN_TIMER_START(zz->ZTime, timer->rfpass, hgc->Communicator);
            }

            while ((neggaincnt < maxneggain) && ((lweights[to]+minvw) <= lmax_weight[to]) ) {
                if (Zoltan_Heap_Empty(&heap[from])) { /* too bad it is empty */
                    v = -1;
                    break;
                }
                
                v = Zoltan_Heap_Extract_Max(&heap[from]);    
                
#ifdef _DEBUG
                if (from != part[v])
                    errexit("hooop from=%d part[%d]=%d", from, v, part[v]);
#endif

                /* Mark vertex we picked from the heap so it is "locked". 
                   For the current strategy, moving only one direction 
                   at a time, the mark information is not critical.
                   Note that the mark array is also used in the move/update 
                   routine so don't remove it! */
                ++mark[v];
                if (lweights[to]+((hg->vwgt)?hg->vwgt[v*hg->VtxWeightDim]:1.0) > lmax_weight[to]) {
#ifdef _DEBUG2                    
                    printf("%s %4d: %6d (g: %5.1lf), p:%2d [%4.0lf, %4.0lf] NF\n", uMe(hgc), movecnt, v, gain[v], from, weights[0], weights[1]);
#endif
                    /* Negative value in moves array means we have examined 
                       the vertex but couldn't move it. Note offset by one,
                       otherwise zero would be ambiguous. */
                    moves[movecnt++] = -(v+1);
                    continue;
                } 

                    
                moves[movecnt] = v;
                ++neggaincnt;
                cutsize -= gain[v];

                fm2_move_vertex_oneway(v, hg, part, gain, heap, pins, lpins, weights, lweights, mark, adj);
                imbal = (targetw0==0.0) ? 0.0
                    : fabs(weights[0]-targetw0)/targetw0;
                limbal = (ltargetw0==0.0) ? 0.0
                    : fabs(lweights[0]-ltargetw0)/ltargetw0;

                if (notfeasible || (cutsize<best_cutsize) ||
                                   (cutsize==best_cutsize && limbal < best_limbal)) {
#ifdef _DEBUG2                    
                    printf("%s %4d: %6d (g: %5.1lf), p:%2d W[%4.0lf, %4.0lf] I:%.2lf LW[%4.0lf, %4.0lf] LI:%.2lf C:%.1lf<-- Best\n", uMe(hgc), movecnt, v, gain[v], from, weights[0], weights[1], imbal, lweights[0], lweights[1], limbal, cutsize); /* after move gain is -oldgain */
#endif
                    notfeasible = weights[from]>max_weight[from];
                    best_cutsize = cutsize;
                    best_cutsizeat = movecnt+1;
                    best_limbal = limbal;
                    neggaincnt = 0;
                }
#ifdef _DEBUG2                
                else
                    printf("%s %4d: %6d (g: %5.1lf), p:%2d [%4.0lf, %4.0lf] %.1lf\n", uMe(hgc), movecnt, v, gain[v], from, weights[0], weights[1], cutsize);
#endif
                ++movecnt;
            }

            
            if (detail_timing) {
                ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfpass, hgc->Communicator);
                ZOLTAN_TIMER_START(zz->ZTime, timer->rfroll, hgc->Communicator);
            }

#ifdef _DEBUG
	    if (v<0)
                uprintf(hgc, "EOLB @ %d there was no vertex to select: v=%d\n", movecnt, v);
	    else if (neggaincnt >= maxneggain) 
                uprintf(hgc, "EOLB @ %d max neg move reached neggaincnt(%d) >= maxneggain\n", movecnt, neggaincnt, maxneggain);
	    else 
                uprintf(hgc, "EOLB @ %d balance constraint LW[%.1lf, %.1lf] and MAXW[%.1lf, %.1lf]\n", movecnt, lweights[0], lweights[1], lmax_weight[0], lmax_weight[1]);
#endif
            
            /* roll back the moves without any improvement */
            for (i=movecnt-1; i>=best_cutsizeat; --i) {
                int v = moves[i];
                if (v<0)
                    v = -v-1;
                else /* we don't need to roll pins, or weights etc; rolling local ones suffices */
                    fm2_move_vertex_oneway_nonroot(v, hg, part, lpins, lweights);
                mark[v] = 0;
            }
            for (i=0; i<best_cutsizeat; ++i){
                int v = (moves[i] < 0 ) ? -moves[i] - 1 : moves[i];
                mark[v] = 0;
            }
            if (detail_timing) 
                ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfroll, hgc->Communicator);            
        }

        if (detail_timing) 
            ZOLTAN_TIMER_START(zz->ZTime, timer->rfnonroot, hgc->Communicator);            
        
        /* now root bcast moves to column procs */
        MPI_Bcast(&best_cutsizeat, 1, MPI_INT, rootRank, hgc->col_comm);
        MPI_Bcast(moves, best_cutsizeat, MPI_INT, rootRank, hgc->col_comm);
        if (hgc->myProc_y!=rootRank) { /* now non-root does move simulation */
            for (i=0; i<best_cutsizeat; ++i) {
                int v = moves[i];
                if (v>=0)
                    fm2_move_vertex_oneway_nonroot(v, hg, part, lpins, lweights);
            }
        }
        if (detail_timing) 
            ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfnonroot, hgc->Communicator);            

        
#ifdef _DEBUG
        for (i = 0; i < hg->nEdge; ++i) {
            int lp[2];

            lp[0] = lp[1] = 0;
            for (j = hg->hindex[i]; j < hg->hindex[i+1]; ++j)
                ++(lp[part[hg->hvertex[j]]]);
            if ((lp[0] != lpins[0][i]) || (lp[1] != lpins[1][i]))
                errexit("for net %d -- lp=[%d, %d] lpins[%d, %d]", i, lp[0], lp[1], lpins[0][i], lpins[1][i]);
        }
#endif


#ifdef HANDLE_ISOLATED_VERTICES
        if (detail_timing)         
            ZOLTAN_TIMER_START(zz->ZTime, timer->rfiso, hgc->Communicator);        
        
#if 0
        MPI_Allreduce(lweights, weights, 2, MPI_DOUBLE, MPI_SUM, hgc->row_comm);        
        best_imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0;
        if (hgc->myProc_y==rootRank)             
            uprintf(hgc, "BEFORE ISOLATED VERTEX HANDLING WE *THINK* GLOBAL IMBALANCE is %.3lf\n", best_imbal);
#endif
        
        if (hgc->myProc_y==rootRank) {
            best_limbal = (ltargetw0==0.0) ? 0.0
                : fabs(lweights[0]-ltargetw0)/ltargetw0;
            
            for (i=isocnt; i < hg->nVtx; ++i) { /* go over isolated vertices */
                int u = moves[i], npno;
                float w=(hg->vwgt ? hg->vwgt[u*hg->VtxWeightDim] : 1.0);

                npno = (lweights[0] < ltargetw0) ? 0 : 1;
                lweights[npno] += w;
                lwadjust[npno] += w;
                part[u] = -(npno+1); /* move to npno (might be same as pno;
                                        so it may not be a real move */
            }
            limbal = (ltargetw0==0.0) ? 0.0
                : fabs(lweights[0]-ltargetw0)/ltargetw0;
#if 0           
            uprintf(hgc, "before binpacking of %d isolated vertices balance was: %.3lf now: %.3lf\n", hg->nVtx-isocnt, best_limbal, limbal);
#endif
        }

        MPI_Bcast(lwadjust, 2, MPI_DOUBLE, rootRank, hgc->col_comm);
        if (hgc->myProc_y!=rootRank) {
            lweights[0] += lwadjust[0];
            lweights[1] += lwadjust[1];
        }
        if (detail_timing)         
            ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfiso, hgc->Communicator);                
#endif        
        
        MPI_Allreduce(lweights, weights, 2, MPI_DOUBLE, MPI_SUM, hgc->row_comm);
#if 0       
        best_imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0;
        if (hgc->myProc_y==rootRank)             
            uprintf(hgc, "NEW GLOBAL IMBALANCE is %.3lf\n", best_imbal);
#endif
        
        if (weights[0]==0.0) 
            ltargetw0 = lmax_weight[0] = 0.0;
        else {
            lmax_weight[0] = lweights[0] +
                (max_weight[0] - weights[0]) * ( lweights[0] / weights[0] );
            ltargetw0 = targetw0 * ( lweights[0] / weights[0] ); /* local target weight */
        }
        lmax_weight[1] = (weights[1]==0.0) ? 0.0 : lweights[1] +
            (max_weight[1] - weights[1]) * ( lweights[1] / weights[1] );
        
        cont = 0;
        MPI_Allreduce(&best_cutsizeat, &cont, 1, MPI_INT, MPI_LOR, hgc->row_comm);

        /* since we're only moving in one direction; make sure two successive
           pass didn't produce any improvement before terminating */
        if (!cont)
            ++successivefails; 
        else
            successivefails = 0; 
#ifdef _DEBUG
        /* Just for debugging */
        best_cutsize = Zoltan_PHG_Compute_NetCut(hgc, hg, part, p);
        imbal = (targetw0 == 0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0;
        printf("%s End of Pass %d Comp.Cut=%.2lf RealCut=%.2lf W[%5.0lf, %5.0lf] Imbal=%.2lf\n", uMe(hgc), passcnt, cutsize, best_cutsize, weights[0], weights[1], imbal);
        /* debuggging code ends here */
#endif
    } while (successivefails<2 &&  (++passcnt < hgp->fm_loop_limit));


#ifdef HANDLE_ISOLATED_VERTICES
    if (detail_timing)         
        ZOLTAN_TIMER_START(zz->ZTime, timer->rfiso, hgc->Communicator);            
    /* now root sneds the final part no's of isolated vertices; if any */
    MPI_Bcast(&isocnt, 1, MPI_INT, rootRank, hgc->col_comm);
    if (isocnt<hg->nVtx) {
        deg = (int *) lgain; /* we'll use for part no's of isolated vertices */
        if (hgc->myProc_y==rootRank) 
            for (i=isocnt; i < hg->nVtx; ++i) { /* go over isolated vertices */
                int u = moves[i];
                deg[i] = part[u] = -part[u]-1; 
            }
            
        MPI_Bcast(&moves[isocnt], hg->nVtx-isocnt, MPI_INT, rootRank, hgc->col_comm);
        MPI_Bcast(&deg[isocnt], hg->nVtx-isocnt, MPI_INT, rootRank, hgc->col_comm);
        if (hgc->myProc_y!=rootRank) 
            for (i=isocnt; i < hg->nVtx; ++i)  /* go over isolated vertices */
                part[moves[i]] = deg[i];
    }
    if (detail_timing)         
        ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfiso, hgc->Communicator);            
#endif
    if (hgc->myProc_y==rootRank) { /* only root needs mark, adj, gain and heaps*/        
        Zoltan_Multifree(__FILE__,__LINE__, 3, &mark, &adj, &gain);
        Zoltan_Heap_Free(&heap[0]);
        Zoltan_Heap_Free(&heap[1]);        
    }
    
 End:    
    Zoltan_Multifree(__FILE__, __LINE__, 4, &pins[0], &lpins[0], &moves, &lgain);

    if (do_timing) 
        ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfrefine, hgc->Communicator);
    
    
    ZOLTAN_TRACE_EXIT(zz, yo);
    return ierr;
}
Beispiel #14
0
int main(int argc, char ** argv) {
 
  int    Num_procs;       /* number of ranks                                     */
  int    Num_procsx, 
         Num_procsy;      /* number of ranks in each coord direction             */
  int    Num_groupsx, 
         Num_groupsy;     /* number of blocks in each coord direction            */
  int    my_group;        /* sequence number of shared memory block              */
  int    my_group_IDx,
         my_group_IDy;    /* coordinates of block within block grid              */
  int    group_size;      /* number of ranks in shared memory group              */
  int    group_sizex,
         group_sizey;     /* number of ranks in block in each coord direction    */
  int    my_ID;           /* MPI rank                                            */
  int    my_global_IDx, 
         my_global_IDy;   /* coordinates of rank in overall rank grid            */
  int    my_local_IDx, 
         my_local_IDy;    /* coordinates of rank within shared memory block      */
  int    right_nbr;       /* global rank of right neighboring tile               */
  int    left_nbr;        /* global rank of left neighboring tile                */
  int    top_nbr;         /* global rank of top neighboring tile                 */
  int    bottom_nbr;      /* global rank of bottom neighboring tile              */
  int    local_nbr[4];    /* list of synchronizing local neighbors               */
  int    num_local_nbrs;  /* number of synchronizing local neighbors             */
  int    dummy;
  DTYPE *top_buf_out;     /* communication buffer                                */
  DTYPE *top_buf_in;      /*       "         "                                   */
  DTYPE *bottom_buf_out;  /*       "         "                                   */
  DTYPE *bottom_buf_in;   /*       "         "                                   */
  DTYPE *right_buf_out;   /*       "         "                                   */
  DTYPE *right_buf_in;    /*       "         "                                   */
  DTYPE *left_buf_out;    /*       "         "                                   */
  DTYPE *left_buf_in;     /*       "         "                                   */
  int    root = 0;
  long   n, width, height;/* linear global and block grid dimension              */
  int    width_rank, 
         height_rank;     /* linear local dimension                              */
  int    i, j, ii, jj, kk, it, jt, iter, leftover;  /* dummies                   */
  int    istart_rank, 
         iend_rank;       /* bounds of grid tile assigned to calling rank        */
  int    jstart_rank, 
         jend_rank;       /* bounds of grid tile assigned to calling rank        */
  int    istart, iend;    /* bounds of grid block containing tile                */
  int    jstart, jend;    /* bounds of grid block containing tile                */
  DTYPE  norm,            /* L1 norm of solution                                 */
         local_norm,      /* contribution of calling rank to L1 norm             */
         reference_norm;  /* value to be matched by computed norm                */
  DTYPE  f_active_points; /* interior of grid with respect to stencil            */
  DTYPE  flops;           /* floating point ops per iteration                    */
  int    iterations;      /* number of times to run the algorithm                */
  double local_stencil_time,/* timing parameters                                 */
         stencil_time,
         avgtime; 
  int    stencil_size;    /* number of points in stencil                         */
  DTYPE  * RESTRICT in;   /* input grid values                                   */
  DTYPE  * RESTRICT out;  /* output grid values                                  */
  long   total_length_in; /* total required length to store input array          */
  long   total_length_out;/* total required length to store output array         */
  int    error=0;         /* error flag                                          */
  DTYPE  weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil     */
  MPI_Request request[8]; /* requests for sends & receives in 4 coord directions */
  MPI_Status  status[8];  /* corresponding statuses                              */
  MPI_Win shm_win_in;     /* shared memory window object for IN array            */
  MPI_Win shm_win_out;    /* shared memory window object for OUT array           */
  MPI_Comm shm_comm_prep; /* preparatory shared memory communicator              */
  MPI_Comm shm_comm;      /* Shared Memory Communicator                          */
  int shm_procs;          /* # of rankes in shared domain                        */
  int shm_ID;             /* MPI rank in shared memory domain                    */
  MPI_Aint size_in;       /* size of the IN array in shared memory window        */
  MPI_Aint size_out;      /* size of the OUT array in shared memory window       */
  int size_mul;           /* one for shm_comm root, zero for the other ranks     */
  int disp_unit;          /* ignored                                             */
 
  /*******************************************************************************
  ** Initialize the MPI environment
  ********************************************************************************/
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);
 
  /*******************************************************************************
  ** process, test, and broadcast input parameters    
  ********************************************************************************/
 
  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("MPI+SHM stencil execution on 2D grid\n");

#ifndef STAR
      printf("ERROR: Compact stencil not supported\n");
      error = 1;
      goto ENDOFTESTS;
#endif
    
    if (argc != 4){
      printf("Usage: %s  <#ranks per coherence domain><# iterations> <array dimension> \n", 
             *argv);
      error = 1;
      goto ENDOFTESTS;
    }
 
    group_size = atoi(*++argv);
    if (group_size < 1) {
      printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size);
      error = 1;
      goto ENDOFTESTS;
    } 
    if (Num_procs%group_size) {
      printf("ERROR: total # %d ranks not divisible by ranks per coherence domain %d\n",
	     Num_procs, group_size);
      error = 1;
      goto ENDOFTESTS;
    } 

    iterations  = atoi(*++argv); 
    if (iterations < 0){
      printf("ERROR: iterations must be >= 0 : %d \n",iterations);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    n  = atol(*++argv);
    long nsquare = n * n;
    if (nsquare < Num_procs){ 
      printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare);
      error = 1;
      goto ENDOFTESTS;
    }
 
    if (RADIUS < 0) {
      printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    if (2*RADIUS +1 > n) {
      printf("ERROR: Stencil radius %d exceeds grid size %ld\n", RADIUS, n);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    ENDOFTESTS:;  
  }
  bail_out(error);

  MPI_Bcast(&n,          1, MPI_LONG, root, MPI_COMM_WORLD);
  MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD);
 
  /* determine best way to create a 2D grid of ranks (closest to square, for 
     best surface/volume ratio); we do this brute force for now. The 
     decomposition needs to be such that shared memory groups can evenly
     tessellate the rank grid
  */
  for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) {
    if (!(Num_procs%Num_procsx)) {
      Num_procsy = Num_procs/Num_procsx;
      for (group_sizex=(int)(sqrt(group_size+1)); group_sizex>0; group_sizex--) {
        if (!(group_size%group_sizex) && !(Num_procsx%group_sizex)) {
          group_sizey=group_size/group_sizex;
          break;
        }
      }
      if (!(Num_procsy%group_sizey)) break;
    }
  }      


  if (my_ID == root) {
    printf("Number of ranks                 = %d\n", Num_procs);
    printf("Grid size                       = %ld\n", n);
    printf("Radius of stencil               = %d\n", RADIUS);
    printf("Tiles in x/y-direction          = %d/%d\n", Num_procsx, Num_procsy);
    printf("Tiles per shared memory domain  = %d\n", group_size);
    printf("Tiles in x/y-direction in group = %d/%d\n", group_sizex,  group_sizey);
    printf("Type of stencil                 = star\n");
#ifdef LOCAL_BARRIER_SYNCH
    printf("Local synchronization           = barrier\n");
#else
    printf("Local synchronization           = point to point\n");
#endif
#ifdef DOUBLE
    printf("Data type                       = double precision\n");
#else
    printf("Data type                       = single precision\n");
#endif
#if LOOPGEN
    printf("Script used to expand stencil loop body\n");
#else
    printf("Compact representation of stencil loop body\n");
#endif
    printf("Number of iterations            = %d\n", iterations);
  }

  /* Setup for Shared memory regions */

  /* first divide WORLD in groups of size group_size */
  MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep);
  /* derive from that an SHM communicator */
  MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm);
  MPI_Comm_rank(shm_comm, &shm_ID);
  MPI_Comm_size(shm_comm, &shm_procs);
  /* do sanity check, making sure groups did not shrink in second comm split */
  if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666);
  
  Num_groupsx = Num_procsx/group_sizex;
  Num_groupsy = Num_procsy/group_sizey;

  my_group = my_ID/group_size;
  my_group_IDx = my_group%Num_groupsx;
  my_group_IDy = my_group/Num_groupsx;
  my_local_IDx = my_ID%group_sizex;
  my_local_IDy = (my_ID%group_size)/group_sizex;
  my_global_IDx = my_group_IDx*group_sizex+my_local_IDx;
  my_global_IDy = my_group_IDy*group_sizey+my_local_IDy;

  /* set all neighboring ranks to -1 (no communication with those ranks) */
  left_nbr = right_nbr = top_nbr = bottom_nbr = -1;
  /* keep track of local neighbors for local synchronization             */
  num_local_nbrs = 0;

  if (my_local_IDx == group_sizex-1 && my_group_IDx != (Num_groupsx-1)) {
    right_nbr = (my_group+1)*group_size+shm_ID-group_sizex+1;
  }
  if (my_local_IDx != group_sizex-1) {
    local_nbr[num_local_nbrs++] = shm_ID + 1;
  }

  if (my_local_IDx == 0 && my_group_IDx != 0) {
    left_nbr = (my_group-1)*group_size+shm_ID+group_sizex-1;
  }
  if (my_local_IDx != 0) {
    local_nbr[num_local_nbrs++] = shm_ID - 1;
  }

  if (my_local_IDy == group_sizey-1 && my_group_IDy != (Num_groupsy-1)) {
    top_nbr = (my_group+Num_groupsx)*group_size + my_local_IDx;
  }
  if (my_local_IDy != group_sizey-1) {
    local_nbr[num_local_nbrs++] = shm_ID + group_sizex;
  }

  if (my_local_IDy == 0 && my_group_IDy != 0) {
    bottom_nbr = (my_group-Num_groupsx)*group_size + group_sizex*(group_sizey-1)+my_local_IDx;
  }
  if (my_local_IDy != 0) {
    local_nbr[num_local_nbrs++] = shm_ID - group_sizex;
  }

  /* compute amount of space required for input and solution arrays for the block,
     and also compute index sets                                                  */
  
  width = n/Num_groupsx;
  leftover = n%Num_groupsx;
  if (my_group_IDx<leftover) {
    istart = (width+1) * my_group_IDx; 
    iend = istart + width;
  }
  else {
    istart = (width+1) * leftover + width * (my_group_IDx-leftover);
    iend = istart + width - 1;
  }
  
  width = iend - istart + 1;
  if (width == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);
 
  height = n/Num_groupsy;
  leftover = n%Num_groupsy;
  if (my_group_IDy<leftover) {
    jstart = (height+1) * my_group_IDy; 
    jend = jstart + height;
  }
  else {
    jstart = (height+1) * leftover + height * (my_group_IDy-leftover);
    jend = jstart + height - 1;
  }
  
  height = jend - jstart + 1;
  if (height == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);
 
  if (width < RADIUS || height < RADIUS) {
    printf("ERROR: rank %d has work tile smaller then stencil radius; w=%ld,h=%ld\n",
           my_ID, width, height);
    error = 1;
  }
  bail_out(error);
 
  total_length_in = (width+2*RADIUS)*(height+2*RADIUS)*sizeof(DTYPE);
  total_length_out = width*height*sizeof(DTYPE);

  /* only the root of each SHM domain specifies window of nonzero size */
  size_mul = (shm_ID==0);  
  size_in= total_length_in*size_mul; 
  MPI_Win_allocate_shared(size_in, sizeof(double), MPI_INFO_NULL, shm_comm, 
                          (void *) &in, &shm_win_in);
  MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_in);
  MPI_Win_shared_query(shm_win_in, MPI_PROC_NULL, &size_in, &disp_unit, (void *)&in);
  if (in == NULL){
    printf("Error allocating space for input array by group %d\n",my_group);
    error = 1;
  }
  bail_out(error);

  size_out= total_length_out*size_mul;
  MPI_Win_allocate_shared(size_out, sizeof(double), MPI_INFO_NULL, shm_comm, 
                          (void *) &out, &shm_win_out);
  MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_out);
  MPI_Win_shared_query(shm_win_out, MPI_PROC_NULL, &size_out, &disp_unit, (void *)&out);
  if (out == NULL){
    printf("Error allocating space for output array by group %d\n", my_group);
    error = 1;
  }
  bail_out(error);

  /* determine index set assigned to each rank                         */

  width_rank = width/group_sizex;
  leftover = width%group_sizex;
  if (my_local_IDx<leftover) {
    istart_rank = (width_rank+1) * my_local_IDx; 
    iend_rank = istart_rank + width_rank;
  }
  else {
    istart_rank = (width_rank+1) * leftover + width_rank * (my_local_IDx-leftover);
    iend_rank = istart_rank + width_rank - 1;
  }
  istart_rank += istart;
  iend_rank += istart;
  width_rank = iend_rank - istart_rank + 1;   

  height_rank = height/group_sizey;
  leftover = height%group_sizey;
  if (my_local_IDy<leftover) {
    jstart_rank = (height_rank+1) * my_local_IDy; 
    jend_rank = jstart_rank + height_rank;
  }
  else {
    jstart_rank = (height_rank+1) * leftover + height_rank * (my_local_IDy-leftover);
    jend_rank = jstart_rank + height_rank - 1;
  }
  jstart_rank+=jstart;
  jend_rank+=jstart;
  height_rank = jend_rank - jstart_rank + 1;

  if (height_rank*width_rank==0) {
    error = 1;
    printf("Rank %d has no work to do\n", my_ID);
  }
  bail_out(error);

  /* allocate communication buffers for halo values                            */
  top_buf_out = (DTYPE *) malloc(4*sizeof(DTYPE)*RADIUS*width_rank);
  if (!top_buf_out) {
    printf("ERROR: Rank %d could not allocated comm buffers for y-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  top_buf_in     = top_buf_out +   RADIUS*width_rank;
  bottom_buf_out = top_buf_out + 2*RADIUS*width_rank;
  bottom_buf_in  = top_buf_out + 3*RADIUS*width_rank;
 
  right_buf_out = (DTYPE *) malloc(4*sizeof(DTYPE)*RADIUS*height_rank);
  if (!right_buf_out) { 
    printf("ERROR: Rank %d could not allocated comm buffers for x-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  right_buf_in   = right_buf_out +   RADIUS*height_rank;
  left_buf_out   = right_buf_out + 2*RADIUS*height_rank;
  left_buf_in    = right_buf_out + 3*RADIUS*height_rank;

    /* fill the stencil weights to reflect a discrete divergence operator         */
  for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++)
    WEIGHT(ii,jj) = (DTYPE) 0.0;
  stencil_size = 4*RADIUS+1;
  for (ii=1; ii<=RADIUS; ii++) {
    WEIGHT(0, ii) = WEIGHT( ii,0) =  (DTYPE) (1.0/(2.0*ii*RADIUS));
    WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS));
  }

  norm = (DTYPE) 0.0;
  f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS);
  /* intialize the input and output arrays                                     */
  for (j=jstart_rank; j<=jend_rank; j++) for (i=istart_rank; i<=iend_rank; i++) {
    IN(i,j)  = COEFX*i+COEFY*j;
    OUT(i,j) = (DTYPE)0.0;
  }

  /* LOAD/STORE FENCE */
  MPI_Win_sync(shm_win_in);
  MPI_Win_sync(shm_win_out);
  MPI_Barrier(shm_comm); 

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration */
    if (iter == 1) { 
      MPI_Barrier(MPI_COMM_WORLD);
      local_stencil_time = wtime();
    }

    /* need to fetch ghost point data from neighbors in y-direction                 */
    if (top_nbr != -1) {
      MPI_Irecv(top_buf_in, RADIUS*width_rank, MPI_DTYPE, top_nbr, 101,
                MPI_COMM_WORLD, &(request[1]));
      for (kk=0,j=jend_rank-RADIUS+1; j<=jend_rank; j++) 
      for (i=istart_rank; i<=iend_rank; i++) {
        top_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(top_buf_out, RADIUS*width_rank,MPI_DTYPE, top_nbr, 99, 
                MPI_COMM_WORLD, &(request[0]));
    }

    if (bottom_nbr != -1) {
      MPI_Irecv(bottom_buf_in,RADIUS*width_rank, MPI_DTYPE, bottom_nbr, 99, 
                MPI_COMM_WORLD, &(request[3]));
      for (kk=0,j=jstart_rank; j<=jstart_rank+RADIUS-1; j++) 
      for (i=istart_rank; i<=iend_rank; i++) {
        bottom_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(bottom_buf_out, RADIUS*width_rank,MPI_DTYPE, bottom_nbr, 101,
 	  MPI_COMM_WORLD, &(request[2]));
      }

    if (top_nbr != -1) {
      MPI_Wait(&(request[0]), &(status[0]));
      MPI_Wait(&(request[1]), &(status[1]));
      for (kk=0,j=jend_rank+1; j<=jend_rank+RADIUS; j++) 
      for (i=istart_rank; i<=iend_rank; i++) {
        IN(i,j) = top_buf_in[kk++];
      }
    }

    if (bottom_nbr != -1) {    
      MPI_Wait(&(request[2]), &(status[2]));
      MPI_Wait(&(request[3]), &(status[3]));
      for (kk=0,j=jstart_rank-RADIUS; j<=jstart_rank-1; j++) 
      for (i=istart_rank; i<=iend_rank; i++) {
        IN(i,j) = bottom_buf_in[kk++];
      }
    }

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_in);

    /* need to fetch ghost point data from neighbors in x-direction                 */
    if (right_nbr != -1) {
      MPI_Irecv(right_buf_in, RADIUS*height_rank, MPI_DTYPE, right_nbr, 1010,
                MPI_COMM_WORLD, &(request[1+4]));
      for (kk=0,j=jstart_rank; j<=jend_rank; j++) 
      for (i=iend_rank-RADIUS+1; i<=iend_rank; i++) {
        right_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(right_buf_out, RADIUS*height_rank, MPI_DTYPE, right_nbr, 990, 
                MPI_COMM_WORLD, &(request[0+4]));
    }

    if (left_nbr != -1) {
      MPI_Irecv(left_buf_in, RADIUS*height_rank, MPI_DTYPE, left_nbr, 990, 
                MPI_COMM_WORLD, &(request[3+4]));
      for (kk=0,j=jstart_rank; j<=jend_rank; j++) 
      for (i=istart_rank; i<=istart_rank+RADIUS-1; i++) {
        left_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(left_buf_out, RADIUS*height_rank, MPI_DTYPE, left_nbr, 1010,
                MPI_COMM_WORLD, &(request[2+4]));
    }

    if (right_nbr != -1) {
      MPI_Wait(&(request[0+4]), &(status[0+4]));
      MPI_Wait(&(request[1+4]), &(status[1+4]));
      for (kk=0,j=jstart_rank; j<=jend_rank; j++) 
      for (i=iend_rank+1; i<=iend_rank+RADIUS; i++) {
        IN(i,j) = right_buf_in[kk++];
      }
    }

    if (left_nbr != -1) {
      MPI_Wait(&(request[2+4]), &(status[2+4]));
      MPI_Wait(&(request[3+4]), &(status[3+4]));
      for (kk=0,j=jstart_rank; j<=jend_rank; j++) 
      for (i=istart_rank-RADIUS; i<=istart_rank-1; i++) {
        IN(i,j) = left_buf_in[kk++];
      }
    }

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_in);

    /* Apply the stencil operator */
    for (j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) {
      for (i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) {
        #if LOOPGEN
          #include "loop_body_star.incl"
        #else
          for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj);
          for (ii=-RADIUS; ii<0; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
          for (ii=1; ii<=RADIUS; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
        #endif
      }
    }

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_out);

#ifdef LOCAL_BARRIER_SYNCH
    MPI_Barrier(shm_comm); // needed to avoid writing IN while other ranks are reading it
#else
    for (i=0; i<num_local_nbrs; i++) {
      MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i]));
      MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm);
    }
    MPI_Waitall(num_local_nbrs, request, status);
#endif

    /* add constant to solution to force refresh of neighbor data, if any */
    for (j=jstart_rank; j<=jend_rank; j++) 
    for (i=istart_rank; i<=iend_rank; i++) IN(i,j)+= 1.0;

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_in);

#ifdef LOCAL_BARRIER_SYNCH
    MPI_Barrier(shm_comm); // needed to avoid reading IN while other ranks are writing it
#else
    for (i=0; i<num_local_nbrs; i++) {
      MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i]));
      MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm);
    }
    MPI_Waitall(num_local_nbrs, request, status);
#endif
 
  } /* end of iterations                                                   */
 
  local_stencil_time = wtime() - local_stencil_time;
  MPI_Reduce(&local_stencil_time, &stencil_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);
  
  /* compute L1 norm in parallel                                                */
  local_norm = (DTYPE) 0.0;
  for (j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) {
    for (i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) {
      local_norm += (DTYPE)ABS(OUT(i,j));
    }
  }
 
  MPI_Reduce(&local_norm, &norm, 1, MPI_DTYPE, MPI_SUM, root, MPI_COMM_WORLD);
 
  /*******************************************************************************
  ** Analyze and output results.
  ********************************************************************************/
 
/* verify correctness                                                            */
  if (my_ID == root) {
    norm /= f_active_points;
    if (RADIUS > 0) {
      reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY);
    }
    else {
      reference_norm = (DTYPE) 0.0;
    }
    if (ABS(norm-reference_norm) > EPSILON) {
      printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n",
             norm, reference_norm);
      error = 1;
    }
    else {
      printf("Solution validates\n");
#ifdef VERBOSE
      printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", 
             reference_norm, norm);
#endif
    }
  }
  bail_out(error);
 
  MPI_Win_unlock_all(shm_win_in);
  MPI_Win_unlock_all(shm_win_out);
  MPI_Win_free(&shm_win_in);
  MPI_Win_free(&shm_win_out);

  if (my_ID == root) {
    /* flops/stencil: 2 flops (fma) for each point in the stencil, 
       plus one flop for the update of the input of the array        */
    flops = (DTYPE) (2*stencil_size+1) * f_active_points;
    avgtime = stencil_time/iterations;
    printf("Rate (MFlops/s): "FSTR"  Avg time (s): %lf\n",
           1.0E-06 * flops/avgtime, avgtime);
  }
 
  MPI_Finalize();
  exit(EXIT_SUCCESS);
}
Beispiel #15
0
int chaco_dist_graph(
  MPI_Comm comm,		/* MPI Communicator */
  PARIO_INFO_PTR pio_info,      /* Parallel IO info */
  int     host_proc,		/* processor where all the data is initially */
  int     *gnvtxs,		/* number of vertices in global graph */
  int     *nvtxs,		/* number of vertices in local graph */
  int     **xadj,		/* start of edge list for each vertex */
  int     **adjncy,		/* edge list data */
  int     *vwgt_dim,            /* number of weights per vertex */
  float   **vwgts,		/* vertex weight list data */
  int     *ewgt_dim,            /* number of weights per edge */
  float   **ewgts,		/* edge weight list data */
  int     *ndim,                /* dimension of the geometry */
  float   **x,                  /* x-coordinates of the vertices */
  float   **y,                  /* y-coordinates of the vertices */
  float   **z,                  /* z-coordinates of the vertices */
  short   **assignments         /* assignments from Chaco file; may be NULL */
)
{
  const char *yo = "chaco_dist_graph";
  int nprocs, myproc, i, j, k, n, p, nedges, nsend, max_nvtxs, v, adj_cnt;
  int offset, use_graph, nvtx_edges;
  int *old_xadj = NULL, *old_adjncy = NULL, *size = NULL;
  int *send_xadj = NULL, *send_adjncy = NULL;
  int *vtx_list = NULL;
  float *old_x = NULL, *old_y = NULL, *old_z = NULL;
  float *send_x = NULL, *send_y = NULL, *send_z = NULL;
  float *old_vwgts = NULL, *old_ewgts = NULL;
  float *send_vwgts = NULL, *send_ewgts = NULL;
  MPI_Status status;

  /* Determine number of processors and my rank. */
  MPI_Comm_size (comm, &nprocs );
  MPI_Comm_rank (comm, &myproc );

  DEBUG_TRACE_START(myproc, yo);

  /* Initialize */
  use_graph = (*xadj  != NULL);
 
  /* Handle serial case and return. */
  if (nprocs == 1) {
    /* Set values expected to be returned by this function. */
    /* All array pointers are unchanged.                    */
    *gnvtxs = *nvtxs;
    /* Initialize distribution, so other routines using it work. */
    ch_dist_init(nprocs, *gnvtxs, pio_info, assignments, host_proc, comm);

    return 1;
  }

  /* Broadcast to all procs */
  MPI_Bcast( vwgt_dim, 1, MPI_INT, host_proc, comm);
  MPI_Bcast( ewgt_dim, 1, MPI_INT, host_proc, comm);
  MPI_Bcast( &use_graph, 1, MPI_INT, host_proc, comm);
  MPI_Bcast( ndim, 1, MPI_INT, host_proc, comm);
  MPI_Bcast( nvtxs, 1, MPI_INT, host_proc, comm);
  *gnvtxs = *nvtxs;

  /* Initialize the chaco distribution on all processors */
  ch_dist_init(nprocs, *gnvtxs, pio_info, assignments, host_proc, comm);
  
  /* Store pointers to original data */
  if (myproc == host_proc) {
    old_xadj   = *xadj;
    *xadj      = NULL;
    old_adjncy = *adjncy;
    *adjncy    = NULL;
    old_x      = *x;
    old_y      = *y;
    old_z      = *z;
  }

  /* Allocate space for new distributed graph data */
  n = *nvtxs = ch_dist_num_vtx(myproc, *assignments);

  if (use_graph) {
    *xadj = (int *) malloc((n+1)*sizeof(int));
    if (*xadj == NULL) {
      Gen_Error(0, "fatal: insufficient memory");
      return 0;
    }
  }
  if (*vwgt_dim){
    old_vwgts = *vwgts;
    *vwgts = NULL;
    if (n > 0) {
      *vwgts = (float *) malloc(n*(*vwgt_dim)*sizeof(float));
      if (*vwgts == NULL) {
        Gen_Error(0, "fatal: insufficient memory");
        return 0;
      }
    }
  }
  if (*ndim > 0) {
    *x = (float *)  malloc(n*sizeof(float));
    if (*ndim > 1) {
      *y = (float *)  malloc(n*sizeof(float));
      if (*ndim > 2) {
        *z = (float *)  malloc(n*sizeof(float));
      }
    }
  }


  /* 
   * Distribute vertex data (xadj, coordinates, etc ) to all procs 
   */

  if (myproc == host_proc){

    /* Allocate space for send buffers  (size = max num vtx per proc ) */
    max_nvtxs = ch_dist_max_num_vtx(*assignments);
    if (use_graph) {
      send_xadj = (int *) malloc((max_nvtxs+1)*sizeof(int));
      if (send_xadj == NULL) {
        Gen_Error(0, "fatal: insufficient memory");
        return 0;
      }
    }
    if (*vwgt_dim) {
      send_vwgts = (float *) malloc(max_nvtxs*(*vwgt_dim)*sizeof(float));
      if (send_vwgts == NULL) {
        Gen_Error(0, "fatal: insufficient memory");
        return 0;
      }
    }
    if (*ndim > 0) {
      send_x = (float *) malloc(max_nvtxs*sizeof(float));
      if (send_x == NULL) {
        Gen_Error(0, "fatal: insufficient memory");
        return 0;
      }
      if (*ndim > 1) {
        send_y = (float *) malloc(max_nvtxs*sizeof(float));
        if (send_y == NULL) {
          Gen_Error(0, "fatal: insufficient memory");
          return 0;
        }
        if (*ndim > 2) {
          send_z = (float *) malloc(max_nvtxs*sizeof(float));
          if (send_z == NULL) {
            Gen_Error(0, "fatal: insufficient memory");
            return 0;
          }
        }
      }
    }

    /* Allocate space for list of vertices on a given processor */
    vtx_list = (int *) malloc(max_nvtxs*sizeof(int));
    if (vtx_list == NULL) {
      Gen_Error(0, "fatal: insufficient memory");
      return 0;
    }

    /* Allocate array to accumulate number of edges to be sent to each proc. */
    if (use_graph) {
      size = (int *) malloc(nprocs*sizeof(int));
      if (size == NULL) {
        Gen_Error(0, "fatal: insufficient memory");
        return 0;
      }
    }

    /* For each processor, gather its vertex information and send it. */
    for (p = 0; p < nprocs; p++){
      if (use_graph) size[p] = 0;

      /* Get list of vertices to be assigned to processor p */
      ch_dist_vtx_list(vtx_list, &nsend, p, *assignments);

      if (p == myproc){

        /* Loop over vertices assigned to myproc; copy the vertex */
        /* data into local arrays.                                */

        if (use_graph) (*xadj)[0] = 0;
        for (i = 0; i < nsend; i++) {
          v = vtx_list[i];
          if (use_graph) {
            size[p] += old_xadj[v+1]-old_xadj[v];
            (*xadj)[i+1] = (*xadj)[i] + old_xadj[v+1] - old_xadj[v];
          }

          if (*vwgt_dim){
            for (j=0; j<*vwgt_dim; j++)
              (*vwgts)[i*(*vwgt_dim)+j] = old_vwgts[v*(*vwgt_dim)+j];
          }

          if (*ndim > 0) {
            (*x)[i] = old_x[v];
            if (*ndim > 1) {
              (*y)[i] = old_y[v];
              if (*ndim > 2) {
                (*z)[i] = old_z[v];
              }
            }
          }
        }
      }
      else {

        /* Loop over vertices assigned to proc p to gather */
        /* vertex data into send buffers                   */

        if (use_graph) send_xadj[0] = 0;
        for (i = 0; i < nsend; i++) {
          v = vtx_list[i];
          if (use_graph) {
            size[p] += old_xadj[v+1]-old_xadj[v];
            send_xadj[i+1] = send_xadj[i] + old_xadj[v+1] - old_xadj[v];
          }
          if (*vwgt_dim){
            for (j=0; j<*vwgt_dim; j++)
              send_vwgts[i*(*vwgt_dim)+j] = old_vwgts[v*(*vwgt_dim)+j];
          }
          if (*ndim > 0) {
            send_x[i] = old_x[v];
            if (*ndim > 1) {
              send_y[i] = old_y[v];
              if (*ndim > 2) {
                send_z[i] = old_z[v];
              }
            }
          }
        }

        /* Send vertex data to proc p. */
        if (use_graph)
          MPI_Send(send_xadj, nsend+1, MPI_INT, p, 1, comm);
        if (*vwgt_dim)
          MPI_Send(send_vwgts, nsend*(*vwgt_dim), MPI_FLOAT, p, 2, comm);
        if (*ndim > 0) {
          MPI_Send(send_x, nsend, MPI_FLOAT, p, 3, comm);
          if (*ndim > 1) {
            MPI_Send(send_y, nsend, MPI_FLOAT, p, 4, comm);
            if (*ndim > 2) {
              MPI_Send(send_z, nsend, MPI_FLOAT, p, 5, comm);
            }
          }
        }
      }
    }
    safe_free((void **)(void *) &send_xadj);
    safe_free((void **)(void *) &send_vwgts);
    safe_free((void **)(void *) &send_x);
    safe_free((void **)(void *) &send_y);
    safe_free((void **)(void *) &send_z);
  }
  else {
    /* host_proc != myproc; receive vertex data from host_proc */
    if (use_graph)
      MPI_Recv (*xadj, (*nvtxs)+1, MPI_INT, host_proc, 1, comm, &status);
    if (*vwgt_dim)
      MPI_Recv (*vwgts, (*nvtxs)*(*vwgt_dim), MPI_FLOAT, host_proc, 2, comm, &status);
    if (*ndim > 0) {
      MPI_Recv(*x, *nvtxs,  MPI_FLOAT, host_proc, 3, comm, &status);
      if (*ndim > 1) {
        MPI_Recv(*y, *nvtxs, MPI_FLOAT, host_proc, 4, comm, &status);
        if (*ndim > 2) {
          MPI_Recv(*z, *nvtxs, MPI_FLOAT, host_proc, 5, comm, &status);
        }
      }
    }
  }


  /* 
   * Distribute edge data to all procs 
   */

  if (use_graph) {

    if (*ewgt_dim) {
      old_ewgts = *ewgts;
      *ewgts = NULL;
    }

    /* Allocate space for edge data */
    nedges = (*xadj)[ *nvtxs];
    if (nedges > 0) {
      *adjncy = (int *) malloc(nedges * sizeof (int));
      if (*adjncy == NULL) {
        Gen_Error(0, "fatal: insufficient memory");
        return 0;
      }
      if (*ewgt_dim){
        *ewgts = (float *) malloc(nedges*(*ewgt_dim) * sizeof (float));
        if (*ewgts == NULL) {
          Gen_Error(0, "fatal: insufficient memory");
          return 0;
        }
      }
    }

    /* Gather and send/receive edge data */

    if (myproc == host_proc){

      /* For each processor, gather its edge data and send it. */
      for (p = 0; p < nprocs; p++){
        if (size[p] == 0) continue;

        /* Get list of vertices to be assigned to processor p */
        ch_dist_vtx_list(vtx_list, &nsend, p, *assignments);

        adj_cnt = 0;
        if (p == myproc) {

          /* Loop over vertices assigned to myproc copy the edge */
          /* data into local arrays.                             */

          for (i = 0; i < nsend; i++) {
            v = vtx_list[i];
            offset = old_xadj[v];
            nvtx_edges = old_xadj[v+1] - old_xadj[v];
            for (j = 0; j < nvtx_edges; j++) {
              (*adjncy)[adj_cnt] = old_adjncy[offset+j];
              if (*ewgt_dim){ 
                for (k=0; k<*ewgt_dim; k++)
                  (*ewgts)[adj_cnt*(*ewgt_dim)+k] = old_ewgts[(offset+j)*(*ewgt_dim)+k];
              }
              adj_cnt++;
            }
          }
        }
        else { /* p != myproc */

          /* allocate send buffers; size = num edges to send to proc p */
          nvtx_edges = 0;
          for (i = 0; i < nsend; i++) {
            v = vtx_list[i];
            nvtx_edges += old_xadj[v+1] - old_xadj[v];
          }
          send_adjncy = (int *) malloc(nvtx_edges * sizeof(int));
          if (send_adjncy == NULL) {
            Gen_Error(0, "fatal: insufficient memory");
            return 0;
          }
          if (*ewgt_dim) {
            send_ewgts = (float *) malloc(nvtx_edges*(*ewgt_dim) * sizeof(float));
            if (send_ewgts == NULL) {
              Gen_Error(0, "fatal: insufficient memory");
              return 0;
            }
          }

          /* Loop over vertices assigned to proc p to gather */
          /* edge data into send buffers                     */

          for (i = 0; i < nsend; i++) {
            v = vtx_list[i];
            offset = old_xadj[v];
            nvtx_edges = old_xadj[v+1] - old_xadj[v];
            for (j = 0; j < nvtx_edges; j++) {
              send_adjncy[adj_cnt] = old_adjncy[offset+j];
              if (*ewgt_dim){
                for (k=0; k<*ewgt_dim; k++)
                  send_ewgts[adj_cnt*(*ewgt_dim)+k] = old_ewgts[(offset+j)*(*ewgt_dim)+k];
              }
              adj_cnt++;
            }
          }
          /* Send edge data to proc p. */
          MPI_Send(send_adjncy, size[p], MPI_INT, p, 6, comm);
          if (*ewgt_dim)
            MPI_Send(send_ewgts, size[p]*(*ewgt_dim), MPI_FLOAT, p, 7, comm);
          safe_free((void **)(void *) &send_adjncy);
          safe_free((void **)(void *) &send_ewgts);
        }
      }
    }
    else {
      /* host_proc != myproc; receive edge data from host_proc */
      if (nedges > 0) {
        MPI_Recv (*adjncy, nedges, MPI_INT, host_proc, 6, comm, &status);
        if (*ewgt_dim)
          MPI_Recv (*ewgts, nedges*(*ewgt_dim), MPI_FLOAT, host_proc, 7, comm, &status);
      }
    }
  }

  /* Free space on host proc */
  if (myproc == host_proc){
    safe_free((void **)(void *) &old_xadj);
    safe_free((void **)(void *) &old_adjncy);
    safe_free((void **)(void *) &old_vwgts);
    safe_free((void **)(void *) &old_ewgts);

    safe_free((void **)(void *) &old_x);
    safe_free((void **)(void *) &old_y);
    safe_free((void **)(void *) &old_z);
    safe_free((void **)(void *) &vtx_list);
  }
  if (size != NULL) safe_free((void **)(void *) &size);
   
  DEBUG_TRACE_END(myproc, yo);
  return 1;
}
Beispiel #16
0
/*
 * Test main function
 */
int test_noise(Test_time_result_type *times, int mes_length, int num_repeats, int num_noise_repeats, int loading, int num_noise_procs )
{
	int* mode_array=NULL;
	init_test_data( &td );
	
	int proc1, proc2;
	MPI_Status status;
	MPI_Request send_request;
	MPI_Request recv_request;
	
	MPI_Request* requests_noise=NULL;
	MPI_Status*  statuses_noise=NULL; 
	
	int sync_sum;

	int i, j, k, l;
	px_my_time_type time_beg,time_end;
	px_my_time_type sum;
	px_my_time_type st_deviation;
	
	int flag;
	int work_flag=1;

	int command[2];
				
	int remote_proc;

	/*
	 * Try get enough memory. If didn't, return -1. In send_request we got memory for both send and receive request
	 */ 
	requests_noise=(MPI_Request *)malloc(2*num_noise_procs*sizeof(MPI_Request));
	if(requests_noise == NULL )
	{
		return -1;
	}

	statuses_noise=(MPI_Status *)malloc(2*num_noise_procs*sizeof(MPI_Status));
	if(statuses_noise == NULL )
	{
		return -1;
	}

	mode_array=(int *)malloc(comm_size*sizeof(int));
	if(mode_array==NULL)
	{
		return -1;
	}

	if ( !alloc_test_data( &td, mes_length, num_repeats, loading, num_noise_procs ) )
	{
		return -1;
	}

	/*
	 * Ok, lets begin test part
	 */
	srand( (unsigned)time( NULL ) );
	
	for(i=0; i<comm_size; i++)
	for(j=0; j<num_repeats; j++)
	{
		td.tmp_results[i][j] = 0;
	}

	if(comm_rank==0)
	{
		/* Uncomment to debug 
		printf("HELLO! I'm 0, press any key\n");
		getchar();
		*/

		for(proc1=0;proc1<comm_size; proc1++)
		for(proc2=0;proc2<comm_size; proc2++)
		{
			flag=init_mode_array(proc1,proc2,num_noise_procs,comm_size,mode_array);
			if(flag)
			{
				return -1;
			}
			
			for(i=0;i<num_repeats;i++)
			{

				MPI_Bcast( mode_array, comm_size, MPI_INT, 0, MPI_COMM_WORLD );
				
				command[0]=i; /* Iteration number */
				command[1]=proc2; /* Leader (messages passing durations will be stored) */

				if(proc1!=0)
				{
					MPI_Send(&command,2,MPI_INT,proc1,1,MPI_COMM_WORLD);
				}
				if((proc2!=0)&&(proc2!=proc1))
				{
					MPI_Send(&command,2,MPI_INT,proc2,1,MPI_COMM_WORLD);
				}
				
				/*
				 *
				 * Goal messages in proc with number 0
				 *
				 */
				if(mode_array[0]==MODE_GOAL_MESSAGES)
				{
					if(proc1==0)
					{
						remote_proc=proc2;
					}
					else
					{
						remote_proc=proc1;
					}

					time_beg=px_my_cpu_time();

						MPI_Isend( td.send_data[remote_proc], mes_length, MPI_BYTE, remote_proc, 0, MPI_COMM_WORLD, &send_request);
						MPI_Irecv( td.recv_data[remote_proc], mes_length, MPI_BYTE, remote_proc, 0, MPI_COMM_WORLD, &recv_request);
						MPI_Wait( &send_request, &status );
						MPI_Wait( &recv_request, &status );

					time_end = px_my_cpu_time();
					
					/*
					 *
					 * command[0] -- current interation number in message passing repeats  
					 * command[1] -- process leader in the pair
					 *
					 */
					if(proc2==0)
					{
						td.tmp_results[proc1][i] = (px_my_time_type)(time_end - time_beg);
					}					
				}
				

				/*
				 *
				 * Noise messages in proc with number 0
				 *
				 */
				if(mode_array[0]==MODE_NOISE_MESSAGES)
				{
						for( j = 0; j < num_noise_repeats; j++ )
						{
							k=0;
							for( l = 1; l < comm_size; l++ )
							{
								if( mode_array[l]==MODE_NOISE_MESSAGES )
								{
									MPI_Isend( td.send_data_noise[l], loading, MPI_BYTE, l, 0, MPI_COMM_WORLD, &requests_noise[k] );
									MPI_Irecv( td.recv_data_noise[l], loading, MPI_BYTE, l, 0, MPI_COMM_WORLD, &requests_noise[k+1]);
									k+=2;
								}
						   }

						   MPI_Waitall(k,requests_noise,statuses_noise);
						}
				
				}


				/*
				 *
				 * This reduce is used for processes syncronization. All must send 
				 * their order number to the process with number 0
				 *
				 */
				MPI_Reduce(&comm_rank,&sync_sum,1,MPI_INT,MPI_SUM,0,MPI_COMM_WORLD);

			} /* end for (num_repeats) */
			
		} /* End for proc1,proc2 */
		
		/*
		 *
		 * Finishing work 
		 *
		 */
		for(i=0;i<comm_size;i++)
		{
			mode_array[i]=MODE_FINISH_WORK;
		}
			
		MPI_Bcast( mode_array, comm_size, MPI_INT, 0, MPI_COMM_WORLD );
		MPI_Reduce(&comm_rank,&sync_sum,1,MPI_INT,MPI_SUM,0,MPI_COMM_WORLD);



	} /* end if(comm_rank==0) */
	else
	{
		while(work_flag)
		{
		 	MPI_Bcast( mode_array, comm_size, MPI_INT, 0, MPI_COMM_WORLD );
		 	switch(mode_array[comm_rank])
		 	{
				case MODE_GOAL_MESSAGES:
					
					MPI_Recv(&command,2,MPI_INT,0,1,MPI_COMM_WORLD,&status);
					
					remote_proc=comm_rank;					
					for(i=0;i<comm_size;i++)
					{
						if((mode_array[i]==MODE_GOAL_MESSAGES)&&(i!=comm_rank))
						{
							remote_proc=i;
							break;
						}						
					}

					time_beg=px_my_cpu_time();

						MPI_Isend( td.send_data[remote_proc], mes_length, MPI_BYTE, remote_proc, 0, MPI_COMM_WORLD, &send_request);
						MPI_Irecv( td.recv_data[remote_proc], mes_length, MPI_BYTE, remote_proc, 0, MPI_COMM_WORLD, &recv_request);
						MPI_Wait( &send_request, &status );
						MPI_Wait( &recv_request, &status );

					time_end = px_my_cpu_time();
					
					/*
					 *
					 * command[0] -- current interation number in message passing repeats  
					 * command[1] -- process leader in the pair
					 *
					 */
					if(comm_rank==command[1])
					{
						td.tmp_results[remote_proc][command[0]] = (px_my_time_type)(time_end - time_beg);
					}
					
				break;
				case MODE_NOISE_MESSAGES:
						for( i = 0; i < num_noise_repeats; i++ )
						{
							k=0;
							for( j = 0; j < comm_size; j++ )
							{
								if( (j != comm_rank) && (mode_array[j] == MODE_NOISE_MESSAGES ) )
								{
									MPI_Isend( td.send_data_noise[j], loading, MPI_BYTE, j, 0, MPI_COMM_WORLD, &requests_noise[k] );
									MPI_Irecv( td.recv_data_noise[j], loading, MPI_BYTE, j, 0, MPI_COMM_WORLD, &requests_noise[k+1]);
									k+=2;
								}
						   }

						   MPI_Waitall(k,requests_noise,statuses_noise);
						}
				
				break;
				case MODE_FINISH_WORK:
					work_flag=0;
				break;
		 	}
			
			MPI_Reduce(&comm_rank,&sync_sum,1,MPI_INT,MPI_SUM,0,MPI_COMM_WORLD);

		} /* end while work_flag */ 	
	}	/* end else if(comm_rank==0) */

	/*
	 * Averaging results
	 */
	for( i = 0; i < comm_size; i++ )
	{
		sum = 0;
		for( j = 0; j < num_repeats; j++ )
		{
			sum += td.tmp_results[i][j];
		}
		times[i].average=(sum/(double)num_repeats);
 			
 		st_deviation=0;
 		for(j=0;j<num_repeats;j++)
 		{
  		 	st_deviation+=(td.tmp_results[i][j]-times[i].average)*(td.tmp_results[i][j]-times[i].average);
		}
 		st_deviation/=(double)(num_repeats);
 		times[i].deviation=sqrt(st_deviation);
 		
		/*
		 *
		 * Function my_time_cmp is described in the file  'network_test.h' and 
		 * is implemented in the file 'network_test.cpp'.
		 *
		 */
 		qsort(td.tmp_results[i], num_repeats, sizeof(px_my_time_type), my_time_cmp );
 		times[i].median=td.tmp_results[i][num_repeats/2]; 	
 		
 		times[i].min=td.tmp_results[i][0]; 	
	}

	/*
	 * Free memory
	 */
	free( requests_noise );
	free( statuses_noise );
	free( mode_array );

	free_test_data( &td );

	return 0;
}
Beispiel #17
0
void distribute_cells ( int myid, int np, int nspt, int dim, int *nbpaths )
{
   int nbcell,*cell,nbsols,i,j,k,left[np],L,R,*m,sn,mysolnum;
   int count,labSize,cnt,dest;
   MPI_Status status;
   MPI_Comm com = MPI_COMM_WORLD;
   int length[nspt],*labels ;
   int fail,A[2],n;
   double normal[dim],sol[2*(dim-1)+5],*c;
   lisStack *s;
   int *a,*b;
   
   n=dim-1;A[0]=n;
   if(myid == 0)
   { 
      fail=celcon_number_of_cells(&nbcell);  /* get the number of cells */
      cell=(int*)calloc(nbcell,sizeof(int));
      for(i=0; i<nbcell; i++)
         fail = celcon_mixed_volume(i+1,&cell[i]);

      nbsols=0;
      for(i=0; i<nbcell; i++) nbsols=nbsols+cell[i];   
      if(v>0) printf("The number cells are %d\n",nbcell);
      if(v>0) print_cell(nbcell,cell);
      if(v>0) printf("The total solutions are %d\n",nbsols);
   }
   MPI_Bcast(&nbsols,1,MPI_INT,0,com);
   
   if(myid == 0)
   {
      if(v>0) printf("\n");
      left[0] = 0;
      for(i=1; i<np; i++)
         if(i <= (nbsols%(np-1))) 
            left[i] = nbsols/(np-1)+1; 
         else
            left[i] = nbsols/(np-1);
      if(v>0) printf("left:");
      if(v>0) Print_Integer_Array(np,left);
   }

   MPI_Scatter(left,1,MPI_INT,&mysolnum,1,MPI_INT,0,com);

   if(myid == 0)
   {
      fail = celcon_number_of_points_in_cell(1,nspt,length);
      labSize=0;
      for(i=0; i<nspt; i++) labSize = labSize+length[i];
      labSize = 1+nspt+labSize;
   }
   MPI_Bcast(&labSize,1,MPI_INT,0,MPI_COMM_WORLD);
   labels = (int*)calloc(labSize,sizeof(int));
   m=(int*)calloc(3,sizeof(int));

   if(myid==0)
   {
      L=1; R=np-1;
      for(i=0; i<nbcell; i++)
      { 
         m[0] = i+1;
         m[2] = labSize;
         celcon_retrieve_mixed_cell(dim,nspt,i+1,labels,normal);
         
         sn=1;
         while(cell[i]!=0)
         { 
            if(cell[i]>=left[L])
            {   
               m[1] = left[L]; m[2] = sn;      
	       MPI_Send(&m[1],2,MPI_INT,L,SEND_CELL,com);
               /*
               if(v>0)
                  printf("%2d paths from cell %d is sending to node %d\n",
                         m[1],m[0],L);
                */
               MPI_Send(labels,labSize,MPI_INT,L,SEND_SUPP,com);
               MPI_Send(normal,dim,MPI_DOUBLE,L,SEND_NORMAL,com) ;
               cell[i] = cell[i]-left[L]; sn = sn+left[L];
               L++;  
            }
            else if(cell[i]>=left[R])
            { 
               m[1] = left[R]; m[2] = sn;      
               MPI_Send(&m[1],2,MPI_INT,R,SEND_CELL,com);
               /*
               if(v>0)
                  printf("%2d paths from cell %d is sending to node %d\n",
                         m[1],m[0],R);
               */
               MPI_Send(labels,labSize,MPI_INT,R,SEND_SUPP,com);
               MPI_Send(normal,dim,MPI_DOUBLE,R,SEND_NORMAL,com) ;
               cell[i] = cell[i]-left[R]; sn = sn+left[R];
               R--;   
            }
            else
            {              
               m[1] = cell[i]; m[2] = sn;     
               MPI_Send(&m[1],2,MPI_INT,R,SEND_CELL,com);
               /*
               if(v>0)
                  printf("%2d paths from cell %d is sending to node %d\n",
                         m[1],m[0],R);
               */
               MPI_Send(labels,labSize,MPI_INT,R,SEND_SUPP,com);
               MPI_Send(normal,dim,MPI_DOUBLE,R,SEND_NORMAL,com);
               left[R]=left[R]-cell[i]; sn = sn+cell[i];
               cell[i]=0;  
            }
         }
      }
      if(v>0) printf("****************************************************\n");
      printf("writing random coefficient system and its solutions to file\n");
      fail = celcon_write_random_coefficient_system();
      fail = solcon_write_solution_dimensions_to_defined_output_file(nbsols,n);
      cnt = 0;
      for(k=1; k<=nbsols; k++)
      {
         MPI_Recv(&A[1],1,MPI_INT,MPI_ANY_SOURCE,SEND_MUL,
                  MPI_COMM_WORLD,&status);
         MPI_Recv(sol,2*n+5,MPI_DOUBLE,MPI_ANY_SOURCE,SEND_SOL,
                  MPI_COMM_WORLD,&status);
         fail = solcon_write_next_solution_to_defined_output_file
                  (&cnt,n,A[1],sol);
      }
      *nbpaths = nbsols;

   } /* myid=0 finish */
   else
   {  
      *nbpaths = mysolnum;
      if(v>0) printf("Node %d has %d paths\n",myid,mysolnum);
      s=(lisStack*)calloc(1,sizeof(lisStack));
      ls_init(s);
      count = 0; sn = 0;
      while(count < mysolnum)
      {  
         MPI_Recv(&m[1],2,MPI_INT,0,SEND_CELL,com,&status);
         sn++;
         m[0] = sn;
         ls_push(s,m);
         count = count+m[1];
         /*
     	 if(v>0) printf("Node %d is receving %2d paths from cell %d\n",
		        myid,m[1],m[0]);
         */
	 MPI_Recv(labels,labSize,MPI_INT,0,SEND_SUPP,com,&status) ;
	 MPI_Recv(normal,dim,MPI_DOUBLE,0,SEND_NORMAL,com,&status) ;
      	 fail = celcon_append_mixed_cell(dim,nspt,labSize,labels,normal);
      }
      fail = celcon_create_polyhedral_homotopy();
      
      for(i=1; i<=sn; i++)
      {
         m = ls_cur(s);
         fail = celcon_solve_start_system(m[0],&R);

         if(fail == 1)
         {
            printf("Solving start system failed.\n");
            printf("Node %d skips cell %d with volume %d...\n",myid,m[0],R);
         }
         else
         {
    /* printf("found %d start solutions from cell %d\n",R,m[0]); */

            fail=celcon_mixed_volume(m[0],&L);

    /* if(R==L)  printf("#start solutions equals mixed volume %d, OK\n",L);
       else  printf("#start solutions not equals mixed volume %d!!!, \n",L);
    */
            for(j=m[2]; j<m[1]+m[2]; j++)
            {          
               fail = celcon_track_solution_path(m[0],j,0);
               fail = solcon_clear_solutions();
               fail = celcon_copy_target_solution_to_container(m[0],j-m[2]+1);
               fail = solcon_retrieve_solution(n,1,&A[1],sol);
               MPI_Send(&A[1],1,MPI_INT,0,SEND_MUL,MPI_COMM_WORLD);
               MPI_Send(sol,2*n+5,MPI_DOUBLE,0,SEND_SOL,MPI_COMM_WORLD);
            }
         }
         ls_pop(s);
      }
   } /* end else */

 /*  MPI_Barrier(com);  */  
   if(myid == 0)
      free(cell);
   else
      free(s);
   free(labels);
   free(m);
}
Beispiel #18
0
int main(int argc, char **argv)
{
    int *writebuf, *readbuf, i, mynod, nprocs, len, err;
    char *filename;
    int errs=0, toterrs;
    MPI_Datatype newtype;
    MPI_File fh;
    MPI_Status status;
    MPI_Info info;

    MPI_Init(&argc,&argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &mynod);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

/* process 0 takes the file name as a command-line argument and
   broadcasts it to other processes */
    if (!mynod) {
	i = 1;
	while ((i < argc) && strcmp("-fname", *argv)) {
	    i++;
	    argv++;
	}
	if (i >= argc) {
	    fprintf(stderr, "\n*#  Usage: coll_test -fname filename\n\n");
	    MPI_Abort(MPI_COMM_WORLD, 1);
	}
	argv++;
	len = strlen(*argv);
	filename = (char *) malloc(len+1);
	strcpy(filename, *argv);
	MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD);
	MPI_Bcast(filename, len+1, MPI_CHAR, 0, MPI_COMM_WORLD);
    }
    else {
	MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD);
	filename = (char *) malloc(len+1);
	MPI_Bcast(filename, len+1, MPI_CHAR, 0, MPI_COMM_WORLD);
    }

    writebuf = (int *) malloc(BUFSIZE*sizeof(int));
    readbuf = (int *) malloc(BUFSIZE*sizeof(int));

/* test atomicity of contiguous accesses */

/* initialize file to all zeros */
    if (!mynod) {
	MPI_File_delete(filename, MPI_INFO_NULL);
	MPI_File_open(MPI_COMM_SELF, filename, MPI_MODE_CREATE |
             MPI_MODE_RDWR, MPI_INFO_NULL, &fh);
	for (i=0; i<BUFSIZE; i++) writebuf[i] = 0;
	MPI_File_write(fh, writebuf, BUFSIZE, MPI_INT, &status);
	MPI_File_close(&fh);
#if VERBOSE
	fprintf(stderr, "\ntesting contiguous accesses\n");
#endif
    }
    MPI_Barrier(MPI_COMM_WORLD);

    for (i=0; i<BUFSIZE; i++) writebuf[i] = 10;
    for (i=0; i<BUFSIZE; i++) readbuf[i] = 20;

    MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_CREATE |
             MPI_MODE_RDWR, MPI_INFO_NULL, &fh);

/* set atomicity to true */
    err = MPI_File_set_atomicity(fh, 1);
    if (err != MPI_SUCCESS) {
	fprintf(stderr, "Atomic mode not supported on this file system.\n");fflush(stderr);
	MPI_Abort(MPI_COMM_WORLD, 1);
    }

    MPI_Barrier(MPI_COMM_WORLD);

/* process 0 writes and others concurrently read. In atomic mode,
   the data read must be either all old values or all new values; nothing
   in between. */

    if (!mynod) MPI_File_write(fh, writebuf, BUFSIZE, MPI_INT, &status);
    else {
	err = MPI_File_read(fh, readbuf, BUFSIZE, MPI_INT, &status);
	if (err == MPI_SUCCESS) {
	    if (readbuf[0] == 0) { /* the rest must also be 0 */
		for (i=1; i<BUFSIZE; i++)
		    if (readbuf[i] != 0) {
			errs++;
			fprintf(stderr, "Process %d: readbuf[%d] is %d, should be 0\n", mynod, i, readbuf[i]);
			MPI_Abort(MPI_COMM_WORLD, 1);
		    }
	    }
	    else if (readbuf[0] == 10) { /* the rest must also be 10 */
		for (i=1; i<BUFSIZE; i++)
		    if (readbuf[i] != 10) {
			errs++;
			fprintf(stderr, "Process %d: readbuf[%d] is %d, should be 10\n", mynod, i, readbuf[i]);
			MPI_Abort(MPI_COMM_WORLD, 1);
		    }
	    }
	    else {
		errs++;
		fprintf(stderr, "Process %d: readbuf[0] is %d, should be either 0 or 10\n", mynod, readbuf[0]);
	    }
	}
    }

    MPI_File_close(&fh);

    MPI_Barrier(MPI_COMM_WORLD);


/* repeat the same test with a noncontiguous filetype */

    MPI_Type_vector(BUFSIZE, 1, 2, MPI_INT, &newtype);
    MPI_Type_commit(&newtype);

    MPI_Info_create(&info);
    /* I am setting these info values for testing purposes only. It is
       better to use the default values in practice. */
    MPI_Info_set(info, "ind_rd_buffer_size", "1209");
    MPI_Info_set(info, "ind_wr_buffer_size", "1107");

    if (!mynod) {
	MPI_File_delete(filename, MPI_INFO_NULL);
	MPI_File_open(MPI_COMM_SELF, filename, MPI_MODE_CREATE |
             MPI_MODE_RDWR, info, &fh);
	for (i=0; i<BUFSIZE; i++) writebuf[i] = 0;
	MPI_File_set_view(fh, 0, MPI_INT, newtype, "native", info);
	MPI_File_write(fh, writebuf, BUFSIZE, MPI_INT, &status);
	MPI_File_close(&fh);
#if VERBOSE
	fprintf(stderr, "\ntesting noncontiguous accesses\n");
#endif
    }
    MPI_Barrier(MPI_COMM_WORLD);

    for (i=0; i<BUFSIZE; i++) writebuf[i] = 10;
    for (i=0; i<BUFSIZE; i++) readbuf[i] = 20;

    MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_CREATE |
             MPI_MODE_RDWR, info, &fh);
    MPI_File_set_atomicity(fh, 1);
    MPI_File_set_view(fh, 0, MPI_INT, newtype, "native", info);
    MPI_Barrier(MPI_COMM_WORLD);

    if (!mynod) MPI_File_write(fh, writebuf, BUFSIZE, MPI_INT, &status);
    else {
	err = MPI_File_read(fh, readbuf, BUFSIZE, MPI_INT, &status);
	if (err == MPI_SUCCESS) {
	    if (readbuf[0] == 0) {
		for (i=1; i<BUFSIZE; i++)
		    if (readbuf[i] != 0) {
			errs++;
			fprintf(stderr, "Process %d: readbuf[%d] is %d, should be 0\n", mynod, i, readbuf[i]);
			MPI_Abort(MPI_COMM_WORLD, 1);
		    }
	    }
	    else if (readbuf[0] == 10) {
		for (i=1; i<BUFSIZE; i++)
		    if (readbuf[i] != 10) {
			errs++;
			fprintf(stderr, "Process %d: readbuf[%d] is %d, should be 10\n", mynod, i, readbuf[i]);
			MPI_Abort(MPI_COMM_WORLD, 1);
		    }
	    }
	    else {
		errs++;
		fprintf(stderr, "Process %d: readbuf[0] is %d, should be either 0 or 10\n", mynod, readbuf[0]);
	    }
	}
    }

    MPI_File_close(&fh);

    MPI_Barrier(MPI_COMM_WORLD);

    MPI_Allreduce( &errs, &toterrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD );
    if (mynod == 0) {
	if( toterrs > 0) {
	    fprintf( stderr, "Found %d errors\n", toterrs );
	}
	else {
	    fprintf( stdout, " No Errors\n" );
	}
    }
    MPI_Type_free(&newtype);
    MPI_Info_free(&info);
    free(writebuf);
    free(readbuf);
    free(filename);

    MPI_Finalize();
    return 0;
}
Beispiel #19
0
int main(int argc, char **argv) {
    int           procid, nproc, i, j, my_nelem;
    int           pollint = 0;
    double        time;
    MPI_Win       llist_win;
    llist_ptr_t   head_ptr, tail_ptr;

    MPI_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &procid);
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);

    MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &llist_win);

    /* Process 0 creates the head node */
    if (procid == 0)
        head_ptr.disp = alloc_elem(procid, llist_win);

    /* Broadcast the head pointer to everyone */
    head_ptr.rank = 0;
    MPI_Bcast(&head_ptr.disp, 1, MPI_AINT, 0, MPI_COMM_WORLD);
    tail_ptr = head_ptr;

    /* All processes append NUM_ELEMS elements to the list; rank 0 has already
     * appended an element. */
    if (procid == 0)
        i = 1;
    else
        i = 0;
    my_nelem = NUM_ELEMS/nproc;
    if (procid < NUM_ELEMS % nproc)
        my_nelem++;

    MPI_Barrier(MPI_COMM_WORLD);
    time = MPI_Wtime();

    for ( ; i < my_nelem; i++) {
        llist_ptr_t new_elem_ptr;
        int success = 0;

        /* Create a new list element and register it with the window */
        new_elem_ptr.rank = procid;
        new_elem_ptr.disp = alloc_elem(procid, llist_win);

        /* Append the new node to the list.  This might take multiple attempts if
           others have already appended and our tail pointer is stale. */
        do {
            int flag;

            /* The tail is at my left neighbor, append my element. */
            if (tail_ptr.rank == (procid + nproc-1) % nproc)
            {
                if (verbose)
                    printf("%d: Appending to <%d, %p>\n", procid, tail_ptr.rank, (void*) tail_ptr.disp);

                MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);
#if USE_ACC
                MPI_Accumulate(&new_elem_ptr, sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
                               (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next), sizeof(llist_ptr_t),
                               MPI_BYTE, MPI_REPLACE, llist_win);
#else
                MPI_Put(&new_elem_ptr, sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
                        (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next), sizeof(llist_ptr_t),
                        MPI_BYTE, llist_win);
#endif
                MPI_Win_unlock(tail_ptr.rank, llist_win);

                success = 1;
                tail_ptr = new_elem_ptr;
            }

            /* Otherwise, chase the tail. */
            else
            {
                llist_ptr_t next_tail_ptr;

                MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);
#if USE_ACC
                MPI_Get_accumulate( NULL, 0, MPI_DATATYPE_NULL, &next_tail_ptr,
                                    sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
                                    (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next),
                                    sizeof(llist_ptr_t), MPI_BYTE, MPI_NO_OP, llist_win);
#else
                MPI_Get(&next_tail_ptr, sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
                        (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next),
                        sizeof(llist_ptr_t), MPI_BYTE, llist_win);
#endif
                MPI_Win_unlock(tail_ptr.rank, llist_win);

                if (next_tail_ptr.rank != nil.rank) {
                    if (verbose)
                        printf("%d: Chasing to <%d, %p>\n", procid, next_tail_ptr.rank, (void*) next_tail_ptr.disp);
                    tail_ptr = next_tail_ptr;
                    pollint = MAX(MIN_NPROBE, pollint/2);
                }
                else {
                    for (j = 0; j < pollint; j++)
                        MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);

                    pollint = MIN(MAX_NPROBE, pollint*2);
                }
            }
        } while (!success);
    }

    MPI_Barrier(MPI_COMM_WORLD);
    time = MPI_Wtime() - time;

    /* Traverse the list and verify that all processes inserted exactly the correct
       number of elements. */
    if (procid == 0) {
        int  errors    = 0;
        int *counts, count = 0;

        counts = (int*) malloc(sizeof(int) * nproc);
        assert(counts != NULL);

        for (i = 0; i < nproc; i++)
            counts[i] = 0;

        tail_ptr = head_ptr;

        MPI_Win_lock_all(0, llist_win);

        /* Walk the list and tally up the number of elements inserted by each rank */
        while (tail_ptr.disp != nil.disp) {
            llist_elem_t elem;

            MPI_Get(&elem, sizeof(llist_elem_t), MPI_BYTE,
                    tail_ptr.rank, tail_ptr.disp, sizeof(llist_elem_t), MPI_BYTE, llist_win);

            MPI_Win_flush(tail_ptr.rank, llist_win);

            tail_ptr = elem.next;

            assert(elem.value >= 0 && elem.value < nproc);
            counts[elem.value]++;
            count++;

            if (verbose) {
                int last_elem = tail_ptr.disp == nil.disp;
                printf("%2d%s", elem.value, last_elem ? "" : " -> ");
                if (count % ELEM_PER_ROW == 0 && !last_elem)
                    printf("\n");
            }
        }

        MPI_Win_unlock_all(llist_win);

        if (verbose)
          printf("\n\n");

        /* Verify the counts we collected */
        for (i = 0; i < nproc; i++) {
            int expected;

            expected = NUM_ELEMS/nproc;
            if (i < NUM_ELEMS % nproc)
                expected++;

            if (counts[i] != expected) {
                printf("Error: Rank %d inserted %d elements, expected %d\n", i, counts[i], expected);
                errors++;
            }
        }

        printf("%s\n", errors == 0 ? " No Errors" : "FAIL");
        free(counts);
    }

    if (print_perf) {
        double max_time;

        MPI_Reduce(&time, &max_time, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

        if (procid == 0) {
            printf("Total time = %0.2f sec, elem/sec = %0.2f, sec/elem = %0.2f usec\n", max_time, NUM_ELEMS/max_time, max_time/NUM_ELEMS*1.0e6);
        }
    }

    MPI_Win_free(&llist_win);

    /* Free all the elements in the list */
    for ( ; my_elems_count > 0; my_elems_count--)
        MPI_Free_mem(my_elems[my_elems_count-1]);

    MPI_Finalize();
    return 0;
}