C++ (Cpp) MPI_Comm_create_errhandler примеры использования

Пример #1

0

Показать файл

Файл: FailureRecovery.cpp Проект: mdmohsinali/SGCT-Based-Fault-Tolerant-Taxila-LBM

int numProcsFails(MPI_Comm mcw){
	int rank, ret, numFailures = 0, flag;
        MPI_Group fGroup;
        MPI_Errhandler newEh;
        MPI_Comm dupComm;

        // Error handler
        MPI_Comm_create_errhandler(mpiErrorHandler, &newEh);

        MPI_Comm_rank(mcw, &rank);

        // Set error handler for communicator
        MPI_Comm_set_errhandler(mcw, newEh);

        // Target function
        if(MPI_SUCCESS != (ret = MPI_Comm_dup(mcw, &dupComm))) {
        //if(MPI_SUCCESS != (ret = MPI_Barrier(mcw))) { // MPI_Comm_dup or MPI_Barrier
           OMPI_Comm_failure_ack(mcw);
           OMPI_Comm_failure_get_acked(mcw, &fGroup);
           // Get the number of failures
           MPI_Group_size(fGroup, &numFailures);
        }// end of "MPI_Comm_dup failure"

        OMPI_Comm_agree(mcw, &flag);
        // Memory release
	if(numFailures > 0)
           MPI_Group_free(&fGroup);
        MPI_Errhandler_free(&newEh);

        return numFailures;
}//numProcsFails()

Пример #2

0

Показать файл

Файл: iterative_refinement.c Проект: kento/Samples

int main(int argc, char *argv[])
{
    int color, key, ret;
    MPI_Errhandler new_eh;
    MPI_Comm comm_subset;

    /*
     * Startup MPI
     */
    MPI_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_mcw_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &mpi_mcw_size);

    if( mpi_mcw_size < 2 ) {
        printf("Error: Must use at least 2 processes for this test\n");
        MPI_Finalize();
        return -1;
    }

    /*
     * Create a new error handler for MPI_COMM_WORLD
     * This overrides the default MPI_ERRORS_ARE_FATAL so that ranks in this
     * communicator will not automatically abort if a failure occurs.
     */
    MPI_Comm_create_errhandler(mpi_error_handler, &new_eh);
    MPI_Comm_set_errhandler(MPI_COMM_WORLD, new_eh);
    signal(SIGUSR2,  signal_handler);

    iterative_solver(MPI_COMM_WORLD);

    MPI_Finalize();

    return 0;
}

Пример #3

0

Показать файл

Файл: errhandler_create.c Проект: aosm/openmpi

int MPI_Errhandler_create(MPI_Handler_function *function,
                          MPI_Errhandler *errhandler)
{
    /* This is a deprecated -- just turn around and call the real
       function */

    return MPI_Comm_create_errhandler(function, errhandler);
}

Пример #4

0

Показать файл

Файл: distribution_test.cpp Проект: OPM/opm-grid

int main(int argc, char** argv)
{
    Dune::MPIHelper::instance(argc, argv);

#if defined(HAVE_MPI) && HAVE_MPI
    MPI_Errhandler errhandler;
    MPI_Comm_create_errhandler(MPI_err_handler, &errhandler);
    MPI_Comm_set_errhandler(MPI_COMM_WORLD, errhandler);
#endif // HAVE_MPI

    boost::unit_test::unit_test_main(&init_unit_test_func,
                                     argc, argv);
}

Пример #5

0

Показать файл

Файл: comm_create_errhandler_f.c Проект: aosm/openmpi

void mpi_comm_create_errhandler_f(ompi_errhandler_fortran_handler_fn_t *function,
				  MPI_Fint *errhandler, MPI_Fint *ierr)
{
    MPI_Errhandler c_errhandler;

    /* See the note in src/mpi/f77/prototypes_mpi.h about the use of
       (void*) for function pointers in this function */

    *ierr = OMPI_INT_2_FINT(
                 MPI_Comm_create_errhandler((MPI_Comm_errhandler_fn*)function,
                                             &c_errhandler));
    if (MPI_SUCCESS == OMPI_FINT_2_INT(*ierr)) {
        *errhandler = MPI_Errhandler_c2f(c_errhandler);
    }
}

Пример #6

0

Показать файл

Файл: predef_eh.c Проект: dbrowneup/pmap

int main (int argc, char **argv)
{
    MPI_Errhandler errh;
    int wrank;
    MPI_Init (&argc, &argv);
    MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
    MPI_Comm_create_errhandler((MPI_Comm_errhandler_function*)errf, &errh);
    MPI_Comm_set_errhandler(MPI_COMM_WORLD, errh);
    MPI_Comm_set_errhandler(MPI_COMM_SELF, errh);
    MPI_Errhandler_free(&errh);
    MPI_Finalize();
    /* Test harness requirement is that only one process write No Errors */
    if (wrank == 0) 
      printf(" No Errors\n");
    return 0;
}

Пример #7

0

Показать файл

Файл: revoke_shrink.c Проект: adevress/MPICH-BlueGene

int main(int argc, char *argv[]) {
    int rank, size, i;
    int sum = 0, val = 1;
    int errs = 0;
    MPI_Errhandler errhandler;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    if (size < 4) {
        fprintf(stderr, "Must run with at least 4 processes.\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    MPI_Comm_dup(MPI_COMM_WORLD, &comm_all);

    MPI_Comm_create_errhandler(&error_handler, &errhandler);
    MPI_Comm_set_errhandler(comm_all, errhandler);

    for (i = 0; i < 10; ++i) {
        MPI_Comm_size(comm_all, &size);
        sum = 0;
        if (i == 5 && rank == 1) {
            exit(1);
        } else if (i != 5) {
            MPI_Allreduce(&val, &sum, 1, MPI_INT, MPI_SUM, comm_all);
            if (sum != size && rank == 0) {
                errs++;
                fprintf(stderr, "Incorrect answer: %d != %d\n", sum, size);
            }
        }
    }

    if (0 == rank && errs) {
        fprintf(stdout, " Found %d errors\n", errs);
    } else if (0 == rank) {
        fprintf(stdout, " No errors\n");
    }

    MPI_Comm_free(&comm_all);
    MPI_Errhandler_free(&errhandler);

    MPI_Finalize();

    return 0;
}

Пример #8

0

Показать файл

Файл: sci_mpi_init.c Проект: ScilabOrg/scilab

int sci_mpi_init(char *fname, void* pvApiCtx)
{
    int flag;

    CheckInputArgument(pvApiCtx, 0, 0);
    CheckOutputArgument(pvApiCtx, 1, 1);
    mpi_init_internal();
    MPI_Initialized(&flag);
    if (!flag)
    {
        /* MPI Not yet initialized */
        MPI_Init(NULL, NULL);
        MPI_Comm_create_errhandler(MPIErrHandler, &errhdl);
    }
    AssignOutputVariable(pvApiCtx, 1) = 0;
    ReturnArguments(pvApiCtx);
    return 0;
}

Пример #9

0

Показать файл

Файл: libmesh.C Проект: huangh-inl/libmesh

LibMeshInit::LibMeshInit (int argc, const char* const* argv,
                          MPI_Comm COMM_WORLD_IN)
#endif
{
  // should _not_ be initialized already.
  libmesh_assert (!libMesh::initialized());

  // Build a command-line parser.
  command_line.reset (new GetPot (argc, argv));

  // Disable performance logging upon request
  {
    if (libMesh::on_command_line ("--disable-perflog"))
      libMesh::perflog.disable_logging();
  }

  // Build a task scheduler
  {
    // Get the requested number of threads, defaults to 1 to avoid MPI and
    // multithreading competition.  If you would like to use MPI and multithreading
    // at the same time then (n_mpi_processes_per_node)x(n_threads) should be the
    //  number of processing cores per node.
    std::vector<std::string> n_threads(2);
    n_threads[0] = "--n_threads";
    n_threads[1] = "--n-threads";
    libMesh::libMeshPrivateData::_n_threads =
      libMesh::command_line_value (n_threads, 1);

    // Set the number of OpenMP threads to the same as the number of threads libMesh is going to use
#ifdef LIBMESH_HAVE_OPENMP
    omp_set_num_threads(libMesh::libMeshPrivateData::_n_threads);
#endif

    task_scheduler.reset (new Threads::task_scheduler_init(libMesh::n_threads()));
  }

  // Construct singletons who may be at risk of the
  // "static initialization order fiasco"
  Singleton::setup();

  // Make sure the construction worked
  libmesh_assert(remote_elem);

#if defined(LIBMESH_HAVE_MPI)

  // Allow the user to bypass MPI initialization
  if (!libMesh::on_command_line ("--disable-mpi"))
    {
      // Check whether the calling program has already initialized
      // MPI, and avoid duplicate Init/Finalize
      int flag;
      MPI_Initialized (&flag);

      if (!flag)
        {
#if MPI_VERSION > 1
          int mpi_thread_provided;
          const int mpi_thread_requested = libMesh::n_threads() > 1 ?
                                           MPI_THREAD_FUNNELED :
                                           MPI_THREAD_SINGLE;

          MPI_Init_thread (&argc, const_cast<char***>(&argv),
                           mpi_thread_requested, &mpi_thread_provided);

          if ((libMesh::n_threads() > 1) &&
              (mpi_thread_provided < MPI_THREAD_FUNNELED))
            {
              libmesh_warning("Warning: MPI failed to guarantee MPI_THREAD_FUNNELED\n" << 
                              "for a threaded run.\n" <<
                              "Be sure your library is funneled-thread-safe..." <<
                               std::endl);

              // Ideally, if an MPI stack tells us it's unsafe for us
              // to use threads, we shouldn't use threads.
              // In practice, we've encountered one MPI stack (an
              // mvapich2 configuration) that returned
              // MPI_THREAD_SINGLE as a proper warning, two stacks
              // that handle MPI_THREAD_FUNNELED properly, and two
              // current stacks plus a couple old stacks that return
              // MPI_THREAD_SINGLE but support libMesh threaded runs
              // anyway.

              // libMesh::libMeshPrivateData::_n_threads = 1;
              // task_scheduler.reset (new Threads::task_scheduler_init(libMesh::n_threads()));
            }
#else
          if (libMesh::libMeshPrivateData::_n_threads > 1)
            {
              libmesh_warning("Warning: using MPI1 for threaded code.\n" <<
                              "Be sure your library is funneled-thread-safe..." <<
                              std::endl);
            }

          MPI_Init (&argc, const_cast<char***>(&argv));
#endif
          libmesh_initialized_mpi = true;
        }

      // Duplicate the input communicator for internal use
      // And get a Parallel::Communicator copy too, to use
      // as a default for that API
      this->_comm = COMM_WORLD_IN;

      libMesh::GLOBAL_COMM_WORLD = COMM_WORLD_IN;

#ifndef LIBMESH_DISABLE_COMMWORLD
      libMesh::COMM_WORLD = COMM_WORLD_IN;
      Parallel::Communicator_World = COMM_WORLD_IN;
#endif

      //MPI_Comm_set_name not supported in at least SGI MPT's MPI implementation
      //MPI_Comm_set_name (libMesh::COMM_WORLD, "libMesh::COMM_WORLD");

      libMeshPrivateData::_processor_id =
        libmesh_cast_int<processor_id_type>(this->comm().rank());
      libMeshPrivateData::_n_processors =
        libmesh_cast_int<processor_id_type>(this->comm().size());

      // Set up an MPI error handler if requested.  This helps us get
      // into a debugger with a proper stack when an MPI error occurs.
      if (libMesh::on_command_line ("--handle-mpi-errors"))
        {
#if MPI_VERSION > 1
          MPI_Comm_create_errhandler(libMesh_MPI_Handler, &libmesh_errhandler);
          MPI_Comm_set_errhandler(libMesh::GLOBAL_COMM_WORLD, libmesh_errhandler);
          MPI_Comm_set_errhandler(MPI_COMM_WORLD, libmesh_errhandler);
#else
          MPI_Errhandler_create(libMesh_MPI_Handler, &libmesh_errhandler);
          MPI_Errhandler_set(libMesh::GLOBAL_COMM_WORLD, libmesh_errhandler);
          MPI_Errhandler_set(MPI_COMM_WORLD, libmesh_errhandler);
#endif // #if MPI_VERSION > 1
        }
    }

  // Could we have gotten bad values from the above calls?
  libmesh_assert_greater (libMeshPrivateData::_n_processors, 0);

  // The libmesh_cast_int already tested _processor_id>=0
  // libmesh_assert_greater_equal (libMeshPrivateData::_processor_id, 0);

  // Let's be sure we properly initialize on every processor at once:
  libmesh_parallel_only(this->comm());

#endif

#if defined(LIBMESH_HAVE_PETSC)

  // Allow the user to bypass PETSc initialization
  if (!libMesh::on_command_line ("--disable-petsc")

#if defined(LIBMESH_HAVE_MPI)
      // If the user bypassed MPI, we'd better be safe and assume that
      // PETSc was built to require it; otherwise PETSc initialization
      // dies.
      && !libMesh::on_command_line ("--disable-mpi")
#endif
      )
    {
      int ierr=0;

      PETSC_COMM_WORLD = libMesh::GLOBAL_COMM_WORLD;

      // Check whether the calling program has already initialized
      // PETSc, and avoid duplicate Initialize/Finalize
      PetscBool petsc_already_initialized;
      ierr = PetscInitialized(&petsc_already_initialized);
      CHKERRABORT(libMesh::GLOBAL_COMM_WORLD,ierr);
      if (petsc_already_initialized != PETSC_TRUE)
        libmesh_initialized_petsc = true;
# if defined(LIBMESH_HAVE_SLEPC)

      // If SLEPc allows us to check whether the calling program
      // has already initialized it, we do that, and avoid
      // duplicate Initialize/Finalize.
      // We assume that SLEPc will handle PETSc appropriately,
      // which it does in the versions we've checked.
#  if !SLEPC_VERSION_LESS_THAN(2,3,3)
      if (!SlepcInitializeCalled)
#  endif
        {
          ierr = SlepcInitialize  (&argc, const_cast<char***>(&argv), NULL, NULL);
          CHKERRABORT(libMesh::GLOBAL_COMM_WORLD,ierr);
          libmesh_initialized_slepc = true;
        }
# else
      if (libmesh_initialized_petsc)
        {
          ierr = PetscInitialize (&argc, const_cast<char***>(&argv), NULL, NULL);
          CHKERRABORT(libMesh::GLOBAL_COMM_WORLD,ierr);
        }
# endif
    }
#endif

  // Re-parse the command-line arguments.  Note that PETSc and MPI
  // initialization above may have removed command line arguments
  // that are not relevant to this application in the above calls.
  // We don't want a false-positive by detecting those arguments.
  command_line->parse_command_line (argc, argv);

  // The following line is an optimization when simultaneous
  // C and C++ style access to output streams is not required.
  // The amount of benefit which occurs is probably implementation
  // defined, and may be nothing.  On the other hand, I have seen
  // some IO tests where IO peformance improves by a factor of two.
  if (!libMesh::on_command_line ("--sync-with-stdio"))
    std::ios::sync_with_stdio(false);

  // Honor the --separate-libmeshout command-line option.
  // When this is specified, the library uses an independent ostream
  // for libMesh::out/libMesh::err messages, and
  // std::cout and std::cerr are untouched by any other options
  if (libMesh::on_command_line ("--separate-libmeshout"))
    {
      // Redirect.  We'll share streambufs with cout/cerr for now, but
      // presumably anyone using this option will want to replace the
      // bufs later.
      std::ostream* newout = new std::ostream(std::cout.rdbuf());
      libMesh::out = *newout;
      std::ostream* newerr = new std::ostream(std::cerr.rdbuf());
      libMesh::err = *newerr;
    }

  // Honor the --redirect-stdout command-line option.
  // When this is specified each processor sends
  // libMesh::out/libMesh::err messages to
  // stdout.processor.####
  if (libMesh::on_command_line ("--redirect-stdout"))
    {
      std::ostringstream filename;
      filename << "stdout.processor." << libMesh::global_processor_id();
      _ofstream.reset (new std::ofstream (filename.str().c_str()));
      // Redirect, saving the original streambufs!
      out_buf = libMesh::out.rdbuf (_ofstream->rdbuf());
      err_buf = libMesh::err.rdbuf (_ofstream->rdbuf());
    }

  // redirect libMesh::out to nothing on all
  // other processors unless explicitly told
  // not to via the --keep-cout command-line argument.
  if (libMesh::global_processor_id() != 0)
    if (!libMesh::on_command_line ("--keep-cout"))
      libMesh::out.rdbuf (NULL);

  // Check command line to override printing
  // of reference count information.
  if(libMesh::on_command_line("--disable-refcount-printing") )
    ReferenceCounter::disable_print_counter_info();

#ifdef LIBMESH_ENABLE_EXCEPTIONS
  // Set our terminate handler to write stack traces in the event of a
  // crash
  old_terminate_handler = std::set_terminate(libmesh_terminate_handler);
#endif


  if (libMesh::on_command_line("--enable-fpe"))
    libMesh::enableFPE(true);

  // The library is now ready for use
  libMeshPrivateData::_is_initialized = true;


  // Make sure these work.  Library methods
  // depend on these being implemented properly,
  // so this is a good time to test them!
  libmesh_assert (libMesh::initialized());
  libmesh_assert (!libMesh::closed());
}

Пример #10

0

Показать файл

Файл: mpi_error_handler.cpp Проект: phpisciuneri/tg

mpi_error_handler::mpi_error_handler()
{
   MPI_Comm_create_errhandler( lmcfd_mpi_error_handler_fn, &handler_handle_ );
   MPI_Comm_set_errhandler(MPI_COMM_WORLD, handler_handle_);
}

Пример #11

0

Показать файл

Файл: init.c Проект: pombredanne/petsc

PetscErrorCode  PetscOptionsCheckInitial_Private(void)
{
  char              string[64],mname[PETSC_MAX_PATH_LEN],*f;
  MPI_Comm          comm = PETSC_COMM_WORLD;
  PetscBool         flg1 = PETSC_FALSE,flg2 = PETSC_FALSE,flg3 = PETSC_FALSE,flag;
  PetscErrorCode    ierr;
  PetscReal         si;
  PetscInt          intensity;
  int               i;
  PetscMPIInt       rank;
  char              version[256];
#if !defined(PETSC_HAVE_THREADSAFETY)
  PetscReal         logthreshold;
#endif
#if defined(PETSC_USE_LOG)
  PetscViewerFormat format;
  PetscBool         flg4 = PETSC_FALSE;
#endif
  
  PetscFunctionBegin;
  ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);

#if !defined(PETSC_HAVE_THREADSAFETY)
  /*
      Setup the memory management; support for tracing malloc() usage
  */
  ierr = PetscOptionsHasName(NULL,"-malloc_log",&flg3);CHKERRQ(ierr);
  logthreshold = 0.0;
  ierr = PetscOptionsGetReal(NULL,"-malloc_log_threshold",&logthreshold,&flg1);CHKERRQ(ierr);
  if (flg1) flg3 = PETSC_TRUE;
#if defined(PETSC_USE_DEBUG)
  ierr = PetscOptionsGetBool(NULL,"-malloc",&flg1,&flg2);CHKERRQ(ierr);
  if ((!flg2 || flg1) && !petscsetmallocvisited) {
    if (flg2 || !(PETSC_RUNNING_ON_VALGRIND)) {
      /* turn off default -malloc if valgrind is being used */
      ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);
    }
  }
#else
  ierr = PetscOptionsGetBool(NULL,"-malloc_dump",&flg1,NULL);CHKERRQ(ierr);
  ierr = PetscOptionsGetBool(NULL,"-malloc",&flg2,NULL);CHKERRQ(ierr);
  if (flg1 || flg2 || flg3) {ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);}
#endif
  if (flg3) {
    ierr = PetscMallocSetDumpLogThreshold((PetscLogDouble)logthreshold);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-malloc_debug",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);
    ierr = PetscMallocDebug(PETSC_TRUE);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-malloc_test",&flg1,NULL);CHKERRQ(ierr);
#if defined(PETSC_USE_DEBUG)
  if (flg1 && !PETSC_RUNNING_ON_VALGRIND) {
    ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);
    ierr = PetscMallocSetDumpLog();CHKERRQ(ierr);
    ierr = PetscMallocDebug(PETSC_TRUE);CHKERRQ(ierr);
  }
#endif

  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-malloc_info",&flg1,NULL);CHKERRQ(ierr);
  if (!flg1) {
    flg1 = PETSC_FALSE;
    ierr = PetscOptionsGetBool(NULL,"-memory_view",&flg1,NULL);CHKERRQ(ierr);
  }
  if (flg1) {
    ierr = PetscMemorySetGetMaximumUsage();CHKERRQ(ierr);
  }
#endif

#if defined(PETSC_USE_LOG)
  ierr = PetscOptionsHasName(NULL,"-objects_dump",&PetscObjectsLog);CHKERRQ(ierr);
#endif

  /*
      Set the display variable for graphics
  */
  ierr = PetscSetDisplay();CHKERRQ(ierr);

  /*
      Print the PETSc version information
  */
  ierr = PetscOptionsHasName(NULL,"-v",&flg1);CHKERRQ(ierr);
  ierr = PetscOptionsHasName(NULL,"-version",&flg2);CHKERRQ(ierr);
  ierr = PetscOptionsHasName(NULL,"-help",&flg3);CHKERRQ(ierr);
  if (flg1 || flg2 || flg3) {

    /*
       Print "higher-level" package version message
    */
    if (PetscExternalVersionFunction) {
      ierr = (*PetscExternalVersionFunction)(comm);CHKERRQ(ierr);
    }

    ierr = PetscGetVersion(version,256);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"--------------------------------------------\
------------------------------\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"%s\n",version);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"%s",PETSC_AUTHOR_INFO);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"See docs/changes/index.html for recent updates.\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"See docs/faq.html for problems.\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"See docs/manualpages/index.html for help. \n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"Libraries linked from %s\n",PETSC_LIB_DIR);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"--------------------------------------------\
------------------------------\n");CHKERRQ(ierr);
  }

  /*
       Print "higher-level" package help message
  */
  if (flg3) {
    if (PetscExternalHelpFunction) {
      ierr = (*PetscExternalHelpFunction)(comm);CHKERRQ(ierr);
    }
  }

  /*
      Setup the error handling
  */
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-on_error_abort",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {
    ierr = MPI_Comm_set_errhandler(PETSC_COMM_WORLD,MPI_ERRORS_ARE_FATAL);CHKERRQ(ierr);
    ierr = PetscPushErrorHandler(PetscAbortErrorHandler,0);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-on_error_mpiabort",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) { ierr = PetscPushErrorHandler(PetscMPIAbortErrorHandler,0);CHKERRQ(ierr);}
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-mpi_return_on_error",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {
    ierr = MPI_Comm_set_errhandler(comm,MPI_ERRORS_RETURN);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-no_signal_handler",&flg1,NULL);CHKERRQ(ierr);
  if (!flg1) {ierr = PetscPushSignalHandler(PetscSignalHandlerDefault,(void*)0);CHKERRQ(ierr);}
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-fp_trap",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {ierr = PetscSetFPTrap(PETSC_FP_TRAP_ON);CHKERRQ(ierr);}
  ierr = PetscOptionsGetInt(NULL,"-check_pointer_intensity",&intensity,&flag);CHKERRQ(ierr);
  if (flag) {ierr = PetscCheckPointerSetIntensity(intensity);CHKERRQ(ierr);}

  /*
      Setup debugger information
  */
  ierr = PetscSetDefaultDebugger();CHKERRQ(ierr);
  ierr = PetscOptionsGetString(NULL,"-on_error_attach_debugger",string,64,&flg1);CHKERRQ(ierr);
  if (flg1) {
    MPI_Errhandler err_handler;

    ierr = PetscSetDebuggerFromString(string);CHKERRQ(ierr);
    ierr = MPI_Comm_create_errhandler((MPI_Handler_function*)Petsc_MPI_DebuggerOnError,&err_handler);CHKERRQ(ierr);
    ierr = MPI_Comm_set_errhandler(comm,err_handler);CHKERRQ(ierr);
    ierr = PetscPushErrorHandler(PetscAttachDebuggerErrorHandler,0);CHKERRQ(ierr);
  }
  ierr = PetscOptionsGetString(NULL,"-debug_terminal",string,64,&flg1);CHKERRQ(ierr);
  if (flg1) { ierr = PetscSetDebugTerminal(string);CHKERRQ(ierr); }
  ierr = PetscOptionsGetString(NULL,"-start_in_debugger",string,64,&flg1);CHKERRQ(ierr);
  ierr = PetscOptionsGetString(NULL,"-stop_for_debugger",string,64,&flg2);CHKERRQ(ierr);
  if (flg1 || flg2) {
    PetscMPIInt    size;
    PetscInt       lsize,*nodes;
    MPI_Errhandler err_handler;
    /*
       we have to make sure that all processors have opened
       connections to all other processors, otherwise once the
       debugger has stated it is likely to receive a SIGUSR1
       and kill the program.
    */
    ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
    if (size > 2) {
      PetscMPIInt dummy = 0;
      MPI_Status  status;
      for (i=0; i<size; i++) {
        if (rank != i) {
          ierr = MPI_Send(&dummy,1,MPI_INT,i,109,PETSC_COMM_WORLD);CHKERRQ(ierr);
        }
      }
      for (i=0; i<size; i++) {
        if (rank != i) {
          ierr = MPI_Recv(&dummy,1,MPI_INT,i,109,PETSC_COMM_WORLD,&status);CHKERRQ(ierr);
        }
      }
    }
    /* check if this processor node should be in debugger */
    ierr  = PetscMalloc1(size,&nodes);CHKERRQ(ierr);
    lsize = size;
    ierr  = PetscOptionsGetIntArray(NULL,"-debugger_nodes",nodes,&lsize,&flag);CHKERRQ(ierr);
    if (flag) {
      for (i=0; i<lsize; i++) {
        if (nodes[i] == rank) { flag = PETSC_FALSE; break; }
      }
    }
    if (!flag) {
      ierr = PetscSetDebuggerFromString(string);CHKERRQ(ierr);
      ierr = PetscPushErrorHandler(PetscAbortErrorHandler,0);CHKERRQ(ierr);
      if (flg1) {
        ierr = PetscAttachDebugger();CHKERRQ(ierr);
      } else {
        ierr = PetscStopForDebugger();CHKERRQ(ierr);
      }
      ierr = MPI_Comm_create_errhandler((MPI_Handler_function*)Petsc_MPI_AbortOnError,&err_handler);CHKERRQ(ierr);
      ierr = MPI_Comm_set_errhandler(comm,err_handler);CHKERRQ(ierr);
    }
    ierr = PetscFree(nodes);CHKERRQ(ierr);
  }

  ierr = PetscOptionsGetString(NULL,"-on_error_emacs",emacsmachinename,128,&flg1);CHKERRQ(ierr);
  if (flg1 && !rank) {ierr = PetscPushErrorHandler(PetscEmacsClientErrorHandler,emacsmachinename);CHKERRQ(ierr);}

  /*
        Setup profiling and logging
  */
#if defined(PETSC_USE_INFO)
  {
    char logname[PETSC_MAX_PATH_LEN]; logname[0] = 0;
    ierr = PetscOptionsGetString(NULL,"-info",logname,250,&flg1);CHKERRQ(ierr);
    if (flg1 && logname[0]) {
      ierr = PetscInfoAllow(PETSC_TRUE,logname);CHKERRQ(ierr);
    } else if (flg1) {
      ierr = PetscInfoAllow(PETSC_TRUE,NULL);CHKERRQ(ierr);
    }
  }
#endif
#if defined(PETSC_USE_LOG)
  mname[0] = 0;
  ierr = PetscOptionsGetString(NULL,"-history",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr);
  if (flg1) {
    if (mname[0]) {
      ierr = PetscOpenHistoryFile(mname,&petsc_history);CHKERRQ(ierr);
    } else {
      ierr = PetscOpenHistoryFile(NULL,&petsc_history);CHKERRQ(ierr);
    }
  }
#if defined(PETSC_HAVE_MPE)
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsHasName(NULL,"-log_mpe",&flg1);CHKERRQ(ierr);
  if (flg1) {ierr = PetscLogMPEBegin();CHKERRQ(ierr);}
#endif
  flg1 = PETSC_FALSE;
  flg3 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-log_all",&flg1,NULL);CHKERRQ(ierr);
  ierr = PetscOptionsHasName(NULL,"-log_summary",&flg3);CHKERRQ(ierr);
  if (flg1)                      { ierr = PetscLogAllBegin();CHKERRQ(ierr); }
  else if (flg3)                 { ierr = PetscLogDefaultBegin();CHKERRQ(ierr);}

  ierr = PetscOptionsGetString(NULL,"-log_trace",mname,250,&flg1);CHKERRQ(ierr);
  if (flg1) {
    char name[PETSC_MAX_PATH_LEN],fname[PETSC_MAX_PATH_LEN];
    FILE *file;
    if (mname[0]) {
      sprintf(name,"%s.%d",mname,rank);
      ierr = PetscFixFilename(name,fname);CHKERRQ(ierr);
      file = fopen(fname,"w");
      if (!file) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_FILE_OPEN,"Unable to open trace file: %s",fname);
    } else file = PETSC_STDOUT;
    ierr = PetscLogTraceBegin(file);CHKERRQ(ierr);
  }

  ierr   = PetscOptionsGetViewer(PETSC_COMM_WORLD,NULL,"-log_view",NULL,&format,&flg4);CHKERRQ(ierr);
  if (flg4) {
    if (format == PETSC_VIEWER_ASCII_XML){
      ierr = PetscLogNestedBegin();CHKERRQ(ierr);
    } else {
      ierr = PetscLogDefaultBegin();CHKERRQ(ierr);
    }
  }
#endif

  ierr = PetscOptionsGetBool(NULL,"-saws_options",&PetscOptionsPublish,NULL);CHKERRQ(ierr);

#if defined(PETSC_HAVE_CUDA)
  ierr = PetscOptionsHasName(NULL,"-cuda_show_devices",&flg1);CHKERRQ(ierr);
  if (flg1) {
    struct cudaDeviceProp prop;
    int                   devCount;
    int                   device;
    cudaError_t           err = cudaSuccess;

    err = cudaGetDeviceCount(&devCount);
    if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceCount %s",cudaGetErrorString(err));
    for (device = 0; device < devCount; ++device) {
      err = cudaGetDeviceProperties(&prop, device);
      if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceProperties %s",cudaGetErrorString(err));
      ierr = PetscPrintf(PETSC_COMM_WORLD, "CUDA device %d: %s\n", device, prop.name);CHKERRQ(ierr);
    }
  }
  {
    int size;
    ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
    if (size>1) {
      int         devCount, device, rank;
      cudaError_t err = cudaSuccess;

      /* check to see if we force multiple ranks to hit the same GPU */
      ierr = PetscOptionsGetInt(NULL,"-cuda_set_device", &device, &flg1);CHKERRQ(ierr);
      if (flg1) {
        err = cudaSetDevice(device);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err));
      } else {
        /* we're not using the same GPU on multiple MPI threads. So try to allocated different   GPUs to different processes */

        /* First get the device count */
        err   = cudaGetDeviceCount(&devCount);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceCount %s",cudaGetErrorString(err));

        /* next determine the rank and then set the device via a mod */
        ierr   = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
        device = rank % devCount;
        err    = cudaSetDevice(device);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err));
      }

      /* set the device flags so that it can map host memory ... do NOT throw exception on err!=cudaSuccess
       multiple devices may try to set the flags on the same device. So long as one of them succeeds, things
       are ok. */
      err = cudaSetDeviceFlags(cudaDeviceMapHost);
      if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDeviceFlags %s",cudaGetErrorString(err));
    } else {
      int         device;
      cudaError_t err = cudaSuccess;

      /* the code below works for serial GPU simulations */
      ierr = PetscOptionsGetInt(NULL,"-cuda_set_device", &device, &flg1);CHKERRQ(ierr);
      if (flg1) {
        err = cudaSetDevice(device);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err));
      }

      /* set the device flags so that it can map host memory ... here, we error check. */
      err = cudaSetDeviceFlags(cudaDeviceMapHost);
      if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDeviceFlags %s",cudaGetErrorString(err));
    }
  }
#endif


  /*
       Print basic help message
  */
  ierr = PetscOptionsHasName(NULL,"-help",&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = (*PetscHelpPrintf)(comm,"Options for all PETSc programs:\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -help: prints help method for each option\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -on_error_abort: cause an abort when an error is detected. Useful \n ");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       only when run in the debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -on_error_attach_debugger [gdb,dbx,xxgdb,ups,noxterm]\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       start the debugger in new xterm\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       unless noxterm is given\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -start_in_debugger [gdb,dbx,xxgdb,ups,noxterm]\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       start all processes in the debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -on_error_emacs <machinename>\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"    emacs jumps to error file\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -debugger_nodes [n1,n2,..] Nodes to start in debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -debugger_pause [m] : delay (in seconds) to attach debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -stop_for_debugger : prints message on how to attach debugger manually\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"                      waits the delay for you to attach\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -display display: Location where X window graphics and debuggers are displayed\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -no_signal_handler: do not trap error signals\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -mpi_return_on_error: MPI returns error code, rather than abort on internal error\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -fp_trap: stop on floating point exceptions\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"           note on IBM RS6000 this slows run greatly\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_dump <optional filename>: dump list of unfreed memory at conclusion\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc: use our error checking malloc\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc no: don't use error checking malloc\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_info: prints total memory usage\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_log: keeps log of all memory allocations\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_debug: enables extended checking for memory corruption\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_table: dump list of options inputted\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_left: dump list of unused options\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_left no: don't dump list of unused options\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -tmp tmpdir: alternative /tmp directory\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -shared_tmp: tmp directory is shared by all processors\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -not_shared_tmp: each processor has separate tmp directory\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -memory_view: print memory usage at end of run\n");CHKERRQ(ierr);
#if defined(PETSC_USE_LOG)
    ierr = (*PetscHelpPrintf)(comm," -get_total_flops: total flops over all processors\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -log[_summary _summary_python]: logging objects and events\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -log_trace [filename]: prints trace of all PETSc calls\n");CHKERRQ(ierr);
#if defined(PETSC_HAVE_MPE)
    ierr = (*PetscHelpPrintf)(comm," -log_mpe: Also create logfile viewable through Jumpshot\n");CHKERRQ(ierr);
#endif
    ierr = (*PetscHelpPrintf)(comm," -info <optional filename>: print informative messages about the calculations\n");CHKERRQ(ierr);
#endif
    ierr = (*PetscHelpPrintf)(comm," -v: prints PETSc version number and release date\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_file <file>: reads options from file\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -petsc_sleep n: sleeps n seconds before running program\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"-----------------------------------------------\n");CHKERRQ(ierr);
  }

#if defined(PETSC_HAVE_POPEN)
  {
  char machine[128];
  ierr = PetscOptionsGetString(NULL,"-popen_machine",machine,128,&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscPOpenSetMachine(machine);CHKERRQ(ierr);
  }
  }
#endif

  ierr = PetscOptionsGetReal(NULL,"-petsc_sleep",&si,&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscSleep(si);CHKERRQ(ierr);
  }

  ierr = PetscOptionsGetString(NULL,"-info_exclude",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscStrstr(mname,"null",&f);CHKERRQ(ierr);
    if (f) {
      ierr = PetscInfoDeactivateClass(0);CHKERRQ(ierr);
    }
  }

#if defined(PETSC_HAVE_CUSP) || defined(PETSC_HAVE_VIENNACL)
  ierr = PetscOptionsHasName(NULL,"-log_summary",&flg3);CHKERRQ(ierr);
  if (!flg3) {
  ierr = PetscOptionsHasName(NULL,"-log_view",&flg3);CHKERRQ(ierr);
  }
#endif
#if defined(PETSC_HAVE_CUSP)
  ierr = PetscOptionsGetBool(NULL,"-cusp_synchronize",&flg3,NULL);CHKERRQ(ierr);
  PetscCUSPSynchronize = flg3;
#elif defined(PETSC_HAVE_VIENNACL)
  ierr = PetscOptionsGetBool(NULL,"-viennacl_synchronize",&flg3,NULL);CHKERRQ(ierr);
  PetscViennaCLSynchronize = flg3;
#endif
  PetscFunctionReturn(0);
}

Пример #12

0

Показать файл

Файл: FailureRecovery.cpp Проект: mdmohsinali/SGCT-Based-Fault-Tolerant-2D-SFI

MPI_Comm communicatorReconstruct(MPI_Comm myCommWorld, int childFlag, int * listFails, int * numFails,
                                 int * numNodeFails, int sumPrevNumNodeFails, int argc, char ** argv, int verbosity) {
    int i, ret, rank, nprocs, oldRank = 0, totFails = 0, * failedList, flag;
    int iterCounter = 0, failure = 0, recvVal[2], length;
    MPI_Status mpiStatus;
    MPI_Comm parent, mcw;
    MPI_Comm dupComm, tempIntracomm, unorderIntracomm;
    MPI_Errhandler newEh;
    double startTime = 0.0, endTime;
    char hostName[MPI_MAX_PROCESSOR_NAME];

    // Error handler
    MPI_Comm_create_errhandler(mpiErrorHandler, &newEh);

    MPI_Comm_get_parent(&parent);

    MPI_Comm_rank(myCommWorld, &rank);
    if(MPI_COMM_NULL == parent && childFlag == 0 && rank == 0)
        startTime = MPI_Wtime();

    do {
        failure = 0;
        ret = MPI_SUCCESS;
        /*
        if(childFlag == 0 && MPI_COMM_NULL != parent){
           parent = MPI_COMM_NULL;
        }
        */
        // Parent part
        if(MPI_COMM_NULL == parent) {
            if(iterCounter == 0)
                mcw = myCommWorld;
            // Set error handler for communicator
            MPI_Comm_set_errhandler(mcw, newEh);

            // World information
            MPI_Comm_rank(mcw, &rank);
            MPI_Comm_size(mcw, &nprocs);
            // Synchronize. Sometimes hangs on without this
#ifdef HANG_ON_REMOVE
            //MPI_Barrier(mcw);
            OMPI_Comm_agree(mcw, &flag); // since some of the times MPI_Barrier hangs on
#endif

            // Target function
            //if(MPI_SUCCESS != (ret = MPI_Barrier(mcw))){
            if(MPI_SUCCESS != (ret = MPI_Comm_dup(mcw, &dupComm))) {
                if(verbosity > 0 && rank == 0)
                    printf("[????? Process %d (nprocs %d)] MPI_Comm_dup (parent): "
                           "Unsuccessful (due to process failure) OK\n", rank, nprocs);

                // Revoke the communicator
                if(MPI_SUCCESS != (OMPI_Comm_revoke(mcw))) {
                    if(rank == 0)
                        printf("[Process %d (nprocs %d)] Iteration %d: OMPI_Comm_revoke "
                               "(parent): Error!\n", rank, nprocs,  iterCounter);
                }
                else {
                    if(verbosity > 1 && rank == 0)
                        printf("[Process %d (nprocs %d)] Iteration %d: OMPI_Comm_revoke "
                               "(parent): SUCCESS\n", rank, nprocs, iterCounter);
                }

                // Call repair with splitted world
                totFails = numProcsFails(mcw);
                failedList = (int *) malloc(totFails*sizeof(int));
                repairComm(&mcw, &tempIntracomm, iterCounter, failedList, numFails, numNodeFails,
                           sumPrevNumNodeFails, argc, argv, verbosity);

                // Assign list of failed processes
                #pragma omp parallel for default(shared)
                for(i = 0; i < *numFails; i++)
                    listFails[i] = failedList[i];

                // Free memory
                free(failedList);

                // Operation failed: retry
                failure = 1;
            } //end of "if MPI_Barrier/MPI_Comm_dup fails"
            else {
                if(verbosity > 0 && rank == 0)
                    printf("[..... Process %d (nprocs %d)] Iteration %d: MPI_Comm_dup "
                           "(parent): SUCCESS\n", rank, nprocs, iterCounter);

                // Operation success: breaking iteration
                failure = 0;
            }
        } // end of "parent"
        // Child part
        else {
            MPI_Comm_set_errhandler(parent, newEh);
            // Synchronize. Sometimes hangs on without this
            // Position of code and intercommunicator, parent, (not intra) is important
#ifdef HANG_ON_REMOVE
            //MPI_Barrier(parent);
            OMPI_Comm_agree(parent, &flag);// since some of the times MPI_Barrier hangs on
#endif

            MPI_Comm_rank(parent, &rank);
            MPI_Comm_size(parent, &nprocs);

            if(verbosity > 0 && rank == 0) {
                MPI_Get_processor_name(hostName, &length);
                printf("[Process %d, nprocs = %d] created on host %s (child)\n",
                       rank, nprocs, hostName);
            }

            if(MPI_SUCCESS != (MPI_Intercomm_merge(parent, true, &unorderIntracomm))) {
                if(rank == 0)
                    printf("[Process %d] Iteration %d: MPI_Intercomm_merge (child): Error!\n",
                           rank, iterCounter);
            }
            else {
                if(verbosity > 1 && rank == 0)
                    printf("[Process %d] Iteration %d: MPI_Intercomm_merge (child): SUCCESS\n",
                           rank, iterCounter);
            }
            // Receive failed ranks and number of fails from process 0 of parent
            if(MPI_SUCCESS != (MPI_Recv(&recvVal, 2, MPI_INT, 0, MERGE_TAG,
                                        unorderIntracomm, &mpiStatus))) {
                if(rank == 0)
                    printf("[Process %d] Iteration %d: MPI_Recv1 (child): Error!\n",
                           rank, iterCounter);
            }
            else {
                if(verbosity > 1 && rank == 0)
                    printf("[Process %d] Iteration %d: MPI_Recv1 (child): SUCCESS\n",
                           rank, iterCounter);
                oldRank = recvVal[0];
                *numFails = recvVal[1];
            }

            // Split the communicator to order the ranks.
            // No order is maintaining here. Actual ordering is done on parent side
            // This is a support only to parent side
            if(MPI_SUCCESS != (MPI_Comm_split(unorderIntracomm, 0, oldRank, &tempIntracomm))) {
                if(rank == 0)
                    printf("[Process %d] Iteration %d: MPI_Comm_split (child): Error!\n",
                           rank, iterCounter);
            }
            else {
                if(verbosity > 1 && rank == 0)
                    printf("[Process %d] Iteration %d: MPI_Comm_split (child): SUCCESS\n",
                           rank, iterCounter);
            }

            // Operation on parent failed: retry
            ret = (!MPI_SUCCESS);
            failure = 1;

            // Free memory
            MPI_Comm_free(&unorderIntracomm);
            MPI_Comm_free(&parent);
        }// end of "child"

        // Reset comm world
        if(ret != MPI_SUCCESS)
            mcw = tempIntracomm;

        // Reset parent value for parent
        if(parent == MPI_COMM_NULL && ret != MPI_SUCCESS)
            parent = mcw;

        // Reset parent value of child and make the operation collective
        if(MPI_SUCCESS != ret && MPI_COMM_NULL != parent)
            parent = MPI_COMM_NULL;
        iterCounter++;
    } while(failure > 1);// replace 'failure > 1' with 'failure' if want fault tolerant recovery

    if(MPI_COMM_NULL == parent && childFlag == 0 && rank == 0) {
        endTime = MPI_Wtime();
        printf("[%d]----- Reconstructing failed communicator (including failed list creation) "
               "takes %0.6f Sec (MPI_Wtime) -----\n", rank, endTime - startTime);
    }

    // Memory release
    MPI_Errhandler_free(&newEh);

    return mcw;
}//communicatorReconstruct()

Пример #13

0

Показать файл

Файл: worker.c Проект: Big-Data/libcircle

/**
 * @brief Sets up libcircle, calls work loop function
 *
 * - Main worker function. This function:
 *     -# Initializes MPI
 *     -# Initializes internal libcircle data structures
 *     -# Calls libcircle's main work loop function.
 *     -# Checkpoints if CIRCLE_abort has been called by a rank.
 */
int8_t CIRCLE_worker()
{
    int rank = -1;
    int size = -1;
    int i = -1;

    /* Holds all worker state */
    CIRCLE_state_st local_state;
    CIRCLE_state_st* sptr = &local_state;

    /* Holds all mpi state */
    CIRCLE_mpi_state_st mpi_s;
    local_state.mpi_state_st = &mpi_s;

    /* Provides an interface to the queue. */
    queue_handle.enqueue = &CIRCLE_enqueue;
    queue_handle.dequeue = &CIRCLE_dequeue;
    queue_handle.local_queue_size = &CIRCLE_local_queue_size;

    MPI_Comm_size(*CIRCLE_INPUT_ST.work_comm, &size);
    sptr->size = size;
    CIRCLE_init_local_state(sptr, size);
    MPI_Errhandler circle_err;
    MPI_Comm_create_errhandler(CIRCLE_MPI_error_handler, &circle_err);
    MPI_Comm_set_errhandler(*mpi_s.work_comm, circle_err);
    rank = CIRCLE_global_rank;
    local_state.rank = rank;
    local_state.token_partner_recv = (rank - 1 + size) % size;
    local_state.token_partner_send = (rank + 1 + size) % size;

    /* randomize the first task we will request work from */
    local_state.seed = (unsigned) rank;
    CIRCLE_get_next_proc(&local_state);

    /* Initial local state */
    local_objects_processed = 0;
    total_objects_processed = 0;

    /* Master rank starts out with the initial data creation */
    size_t array_elems = (size_t) size;
    uint32_t* total_objects_processed_array = (uint32_t*) calloc(array_elems, sizeof(uint32_t));
    uint32_t* total_work_requests_array = (uint32_t*) calloc(array_elems, sizeof(uint32_t));
    uint32_t* total_no_work_received_array = (uint32_t*) calloc(array_elems, sizeof(uint32_t));

    if(CIRCLE_INPUT_ST.options & CIRCLE_SPLIT_EQUAL) {
        LOG(CIRCLE_LOG_DBG, "Using equalized load splitting.");
    }

    if(CIRCLE_INPUT_ST.options & CIRCLE_SPLIT_RANDOM) {
        LOG(CIRCLE_LOG_DBG, "Using randomized load splitting.");
    }

    /* start the termination token on rank 0 */
    if(rank == 0) {
        local_state.have_token = 1;
    }

    /* start by adding work to queue by calling create_cb,
     * only invoke on master unless CREATE_GLOBAL is set */
    if(rank == 0 || CIRCLE_INPUT_ST.options & CIRCLE_CREATE_GLOBAL) {
        (*(CIRCLE_INPUT_ST.create_cb))(&queue_handle);
    }

    CIRCLE_work_loop(sptr, &queue_handle);
    CIRCLE_cleanup_mpi_messages(sptr);

    if(CIRCLE_ABORT_FLAG) {
        CIRCLE_checkpoint();
    }


    MPI_Gather(&local_objects_processed, 1, MPI_INT, \
               &total_objects_processed_array[0], 1, MPI_INT, 0, \
               *mpi_s.work_comm);
    MPI_Gather(&local_work_requested, 1, MPI_INT, \
               &total_work_requests_array[0], 1, MPI_INT, 0, \
               *mpi_s.work_comm);
    MPI_Gather(&local_no_work_received, 1, MPI_INT, \
               &total_no_work_received_array[0], 1, MPI_INT, 0, \
               *mpi_s.work_comm);
    MPI_Reduce(&local_objects_processed, &total_objects_processed, 1, \
               MPI_INT, MPI_SUM, 0, *mpi_s.work_comm);
    MPI_Reduce(&local_hop_bytes, &total_hop_bytes, 1, \
               MPI_INT, MPI_SUM, 0, *mpi_s.work_comm);

    if(rank == 0) {
        for(i = 0; i < size; i++) {
            LOG(CIRCLE_LOG_INFO, "Rank %d\tObjects Processed %d\t%0.3lf%%", i, \
                total_objects_processed_array[i], \
                (double)total_objects_processed_array[i] / \
                (double)total_objects_processed * 100.0);
            LOG(CIRCLE_LOG_INFO, "Rank %d\tWork requests: %d", i, total_work_requests_array[i]);
            LOG(CIRCLE_LOG_INFO, "Rank %d\tNo work replies: %d", i, total_no_work_received_array[i]);
        }

        LOG(CIRCLE_LOG_INFO, \
            "Total Objects Processed: %d", total_objects_processed);
        LOG(CIRCLE_LOG_INFO, \
            "Total hop-bytes: %"PRIu64, total_hop_bytes);
        LOG(CIRCLE_LOG_INFO, \
            "Hop-bytes per file: %f", (float)total_hop_bytes / (float)total_objects_processed);
    }

    /* free memory */
    CIRCLE_free(&total_no_work_received_array);
    CIRCLE_free(&total_work_requests_array);
    CIRCLE_free(&total_objects_processed_array);
    CIRCLE_finalize_local_state(sptr);

    return 0;
}

Пример #14

0

Показать файл

Файл: libmesh.C Проект: dschwen/libmesh

LibMeshInit::LibMeshInit (int argc, const char * const * argv,
                          MPI_Comm COMM_WORLD_IN)
#endif
{
  // should _not_ be initialized already.
  libmesh_assert (!libMesh::initialized());

  // Build a command-line parser.
  command_line.reset (new GetPot (argc, argv));

  // Disable performance logging upon request
  {
    if (libMesh::on_command_line ("--disable-perflog"))
      libMesh::perflog.disable_logging();
  }

  // Build a task scheduler
  {
    // Get the requested number of threads, defaults to 1 to avoid MPI and
    // multithreading competition.  If you would like to use MPI and multithreading
    // at the same time then (n_mpi_processes_per_node)x(n_threads) should be the
    //  number of processing cores per node.
    std::vector<std::string> n_threads(2);
    n_threads[0] = "--n_threads";
    n_threads[1] = "--n-threads";
    libMesh::libMeshPrivateData::_n_threads =
      libMesh::command_line_value (n_threads, 1);

    // If there's no threading model active, force _n_threads==1
#if !LIBMESH_USING_THREADS
    if (libMesh::libMeshPrivateData::_n_threads != 1)
      {
        libMesh::libMeshPrivateData::_n_threads = 1;
        libmesh_warning("Warning: You requested --n-threads>1 but no threading model is active!\n"
                        << "Forcing --n-threads==1 instead!");
      }
#endif

    // Set the number of OpenMP threads to the same as the number of threads libMesh is going to use
#ifdef LIBMESH_HAVE_OPENMP
    omp_set_num_threads(libMesh::libMeshPrivateData::_n_threads);
#endif

    task_scheduler.reset (new Threads::task_scheduler_init(libMesh::n_threads()));
  }

  // Construct singletons who may be at risk of the
  // "static initialization order fiasco"
  Singleton::setup();

  // Make sure the construction worked
  libmesh_assert(remote_elem);

#if defined(LIBMESH_HAVE_MPI)

  // Allow the user to bypass MPI initialization
  if (!libMesh::on_command_line ("--disable-mpi"))
    {
      // Check whether the calling program has already initialized
      // MPI, and avoid duplicate Init/Finalize
      int flag;
      libmesh_call_mpi(MPI_Initialized (&flag));

      if (!flag)
        {
          int mpi_thread_provided;
          const int mpi_thread_requested = libMesh::n_threads() > 1 ?
            MPI_THREAD_FUNNELED :
            MPI_THREAD_SINGLE;

          libmesh_call_mpi
            (MPI_Init_thread (&argc, const_cast<char ***>(&argv),
                              mpi_thread_requested, &mpi_thread_provided));

          if ((libMesh::n_threads() > 1) &&
              (mpi_thread_provided < MPI_THREAD_FUNNELED))
            {
              libmesh_warning("Warning: MPI failed to guarantee MPI_THREAD_FUNNELED\n"
                              << "for a threaded run.\n"
                              << "Be sure your library is funneled-thread-safe..."
                              << std::endl);

              // Ideally, if an MPI stack tells us it's unsafe for us
              // to use threads, we shouldn't use threads.
              // In practice, we've encountered one MPI stack (an
              // mvapich2 configuration) that returned
              // MPI_THREAD_SINGLE as a proper warning, two stacks
              // that handle MPI_THREAD_FUNNELED properly, and two
              // current stacks plus a couple old stacks that return
              // MPI_THREAD_SINGLE but support libMesh threaded runs
              // anyway.

              // libMesh::libMeshPrivateData::_n_threads = 1;
              // task_scheduler.reset (new Threads::task_scheduler_init(libMesh::n_threads()));
            }
          libmesh_initialized_mpi = true;
        }

      // Duplicate the input communicator for internal use
      // And get a Parallel::Communicator copy too, to use
      // as a default for that API
      this->_comm = COMM_WORLD_IN;

      libMesh::GLOBAL_COMM_WORLD = COMM_WORLD_IN;

      //MPI_Comm_set_name not supported in at least SGI MPT's MPI implementation
      //MPI_Comm_set_name (libMesh::COMM_WORLD, "libMesh::COMM_WORLD");

      libMeshPrivateData::_processor_id =
        cast_int<processor_id_type>(this->comm().rank());
      libMeshPrivateData::_n_processors =
        cast_int<processor_id_type>(this->comm().size());

      // Set up an MPI error handler if requested.  This helps us get
      // into a debugger with a proper stack when an MPI error occurs.
      if (libMesh::on_command_line ("--handle-mpi-errors"))
        {
          libmesh_call_mpi
            (MPI_Comm_create_errhandler(libMesh_MPI_Handler, &libmesh_errhandler));
          libmesh_call_mpi
            (MPI_Comm_set_errhandler(libMesh::GLOBAL_COMM_WORLD, libmesh_errhandler));
          libmesh_call_mpi
            (MPI_Comm_set_errhandler(MPI_COMM_WORLD, libmesh_errhandler));
        }
    }

  // Could we have gotten bad values from the above calls?
  libmesh_assert_greater (libMeshPrivateData::_n_processors, 0);

  // The cast_int already tested _processor_id>=0
  // libmesh_assert_greater_equal (libMeshPrivateData::_processor_id, 0);

  // Let's be sure we properly initialize on every processor at once:
  libmesh_parallel_only(this->comm());

#endif

#if defined(LIBMESH_HAVE_PETSC)

  // Allow the user to bypass PETSc initialization
  if (!libMesh::on_command_line ("--disable-petsc")

#if defined(LIBMESH_HAVE_MPI)
      // If the user bypassed MPI, we'd better be safe and assume that
      // PETSc was built to require it; otherwise PETSc initialization
      // dies.
      && !libMesh::on_command_line ("--disable-mpi")
#endif
      )
    {
      int ierr=0;

      PETSC_COMM_WORLD = libMesh::GLOBAL_COMM_WORLD;

      // Check whether the calling program has already initialized
      // PETSc, and avoid duplicate Initialize/Finalize
      PetscBool petsc_already_initialized;
      ierr = PetscInitialized(&petsc_already_initialized);
      CHKERRABORT(libMesh::GLOBAL_COMM_WORLD,ierr);
      if (petsc_already_initialized != PETSC_TRUE)
        libmesh_initialized_petsc = true;
# if defined(LIBMESH_HAVE_SLEPC)

      // If SLEPc allows us to check whether the calling program
      // has already initialized it, we do that, and avoid
      // duplicate Initialize/Finalize.
      // We assume that SLEPc will handle PETSc appropriately,
      // which it does in the versions we've checked.
      if (!SlepcInitializeCalled)
        {
          ierr = SlepcInitialize  (&argc, const_cast<char ***>(&argv), nullptr, nullptr);
          CHKERRABORT(libMesh::GLOBAL_COMM_WORLD,ierr);
          libmesh_initialized_slepc = true;
        }
# else
      if (libmesh_initialized_petsc)
        {
          ierr = PetscInitialize (&argc, const_cast<char ***>(&argv), nullptr, nullptr);
          CHKERRABORT(libMesh::GLOBAL_COMM_WORLD,ierr);
        }
# endif
#if !PETSC_RELEASE_LESS_THAN(3,3,0)
      // Register the reference implementation of DMlibMesh
#if PETSC_RELEASE_LESS_THAN(3,4,0)
      ierr = DMRegister(DMLIBMESH, PETSC_NULL, "DMCreate_libMesh", DMCreate_libMesh); CHKERRABORT(libMesh::GLOBAL_COMM_WORLD,ierr);
#else
      ierr = DMRegister(DMLIBMESH, DMCreate_libMesh); CHKERRABORT(libMesh::GLOBAL_COMM_WORLD,ierr);
#endif

#endif
    }
#endif

#if defined(LIBMESH_HAVE_MPI) && defined(LIBMESH_HAVE_VTK)
  // Do MPI initialization for VTK.
  _vtk_mpi_controller = vtkMPIController::New();
  _vtk_mpi_controller->Initialize(&argc, const_cast<char ***>(&argv), /*initialized_externally=*/1);
  _vtk_mpi_controller->SetGlobalController(_vtk_mpi_controller);
#endif

  // Re-parse the command-line arguments.  Note that PETSc and MPI
  // initialization above may have removed command line arguments
  // that are not relevant to this application in the above calls.
  // We don't want a false-positive by detecting those arguments.
  //
  // Note: this seems overly paranoid/like it should be unnecessary,
  // plus we were doing it wrong for many years and not clearing the
  // existing GetPot object before re-parsing the command line, so all
  // the command line arguments appeared twice in the GetPot object...
  command_line.reset (new GetPot (argc, argv));

  // The following line is an optimization when simultaneous
  // C and C++ style access to output streams is not required.
  // The amount of benefit which occurs is probably implementation
  // defined, and may be nothing.  On the other hand, I have seen
  // some IO tests where IO performance improves by a factor of two.
  if (!libMesh::on_command_line ("--sync-with-stdio"))
    std::ios::sync_with_stdio(false);

  // Honor the --separate-libmeshout command-line option.
  // When this is specified, the library uses an independent ostream
  // for libMesh::out/libMesh::err messages, and
  // std::cout and std::cerr are untouched by any other options
  if (libMesh::on_command_line ("--separate-libmeshout"))
    {
      // Redirect.  We'll share streambufs with cout/cerr for now, but
      // presumably anyone using this option will want to replace the
      // bufs later.
      std::ostream * newout = new std::ostream(std::cout.rdbuf());
      libMesh::out = *newout;
      std::ostream * newerr = new std::ostream(std::cerr.rdbuf());
      libMesh::err = *newerr;
    }

  // Process command line arguments for redirecting stdout/stderr.
  bool
    cmdline_has_redirect_stdout = libMesh::on_command_line ("--redirect-stdout"),
    cmdline_has_redirect_output = libMesh::on_command_line ("--redirect-output");

  // The --redirect-stdout command-line option has been deprecated in
  // favor of "--redirect-output basename".
  if (cmdline_has_redirect_stdout)
    libmesh_warning("The --redirect-stdout command line option has been deprecated. "
                    "Use '--redirect-output basename' instead.");

  // Honor the "--redirect-stdout" and "--redirect-output basename"
  // command-line options.  When one of these is specified, each
  // processor sends libMesh::out/libMesh::err messages to
  // stdout.processor.#### (default) or basename.processor.####.
  if (cmdline_has_redirect_stdout || cmdline_has_redirect_output)
    {
      std::string basename = "stdout";

      // Look for following argument if using new API
      if (cmdline_has_redirect_output)
        {
          // Set the cursor to the correct location in the list of command line arguments.
          command_line->search(1, "--redirect-output");

          // Get the next option on the command line as a string.
          std::string next_string = "";
          next_string = command_line->next(next_string);

          // If the next string starts with a dash, we assume it's
          // another flag and not a file basename requested by the
          // user.
          if (next_string.size() > 0 && next_string.find_first_of("-") != 0)
            basename = next_string;
        }

      std::ostringstream filename;
      filename << basename << ".processor." << libMesh::global_processor_id();
      _ofstream.reset (new std::ofstream (filename.str().c_str()));

      // Redirect, saving the original streambufs!
      out_buf = libMesh::out.rdbuf (_ofstream->rdbuf());
      err_buf = libMesh::err.rdbuf (_ofstream->rdbuf());
    }

  // redirect libMesh::out to nothing on all
  // other processors unless explicitly told
  // not to via the --keep-cout command-line argument.
  if (libMesh::global_processor_id() != 0)
    if (!libMesh::on_command_line ("--keep-cout"))
      libMesh::out.rdbuf (nullptr);

  // Similarly, the user can request to drop cerr on all non-0 ranks.
  // By default, errors are printed on all ranks, but this can lead to
  // interleaved/unpredictable outputs when doing parallel regression
  // testing, which this option is designed to support.
  if (libMesh::global_processor_id() != 0)
    if (libMesh::on_command_line ("--drop-cerr"))
      libMesh::err.rdbuf (nullptr);

  // Check command line to override printing
  // of reference count information.
  if (libMesh::on_command_line("--disable-refcount-printing"))
    ReferenceCounter::disable_print_counter_info();

#ifdef LIBMESH_ENABLE_EXCEPTIONS
  // Set our terminate handler to write stack traces in the event of a
  // crash
  old_terminate_handler = std::set_terminate(libmesh_terminate_handler);
#endif


  if (libMesh::on_command_line("--enable-fpe"))
    libMesh::enableFPE(true);

  if (libMesh::on_command_line("--enable-segv"))
    libMesh::enableSEGV(true);

  // The library is now ready for use
  libMeshPrivateData::_is_initialized = true;


  // Make sure these work.  Library methods
  // depend on these being implemented properly,
  // so this is a good time to test them!
  libmesh_assert (libMesh::initialized());
  libmesh_assert (!libMesh::closed());
}