int gasneti_bootstrapInit_mpi(int *argc, char ***argv, gasnet_node_t *nodes, gasnet_node_t *mynode) { MPI_Group world; int err; /* Call MPI_Init exactly once */ err = MPI_Initialized(&gasnetc_mpi_preinitialized); if (MPI_SUCCESS != err) return GASNET_ERR_NOT_INIT; if (!gasnetc_mpi_preinitialized) { err = MPI_Init(argc, argv); if (MPI_SUCCESS != err) return GASNET_ERR_NOT_INIT; } /* Create private communicator */ err = MPI_Comm_group(MPI_COMM_WORLD, &world); gasneti_assert(err == MPI_SUCCESS); err = MPI_Comm_create(MPI_COMM_WORLD, world, &gasnetc_mpi_comm); gasneti_assert(err == MPI_SUCCESS); err = MPI_Group_free(&world); gasneti_assert(err == MPI_SUCCESS); /* Get size and rank */ err = MPI_Comm_size(gasnetc_mpi_comm, &gasnetc_mpi_size); gasneti_assert(err == MPI_SUCCESS); *nodes = gasnetc_mpi_size; if ((int)(*nodes) != gasnetc_mpi_size) *nodes = 0; /* Overflow! */ err = MPI_Comm_rank(gasnetc_mpi_comm, &gasnetc_mpi_rank); gasneti_assert(err == MPI_SUCCESS); *mynode = gasnetc_mpi_rank; gasneti_setupGlobalEnvironment(*nodes, *mynode, &gasneti_bootstrapExchange_mpi, &gasneti_bootstrapBroadcast_mpi); return GASNET_OK; }
std::ostream& pout() { #ifdef CH_MPI // the common case is _open == true, which just returns s_pout if ( ! s_pout_open ) { // the uncommon cae: the file isn't opened, MPI may not be // initialized, and the basename may not have been set int flag_i, flag_f; MPI_Initialized(&flag_i); MPI_Finalized(&flag_f); // app hasn't set a basename yet, so set the default if ( ! s_pout_init ) { s_pout_basename = "pout" ; s_pout_init = true ; } // if MPI not initialized, we cant open the file so return cout if ( ! flag_i || flag_f) { return std::cout; // MPI hasn't been started yet, or has ended.... } // MPI is initialized, so file must not be, so open it setFileName() ; openFile() ; // finally, in case the open failed, return cout if ( ! s_pout_open ) { return std::cout ; } } return s_pout ; #else return std::cout; #endif }
/** Finalize DALEC. Must be called before MPI is finalized. DALEC calls are * not valid after finalization. Collective on world group. * * @return Zero on success */ int PDALEC_Finalize(void) { int dalec_alive = atomic_fetch_sub_explicit(&(DALECI_GLOBAL_STATE.alive), 1,memory_order_seq_cst); if (dalec_alive == 1) { /* Check for MPI initialization */ int mpi_is_init, mpi_is_fin; MPI_Initialized(&mpi_is_init); MPI_Finalized(&mpi_is_fin); /* Free communicator if possible and return */ if (!mpi_is_init || mpi_is_fin) { DALECI_Warning("MPI must be active when calling DALEC_Finalize"); return DALEC_ERROR_MPI_USAGE; } else { int rc = MPI_Comm_free(&DALECI_GLOBAL_STATE.mpi_comm); return DALECI_Check_MPI("DALEC_Finalize", "MPI_Comm_free", rc); } } else { /* Library is still active. */ return DALEC_SUCCESS; } }
int MpiCommunicator::init( int minId, long thecomm_ ) { VT_FUNC_I( "MpiCommunicator::init" ); assert( sizeof(thecomm_) >= sizeof(MPI_Comm) ); MPI_Comm thecomm = (MPI_Comm)thecomm_; // turn wait mode on for intel mpi if possible // this should greatly improve performance for intel mpi PAL_SetEnvVar( "I_MPI_WAIT_MODE", "enable", 0); int flag; MPI_Initialized( &flag ); if ( ! flag ) { int p; //!! FIXME passing NULL ptr breaks mvapich1 mpi implementation MPI_Init_thread( 0, NULL, MPI_THREAD_MULTIPLE, &p ); if( p != MPI_THREAD_MULTIPLE ) { // can't use Speaker yet, need Channels to be inited std::cerr << "[CnC] Warning: not MPI_THREAD_MULTIPLE (" << MPI_THREAD_MULTIPLE << "), but " << p << std::endl; } } else if( thecomm == 0 ) { CNC_ABORT( "Process has already been initialized" ); } MPI_Comm myComm = MPI_COMM_WORLD; int rank; MPI_Comm parentComm; if( thecomm == 0 ) { MPI_Comm_get_parent( &parentComm ); } else { m_customComm = true; m_exit0CallOk = false; myComm = thecomm; } MPI_Comm_rank( myComm, &rank ); // father of all checks if he's requested to spawn processes: if ( rank == 0 && parentComm == MPI_COMM_NULL ) { // Ok, let's spawn the clients. // I need some information for the startup. // 1. Name of the executable (default is the current exe) const char * _tmp = getenv( "CNC_MPI_SPAWN" ); if ( _tmp ) { int nClientsToSpawn = atol( _tmp ); _tmp = getenv( "CNC_MPI_EXECUTABLE" ); std::string clientExe( _tmp ? _tmp : "" ); if( clientExe.empty() ) clientExe = PAL_GetProgname(); CNC_ASSERT( ! clientExe.empty() ); // 3. Special setting for MPI_Info: hosts const char * clientHost = getenv( "CNC_MPI_HOSTS" ); // Prepare MPI_Info object: MPI_Info clientInfo = MPI_INFO_NULL; if ( clientHost ) { MPI_Info_create( &clientInfo ); if ( clientHost ) { MPI_Info_set( clientInfo, const_cast< char * >( "host" ), const_cast< char * >( clientHost ) ); // can't use Speaker yet, need Channels to be inited std::cerr << "[CnC " << rank << "] Set MPI_Info_set( \"host\", \"" << clientHost << "\" )\n"; } } // Now spawn the client processes: // can't use Speaker yet, need Channels to be inited std::cerr << "[CnC " << rank << "] Spawning " << nClientsToSpawn << " MPI processes" << std::endl; int* errCodes = new int[nClientsToSpawn]; MPI_Comm interComm; int err = MPI_Comm_spawn( const_cast< char * >( clientExe.c_str() ), MPI_ARGV_NULL, nClientsToSpawn, clientInfo, 0, MPI_COMM_WORLD, &interComm, errCodes ); delete [] errCodes; if ( err ) { // can't use Speaker yet, need Channels to be inited std::cerr << "[CnC " << rank << "] Error in MPI_Comm_spawn. Skipping process spawning"; } else { MPI_Intercomm_merge( interComm, 0, &myComm ); } } // else { // No process spawning // MPI-1 situation: all clients to be started by mpiexec // myComm = MPI_COMM_WORLD; //} } if ( thecomm == 0 && parentComm != MPI_COMM_NULL ) { // I am a child. Build intra-comm to the parent. MPI_Intercomm_merge( parentComm, 1, &myComm ); } MPI_Comm_rank( myComm, &rank ); CNC_ASSERT( m_channel == NULL ); MpiChannelInterface* myChannel = new MpiChannelInterface( use_crc(), myComm ); m_channel = myChannel; int size; MPI_Comm_size( myComm, &size ); // Are we on the host or on the remote side? if ( rank == 0 ) { if( size <= 1 ) { Speaker oss( std::cerr ); oss << "Warning: no clients avabilable. Forgot to set CNC_MPI_SPAWN?"; } // ==> HOST startup: // This initializes the mpi environment in myChannel. MpiHostInitializer hostInitializer( *myChannel ); hostInitializer.init_mpi_comm( myComm ); } else { // ==> CLIENT startup: // This initializes the mpi environment in myChannel. MpiClientInitializer clientInitializer( *myChannel ); clientInitializer.init_mpi_comm( myComm ); } { Speaker oss( std::cerr ); oss << "MPI initialization complete (rank " << rank << ")."; } // MPI_Barrier( myComm ); // Now the mpi specific setup is finished. // Do the generic initialization stuff. GenericCommunicator::init( minId ); return 0; }
LibMeshInit::LibMeshInit (int argc, const char* const* argv, MPI_Comm COMM_WORLD_IN) #endif { // should _not_ be initialized already. libmesh_assert (!libMesh::initialized()); // Build a command-line parser. command_line.reset (new GetPot (argc, argv)); // Disable performance logging upon request { if (libMesh::on_command_line ("--disable-perflog")) libMesh::perflog.disable_logging(); } // Build a task scheduler { // Get the requested number of threads, defaults to 1 to avoid MPI and // multithreading competition. If you would like to use MPI and multithreading // at the same time then (n_mpi_processes_per_node)x(n_threads) should be the // number of processing cores per node. std::vector<std::string> n_threads(2); n_threads[0] = "--n_threads"; n_threads[1] = "--n-threads"; libMesh::libMeshPrivateData::_n_threads = libMesh::command_line_value (n_threads, 1); // Set the number of OpenMP threads to the same as the number of threads libMesh is going to use #ifdef LIBMESH_HAVE_OPENMP omp_set_num_threads(libMesh::libMeshPrivateData::_n_threads); #endif task_scheduler.reset (new Threads::task_scheduler_init(libMesh::n_threads())); } // Construct singletons who may be at risk of the // "static initialization order fiasco" Singleton::setup(); // Make sure the construction worked libmesh_assert(remote_elem); #if defined(LIBMESH_HAVE_MPI) // Allow the user to bypass MPI initialization if (!libMesh::on_command_line ("--disable-mpi")) { // Check whether the calling program has already initialized // MPI, and avoid duplicate Init/Finalize int flag; MPI_Initialized (&flag); if (!flag) { #if MPI_VERSION > 1 int mpi_thread_provided; const int mpi_thread_requested = libMesh::n_threads() > 1 ? MPI_THREAD_FUNNELED : MPI_THREAD_SINGLE; MPI_Init_thread (&argc, const_cast<char***>(&argv), mpi_thread_requested, &mpi_thread_provided); if ((libMesh::n_threads() > 1) && (mpi_thread_provided < MPI_THREAD_FUNNELED)) { libmesh_warning("Warning: MPI failed to guarantee MPI_THREAD_FUNNELED\n" << "for a threaded run.\n" << "Be sure your library is funneled-thread-safe..." << std::endl); // Ideally, if an MPI stack tells us it's unsafe for us // to use threads, we shouldn't use threads. // In practice, we've encountered one MPI stack (an // mvapich2 configuration) that returned // MPI_THREAD_SINGLE as a proper warning, two stacks // that handle MPI_THREAD_FUNNELED properly, and two // current stacks plus a couple old stacks that return // MPI_THREAD_SINGLE but support libMesh threaded runs // anyway. // libMesh::libMeshPrivateData::_n_threads = 1; // task_scheduler.reset (new Threads::task_scheduler_init(libMesh::n_threads())); } #else if (libMesh::libMeshPrivateData::_n_threads > 1) { libmesh_warning("Warning: using MPI1 for threaded code.\n" << "Be sure your library is funneled-thread-safe..." << std::endl); } MPI_Init (&argc, const_cast<char***>(&argv)); #endif libmesh_initialized_mpi = true; } // Duplicate the input communicator for internal use // And get a Parallel::Communicator copy too, to use // as a default for that API this->_comm = COMM_WORLD_IN; libMesh::GLOBAL_COMM_WORLD = COMM_WORLD_IN; #ifndef LIBMESH_DISABLE_COMMWORLD libMesh::COMM_WORLD = COMM_WORLD_IN; Parallel::Communicator_World = COMM_WORLD_IN; #endif //MPI_Comm_set_name not supported in at least SGI MPT's MPI implementation //MPI_Comm_set_name (libMesh::COMM_WORLD, "libMesh::COMM_WORLD"); libMeshPrivateData::_processor_id = libmesh_cast_int<processor_id_type>(this->comm().rank()); libMeshPrivateData::_n_processors = libmesh_cast_int<processor_id_type>(this->comm().size()); // Set up an MPI error handler if requested. This helps us get // into a debugger with a proper stack when an MPI error occurs. if (libMesh::on_command_line ("--handle-mpi-errors")) { #if MPI_VERSION > 1 MPI_Comm_create_errhandler(libMesh_MPI_Handler, &libmesh_errhandler); MPI_Comm_set_errhandler(libMesh::GLOBAL_COMM_WORLD, libmesh_errhandler); MPI_Comm_set_errhandler(MPI_COMM_WORLD, libmesh_errhandler); #else MPI_Errhandler_create(libMesh_MPI_Handler, &libmesh_errhandler); MPI_Errhandler_set(libMesh::GLOBAL_COMM_WORLD, libmesh_errhandler); MPI_Errhandler_set(MPI_COMM_WORLD, libmesh_errhandler); #endif // #if MPI_VERSION > 1 } } // Could we have gotten bad values from the above calls? libmesh_assert_greater (libMeshPrivateData::_n_processors, 0); // The libmesh_cast_int already tested _processor_id>=0 // libmesh_assert_greater_equal (libMeshPrivateData::_processor_id, 0); // Let's be sure we properly initialize on every processor at once: libmesh_parallel_only(this->comm()); #endif #if defined(LIBMESH_HAVE_PETSC) // Allow the user to bypass PETSc initialization if (!libMesh::on_command_line ("--disable-petsc") #if defined(LIBMESH_HAVE_MPI) // If the user bypassed MPI, we'd better be safe and assume that // PETSc was built to require it; otherwise PETSc initialization // dies. && !libMesh::on_command_line ("--disable-mpi") #endif ) { int ierr=0; PETSC_COMM_WORLD = libMesh::GLOBAL_COMM_WORLD; // Check whether the calling program has already initialized // PETSc, and avoid duplicate Initialize/Finalize PetscBool petsc_already_initialized; ierr = PetscInitialized(&petsc_already_initialized); CHKERRABORT(libMesh::GLOBAL_COMM_WORLD,ierr); if (petsc_already_initialized != PETSC_TRUE) libmesh_initialized_petsc = true; # if defined(LIBMESH_HAVE_SLEPC) // If SLEPc allows us to check whether the calling program // has already initialized it, we do that, and avoid // duplicate Initialize/Finalize. // We assume that SLEPc will handle PETSc appropriately, // which it does in the versions we've checked. # if !SLEPC_VERSION_LESS_THAN(2,3,3) if (!SlepcInitializeCalled) # endif { ierr = SlepcInitialize (&argc, const_cast<char***>(&argv), NULL, NULL); CHKERRABORT(libMesh::GLOBAL_COMM_WORLD,ierr); libmesh_initialized_slepc = true; } # else if (libmesh_initialized_petsc) { ierr = PetscInitialize (&argc, const_cast<char***>(&argv), NULL, NULL); CHKERRABORT(libMesh::GLOBAL_COMM_WORLD,ierr); } # endif } #endif // Re-parse the command-line arguments. Note that PETSc and MPI // initialization above may have removed command line arguments // that are not relevant to this application in the above calls. // We don't want a false-positive by detecting those arguments. command_line->parse_command_line (argc, argv); // The following line is an optimization when simultaneous // C and C++ style access to output streams is not required. // The amount of benefit which occurs is probably implementation // defined, and may be nothing. On the other hand, I have seen // some IO tests where IO peformance improves by a factor of two. if (!libMesh::on_command_line ("--sync-with-stdio")) std::ios::sync_with_stdio(false); // Honor the --separate-libmeshout command-line option. // When this is specified, the library uses an independent ostream // for libMesh::out/libMesh::err messages, and // std::cout and std::cerr are untouched by any other options if (libMesh::on_command_line ("--separate-libmeshout")) { // Redirect. We'll share streambufs with cout/cerr for now, but // presumably anyone using this option will want to replace the // bufs later. std::ostream* newout = new std::ostream(std::cout.rdbuf()); libMesh::out = *newout; std::ostream* newerr = new std::ostream(std::cerr.rdbuf()); libMesh::err = *newerr; } // Honor the --redirect-stdout command-line option. // When this is specified each processor sends // libMesh::out/libMesh::err messages to // stdout.processor.#### if (libMesh::on_command_line ("--redirect-stdout")) { std::ostringstream filename; filename << "stdout.processor." << libMesh::global_processor_id(); _ofstream.reset (new std::ofstream (filename.str().c_str())); // Redirect, saving the original streambufs! out_buf = libMesh::out.rdbuf (_ofstream->rdbuf()); err_buf = libMesh::err.rdbuf (_ofstream->rdbuf()); } // redirect libMesh::out to nothing on all // other processors unless explicitly told // not to via the --keep-cout command-line argument. if (libMesh::global_processor_id() != 0) if (!libMesh::on_command_line ("--keep-cout")) libMesh::out.rdbuf (NULL); // Check command line to override printing // of reference count information. if(libMesh::on_command_line("--disable-refcount-printing") ) ReferenceCounter::disable_print_counter_info(); #ifdef LIBMESH_ENABLE_EXCEPTIONS // Set our terminate handler to write stack traces in the event of a // crash old_terminate_handler = std::set_terminate(libmesh_terminate_handler); #endif if (libMesh::on_command_line("--enable-fpe")) libMesh::enableFPE(true); // The library is now ready for use libMeshPrivateData::_is_initialized = true; // Make sure these work. Library methods // depend on these being implemented properly, // so this is a good time to test them! libmesh_assert (libMesh::initialized()); libmesh_assert (!libMesh::closed()); }
int main(int argc, char *argv[]) { json_object *main_obj = json_object_new_object(); json_object *parallel_obj = json_object_new_object(); json_object *problem_obj = 0; json_object *clargs_obj = 0; MACSIO_TIMING_GroupMask_t main_grp; MACSIO_TIMING_TimerId_t main_tid; int i, argi, exercise_scr = 0; int size = 1, rank = 0; /* quick pre-scan for scr cl flag */ for (i = 0; i < argc && !exercise_scr; i++) exercise_scr = !strcmp("exercise_scr", argv[i]); #warning SHOULD WE BE USING MPI-3 API #ifdef HAVE_MPI MPI_Init(&argc, &argv); #ifdef HAVE_SCR #warning SANITY CHECK WITH MIFFPP if (exercise_scr) SCR_Init(); #endif MPI_Comm_dup(MPI_COMM_WORLD, &MACSIO_MAIN_Comm); MPI_Errhandler_set(MACSIO_MAIN_Comm, MPI_ERRORS_RETURN); MPI_Comm_size(MACSIO_MAIN_Comm, &MACSIO_MAIN_Size); MPI_Comm_rank(MACSIO_MAIN_Comm, &MACSIO_MAIN_Rank); mpi_errno = MPI_SUCCESS; #endif errno = 0; main_grp = MACSIO_TIMING_GroupMask("MACSIO main()"); main_tid = MT_StartTimer("main", main_grp, MACSIO_TIMING_ITER_AUTO); MACSIO_LOG_StdErr = MACSIO_LOG_LogInit(MACSIO_MAIN_Comm, 0, 0, 0, 0); /* Process the command line and put the results in the problem */ clargs_obj = ProcessCommandLine(argc, argv, &argi); json_object_object_add(main_obj, "clargs", clargs_obj); strncpy(MACSIO_UTILS_UnitsPrefixSystem, JsonGetStr(clargs_obj, "units_prefix_system"), sizeof(MACSIO_UTILS_UnitsPrefixSystem)); MACSIO_LOG_MainLog = MACSIO_LOG_LogInit(MACSIO_MAIN_Comm, JsonGetStr(clargs_obj, "log_file_name"), JsonGetInt(clargs_obj, "log_line_length"), JsonGetInt(clargs_obj, "log_line_cnt/0"), JsonGetInt(clargs_obj, "log_line_cnt/1")); #warning THESE INITIALIZATIONS SHOULD BE IN MACSIO_LOG MACSIO_LOG_DebugLevel = JsonGetInt(clargs_obj, "debug_level"); /* Setup parallel information */ json_object_object_add(parallel_obj, "mpi_size", json_object_new_int(MACSIO_MAIN_Size)); json_object_object_add(parallel_obj, "mpi_rank", json_object_new_int(MACSIO_MAIN_Rank)); json_object_object_add(main_obj, "parallel", parallel_obj); #warning SHOULD WE INCLUDE TOP-LEVEL INFO ON VAR NAMES AND WHETHER THEYRE RESTRICTED #warning CREATE AN IO CONTEXT OBJECT /* Acquire an I/O context handle from the plugin */ /* Do a read or write test */ if (strcmp(JsonGetStr(clargs_obj, "read_path"),"null")) main_read(argi, argc, argv, main_obj); else main_write(argi, argc, argv, main_obj); /* stop total timer */ MT_StopTimer(main_tid); /* Write timings data file if requested */ if (strlen(JsonGetStr(clargs_obj, "timings_file_name"))) write_timings_file(JsonGetStr(clargs_obj, "timings_file_name")); MACSIO_TIMING_ClearTimers(MACSIO_TIMING_ALL_GROUPS); #warning ATEXIT THESE if (json_object_put(main_obj) != 1) { MACSIO_LOG_MSG(Info, ("Unable to free main JSON object")); } MACSIO_TIMING_GroupMask(0); MACSIO_TIMING_ReduceTimers(MACSIO_MAIN_Comm, -1); json_object_apath_get_string(0,0); /* free circ cache */ MACSIO_LOG_LogFinalize(MACSIO_LOG_MainLog); MACSIO_LOG_LogFinalize(MACSIO_LOG_StdErr); #ifdef HAVE_SCR if (exercise_scr) SCR_Finalize(); #endif #ifdef HAVE_MPI { int result; if ((MPI_Initialized(&result) == MPI_SUCCESS) && result) MPI_Finalize(); } #endif #warning FIX RETVAL OF MAIN TO BE NON-ZERO WHEN ERRORS OCCUR return (0); }
void myhbwmalloc_init(void) { /* set to NULL before trying to initialize. if we return before * successful creation of the mspace, then it will still be NULL, * and we can use that in subsequent library calls to determine * that the library failed to initialize. */ myhbwmalloc_mspace = NULL; /* verbose printout? */ myhbwmalloc_verbose = 0; { char * env_char = getenv("HBWMALLOC_VERBOSE"); if (env_char != NULL) { myhbwmalloc_verbose = 1; printf("hbwmalloc: HBWMALLOC_VERBOSE set\n"); } } /* fail hard or soft? */ myhbwmalloc_hardfail = 1; { char * env_char = getenv("HBWMALLOC_SOFTFAIL"); if (env_char != NULL) { myhbwmalloc_hardfail = 0; printf("hbwmalloc: HBWMALLOC_SOFTFAIL set\n"); } } /* set the atexit handler that will destroy the mspace and free the numa allocation */ atexit(myhbwmalloc_final); /* detect and configure use of NUMA memory nodes */ { int max_possible_node = numa_max_possible_node(); int num_possible_nodes = numa_num_possible_nodes(); int max_numa_nodes = numa_max_node(); int num_configured_nodes = numa_num_configured_nodes(); int num_configured_cpus = numa_num_configured_cpus(); if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_max_possible_node() = %d\n", max_possible_node); printf("hbwmalloc: numa_num_possible_nodes() = %d\n", num_possible_nodes); printf("hbwmalloc: numa_max_node() = %d\n", max_numa_nodes); printf("hbwmalloc: numa_num_configured_nodes() = %d\n", num_configured_nodes); printf("hbwmalloc: numa_num_configured_cpus() = %d\n", num_configured_cpus); } /* FIXME this is a hack. assumes HBW is only numa node 1. */ if (num_configured_nodes <= 2) { myhbwmalloc_numa_node = num_configured_nodes-1; } else { fprintf(stderr,"hbwmalloc: we support only 2 numa nodes, not %d\n", num_configured_nodes); } if (myhbwmalloc_verbose) { for (int i=0; i<num_configured_nodes; i++) { unsigned max_numa_cpus = numa_num_configured_cpus(); struct bitmask * mask = numa_bitmask_alloc( max_numa_cpus ); int rc = numa_node_to_cpus(i, mask); if (rc != 0) { fprintf(stderr, "hbwmalloc: numa_node_to_cpus failed\n"); } else { printf("hbwmalloc: numa node %d cpu mask:", i); for (unsigned j=0; j<max_numa_cpus; j++) { int bit = numa_bitmask_isbitset(mask,j); printf(" %d", bit); } printf("\n"); } numa_bitmask_free(mask); } fflush(stdout); } } #if 0 /* unused */ /* see if the user specifies a slab size */ size_t slab_size_requested = 0; { char * env_char = getenv("HBWMALLOC_BYTES"); if (env_char!=NULL) { long units = 1L; if ( NULL != strstr(env_char,"G") ) units = 1000000000L; else if ( NULL != strstr(env_char,"M") ) units = 1000000L; else if ( NULL != strstr(env_char,"K") ) units = 1000L; else units = 1L; int num_count = strspn(env_char, "0123456789"); memset( &env_char[num_count], ' ', strlen(env_char)-num_count); slab_size_requested = units * atol(env_char); } if (myhbwmalloc_verbose) { printf("hbwmalloc: requested slab_size_requested = %zu\n", slab_size_requested); } } #endif /* see what libnuma says is available */ size_t myhbwmalloc_slab_size; { int node = myhbwmalloc_numa_node; long long freemem; long long maxmem = numa_node_size64(node, &freemem); if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_node_size64 says maxmem=%lld freemem=%lld for numa node %d\n", maxmem, freemem, node); } myhbwmalloc_slab_size = freemem; } /* assume threads, disable if MPI knows otherwise, then allow user to override. */ int multithreaded = 1; #ifdef HAVE_MPI int nprocs; { int is_init, is_final; MPI_Initialized(&is_init); MPI_Finalized(&is_final); if (is_init && !is_final) { MPI_Comm_size(MPI_COMM_WORLD, &nprocs); } /* give equal portion to every MPI process */ myhbwmalloc_slab_size /= nprocs; /* if the user initializes MPI with MPI_Init or * MPI_Init_thread(MPI_THREAD_SINGLE), they assert there * are no threads at all, which means we can skip the * malloc mspace lock. * * if the user lies to MPI, they deserve any bad thing * that comes of it. */ int provided; MPI_Query_thread(&provided); if (provided==MPI_THREAD_SINGLE) { multithreaded = 0; } else { multithreaded = 1; } if (myhbwmalloc_verbose) { printf("hbwmalloc: MPI processes = %d (threaded = %d)\n", nprocs, multithreaded); printf("hbwmalloc: myhbwmalloc_slab_size = %d\n", myhbwmalloc_slab_size); } } #endif /* user can assert that hbwmalloc and friends need not be thread-safe */ { char * env_char = getenv("HBWMALLOC_LOCKLESS"); if (env_char != NULL) { multithreaded = 0; if (myhbwmalloc_verbose) { printf("hbwmalloc: user has disabled locking in mspaces by setting HBWMALLOC_LOCKLESS\n"); } } } myhbwmalloc_slab = numa_alloc_onnode( myhbwmalloc_slab_size, myhbwmalloc_numa_node); if (myhbwmalloc_slab==NULL) { fprintf(stderr, "hbwmalloc: numa_alloc_onnode returned NULL for size = %zu\n", myhbwmalloc_slab_size); return; } else { if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_alloc_onnode succeeded for size %zu\n", myhbwmalloc_slab_size); } /* part (less than 128*sizeof(size_t) bytes) of this space is used for bookkeeping, * so the capacity must be at least this large */ if (myhbwmalloc_slab_size < 128*sizeof(size_t)) { fprintf(stderr, "hbwmalloc: not enough space for mspace bookkeeping\n"); return; } /* see above regarding if the user lies to MPI. */ int locked = multithreaded; myhbwmalloc_mspace = create_mspace_with_base( myhbwmalloc_slab, myhbwmalloc_slab_size, locked); if (myhbwmalloc_mspace == NULL) { fprintf(stderr, "hbwmalloc: create_mspace_with_base returned NULL\n"); return; } else if (myhbwmalloc_verbose) { printf("hbwmalloc: create_mspace_with_base succeeded for size %zu\n", myhbwmalloc_slab_size); } } }
void mpi_initialized_(int* flag, int* ierr){ *ierr = MPI_Initialized(flag); }
LIS_INT lis_initialize(LIS_INT* argc, char** argv[]) { LIS_ARGS p; LIS_INT i,nprocs; LIS_DEBUG_FUNC_IN; /* lis_memory_init();*/ #ifdef USE_MPI MPI_Initialized(&lis_mpi_initialized); if (!lis_mpi_initialized) MPI_Init(argc, argv); #ifdef USE_QUAD_PRECISION MPI_Type_contiguous( LIS_MPI_MSCALAR_LEN, MPI_DOUBLE, &LIS_MPI_MSCALAR ); MPI_Type_commit( &LIS_MPI_MSCALAR ); MPI_Op_create((MPI_User_function *)lis_mpi_msum, LIS_TRUE, &LIS_MPI_MSUM); #endif #endif #ifdef _OPENMP nprocs = omp_get_max_threads(); #endif lis_arg2args(*argc,*argv,&cmd_args); p = cmd_args->next; while( p!=cmd_args ) { for(i=0;i<LIS_INIT_OPTIONS_LEN;i++) { if( strcmp(p->arg1, LIS_INIT_OPTNAME[i])==0 ) { switch( LIS_INIT_OPTACT[i] ) { case LIS_INIT_OPTIONS_OMPNUMTHREADS: #ifdef _LONGLONG sscanf(p->arg2, "%lld", &nprocs); #else sscanf(p->arg2, "%d", &nprocs); #endif break; } } } p = p->next; } #ifdef _OPENMP omp_set_num_threads(nprocs); lis_vec_tmp = (LIS_SCALAR *)lis_malloc( nprocs*LIS_VEC_TMP_PADD*sizeof(LIS_QUAD),"lis_initialize::lis_vec_tmp" ); if( lis_vec_tmp==NULL ) { LIS_SETERR_MEM(nprocs*LIS_VEC_TMP_PADD*sizeof(LIS_QUAD)); return LIS_ERR_OUT_OF_MEMORY; } #endif #ifdef USE_QUAD_PRECISION lis_quad_scalar_tmp = (LIS_SCALAR *)lis_malloc( LIS_QUAD_SCALAR_SIZE*sizeof(LIS_SCALAR),"lis_initialize::lis_quad_scalar_tmp" ); if( lis_quad_scalar_tmp==NULL ) { LIS_SETERR_MEM(LIS_QUAD_SCALAR_SIZE*sizeof(LIS_SCALAR)); return LIS_OUT_OF_MEMORY; } lis_quad_x87_fpu_init(&lis_x87_fpu_cw); #endif for(i=1;i<*argc;i++) { if( strncmp(argv[0][i], "-help", 5)==0 ) { /* lis_display();*/ CHKERR(1); } else if( strncmp(argv[0][i], "-ver", 4)==0 ) { lis_version(); CHKERR(1); } } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
int A1D_Initialize() { #ifdef DMAPPD_USES_MPI int mpi_initialized, mpi_provided; int mpi_status = MPI_SUCCESS; int namelen; char procname[MPI_MAX_PROCESSOR_NAME]; #endif #ifdef __CRAYXE int pmi_status = PMI_SUCCESS; int nodeid = -1; rca_mesh_coord_t rca_xyz; dmapp_return_t dmapp_status = DMAPP_RC_SUCCESS; dmapp_rma_attrs_ext_t dmapp_config_in, dmapp_config_out; dmapp_jobinfo_t dmapp_info; dmapp_pe_t dmapp_rank = -1; int dmapp_size = -1; #endif int sheapflag = 0; #ifdef DEBUG_FUNCTION_ENTER_EXIT fprintf(stderr,"entering A1D_Initialize() \n"); #endif #ifdef DMAPPD_USES_MPI /*************************************************** * * configure MPI * ***************************************************/ /* MPI has to be Initialized for this implementation to work */ MPI_Initialized(&mpi_initialized); assert(mpi_initialized==1); /* MPI has to tolerate threads because A1 supports them */ MPI_Query_thread(&mpi_provided); //assert(mpi_provided>MPI_THREAD_SINGLE); /* have to use our own communicator for collectives to be proper */ mpi_status = MPI_Comm_dup(MPI_COMM_WORLD,&A1D_COMM_WORLD); assert(mpi_status==0); /* get my MPI rank */ mpi_status = MPI_Comm_rank(A1D_COMM_WORLD,&mpi_rank); assert(mpi_status==0); /* get MPI world size */ mpi_status = MPI_Comm_size(A1D_COMM_WORLD,&mpi_size); assert(mpi_status==0); /* in a perfect world, this would provide topology information like BG */ MPI_Get_processor_name( procname, &namelen ); printf( "%d: MPI_Get_processor_name = %s\n" , mpi_rank, procname ); fflush( stdout ); /* barrier to make sure MPI is ready everywhere */ mpi_status = MPI_Barrier(A1D_COMM_WORLD); assert(mpi_status==0); #endif #ifdef __CRAYXE /*************************************************** * * query topology * ***************************************************/ PMI_Get_nid( mpi_rank, &nodeid ); assert(pmi_status==PMI_SUCCESS); rca_get_meshcoord((uint16_t)nodeid, &rca_xyz); printf("%d: rca_get_meshcoord returns (%2u,%2u,%2u)\n", mpi_rank, rca_xyz.mesh_x, rca_xyz.mesh_y, rca_xyz.mesh_z ); #endif #ifdef __CRAYXE /*************************************************** * * configure DMAPP * ***************************************************/ dmapp_config_in.max_outstanding_nb = DMAPP_DEF_OUTSTANDING_NB; /* 512 */ dmapp_config_in.offload_threshold = DMAPP_OFFLOAD_THRESHOLD; /* 4096 */ #ifdef DETERMINISTIC_ROUTING dmapp_config_in.put_relaxed_ordering = DMAPP_ROUTING_DETERMINISTIC; dmapp_config_in.get_relaxed_ordering = DMAPP_ROUTING_DETERMINISTIC; #else dmapp_config_in.put_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE; dmapp_config_in.get_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE; #endif dmapp_config_in.max_concurrency = 1; /* not thread-safe */ #ifdef FLUSH_IMPLEMENTED dmapp_config_in.PI_ordering = DMAPP_PI_ORDERING_RELAXED; #else dmapp_config_in.PI_ordering = DMAPP_PI_ORDERING_STRICT; #endif dmapp_status = dmapp_init_ext( &dmapp_config_in, &dmapp_config_out ); assert(dmapp_status==DMAPP_RC_SUCCESS); #ifndef FLUSH_IMPLEMENTED /* without strict PI ordering, we have to flush remote stores with a get packet to force global visibility */ assert( dmapp_config_out.PI_ordering == DMAPP_PI_ORDERING_STRICT); #endif dmapp_status = dmapp_get_jobinfo(&dmapp_info); assert(dmapp_status==DMAPP_RC_SUCCESS); dmapp_rank = dmapp_info.pe; dmapp_size = dmapp_info.npes; A1D_Sheap_desc = dmapp_info.sheap_seg; /* make sure PMI and DMAPP agree */ assert(mpi_rank==dmapp_rank); assert(mpi_size==dmapp_size); #endif /*************************************************** * * setup protocols * ***************************************************/ #ifdef FLUSH_IMPLEMENTED /* allocate Put list */ A1D_Put_flush_list = malloc( mpi_size * sizeof(int32_t) ); assert(A1D_Put_flush_list != NULL); #endif #ifdef __CRAYXE A1D_Acc_lock = dmapp_sheap_malloc( sizeof(int64_t) ); #endif A1D_Allreduce_issame64((size_t)A1D_Acc_lock, &sheapflag); assert(sheapflag==1); #ifdef DEBUG_FUNCTION_ENTER_EXIT fprintf(stderr,"exiting A1D_Initialize() \n"); #endif return(0); }
static bool initialized() { int ini; MPI_Initialized(&ini); return ini; }
int main(int argc, char **argv) { double fret; /* Frank energy */ double start, end; /* for timing */ double **grid; /* 2D grid */ int **lock; /* locked cells */ double **fgrid; /* fullgrid */ int **flock; /* locked cells (fullgrid) */ char fname[128] = "log"; char gname[128] = "grid"; char suffix[128]; int iter = 0; long maxiter; int flag, rank, np; /* MPI variables */ float t0; /* starting "temperature" */ FILE *log_fp; t_par par[2]; int sep, ba; MPI_Init(&argc, &argv); MPI_Initialized(&flag); if (flag != 1) MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &np); if (argc < 11 || argc > 12) { if (rank == 0) printf("Usage: <%s> <x length> <y length> <monte carlo steps> <temp> <major axis> <minor axis> <align> <theta> <separation> <boundary angle> [id]\n", argv[0]); MPI_Finalize(); exit(1); } nx = atoi(argv[1]); ny = atoi(argv[2]); maxiter = atoi(argv[3]); t0 = atof(argv[4]); par[0].major = par[1].major = atoi(argv[5]); par[0].minor = par[1].minor = atoi(argv[6]); if (strcmp(argv[7], "para") != 0 && strcmp(argv[7], "perp") != 0) { if (rank == 0) printf("Alignment must be para or perp\n"); MPI_Finalize(); exit(1); } strcpy(par[0].align, argv[7]); strcpy(par[1].align, argv[7]); par[0].theta = 0; par[1].theta = atof(argv[8]); sep = atoi(argv[9]); ba = atoi(argv[10]); if (argc == 12) id = atoi(argv[11]); else id = 1; par[0].cy = par[1].cy = ny / 2; par[0].cx = nx / 2 - par[0].major - sep / 2; par[1].cx = nx / 2 + par[1].major + sep / 2 - 1; sprintf(suffix, "r%dx%d_t%.0f_s%d_a%d_%d_%s", par[0].major, par[0].minor, par[1].theta, sep, ba, id, par[0].align); par[1].theta = PI * par[1].theta / 180.0; random_key key; int success = initialise(&grid, &lock, par, sep, ba, suffix, &key); int all_succeeded; MPI_Allreduce(&success, &all_succeeded, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (all_succeeded != np) { MPI_Finalize(); return 0; } strcat(gname, suffix); print(grid, gname); fret = func(grid, lock, 0); if (rank == 0) { printf("Initial Frank Energy: %f\n", fret); strcat(fname, suffix); log_fp = fopen(fname, "w"); } start = MPI_Wtime(); monte(grid, lock, maxiter, t0, log_fp, suffix, key); end = MPI_Wtime(); grid2root(grid, &fgrid); lock2root(lock, &flock); prncont(fgrid, "testgrid"); fret = func(grid, lock, 0); if (rank == 0) conjgrad(fgrid, flock, 10000, &fret, log_fp, suffix, par, sep); if (rank == 0) { printf("End Frank Energy: %f\n", fret); printf("No. iterations: %d\n", iter); printf("Time taken: %f\n", end - start); } MPI_Finalize(); return 0; }
void ComponentInterface::init_done( bool pane_changed) throw(COM_exception) { // Loop through the dataitems. if ( _status == STATUS_SHRUNK) { int max_id=0; for (Attr_map::iterator it=_attr_map.begin(); it!=_attr_map.end(); ++it) { int id = it->second->id(), ncomp=it->second->size_of_components(); max_id = std::max( max_id, id+ncomp+(ncomp>1)); } if (max_id < _last_id) _last_id = max_id; } int npanes = _pane_map.size(); std::vector< int> pane_ids; pane_ids.reserve(npanes); for (Pane_map::iterator it=_pane_map.begin(); it!=_pane_map.end(); ++it) { it->second->init_done(); pane_ids.push_back( it->second->id()); } _status = STATUS_NOCHANGE; if ( !pane_changed) { if ( npanes>int(_pane_map.size())) { throw COM_exception( COM_ERR_INIT_DONE_PANEMAP, append_frame (_name,ComponentInterface::init_done)); } return; } // communicate pane mapping int flag; MPI_Initialized( &flag); if ( _comm == MPI_COMM_NULL) flag = 0; // Compute proc_map int nprocs; if ( flag) MPI_Comm_size( _comm, &nprocs); else nprocs = 1; // Obtain the number of panes. std::vector<int> npanes_all(nprocs); if ( flag) MPI_Allgather( &npanes, 1, MPI_INT, &npanes_all[0], 1, MPI_INT, _comm); else npanes_all[0] = npanes; std::vector<int> disps(nprocs+1); disps[0]=0; for ( int i=0; i<nprocs; ++i) disps[i+1] = disps[i]+npanes_all[i]; std::vector<int> pane_ids_all( disps[nprocs]); if ( flag) MPI_Allgatherv( &pane_ids[0], npanes, MPI_INT, &pane_ids_all[0], &npanes_all[0], &disps[0], MPI_INT, _comm); else pane_ids_all = pane_ids; // Build process map _proc_map.clear(); for ( int p=0; p<nprocs; ++p) { for ( int j=disps[p], jn=disps[p+1]; j<jn; ++j) _proc_map[ pane_ids_all[j]] = p; } }
static void na_test_mpi_init(struct na_test_info *na_test_info) { int mpi_initialized = 0; int mpi_finalized = 0; na_test_info->mpi_comm = MPI_COMM_WORLD; /* default */ MPI_Initialized(&mpi_initialized); if (mpi_initialized) { NA_LOG_WARNING("MPI was already initialized"); goto done; } MPI_Finalized(&mpi_finalized); if (mpi_finalized) { NA_LOG_ERROR("MPI was already finalized"); goto done; } #ifdef NA_MPI_HAS_GNI_SETUP /* Setup GNI job before initializing MPI */ if (NA_MPI_Gni_job_setup() != NA_SUCCESS) { NA_LOG_ERROR("Could not setup GNI job"); return; } #endif if (na_test_info->listen || na_test_info->mpi_static) { int provided; MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided); if (provided != MPI_THREAD_MULTIPLE) { NA_LOG_ERROR("MPI_THREAD_MULTIPLE cannot be set"); } /* Only if we do static MPMD MPI */ if (na_test_info->mpi_static) { int mpi_ret, color, global_rank; MPI_Comm_rank(MPI_COMM_WORLD, &global_rank); /* Color is 1 for server, 2 for client */ color = (na_test_info->listen) ? 1 : 2; /* Assume that the application did not split MPI_COMM_WORLD already */ mpi_ret = MPI_Comm_split(MPI_COMM_WORLD, color, global_rank, &na_test_info->mpi_comm); if (mpi_ret != MPI_SUCCESS) { NA_LOG_ERROR("Could not split communicator"); } #ifdef NA_HAS_MPI /* Set init comm that will be used to setup NA MPI */ NA_MPI_Set_init_intra_comm(na_test_info->mpi_comm); #endif } } else { MPI_Init(NULL, NULL); } done: MPI_Comm_rank(na_test_info->mpi_comm, &na_test_info->mpi_comm_rank); MPI_Comm_size(na_test_info->mpi_comm, &na_test_info->mpi_comm_size); return; }
/*@C PetscInitialize - Initializes the PETSc database and MPI. PetscInitialize() calls MPI_Init() if that has yet to be called, so this routine should always be called near the beginning of your program -- usually the very first line! Collective on MPI_COMM_WORLD or PETSC_COMM_WORLD if it has been set Input Parameters: + argc - count of number of command line arguments . args - the command line arguments . file - [optional] PETSc database file, also checks ~username/.petscrc and .petscrc use NULL to not check for code specific file. Use -skip_petscrc in the code specific file to skip the .petscrc files - help - [optional] Help message to print, use NULL for no message If you wish PETSc code to run ONLY on a subcommunicator of MPI_COMM_WORLD, create that communicator first and assign it to PETSC_COMM_WORLD BEFORE calling PetscInitialize(). Thus if you are running a four process job and two processes will run PETSc and have PetscInitialize() and PetscFinalize() and two process will not, then do this. If ALL processes in the job are using PetscInitialize() and PetscFinalize() then you don't need to do this, even if different subcommunicators of the job are doing different things with PETSc. Options Database Keys: + -start_in_debugger [noxterm,dbx,xdb,gdb,...] - Starts program in debugger . -on_error_attach_debugger [noxterm,dbx,xdb,gdb,...] - Starts debugger when error detected . -on_error_emacs <machinename> causes emacsclient to jump to error file . -on_error_abort calls abort() when error detected (no traceback) . -on_error_mpiabort calls MPI_abort() when error detected . -error_output_stderr prints error messages to stderr instead of the default stdout . -error_output_none does not print the error messages (but handles errors in the same way as if this was not called) . -debugger_nodes [node1,node2,...] - Indicates nodes to start in debugger . -debugger_pause [sleeptime] (in seconds) - Pauses debugger . -stop_for_debugger - Print message on how to attach debugger manually to process and wait (-debugger_pause) seconds for attachment . -malloc - Indicates use of PETSc error-checking malloc (on by default for debug version of libraries) . -malloc no - Indicates not to use error-checking malloc . -malloc_debug - check for memory corruption at EVERY malloc or free . -malloc_test - like -malloc_dump -malloc_debug, but only active for debugging builds . -fp_trap - Stops on floating point exceptions (Note that on the IBM RS6000 this slows code by at least a factor of 10.) . -no_signal_handler - Indicates not to trap error signals . -shared_tmp - indicates /tmp directory is shared by all processors . -not_shared_tmp - each processor has own /tmp . -tmp - alternative name of /tmp directory . -get_total_flops - returns total flops done by all processors . -memory_info - Print memory usage at end of run - -server <port> - start PETSc webserver (default port is 8080) Options Database Keys for Profiling: See the <a href="../../docs/manual.pdf#nameddest=ch_profiling">profiling chapter of the users manual</a> for details. + -info <optional filename> - Prints verbose information to the screen . -info_exclude <null,vec,mat,pc,ksp,snes,ts> - Excludes some of the verbose messages . -log_sync - Log the synchronization in scatters, inner products and norms . -log_trace [filename] - Print traces of all PETSc calls to the screen (useful to determine where a program hangs without running in the debugger). See PetscLogTraceBegin(). . -log_summary [filename] - Prints summary of flop and timing information to screen. If the filename is specified the summary is written to the file. See PetscLogView(). . -log_summary_python [filename] - Prints data on of flop and timing usage to a file or screen. See PetscLogPrintSViewPython(). . -log_all [filename] - Logs extensive profiling information See PetscLogDump(). . -log [filename] - Logs basic profiline information See PetscLogDump(). - -log_mpe [filename] - Creates a logfile viewable by the utility Jumpshot (in MPICH distribution) Only one of -log_trace, -log_summary, -log_all, -log, or -log_mpe may be used at a time Environmental Variables: + PETSC_TMP - alternative tmp directory . PETSC_SHARED_TMP - tmp is shared by all processes . PETSC_NOT_SHARED_TMP - each process has its own private tmp . PETSC_VIEWER_SOCKET_PORT - socket number to use for socket viewer - PETSC_VIEWER_SOCKET_MACHINE - machine to use for socket viewer to connect to Level: beginner Notes: If for some reason you must call MPI_Init() separately, call it before PetscInitialize(). Fortran Version: In Fortran this routine has the format $ call PetscInitialize(file,ierr) + ierr - error return code - file - [optional] PETSc database file, also checks ~username/.petscrc and .petscrc use NULL_CHARACTER to not check for code specific file. Use -skip_petscrc in the code specific file to skip the .petscrc files Important Fortran Note: In Fortran, you MUST use NULL_CHARACTER to indicate a null character string; you CANNOT just use NULL as in the C version. See the <a href="../../docs/manual.pdf">users manual</a> for details. If your main program is C but you call Fortran code that also uses PETSc you need to call PetscInitializeFortran() soon after calling PetscInitialize(). Concepts: initializing PETSc .seealso: PetscFinalize(), PetscInitializeFortran(), PetscGetArgs(), PetscInitializeNoArguments() @*/ PetscErrorCode PetscInitialize(int *argc,char ***args,const char file[],const char help[]) { PetscErrorCode ierr; PetscMPIInt flag, size; PetscInt nodesize; PetscBool flg; char hostname[256]; PetscFunctionBegin; if (PetscInitializeCalled) PetscFunctionReturn(0); /* these must be initialized in a routine, not as a constant declaration*/ PETSC_STDOUT = stdout; PETSC_STDERR = stderr; ierr = PetscOptionsCreate();CHKERRQ(ierr); /* We initialize the program name here (before MPI_Init()) because MPICH has a bug in it that it sets args[0] on all processors to be args[0] on the first processor. */ if (argc && *argc) { ierr = PetscSetProgramName(**args);CHKERRQ(ierr); } else { ierr = PetscSetProgramName("Unknown Name");CHKERRQ(ierr); } ierr = MPI_Initialized(&flag);CHKERRQ(ierr); if (!flag) { if (PETSC_COMM_WORLD != MPI_COMM_NULL) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"You cannot set PETSC_COMM_WORLD if you have not initialized MPI first"); #if defined(PETSC_HAVE_MPI_INIT_THREAD) { PetscMPIInt provided; ierr = MPI_Init_thread(argc,args,MPI_THREAD_FUNNELED,&provided);CHKERRQ(ierr); } #else ierr = MPI_Init(argc,args);CHKERRQ(ierr); #endif PetscBeganMPI = PETSC_TRUE; } if (argc && args) { PetscGlobalArgc = *argc; PetscGlobalArgs = *args; } PetscFinalizeCalled = PETSC_FALSE; if (PETSC_COMM_WORLD == MPI_COMM_NULL) PETSC_COMM_WORLD = MPI_COMM_WORLD; ierr = MPI_Comm_set_errhandler(PETSC_COMM_WORLD,MPI_ERRORS_RETURN);CHKERRQ(ierr); /* Done after init due to a bug in MPICH-GM? */ ierr = PetscErrorPrintfInitialize();CHKERRQ(ierr); ierr = MPI_Comm_rank(MPI_COMM_WORLD,&PetscGlobalRank);CHKERRQ(ierr); ierr = MPI_Comm_size(MPI_COMM_WORLD,&PetscGlobalSize);CHKERRQ(ierr); MPIU_BOOL = MPI_INT; MPIU_ENUM = MPI_INT; /* Initialized the global complex variable; this is because with shared libraries the constructors for global variables are not called; at least on IRIX. */ #if defined(PETSC_HAVE_COMPLEX) { #if defined(PETSC_CLANGUAGE_CXX) PetscComplex ic(0.0,1.0); PETSC_i = ic; #elif defined(PETSC_CLANGUAGE_C) PETSC_i = _Complex_I; #endif } #if !defined(PETSC_HAVE_MPI_C_DOUBLE_COMPLEX) ierr = MPI_Type_contiguous(2,MPI_DOUBLE,&MPIU_C_DOUBLE_COMPLEX);CHKERRQ(ierr); ierr = MPI_Type_commit(&MPIU_C_DOUBLE_COMPLEX);CHKERRQ(ierr); ierr = MPI_Type_contiguous(2,MPI_FLOAT,&MPIU_C_COMPLEX);CHKERRQ(ierr); ierr = MPI_Type_commit(&MPIU_C_COMPLEX);CHKERRQ(ierr); #endif #endif /* PETSC_HAVE_COMPLEX */ /* Create the PETSc MPI reduction operator that sums of the first half of the entries and maxes the second half. */ ierr = MPI_Op_create(PetscMaxSum_Local,1,&PetscMaxSum_Op);CHKERRQ(ierr); #if defined(PETSC_USE_REAL___FLOAT128) ierr = MPI_Type_contiguous(2,MPI_DOUBLE,&MPIU___FLOAT128);CHKERRQ(ierr); ierr = MPI_Type_commit(&MPIU___FLOAT128);CHKERRQ(ierr); #if defined(PETSC_HAVE_COMPLEX) ierr = MPI_Type_contiguous(4,MPI_DOUBLE,&MPIU___COMPLEX128);CHKERRQ(ierr); ierr = MPI_Type_commit(&MPIU___COMPLEX128);CHKERRQ(ierr); #endif ierr = MPI_Op_create(PetscMax_Local,1,&MPIU_MAX);CHKERRQ(ierr); ierr = MPI_Op_create(PetscMin_Local,1,&MPIU_MIN);CHKERRQ(ierr); #endif #if (defined(PETSC_HAVE_COMPLEX) && !defined(PETSC_HAVE_MPI_C_DOUBLE_COMPLEX)) || defined(PETSC_USE_REAL___FLOAT128) ierr = MPI_Op_create(PetscSum_Local,1,&MPIU_SUM);CHKERRQ(ierr); #endif ierr = MPI_Type_contiguous(2,MPIU_SCALAR,&MPIU_2SCALAR);CHKERRQ(ierr); ierr = MPI_Type_commit(&MPIU_2SCALAR);CHKERRQ(ierr); ierr = MPI_Op_create(PetscADMax_Local,1,&PetscADMax_Op);CHKERRQ(ierr); ierr = MPI_Op_create(PetscADMin_Local,1,&PetscADMin_Op);CHKERRQ(ierr); #if defined(PETSC_USE_64BIT_INDICES) || !defined(MPI_2INT) ierr = MPI_Type_contiguous(2,MPIU_INT,&MPIU_2INT);CHKERRQ(ierr); ierr = MPI_Type_commit(&MPIU_2INT);CHKERRQ(ierr); #endif /* Attributes to be set on PETSc communicators */ ierr = MPI_Keyval_create(MPI_NULL_COPY_FN,Petsc_DelCounter,&Petsc_Counter_keyval,(void*)0);CHKERRQ(ierr); ierr = MPI_Keyval_create(MPI_NULL_COPY_FN,Petsc_DelComm_Outer,&Petsc_InnerComm_keyval,(void*)0);CHKERRQ(ierr); ierr = MPI_Keyval_create(MPI_NULL_COPY_FN,Petsc_DelComm_Inner,&Petsc_OuterComm_keyval,(void*)0);CHKERRQ(ierr); /* Build the options database */ ierr = PetscOptionsInsert(argc,args,file);CHKERRQ(ierr); /* Print main application help message */ ierr = PetscOptionsHasName(NULL,"-help",&flg);CHKERRQ(ierr); if (help && flg) { ierr = PetscPrintf(PETSC_COMM_WORLD,help);CHKERRQ(ierr); } ierr = PetscOptionsCheckInitial_Private();CHKERRQ(ierr); /* SHOULD PUT IN GUARDS: Make sure logging is initialized, even if we do not print it out */ #if defined(PETSC_USE_LOG) ierr = PetscLogBegin_Private();CHKERRQ(ierr); #endif /* Load the dynamic libraries (on machines that support them), this registers all the solvers etc. (On non-dynamic machines this initializes the PetscDraw and PetscViewer classes) */ ierr = PetscInitialize_DynamicLibraries();CHKERRQ(ierr); ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); ierr = PetscInfo1(0,"PETSc successfully started: number of processors = %d\n",size);CHKERRQ(ierr); ierr = PetscGetHostName(hostname,256);CHKERRQ(ierr); ierr = PetscInfo1(0,"Running on machine: %s\n",hostname);CHKERRQ(ierr); ierr = PetscOptionsCheckInitial_Components();CHKERRQ(ierr); /* Check the options database for options related to the options database itself */ ierr = PetscOptionsSetFromOptions();CHKERRQ(ierr); #if defined(PETSC_USE_PETSC_MPI_EXTERNAL32) /* Tell MPI about our own data representation converter, this would/should be used if extern32 is not supported by the MPI Currently not used because it is not supported by MPICH. */ #if !defined(PETSC_WORDS_BIGENDIAN) ierr = MPI_Register_datarep((char*)"petsc",PetscDataRep_read_conv_fn,PetscDataRep_write_conv_fn,PetscDataRep_extent_fn,NULL);CHKERRQ(ierr); #endif #endif ierr = PetscOptionsGetInt(NULL,"-hmpi_spawn_size",&nodesize,&flg);CHKERRQ(ierr); if (flg) { #if defined(PETSC_HAVE_MPI_COMM_SPAWN) ierr = PetscHMPISpawn((PetscMPIInt) nodesize);CHKERRQ(ierr); /* worker nodes never return from here; they go directly to PetscEnd() */ #else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PETSc built without MPI 2 (MPI_Comm_spawn) support, use -hmpi_merge_size instead"); #endif } else { ierr = PetscOptionsGetInt(NULL,"-hmpi_merge_size",&nodesize,&flg);CHKERRQ(ierr); if (flg) { ierr = PetscHMPIMerge((PetscMPIInt) nodesize,NULL,NULL);CHKERRQ(ierr); if (PetscHMPIWorker) { /* if worker then never enter user code */ PetscInitializeCalled = PETSC_TRUE; PetscEnd(); } } } #if defined(PETSC_HAVE_CUDA) { PetscMPIInt p; for (p = 0; p < PetscGlobalSize; ++p) { if (p == PetscGlobalRank) cublasInit(); ierr = MPI_Barrier(PETSC_COMM_WORLD);CHKERRQ(ierr); } } #endif ierr = PetscOptionsHasName(NULL,"-python",&flg);CHKERRQ(ierr); if (flg) { PetscInitializeCalled = PETSC_TRUE; ierr = PetscPythonInitialize(NULL,NULL);CHKERRQ(ierr); } ierr = PetscThreadCommInitializePackage();CHKERRQ(ierr); /* Setup building of stack frames for all function calls */ #if defined(PETSC_USE_DEBUG) PetscThreadLocalRegister((PetscThreadKey*)&petscstack); /* Creates petscstack_key if needed */ ierr = PetscStackCreate();CHKERRQ(ierr); #endif #if defined(PETSC_SERIALIZE_FUNCTIONS) ierr = PetscFPTCreate(10000);CHKERRQ(ierr); #endif /* Once we are completedly initialized then we can set this variables */ PetscInitializeCalled = PETSC_TRUE; PetscFunctionReturn(0); }
/* Initialize MTest, initializing MPI if necessary. Environment Variables: + MPITEST_DEBUG - If set (to any value), turns on debugging output . MPITEST_THREADLEVEL_DEFAULT - If set, use as the default "provided" level of thread support. Applies to MTest_Init but not MTest_Init_thread. - MPITEST_VERBOSE - If set to a numeric value, turns on that level of verbose output. This is used by the routine 'MTestPrintfMsg' */ void MTest_Init_thread(int *argc, char ***argv, int required, int *provided) { int flag; char *envval = 0; MPI_Initialized(&flag); if (!flag) { /* Permit an MPI that claims only MPI 1 but includes the * MPI_Init_thread routine (e.g., IBM MPI) */ #if MPI_VERSION >= 2 || defined(HAVE_MPI_INIT_THREAD) MPI_Init_thread(argc, argv, required, provided); #else MPI_Init(argc, argv); *provided = -1; #endif } /* Check for debugging control */ if (getenv("MPITEST_DEBUG")) { dbgflag = 1; MPI_Comm_rank(MPI_COMM_WORLD, &wrank); } /* Check for verbose control */ envval = getenv("MPITEST_VERBOSE"); if (envval) { char *s; long val = strtol(envval, &s, 0); if (s == envval) { /* This is the error case for strtol */ fprintf(stderr, "Warning: %s not valid for MPITEST_VERBOSE\n", envval); fflush(stderr); } else { if (val >= 0) { verbose = val; } else { fprintf(stderr, "Warning: %s not valid for MPITEST_VERBOSE\n", envval); fflush(stderr); } } } /* Check for option to return success/failure in the return value of main */ envval = getenv("MPITEST_RETURN_WITH_CODE"); if (envval) { if (strcmp(envval, "yes") == 0 || strcmp(envval, "YES") == 0 || strcmp(envval, "true") == 0 || strcmp(envval, "TRUE") == 0) { returnWithVal = 1; } else if (strcmp(envval, "no") == 0 || strcmp(envval, "NO") == 0 || strcmp(envval, "false") == 0 || strcmp(envval, "FALSE") == 0) { returnWithVal = 0; } else { fprintf(stderr, "Warning: %s not valid for MPITEST_RETURN_WITH_CODE\n", envval); fflush(stderr); } } /* Print rusage data if set */ if (getenv("MPITEST_RUSAGE")) { usageOutput = 1; } }
FC_FUNC( mpi_initialized , MPI_INITIALIZED )(int *flag, int *ierror) { *ierror=MPI_Initialized(flag); }
int main(int argc, char **argv) { MPI_Init(&argc, &argv); int initFlag; MPI_Initialized(&initFlag); if (!initFlag) { printf("MPI init failed\n"); return 8; } MPI_Comm_rank(MPI_COMM_WORLD, &proc_rank); MPI_Comm_size(MPI_COMM_WORLD, &world_size); int l,mm=5; int nx,ny,nz,lt,nedge; float frequency; float velmax; float dt; int ncx_shot1,ncy_shot1,ncz_shot; int ishot,ncy_shot,ncx_shot; float unit; int nxshot,nyshot,dxshot,dyshot; char infile[80],outfile[80],logfile[80],tmp[80], nodelog[84]; FILE *fin, *fout, *flog, *fnode; MPI_File mpi_flog, mpi_fout; MPI_Status mpi_status; struct timeval start,end; float all_time; float *u, *v, *w, *up, *up1, *up2, *vp, *vp1, *vp2, *wp, *wp1, *wp2, *us, *us1, *us2, *vs, *vs1, *vs2, *ws, *ws1, *ws2, *vpp, *density, *vss; float c[5][7]; float *wave; float nshot,t0,tt,c0; float dtx,dtz,dtxz,dr1,dr2,dtx4,dtz4,dtxz4; char message[100]; if(argc<4) { printf("please add 3 parameter: inpurfile, outfile, logfile\n"); exit(1); } message[99] = 0; // Avoid string buffer overrun strcpy(infile,argv[1]); strcpy(outfile,argv[2]); strcpy(logfile,argv[3]); strcpy(nodelog,logfile); strcat(nodelog, ".node"); strcpy(tmp,"date "); strncat(tmp, ">> ",3); strncat(tmp, logfile, strlen(logfile)); if (proc_rank == 0) { flog = fopen(logfile,"w"); fprintf(flog,"------------start time------------\n"); fclose(flog); system(tmp); gettimeofday(&start,NULL); } fin = fopen(infile,"r"); if(fin == NULL) { printf("file %s is not exist\n",infile); exit(2); } fscanf(fin,"nx=%d\n",&nx); fscanf(fin,"ny=%d\n",&ny); fscanf(fin,"nz=%d\n",&nz); fscanf(fin,"lt=%d\n",<); fscanf(fin,"nedge=%d\n",&nedge); fscanf(fin,"ncx_shot1=%d\n",&ncx_shot1); fscanf(fin,"ncy_shot1=%d\n",&ncy_shot1); fscanf(fin,"ncz_shot=%d\n",&ncz_shot); fscanf(fin,"nxshot=%d\n",&nxshot); fscanf(fin,"nyshot=%d\n",&nyshot); fscanf(fin,"frequency=%f\n",&frequency); fscanf(fin,"velmax=%f\n",&velmax); fscanf(fin,"dt=%f\n",&dt); fscanf(fin,"unit=%f\n",&unit); fscanf(fin,"dxshot=%d\n",&dxshot); fscanf(fin,"dyshot=%d\n",&dyshot); fclose(fin); if (proc_rank == 0) { // Master printf("\n--------workload parameter--------\n"); printf("nx=%d\n",nx); printf("ny=%d\n",ny); printf("nz=%d\n",nz); printf("lt=%d\n",lt); printf("nedge=%d\n",nedge); printf("ncx_shot1=%d\n",ncx_shot1); printf("ncy_shot1=%d\n",ncy_shot1); printf("ncz_shot=%d\n",ncz_shot); printf("nxshot=%d\n",nxshot); printf("nyshot=%d\n",nyshot); printf("frequency=%f\n",frequency); printf("velmax=%f\n",velmax); printf("dt=%f\n",dt); printf("unit=%f\n",unit); printf("dxshot=%d\n",dxshot); printf("dyshot=%d\n\n",dyshot); flog = fopen(logfile,"a"); fprintf(flog,"\n--------workload parameter--------\n"); fprintf(flog,"nx=%d\n",nx); fprintf(flog,"ny=%d\n",ny); fprintf(flog,"nz=%d\n",nz); fprintf(flog,"lt=%d\n",lt); fprintf(flog,"nedge=%d\n",nedge); fprintf(flog,"ncx_shot1=%d\n",ncx_shot1); fprintf(flog,"ncy_shot1=%d\n",ncy_shot1); fprintf(flog,"ncz_shot=%d\n",ncz_shot); fprintf(flog,"nxshot=%d\n",nxshot); fprintf(flog,"nyshot=%d\n",nyshot); fprintf(flog,"frequency=%f\n",frequency); fprintf(flog,"velmax=%f\n",velmax); fprintf(flog,"dt=%f\n",dt); fprintf(flog,"unit=%f\n",unit); fprintf(flog,"dxshot=%d\n",dxshot); fprintf(flog,"dyshot=%d\n\n",dyshot); fclose(flog); fnode = fopen(nodelog, "a"); fprintf(fnode,"World size: %d\n", world_size); fclose(fnode); } #ifdef _WITH_PHI // [Afa] It is recommended that for Intel Xeon Phi data is 64-byte aligned. // Upon successful completion, posix_memalign() shall return zero if (posix_memalign((void **)&u , 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&v , 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&w , 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&up , 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&up1, 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&up2, 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&vp , 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&vp1, 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&vp2, 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&wp , 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&wp1, 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&wp2, 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&us , 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&us1, 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&us2, 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&vs , 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&vs1, 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&vs2, 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&ws , 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&ws1, 64, sizeof(float)*nz*ny*nx)) return 2; if (posix_memalign((void **)&ws2, 64, sizeof(float)*nz*ny*nx)) return 2; #else u = (float*)malloc(sizeof(float)*nz*ny*nx); v = (float*)malloc(sizeof(float)*nz*ny*nx); w = (float*)malloc(sizeof(float)*nz*ny*nx); up = (float*)malloc(sizeof(float)*nz*ny*nx); up1 = (float*)malloc(sizeof(float)*nz*ny*nx); up2 = (float*)malloc(sizeof(float)*nz*ny*nx); vp = (float*)malloc(sizeof(float)*nz*ny*nx); vp1 = (float*)malloc(sizeof(float)*nz*ny*nx); vp2 = (float*)malloc(sizeof(float)*nz*ny*nx); wp = (float*)malloc(sizeof(float)*nz*ny*nx); wp1 = (float*)malloc(sizeof(float)*nz*ny*nx); wp2 = (float*)malloc(sizeof(float)*nz*ny*nx); us = (float*)malloc(sizeof(float)*nz*ny*nx); us1 = (float*)malloc(sizeof(float)*nz*ny*nx); us2 = (float*)malloc(sizeof(float)*nz*ny*nx); vs = (float*)malloc(sizeof(float)*nz*ny*nx); vs1 = (float*)malloc(sizeof(float)*nz*ny*nx); vs2 = (float*)malloc(sizeof(float)*nz*ny*nx); ws = (float*)malloc(sizeof(float)*nz*ny*nx); ws1 = (float*)malloc(sizeof(float)*nz*ny*nx); ws2 = (float*)malloc(sizeof(float)*nz*ny*nx); #endif // [Afa] Those are not offloaded to phi yet vpp = (float*)malloc(sizeof(float)*nz*ny*nx); density = (float*)malloc(sizeof(float)*nz*ny*nx); vss = (float*)malloc(sizeof(float)*nz*ny*nx); wave = (float*)malloc(sizeof(float)*lt); nshot=nxshot*nyshot; t0=1.0/frequency; // [Afa] Branch optmization // TODO: Will compiler optimize the `condition'? // i.e Can I write `for(i=0;i< (nz < 210 ? nz : 210);i++)'? int condition = nz < 210 ? nz : 210; for(int i=0; i < condition;i++) { for(int j=0;j<ny;j++) { for(int k=0;k<nx;k++) { vpp[i*ny*nx+j*nx+k]=2300.; vss[i*ny*nx+j*nx+k]=1232.; density[i*ny*nx+j*nx+k]=1.; } } } condition = nz < 260 ? nz : 260; for(int i=210; i < condition;i++) { for(int j=0;j<ny;j++) { for(int k=0;k<nx;k++) { vpp[i*ny*nx+j*nx+k]=2800.; vss[i*ny*nx+j*nx+k]=1509.; density[i*ny*nx+j*nx+k]=2.; } } } for(int i=260;i<nz;i++) { for(int j=0;j<ny;j++) { for(int k=0;k<nx;k++) { vpp[i*ny*nx+j*nx+k]=3500.; vss[i*ny*nx+j*nx+k]=1909.; density[i*ny*nx+j*nx+k]=2.5; } } } for(l=0;l<lt;l++) { tt=l*dt; tt=tt-t0; float sp=PIE*frequency*tt; float fx=100000.*exp(-sp*sp)*(1.-2.*sp*sp); wave[l]=fx; } // TODO: [Afa] Data produced by code below are static. See table below if(mm==5) { c0=-2.927222164; c[0][0]=1.66666665; c[1][0]=-0.23809525; c[2][0]=0.03968254; c[3][0]=-0.004960318; c[4][0]=0.0003174603; } c[0][1]=0.83333; c[1][1]=-0.2381; c[2][1]=0.0595; c[3][1]=-0.0099; c[4][1]=0.0008; for(int i=0;i<5;i++) for(int j=0;j<5;j++) c[j][2+i]=c[i][1]*c[j][1]; /* * mm == 5, c = * 1.666667 0.833330 0.694439 -0.198416 0.049583 -0.008250 0.000667 * -0.238095 -0.238100 -0.198416 0.056692 -0.014167 0.002357 -0.000190 * 0.039683 0.059500 0.049583 -0.014167 0.003540 -0.000589 0.000048 * -0.004960 -0.009900 -0.008250 0.002357 -0.000589 0.000098 -0.000008 * 0.000317 0.000800 0.000667 -0.000190 0.000048 -0.000008 0.000001 */ /* * mm != 5, c = * 0.000000 0.833330 0.694439 -0.198416 0.049583 -0.008250 0.000667 * 0.000000 -0.238100 -0.198416 0.056692 -0.014167 0.002357 -0.000190 * 0.000000 0.059500 0.049583 -0.014167 0.003540 -0.000589 0.000048 * 0.000000 -0.009900 -0.008250 0.002357 -0.000589 0.000098 -0.000008 * 0.000000 0.000800 0.000667 -0.000190 0.000048 -0.000008 0.000001 */ dtx=dt/unit; dtz=dt/unit; dtxz=dtx*dtz; dr1=dtx*dtx/2.; dr2=dtz*dtz/2.; dtx4=dtx*dtx*dtx*dtx; dtz4=dtz*dtz*dtz*dtz; dtxz4=dtx*dtx*dtz*dtz; if (proc_rank == 0) { fout = fopen(outfile, "wb"); fclose(fout); } // [Afa] Truncate file. We need a prettier way MPI_Barrier(MPI_COMM_WORLD); MPI_File_open(MPI_COMM_WORLD, outfile, MPI_MODE_WRONLY, MPI_INFO_NULL, &mpi_fout); MPI_File_open(MPI_COMM_WORLD, nodelog, MPI_MODE_WRONLY, MPI_INFO_NULL, &mpi_flog); // [Afa] *About Nodes Number* nshot (i.e nxshot * nyshot) should be multiple of node numbers, // or there will be hungry processes int loop_per_proc = ((int)nshot % world_size == 0) ? (nshot / world_size) : (nshot / world_size + 1); printf("\x1B[31mDEBUG:\x1b[39;49m World size %d, Loop per Proc %d, nshot %f, I am No. %d\n", world_size, loop_per_proc, nshot, proc_rank); // for(ishot=1;ishot<=nshot;ishot++) // [Afa] nshot is 20 in para1.in, but 200 in para2.in for (int loop_index = 0; loop_index < loop_per_proc; ++loop_index) { ishot = loop_index + proc_rank * loop_per_proc + 1; // [Afa] See commented code 2 lines above to understand this line if (ishot <= nshot) { // [Afa] ishot <= nshot printf("shot %d, process %d\n",ishot, proc_rank); snprintf(message, 29, "shot %6d, process %6d\n", ishot, proc_rank); // [Afa] Those numbers: MPI_File_seek(mpi_flog, 28 * (ishot - 1), MPI_SEEK_SET); // 28: string without '\0' MPI_File_write(mpi_flog, message, 28, MPI_CHAR, &mpi_status); // 29: with '\0' } else { printf("shot HUNGRY, process %d\n", proc_rank); snprintf(message, 29, "shot HUNGRY, process %6d\n", proc_rank); MPI_File_seek(mpi_flog, 28 * (ishot - 1), MPI_SEEK_SET); MPI_File_write(mpi_flog, message, 28, MPI_CHAR, &mpi_status); continue; } ncy_shot=ncy_shot1+(ishot/nxshot)*dyshot; ncx_shot=ncx_shot1+(ishot%nxshot)*dxshot; // [Afa] Matrix is zeroed in every loop // i.e. The relation between those matrices in each loop is pretty loose // Matrices not zeroed are: vpp, density, vss and wave, and they're not changed (read-only) // We only need to partially collect matrix `up' // TODO: [Afa] Get a better way to pass those pointers, and mark them as `restrict' // And WHY are they using cpp as extension? C++11 doesn't support `restrict' zero_matrices(u, w, ws2, up2, vp1, wp1, us, ws, wp, us2, us1, wp2, v, up1, nz, nx, up, ny, ws1, vs, vp2, vs1, vs2, vp); for(l=1;l<=lt;l++) { float xmax=l*dt*velmax; int nleft=ncx_shot-xmax/unit-10; int nright=ncx_shot+xmax/unit+10; int nfront=ncy_shot-xmax/unit-10; int nback=ncy_shot+xmax/unit+10; int ntop=ncz_shot-xmax/unit-10; int nbottom=ncz_shot+xmax/unit+10; if(nleft<5) nleft=5; if(nright>nx-5) nright=nx-5; if(nfront<5) nfront=5; if(nback>ny-5) nback=ny-5; if(ntop<5) ntop=5; if(nbottom>nz-5) nbottom=nz-5; ntop = ntop-1; nfront = nfront-1; nleft = nleft-1; // Although up, vp, wp, us, vs, ws are modified below, we're sure there's no race condition. // Each loop accesses a UNIQUE element in the array, and the value is not used, no need to worry about the dirty cache #pragma omp parallel for shared(u) shared(v) shared(w) shared(up1) shared(up2) shared(vp1) shared(vp2) shared(wp1) \ shared(wp2) shared(us) shared(us1) shared(us2) shared(vs) shared(vs1) shared(vs2) shared(ws) shared(ws1) shared(ws2) \ shared(vss) shared(vpp) shared(dr1) shared(dr2) shared(dtz) shared(dtx) shared(ncx_shot) shared(ncy_shot) shared(ncz_shot) \ shared(wave) for(int k=ntop;k<nbottom;k++) { for(int j=nfront;j<nback;j++) { for(int i=nleft;i<nright;i++) { float vvp2,drd1,drd2,vvs2; float px,sx; if(i==ncx_shot-1&&j==ncy_shot-1&&k==ncz_shot-1) { px=1.; sx=0.; } else { px=0.; sx=0.; } vvp2=vpp[k*ny*nx+j*nx+i]*vpp[k*ny*nx+j*nx+i]; drd1=dr1*vvp2; drd2=dr2*vvp2; vvs2=vss[k*ny*nx+j*nx+i]*vss[k*ny*nx+j*nx+i]; drd1=dr1*vvs2; drd2=dr2*vvs2; float tempux2=0.0f; float tempuy2=0.0f; float tempuz2=0.0f; float tempvx2=0.0f; float tempvy2=0.0f; float tempvz2=0.0f; float tempwx2=0.0f; float tempwy2=0.0f; float tempwz2=0.0f; float tempuxz=0.0f; float tempuxy=0.0f; float tempvyz=0.0f; float tempvxy=0.0f; float tempwxz=0.0f; float tempwyz=0.0f; // This will make the compiler do the vectorization for(int kk=1;kk<=mm;kk++) { tempux2 += c[kk-1][0]*(u[k*ny*nx+j*nx+(i+kk)]+u[k*ny*nx+j*nx+(i-kk)]); tempuy2 += c[kk-1][0]*(u[k*ny*nx+(j+kk)*nx+i]+u[k*ny*nx+(j-kk)*nx+i]); tempuz2 += c[kk-1][0]*(u[(k+kk)*ny*nx+j*nx+i]+u[(k-kk)*ny*nx+j*nx+i]); } for(int kk=1;kk<=mm;kk++) { tempvx2 += c[kk-1][0]*(v[k*ny*nx+j*nx+(i+kk)]+v[k*ny*nx+j*nx+(i-kk)]); tempvy2 += c[kk-1][0]*(v[k*ny*nx+(j+kk)*nx+i]+v[k*ny*nx+(j-kk)*nx+i]); tempvz2 += c[kk-1][0]*(v[(k+kk)*ny*nx+j*nx+i]+v[(k-kk)*ny*nx+j*nx+i]); } for(int kk=1;kk<=mm;kk++) { tempwx2 += c[kk-1][0]*(w[k*ny*nx+j*nx+(i+kk)]+w[k*ny*nx+j*nx+(i-kk)]); tempwy2 += c[kk-1][0]*(w[k*ny*nx+(j+kk)*nx+i]+w[k*ny*nx+(j-kk)*nx+i]); tempwz2 += c[kk-1][0]*(w[(k+kk)*ny*nx+j*nx+i]+w[(k-kk)*ny*nx+j*nx+i]); } //for(kk=1;kk<=mm;kk++) end tempux2=(tempux2+c0*u[k*ny*nx+j*nx+i])*vvp2*dtx*dtx; // u[k][j][i] tempuy2=(tempuy2+c0*u[k*ny*nx+j*nx+i])*vvs2*dtx*dtx; // u[k][j][i] tempuz2=(tempuz2+c0*u[k*ny*nx+j*nx+i])*vvs2*dtz*dtz; // u[k][j][i] tempvx2=(tempvx2+c0*v[k*ny*nx+j*nx+i])*vvs2*dtx*dtx; tempvy2=(tempvy2+c0*v[k*ny*nx+j*nx+i])*vvp2*dtx*dtx; tempvz2=(tempvz2+c0*v[k*ny*nx+j*nx+i])*vvs2*dtz*dtz; tempwx2=(tempwx2+c0*w[k*ny*nx+j*nx+i])*vvs2*dtx*dtx; tempwy2=(tempwy2+c0*w[k*ny*nx+j*nx+i])*vvs2*dtx*dtx; tempwz2=(tempwz2+c0*w[k*ny*nx+j*nx+i])*vvp2*dtz*dtz; // This loop is auto-vectorized for(int kk=1;kk<=mm;kk++) { for(int kkk=1;kkk<=mm;kkk++) { tempuxz=tempuxz+c[kkk-1][1+kk]*(u[(k+kkk)*ny*nx+j*nx+(i+kk)] -u[(k-kkk)*ny*nx+j*nx+(i+kk)] +u[(k-kkk)*ny*nx+j*nx+(i-kk)] -u[(k+kkk)*ny*nx+j*nx+(i-kk)]); // u[k+kkk][j][i+kk], u[k-kkk][j][i+kk], u[k-kkk][j][i-kk], u[k+kkk][j][i-kk] tempuxy=tempuxy+c[kkk-1][1+kk]*(u[k*ny*nx+(j+kkk)*nx+(i+kk)] -u[k*ny*nx+(j-kkk)*nx+(i+kk)] +u[k*ny*nx+(j-kkk)*nx+(i-kk)] -u[k*ny*nx+(j+kkk)*nx+(i-kk)]); tempvyz=tempvyz+c[kkk-1][1+kk]*(v[(k+kkk)*ny*nx+(j+kk)*nx+i] -v[(k-kkk)*ny*nx+(j+kk)*nx+i] +v[(k-kkk)*ny*nx+(j-kk)*nx+i] -v[(k+kkk)*ny*nx+(j-kk)*nx+i]); tempvxy=tempvxy+c[kkk-1][1+kk]*(v[k*ny*nx+(j+kkk)*nx+(i+kk)] -v[k*ny*nx+(j-kkk)*nx+(i+kk)] +v[k*ny*nx+(j-kkk)*nx+(i-kk)] -v[k*ny*nx+(j+kkk)*nx+(i-kk)]); tempwyz=tempwyz+c[kkk-1][1+kk]*(w[(k+kkk)*ny*nx+(j+kk)*nx+i] -w[(k-kkk)*ny*nx+(j+kk)*nx+i] +w[(k-kkk)*ny*nx+(j-kk)*nx+i] -w[(k+kkk)*ny*nx+(j-kk)*nx+i]); tempwxz=tempwxz+c[kkk-1][1+kk]*(w[(k+kkk)*ny*nx+j*nx+(i+kk)] -w[(k-kkk)*ny*nx+j*nx+(i+kk)] +w[(k-kkk)*ny*nx+j*nx+(i-kk)] -w[(k+kkk)*ny*nx+j*nx+(i-kk)]); } // for(kkk=1;kkk<=mm;kkk++) end } //for(kk=1;kk<=mm;kk++) end // LValues below are only changed here up[k*ny*nx+j*nx+i]=2.*up1[k*ny*nx+j*nx+i]-up2[k*ny*nx+j*nx+i] +tempux2+tempwxz*vvp2*dtz*dtx +tempvxy*vvp2*dtz*dtx; // up1[k][j][j], up2[k][j][i], up[k][j][i] vp[k*ny*nx+j*nx+i]=2.*vp1[k*ny*nx+j*nx+i]-vp2[k*ny*nx+j*nx+i] +tempvy2+tempuxy*vvp2*dtz*dtx +tempwyz*vvp2*dtz*dtx; wp[k*ny*nx+j*nx+i]=2.*wp1[k*ny*nx+j*nx+i]-wp2[k*ny*nx+j*nx+i] +tempwz2+tempuxz*vvp2*dtz*dtx +tempvyz*vvp2*dtz*dtx +px*wave[l-1]; us[k*ny*nx+j*nx+i]=2.*us1[k*ny*nx+j*nx+i]-us2[k*ny*nx+j*nx+i]+tempuy2+tempuz2 -tempvxy*vvs2*dtz*dtx-tempwxz*vvs2*dtz*dtx; vs[k*ny*nx+j*nx+i]=2.*vs1[k*ny*nx+j*nx+i]-vs2[k*ny*nx+j*nx+i]+tempvx2+tempvz2 -tempuxy*vvs2*dtz*dtx-tempwyz*vvs2*dtz*dtx; ws[k*ny*nx+j*nx+i]=2.*ws1[k*ny*nx+j*nx+i]-ws2[k*ny*nx+j*nx+i]+tempwx2+tempwy2 -tempuxz*vvs2*dtz*dtx-tempvyz*vvs2*dtz*dtx; }//for(i=nleft;i<nright;i++) end } } // Again, those are UNIQUE access. Safe to share #pragma omp parallel for shared(up) shared(us) shared(vp) shared(vs) shared(wp) shared(ws) shared(u) shared(v) shared(w) \ shared(up2) shared(up1) shared(us2) shared(us1) shared(vp2) shared(vp1) shared(wp2) shared(wp1) shared(ws2) shared(ws1) for(int k=ntop;k<nbottom;k++) for(int j=nfront;j<nback;j++) for(int i=nleft;i<nright;i++) { u[k*ny*nx+j*nx+i]=up[k*ny*nx+j*nx+i]+us[k*ny*nx+j*nx+i]; v[k*ny*nx+j*nx+i]=vp[k*ny*nx+j*nx+i]+vs[k*ny*nx+j*nx+i]; w[k*ny*nx+j*nx+i]=wp[k*ny*nx+j*nx+i]+ws[k*ny*nx+j*nx+i]; up2[k*ny*nx+j*nx+i]=up1[k*ny*nx+j*nx+i]; up1[k*ny*nx+j*nx+i]=up[k*ny*nx+j*nx+i]; us2[k*ny*nx+j*nx+i]=us1[k*ny*nx+j*nx+i]; us1[k*ny*nx+j*nx+i]=us[k*ny*nx+j*nx+i]; vp2[k*ny*nx+j*nx+i]=vp1[k*ny*nx+j*nx+i]; vp1[k*ny*nx+j*nx+i]=vp[k*ny*nx+j*nx+i]; vs2[k*ny*nx+j*nx+i]=vs1[k*ny*nx+j*nx+i]; vs1[k*ny*nx+j*nx+i]=vs[k*ny*nx+j*nx+i]; wp2[k*ny*nx+j*nx+i]=wp1[k*ny*nx+j*nx+i]; wp1[k*ny*nx+j*nx+i]=wp[k*ny*nx+j*nx+i]; ws2[k*ny*nx+j*nx+i]=ws1[k*ny*nx+j*nx+i]; ws1[k*ny*nx+j*nx+i]=ws[k*ny*nx+j*nx+i]; }//for(i=nleft;i<nright;i++) end }//for(l=1;l<=lt;l++) end // [Afa] Do we need to keep the order of data? // [Afa Update] Yes, we do need to KEEP THE ORDER of data // fwrite(up+169*ny*nx,sizeof(float),ny*nx,fout); // This is the original fwrite MPI_File_seek(mpi_fout, (ishot - 1) * ny * nx * sizeof(float), MPI_SEEK_SET); MPI_File_write(mpi_fout, up + 169 * ny * nx, ny * nx, MPI_FLOAT, &mpi_status); }//for(ishot=1;ishot<=nshot;ishot++) end MPI_File_close(&mpi_fout); MPI_File_close(&mpi_flog); free(u); free(v); free(w); free(up); free(up1); free(up2); free(vp); free(vp1); free(vp2); free(wp); free(wp1); free(wp2); free(us); free(us1); free(us2); free(vs); free(vs1); free(vs2); free(ws); free(ws1); free(ws2); free(vpp); free(density); free(vss); free(wave); MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); if (proc_rank == 0) { gettimeofday(&end,NULL); all_time = (end.tv_sec-start.tv_sec)+(float)(end.tv_usec-start.tv_usec)/1000000.0; printf("run time:\t%f s\n",all_time); flog = fopen(logfile,"a"); fprintf(flog,"\nrun time:\t%f s\n\n",all_time); fclose(flog); flog = fopen(logfile,"a"); fprintf(flog,"------------end time------------\n"); fclose(flog); system(tmp); } // Why return 1? return 0; }
int A1D_Initialize() { int mpi_initialized, mpi_provided; int mpi_status; int i; size_t bytes_in, bytes_out; DCMF_Result dcmf_result; DCMF_Configure_t dcmf_config; DCMF_Memregion_t local_memregion; /*************************************************** * * configure MPI * ***************************************************/ /* MPI has to be initialized for this implementation to work */ MPI_Initialized(&mpi_initialized); assert(mpi_initialized==1); /* MPI has to be thread-safe so that DCMF doesn't explode */ MPI_Query_thread(&mpi_provided); assert(mpi_provided==MPI_THREAD_MULTIPLE); /* have to use our own communicator for collectives to be proper */ mpi_status = MPI_Comm_dup(MPI_COMM_WORLD,&A1D_COMM_WORLD); assert(mpi_status==0); /* get my MPI rank */ mpi_status = MPI_Comm_rank(A1D_COMM_WORLD,&myrank); assert(mpi_status==0); /* get MPI world size */ mpi_status = MPI_Comm_size(A1D_COMM_WORLD,&mpi_size); assert(mpi_status==0); /* make sure MPI and DCMF agree */ assert(myrank==DCMF_Messager_rank()); assert(mpi_size==DCMF_Messager_size()); /* barrier before DCMF_Messager_configure to make sure MPI is ready everywhere */ mpi_status = MPI_Barrier(A1D_COMM_WORLD); assert(mpi_status==0); /*************************************************** * * configure DCMF * ***************************************************/ /* to be safe, but perhaps not necessary */ dcmf_config.thread_level = DCMF_THREAD_MULTIPLE; #ifdef ACCUMULATE_IMPLEMENTED /* interrupts required for accumulate only, Put/Get use DMA * if accumulate not used, MPI will query environment for DCMF_INTERRUPTS */ dcmf_config.interrupts = DCMF_INTERRUPTS_ON; #endif /* reconfigure DCMF with interrupts on */ DCMF_CriticalSection_enter(0); dcmf_result = DCMF_Messager_configure(&dcmf_config, &dcmf_config); assert(dcmf_result==DCMF_SUCCESS); DCMF_CriticalSection_exit(0); /* barrier after DCMF_Messager_configure to make sure everyone has the new DCMF config */ mpi_status = MPI_Barrier(A1D_COMM_WORLD); assert(mpi_status==0); /*************************************************** * * setup DCMF memregions * ***************************************************/ /* allocate memregion list */ A1D_Memregion_list = malloc( mpi_size * sizeof(DCMF_Memregion_t) ); assert(A1D_Memregion_list != NULL); /* allocate base pointer list */ A1D_Baseptr_list = malloc( mpi_size * sizeof(void*) ); assert(A1D_Memregion_list != NULL); /* create memregions */ bytes_in = -1; DCMF_CriticalSection_enter(0); dcmf_result = DCMF_Memregion_create(&local_memregion,&bytes_out,bytes_in,NULL,0); assert(dcmf_result==DCMF_SUCCESS); DCMF_CriticalSection_exit(0); /* exchange memregions because we don't use symmetry heap */ mpi_status = MPI_Allgather(&local_memregion,sizeof(DCMF_Memregion_t),MPI_BYTE, A1D_Memregion_list,sizeof(DCMF_Memregion_t),MPI_BYTE, A1D_COMM_WORLD); assert(mpi_status==0); /* destroy temporary local memregion */ DCMF_CriticalSection_enter(0); dcmf_result = DCMF_Memregion_destroy(&local_memregion); assert(dcmf_result==DCMF_SUCCESS); DCMF_CriticalSection_exit(0); /* check for valid memregions */ DCMF_CriticalSection_enter(0); for (i = 0; i < mpi_size; i++) { dcmf_result = DCMF_Memregion_query(&A1D_Memregion_list[i], &bytes_out, &A1D_Baseptr_list[i]); assert(dcmf_result==DCMF_SUCCESS); } DCMF_CriticalSection_exit(0); #ifdef FLUSH_IMPLEMENTED /*************************************************** * * setup flush list(s) * ***************************************************/ /* allocate Put list */ A1D_Put_flush_list = malloc( mpi_size * sizeof(int) ); assert(A1D_Put_flush_list != NULL); #ifdef ACCUMULATE_IMPLEMENTED /* allocate Acc list */ A1D_Send_flush_list = malloc( mpi_size * sizeof(int) ); assert(A1D_Send_flush_list != NULL); #endif #endif /*************************************************** * * define null callback * ***************************************************/ A1D_Nocallback.function = NULL; A1D_Nocallback.clientdata = NULL; return(0); }
/*--------------------------------------------------------------------*/ void cppinit2(int *idproc, int *nvp, int argc, char *argv[]) { /* this subroutine initializes parallel processing lgrp communicator = MPI_COMM_WORLD output: idproc, nvp idproc = processor id in lgrp communicator nvp = number of real or virtual processors obtained local data */ static int ibig = 2147483647; static float small = 1.0e-12; int ierror, flag, ndprec, idprec, iprec; float prec; prec = 1.0 + small; iprec = ibig + 1; /* ndprec = (0,1) = (no,yes) use (normal,autodouble) precision */ if (vresult(prec) > 1.0) ndprec = 1; else ndprec = 0; /* idprec = (0,1) = (no,yes) use (normal,autodouble) integer precision */ if (iresult(iprec) > 0) idprec = 1; else idprec = 0; /* Open error file */ unit2 = fopen("C.2","w"); /* indicate whether MPI_INIT has been called */ ierror = MPI_Initialized(&flag); if (!flag) { /* initialize the MPI execution environment */ ierror = MPI_Init(&argc,&argv); if (ierror) exit(1); } lworld = MPI_COMM_WORLD; lgrp = lworld; /* determine the rank of the calling process in the communicator */ ierror = MPI_Comm_rank(lgrp,idproc); /* determine the size of the group associated with a communicator */ ierror = MPI_Comm_size(lgrp,&nproc); /* set default datatypes */ mint = MPI_INT; mdouble = MPI_DOUBLE; /* single precision real */ if (ndprec==0) { mreal = MPI_FLOAT; mcplx = MPI_COMPLEX; } /* double precision real */ else { mreal = MPI_DOUBLE; mcplx = MPI_DOUBLE_COMPLEX; } /* single precision integer */ /* if (idprec==0) */ /* mint = MPI_INT; */ /* double precision integer */ /* else */ /* mint = MPI_LONG; */ /* operators */ msum = MPI_SUM; mmax = MPI_MAX; *nvp = nproc; return; }
/*------------------------------------------------------------------------- * Function: h5_show_hostname * * Purpose: Show hostname. Show process ID if in MPI environment. * * Return: void * * Programmer: Albert Cheng * 2002/04/22 * * Modifications: * *------------------------------------------------------------------------- */ void h5_show_hostname(void) { char hostname[80]; #ifdef H5_HAVE_WIN32_API WSADATA wsaData; int err; #endif /* try show the process or thread id in multiple processes cases*/ #ifdef H5_HAVE_PARALLEL { int mpi_rank, mpi_initialized, mpi_finalized; MPI_Initialized(&mpi_initialized); MPI_Finalized(&mpi_finalized); if(mpi_initialized && !mpi_finalized) { MPI_Comm_rank(MPI_COMM_WORLD,&mpi_rank); printf("MPI-process %d.", mpi_rank); } else printf("thread 0."); } #elif defined(H5_HAVE_THREADSAFE) printf("thread %lu.", HDpthread_self_ulong()); #else printf("thread 0."); #endif #ifdef H5_HAVE_WIN32_API err = WSAStartup( MAKEWORD(2,2), &wsaData ); if ( err != 0 ) { /* could not find a usable WinSock DLL */ return; } /* Confirm that the WinSock DLL supports 2.2.*/ /* Note that if the DLL supports versions greater */ /* than 2.2 in addition to 2.2, it will still return */ /* 2.2 in wVersion since that is the version we */ /* requested. */ if ( LOBYTE( wsaData.wVersion ) != 2 || HIBYTE( wsaData.wVersion ) != 2 ) { /* could not find a usable WinSock DLL */ WSACleanup( ); return; } #endif #ifdef H5_HAVE_GETHOSTNAME if (gethostname(hostname, (size_t)80) < 0) printf(" gethostname failed\n"); else printf(" hostname=%s\n", hostname); #else printf(" gethostname not supported\n"); #endif #ifdef H5_HAVE_WIN32_API WSACleanup(); #endif }
//SEXP pboot(SEXP data, SEXP statistic, SEXP ind, SEXP lt0, SEXP varg){ SEXP pboot(SEXP scenario,...){ SEXP result; double *func_results; int response, worldSize; enum commandCodes commandCode; int scene = asInteger(scenario); va_list ap; int c; // get the function arguments common to all scenarios // 1, R, lt0, vargs, strdata, strstatistic va_start(ap, scenario); int r = asInteger(va_arg(ap, SEXP)); // the number of replications to perform int ltn = asInteger(va_arg(ap, SEXP)); // the number of results the statistical function returns SEXP varg = va_arg(ap, SEXP); SEXP data = va_arg(ap, SEXP); SEXP statistic = va_arg(ap, SEXP); MPI_Initialized(&response); if (response) { DEBUG("MPI is init'ed in ptest\n"); } else { DEBUG("MPI is NOT init'ed in ptest\n"); PROTECT(result = NEW_INTEGER(1)); INTEGER(result)[0] = -1; UNPROTECT(1); return result; } MPI_Comm_size(MPI_COMM_WORLD, &worldSize); // broadcast command to other processors commandCode = PBOOT; MPI_Bcast(&commandCode, 1, MPI_INTEGER, 0, MPI_COMM_WORLD); // intialise the memory to store the results func_results = (double *)malloc(sizeof(double) * r * ltn); int * find; int * ind; int * wind; SEXP f, w, Spred, Sind; int m; int * pred; switch(scene) { case 1:; SEXP rangen, mle; rangen = va_arg(ap, SEXP); mle = va_arg(ap, SEXP); response = boot(1, func_results, r, ltn, varg, CHAR(STRING_ELT(data,0)), translateChar(PRINTNAME(statistic)), rangen, mle); break; case 2:; f = va_arg(ap, SEXP); c = ncols(f); // number of columns in the index find = (int *)malloc(sizeof(int) * r * c); Rmatrix2Carray(f, find, r, c); response = boot(2, func_results, r, ltn, varg, CHAR(STRING_ELT(data,0)), translateChar(PRINTNAME(statistic)), c, find); free(find); break; case 3:; f = va_arg(ap, SEXP); Spred = va_arg(ap, SEXP); c = ncols(f); // number of columns in the index m = ncols(Spred); find = (int *)malloc(sizeof(int) * r * c); pred = (int *)malloc(sizeof(int) * r * m); Rmatrix2Carray(f, find, r, c); Rmatrix2Carray(Spred, pred, r, m); response = boot(3, func_results, r, ltn, varg, CHAR(STRING_ELT(data,0)), translateChar(PRINTNAME(statistic)), c, find, pred, m); free(find); free(pred); break; case 4: w = va_arg(ap, SEXP); c = ncols(w); // number of columns in the index wind = (int *)malloc(sizeof(int) * r * c); Rmatrix2Carray(w, wind, r, c); response = boot(4, func_results, r, ltn, varg, CHAR(STRING_ELT(data,0)), translateChar(PRINTNAME(statistic)), c, wind); free(wind); break; case 5:; w = va_arg(ap, SEXP); Spred = va_arg(ap, SEXP); c = ncols(w); // number of columns in the index m = ncols(Spred); wind = (int *)malloc(sizeof(int) * r * c); pred = (int *)malloc(sizeof(int) * r * m); Rmatrix2Carray(w, wind, r, c); Rmatrix2Carray(Spred, pred, r, m); response = boot(5, func_results, r, ltn, varg, CHAR(STRING_ELT(data,0)), translateChar(PRINTNAME(statistic)), c, wind, pred, m); free(wind); free(pred); break; case 8: ;// work around for gcc bug // retrieve function arguments SEXP ind = va_arg(ap, SEXP); c = ncols(ind); // replications are the number of columns in the index // convert the ind from (horrible) SEXP format to C array int * cind; cind = (int *)malloc(sizeof(int) * r * c); Rmatrix2Carray(ind, cind, r, c); // sending everything to the implementation function response = boot(8, func_results, r, ltn, varg, CHAR(STRING_ELT(data,0)), translateChar(PRINTNAME(statistic)), c, cind); free(cind); break; // end of scenario 8 default: break; }// end of switch va_end(ap); // Turn the array passed back from the implementation into // a SEXP object that can be returned to R. PROTECT(result = allocMatrix(REALSXP,r ,ltn)); // t.star <- matrix(NA, sum(R), lt0) Carray2Rmatrix(func_results, result, r, ltn); free(func_results); UNPROTECT(1); return result; }
/*------------------------------------------------------------------------- * Function: getenv_all * * Purpose: Used to get the environment that the root MPI task has. * name specifies which environment variable to look for * val is the string to which the value of that environment * variable will be copied. * * NOTE: The pointer returned by this function is only * valid until the next call to getenv_all and the data * stored there must be copied somewhere else before any * further calls to getenv_all take place. * * Return: pointer to a string containing the value of the environment variable * NULL if the varialbe doesn't exist in task 'root's environment. * * Programmer: Leon Arber * 4/4/05 * * Modifications: * Use original getenv if MPI is not initialized. This happens * one uses the PHDF5 library to build a serial nature code. * Albert 2006/04/07 * *------------------------------------------------------------------------- */ char * getenv_all(MPI_Comm comm, int root, const char* name) { int mpi_size, mpi_rank, mpi_initialized; int len; static char* env = NULL; assert(name); MPI_Initialized(&mpi_initialized); if(!mpi_initialized) { /* use original getenv */ if(env) HDfree(env); env = HDgetenv(name); } /* end if */ else { MPI_Comm_rank(comm, &mpi_rank); MPI_Comm_size(comm, &mpi_size); assert(root < mpi_size); /* The root task does the getenv call * and sends the result to the other tasks */ if(mpi_rank == root) { env = HDgetenv(name); if(env) { len = (int)HDstrlen(env); MPI_Bcast(&len, 1, MPI_INT, root, comm); MPI_Bcast(env, len, MPI_CHAR, root, comm); } else { /* len -1 indicates that the variable was not in the environment */ len = -1; MPI_Bcast(&len, 1, MPI_INT, root, comm); } } else { MPI_Bcast(&len, 1, MPI_INT, root, comm); if(len >= 0) { if(env == NULL) env = (char*) HDmalloc((size_t)len+1); else if(HDstrlen(env) < (size_t)len) env = (char*) HDrealloc(env, (size_t)len+1); MPI_Bcast(env, len, MPI_CHAR, root, comm); env[len] = '\0'; } else { if(env) HDfree(env); env = NULL; } } } #ifndef NDEBUG MPI_Barrier(comm); #endif return env; }
int fastdebug_(const char *str, int *lineno ,int str_len) { int flag; mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; char *msg, *nstr; char hostname[30], fname[50]; // char prefix[] = "/tmp/gmdebug/debug-"; char *prefix = "/scratch/gmdebug-"; char preamble[ROUGH_PREAMBLE_LENGTH]; // Bail immediately if there has been an error, or we're simply deactivated // and acting as a stub file. if (fast_debug_error || is_stubbed) return 0; // Get the hostname gethostname(hostname, sizeof hostname); MPI_Initialized(&flag); if(flag==0) return 1; if (fp == -1) { if(NULL!=getenv("MATT_PREFIX")) prefix=getenv("MATT_PREFIX"); mypid=getpid(); MPI_Initialized(&flag); if(flag) MPI_Comm_rank(MPI_COMM_WORLD, &myrank ); else fprintf(stderr,"[fastdebug] MPI NOT STARTED\n"); // just for debug.. if (VERBOSE) mypid=0; snprintf(fname,sizeof(fname)-1,"%s%5.5d-%3.3d",prefix,mypid,myrank); if (VERBOSE) printf("Opening file %s\n", fname); if ((fp=open(fname, O_WRONLY | O_CREAT | O_APPEND, mode)) < 0) { fprintf(stderr, "[fastdebug]: Cannot open file %s on %s. Deactivating fastdebug.\n", fname, hostname); //exit(1); fast_debug_error=1; return 0; } } // Write the preable, so, time, hostname, etc. snprintf(preamble, ROUGH_PREAMBLE_LENGTH+1, "%lld [%s:%05d:%02d]", f_gettimeofday_(), hostname, mypid, myrank); // Magic number ROUGH_PREAMBLE_LENGTH is estimated length of all the // host/pid/rank info in the debug line if ((str_len + strlen(preamble)) > MAX_STR_LEN) { // Truncate message str_len = MAX_STR_LEN - strlen(preamble); } str_len++; // null character // Trim white space if (VERBOSE) printf("[fastdebug] Trying to allocation %d characters for nstr \"%s\"\n", str_len, str); if ((nstr=(char *)alloca(str_len)) == NULL) { fprintf(stderr, "[fastdebug]: Cannot allocate msg %d characters for string.\n", (str_len+2)); exit(1); } snprintf(nstr, str_len, "%s", str); // copy it to a mutable stirng nstr=trim(nstr); // trim white space str_len=strlen(nstr); // str_len will now be the entire output message (preample and all) // The 128 is a magic number buffer, for the spaces and line breaks, etc.. str_len=str_len+strlen(preamble)+128; if (VERBOSE) printf("[fastdebug] Trying to allocation %d characters for combined string \"%s %s\"\n", str_len+2, preamble, nstr); if ((msg=(char *)alloca(str_len)) == NULL) { fprintf(stderr, "[fastdebug]: Cannot allocate msg %d characters for string.\n", (str_len+2)); exit(1); } // Combine the string snprintf(msg, str_len, "%s(%5d) %s\n", preamble,*lineno, nstr); //snprintf(msg, str_len+2, "%s %s\n", nstr); if (VERBOSE) printf("Writing \"%s\" to %s\n", msg, fname); write(fp,msg,strlen(msg)); /*close(fp);*/ return 1; }
int cfio_init(int x_proc_num, int y_proc_num, int ratio) { int rc, i; int size; int root = 0; int error, ret; int server_proc_num; int best_server_amount; MPI_Group group, client_group, server_group; int *ranks; //set_debug_mask(DEBUG_CFIO | DEBUG_SERVER);// | DEBUG_MSG | DEBUG_SERVER); rc = MPI_Initialized(&i); if( !i ) { error("MPI should be initialized before the cfio\n"); return -1; } MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); //if(rank == 100) //{ // set_debug_mask(DEBUG_MAP); //} client_num = x_proc_num * y_proc_num; server_proc_num = size - client_num; if(server_proc_num < 0) { server_proc_num = 0; } best_server_amount = (int)((double)client_num / ratio); if(best_server_amount <= 0) { best_server_amount = 1; } MPI_Comm_group(MPI_COMM_WORLD, &group); //ranks = malloc(client_num * sizeof(int)); //for(i = 0; i < client_num; i ++) //{ // ranks[i] = i; //} //MPI_Group_incl(group, client_num, ranks, &client_group); //MPI_Comm_create(MPI_COMM_WORLD, client_group, &client_comm); //free(ranks); ranks = malloc(server_proc_num * sizeof(int)); for(i = 0; i < server_proc_num; i ++) { ranks[i] = i + client_num; } MPI_Group_incl(group, server_proc_num, ranks, &server_group); MPI_Comm_create(MPI_COMM_WORLD, server_group, &server_comm); free(ranks); //times_start(); if((ret = cfio_map_init( x_proc_num, y_proc_num, server_proc_num, best_server_amount, MPI_COMM_WORLD, server_comm)) < 0) { error("Map Init Fail."); return ret; } if(cfio_map_proc_type(rank) == CFIO_MAP_TYPE_SERVER) { if((ret = cfio_server_init()) < 0) { error(""); return ret; } if((ret = cfio_server_start()) < 0) { error(""); return ret; } }else if(cfio_map_proc_type(rank) == CFIO_MAP_TYPE_CLIENT) { if((ret = cfio_send_init(CLIENT_BUF_SIZE)) < 0) { error(""); return ret; } if((ret = cfio_id_init(CFIO_ID_INIT_CLIENT)) < 0) { error(""); return ret; } } debug(DEBUG_CFIO, "success return."); return CFIO_ERROR_NONE; }
int hybrid_report_mask(){ // General int i,j,ierr; int id, rid,tid; int in_mpi, in_omp; int thrd, nthrds; int ncpus, nel_set; // Mask storage static int ** omp_proc_mask; static int * omp_mask_pac; char *dummy; // MPI specific Variables int rank, nranks; MPI_Request *request; MPI_Status *status; static int multi_node = 0; static char *all_names; static int max_name_len; int name_len; char proc_name[MPI_MAX_PROCESSOR_NAME]; char l,p; int tpc; // hwthreads/core Maskopts opts; // get print_speed fast or slow (f|c); listing cores or SMT (c|s) p = opts.get_p(); l = opts.get_l(); tpc=get_threads_per_node(); // In MPI and parallel region ? MPI_Initialized(&in_mpi); in_omp = omp_in_parallel(); if(in_mpi == 0){ printf("ERROR: ***** Must call hybrid_report_mask() in MPI program. ***** \n"); exit(1); } // Get rank number & no of ranks via MPI MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); if(in_omp == 0){ if(rank == 0){ printf(" ***** When using 1 thread, Intel OpenMP MAY report " "\"not in a parallel region\" (Uh!)***** \n"); printf(" ***** Each row will only have a rank number (no \"0\" thread_id). \n"); printf("WARNING: ***** Unspecified results if hybrid_report_mask " "not called in parallel region of MPI code section. ***** \n"); } } thrd = omp_get_thread_num(); // thread id nthrds = omp_get_num_threads(); // Number of Threads // Get number of cpus (this gives no. // of cpu_ids in /proc/cpuinfo) ncpus = (int) sysconf(_SC_NPROCESSORS_ONLN); // Working only with MPI processes (masters) #pragma omp master { // Get a list of nodes from all ranks. MPI_Get_processor_name(proc_name,&name_len); MPI_Allreduce(&name_len, &max_name_len, 1,MPI_INT, MPI_MAX, MPI_COMM_WORLD); all_names = (char *) malloc(sizeof(int*)*nranks*(max_name_len+1)); MPI_Gather( proc_name, max_name_len+1 , MPI_CHAR, all_names, max_name_len+1, MPI_CHAR, 0, MPI_COMM_WORLD); // If multiple nodes, make muti_node non-zero. if(rank == 0){ for(id=0;id<nranks;id++){ if( strcmp(&all_names[id*(max_name_len+1)],&all_names[0]) ) multi_node++; } } // Create shared storage for masks (only master allocates) omp_proc_mask = (int **) malloc(sizeof(int*)*nthrds); for(i=0;i<nthrds;i++) omp_proc_mask[i] = (int * ) malloc(sizeof(int )*ncpus ); for(i=0;i<nthrds;i++) for(j=0;j<ncpus;j++) omp_proc_mask[i][j] =0; } #pragma omp barrier #pragma omp critical // (boundto -- may not be thread safe) ierr = boundto(&nel_set,omp_proc_mask[thrd]); #pragma omp barrier #pragma omp master { omp_mask_pac = (int *) malloc(sizeof(int)*nranks*nthrds*ncpus); // need packing space for mpi send/recv if(rank == 0){ request = (MPI_Request *) malloc(sizeof(MPI_Request)*nranks); status = (MPI_Status *) malloc(sizeof(MPI_Status )*nranks); print_mask(1, dummy, multi_node, 0, 0, ncpus, nranks,nthrds, omp_proc_mask[0],tpc,l); //print header fflush(stdout); for(tid=0;tid<nthrds;tid++){ print_mask(0, &all_names[tid*(max_name_len+1)], multi_node, 0,tid, ncpus, nranks,nthrds, omp_proc_mask[tid],tpc,l); } fflush(stdout); for(rid=1;rid<nranks;rid++){ // Receive other rank's packed mask arrays MPI_Irecv(&omp_mask_pac[rid*nthrds*ncpus], nthrds*ncpus, MPI_INT, rid, 99, MPI_COMM_WORLD, &request[rid-1]); } MPI_Waitall(nranks-1,&request[0],&status[0]); for(rid=1;rid<nranks;rid++){ // Print for each rank for(tid=0;tid<nthrds;tid++){ print_mask(0, &all_names[tid*(max_name_len+1)], multi_node, rid,tid, ncpus, nranks,nthrds, &omp_mask_pac[rid*nthrds*ncpus + tid*ncpus],tpc,l); if(p == 's') ierr=usleep(300000); } } if(nranks*nthrds > 50) print_mask(2, dummy, multi_node, 0, 0, ncpus, nranks,nthrds, omp_proc_mask[0],tpc,l); //print header fflush(stdout); } // end root printing else{ //all non-root ranks // Pack up the ranks' mask arrays (Uh, should have made one array from beginning!) for( tid=0;tid<nthrds;tid++){ for( id=0; id<ncpus; id++) omp_mask_pac[(tid*ncpus)+id] = omp_proc_mask[tid][id]; if(p == 's') ierr=usleep(300000); } // Send to root MPI_Send(omp_mask_pac, nthrds*ncpus, MPI_INT, 0, 99, MPI_COMM_WORLD); } // end non-root printing // Return allocated space for(i=0;i<nthrds;i++) free(omp_proc_mask[i]); free(omp_proc_mask); free(omp_mask_pac); if(rank == 0 ){ free(request); free(status);} free(all_names); } // end of Master #pragma omp barrier // JIC, what all threads leaving at the same time. }
GlobalMPISession::GlobalMPISession( int* argc, char*** argv, std::ostream *out ) { std::ostringstream oss; // Above is used to create all output before sending to *out to avoid // jumbled parallel output between processors #ifdef HAVE_MPI int mpierr = 0; // Assert that MPI is not already initialized int mpiHasBeenStarted = 0; MPI_Initialized(&mpiHasBeenStarted); if (mpiHasBeenStarted) { if (out) { *out << "GlobalMPISession(): Error, MPI_Intialized() return true," << " calling std::terminate()!\n" << std::flush; } std::terminate(); } // Initialize MPI mpierr = ::MPI_Init(argc, (char ***) argv); if (mpierr != 0) { if (out) { *out << "GlobalMPISession(): Error, MPI_Init() returned error code=" << mpierr << "!=0, calling std::terminate()!\n" << std::flush; } std::terminate(); } initialize(out); // Get NProc_ and rank_ int nameLen; char procName[MPI_MAX_PROCESSOR_NAME]; mpierr = ::MPI_Get_processor_name(procName, &nameLen); if (mpierr != 0) { if (out) { *out << "GlobalMPISession(): Error, MPI_Get_processor_name() error code=" << mpierr << "!=0, calling std::terminate()!\n" << std::flush; } std::terminate(); } oss << "Teuchos::GlobalMPISession::GlobalMPISession(): started processor with name " << procName << " and rank " << rank_ << "!" << std::endl; #else oss << "Teuchos::GlobalMPISession::GlobalMPISession(): started serial run" << std::endl; #endif #ifndef TEUCHOS_SUPPRESS_PROC_STARTUP_BANNER // See if we should suppress the startup banner bool printStartupBanner = true; const std::string suppress_option("--teuchos-suppress-startup-banner"); for ( int opt_i = 0; opt_i < *argc; ++opt_i ) { if ( suppress_option == (*argv)[opt_i] ) { // We are suppressing the output! printStartupBanner = false; // Remove this option! // Note that (*argv)[*argc]==0 but convention so we copy it too! for( int i = opt_i; i < *argc; ++i ) (*argv)[i] = (*argv)[i+1]; --*argc; } } if (out && printStartupBanner) { *out << oss.str() << std::flush; } #endif }
/* so src/nrnpython/inithoc.cpp does not have to include a c++ mpi.h */ int nrnmpi_wrap_mpi_init(int* flag) { return MPI_Initialized(flag); }
int pmrrr (char *jobz, char *range, int *np, double *D, double *E, double *vl, double *vu, int *il, int *iu, int *tryracp, MPI_Comm comm, int *nzp, int *offsetp, double *W, double *Z, int *ldz, int *Zsupp) { /* Input parameter */ int n = *np; bool onlyW = toupper(jobz[0]) == 'N'; bool wantZ = toupper(jobz[0]) == 'V'; bool cntval = toupper(jobz[0]) == 'C'; bool alleig = toupper(range[0]) == 'A'; bool valeig = toupper(range[0]) == 'V'; bool indeig = toupper(range[0]) == 'I'; /* Check input parameters */ if(!(onlyW || wantZ || cntval)) return 1; if(!(alleig || valeig || indeig)) return 1; if(n <= 0) return 1; if (valeig) { if(*vu<=*vl) return 1; } else if (indeig) { if (*il<1 || *il>n || *iu<*il || *iu>n) return 1; } /* MPI & multithreading info */ int is_init, is_final; MPI_Initialized(&is_init); MPI_Finalized(&is_final); if (is_init!=1 || is_final==1) { fprintf(stderr, "ERROR: MPI is not active! (init=%d, final=%d) \n", is_init, is_final); return 1; } MPI_Comm comm_dup; MPI_Comm_dup(comm, &comm_dup); int nproc, pid, thread_support; MPI_Comm_size(comm_dup, &nproc); MPI_Comm_rank(comm_dup, &pid); MPI_Query_thread(&thread_support); int nthreads; if ( !(thread_support == MPI_THREAD_MULTIPLE || thread_support == MPI_THREAD_FUNNELED) ) { /* Disable multithreading; note: to support multithreading with * MPI_THREAD_SERIALIZED the code must be changed slightly; this * is not supported at the moment */ nthreads = 1; } else { char *ompvar = getenv("PMR_NUM_THREADS"); if (ompvar == NULL) { nthreads = DEFAULT_NUM_THREADS; } else { nthreads = atoi(ompvar); } } #if defined(MVAPICH2_VERSION) if (nthreads>1) { int mv2_affinity=1; char *mv2_string = getenv("MV2_ENABLE_AFFINITY"); if (mv2_string != NULL) mv2_affinity = atoi(mv2_string); if (mv2_affinity!=0) { nthreads = 1; if (pid==0) { fprintf(stderr, "WARNING: PMRRR incurs a significant performance penalty when multithreaded with MVAPICH2 with affinity enabled. The number of threads has been reduced to one; please rerun with MV2_ENABLE_AFFINITY=0 or PMR_NUM_THREADS=1 in the future.\n"); fflush(stderr); } } } #endif /* If only maximal number of local eigenvectors are queried * return if possible here */ *nzp = 0; *offsetp = 0; if (cntval) { if ( alleig || n < DSTEMR_IF_SMALLER ) { *nzp = iceil(n,nproc); MPI_Comm_free(&comm_dup); return 0; } else if (indeig) { *nzp = iceil(*iu-*il+1,nproc); MPI_Comm_free(&comm_dup); return 0; } } /* Check if computation should be done by multiple processes */ int info; if (n < DSTEMR_IF_SMALLER) { info = handle_small_cases(jobz, range, np, D, E, vl, vu, il, iu, tryracp, comm, nzp, offsetp, W, Z, ldz, Zsupp); MPI_Comm_free(&comm_dup); return info; } /* Allocate memory */ double *Werr = (double*)malloc(n*sizeof(double)); assert(Werr!=NULL); double *Wgap = (double*)malloc(n*sizeof(double)); assert(Wgap!=NULL); double *gersch = (double*)malloc(2*n*sizeof(double)); assert(gersch!=NULL); int *iblock = (int*)calloc(n,sizeof(int)); assert(iblock!=NULL); int *iproc = (int*)malloc(n*sizeof(int)); assert(iproc!=NULL); int *Windex = (int*)malloc(n*sizeof(int)); assert(Windex!=NULL); int *isplit = (int*)malloc(n*sizeof(int)); assert(isplit!=NULL); int *Zindex = (int*)malloc(n*sizeof(int)); assert(Zindex!=NULL); proc_t *procinfo = (proc_t*)malloc(sizeof(proc_t)); assert(procinfo!=NULL); in_t *Dstruct = (in_t*)malloc(sizeof(in_t)); assert(Dstruct!=NULL); val_t *Wstruct = (val_t*)malloc(sizeof(val_t)); assert(Wstruct!=NULL); vec_t *Zstruct = (vec_t*)malloc(sizeof(vec_t)); assert(Zstruct!=NULL); tol_t *tolstruct = (tol_t*)malloc(sizeof(tol_t)); assert(tolstruct!=NULL); /* Bundle variables into a structures */ procinfo->pid = pid; procinfo->nproc = nproc; procinfo->comm = comm_dup; procinfo->nthreads = nthreads; procinfo->thread_support = thread_support; Dstruct->n = n; Dstruct->D = D; Dstruct->E = E; Dstruct->isplit = isplit; Wstruct->n = n; Wstruct->vl = vl; Wstruct->vu = vu; Wstruct->il = il; Wstruct->iu = iu; Wstruct->W = W; Wstruct->Werr = Werr; Wstruct->Wgap = Wgap; Wstruct->Windex = Windex; Wstruct->iblock = iblock; Wstruct->iproc = iproc; Wstruct->gersch = gersch; Zstruct->ldz = *ldz; Zstruct->nz = 0; Zstruct->Z = Z; Zstruct->Zsupp = Zsupp; Zstruct->Zindex = Zindex; /* Scale matrix to allowable range, returns 1.0 if not scaled */ double scale = scale_matrix(Dstruct, Wstruct, valeig); /* Test if matrix warrants more expensive computations which * guarantees high relative accuracy */ if (*tryracp) odrrr(&n, D, E, &info); /* 0 - rel acc */ else info = -1; int i; double *Dcopy, *E2copy; if (info == 0) { /* This case is extremely rare in practice */ tolstruct->split = DBL_EPSILON; /* Copy original data needed for refinement later */ Dcopy = (double*)malloc(n*sizeof(double)); assert(Dcopy!=NULL); memcpy(Dcopy, D, n*sizeof(double)); E2copy = (double*)malloc(n*sizeof(double)); assert(E2copy!=NULL); for (i=0; i<n-1; i++) E2copy[i] = E[i]*E[i]; } else { /* Neg. threshold forces old splitting criterion */ tolstruct->split = -DBL_EPSILON; *tryracp = 0; } if (!wantZ) { /* Compute eigenvalues to full precision */ tolstruct->rtol1 = 4.0 * DBL_EPSILON; tolstruct->rtol2 = 4.0 * DBL_EPSILON; } else { /* Do not compute to full accuracy first, but refine later */ tolstruct->rtol1 = sqrt(DBL_EPSILON); tolstruct->rtol1 = fmin(1e-2*MIN_RELGAP, tolstruct->rtol1); tolstruct->rtol2 = sqrt(DBL_EPSILON)*5.0E-3; tolstruct->rtol2 = fmin(5e-6*MIN_RELGAP, tolstruct->rtol2); tolstruct->rtol2 = fmax(4.0 * DBL_EPSILON, tolstruct->rtol2); } /* Compute all eigenvalues: sorted by block */ info = plarre(procinfo,jobz,range,Dstruct,Wstruct,tolstruct,nzp,offsetp); assert(info == 0); /* If just number of local eigenvectors are queried */ if (cntval & valeig) { clean_up(comm_dup, Werr, Wgap, gersch, iblock, iproc, Windex, isplit, Zindex, procinfo, Dstruct, Wstruct, Zstruct, tolstruct); return 0; } /* If only eigenvalues are to be computed */ if (!wantZ) { /* Refine to high relative with respect to input T */ if (*tryracp) { info = refine_to_highrac (procinfo, jobz, Dcopy, E2copy, Dstruct, nzp, Wstruct, tolstruct); assert(info == 0); } /* Sort eigenvalues */ qsort(W, n, sizeof(double), cmp); /* Only keep subset ifirst:ilast */ int ifirst, ilast, isize; int iil = *il; int iiu = *iu; int ifirst_tmp=iil; for (i=0; i<nproc; i++) { int chunk = (iiu-iil+1)/nproc + (i < (iiu-iil+1)%nproc); int ilast_tmp; if (i == nproc-1) { ilast_tmp = iiu; } else { ilast_tmp = ifirst_tmp + chunk - 1; ilast_tmp = imin(ilast_tmp, iiu); } if (i == pid) { ifirst = ifirst_tmp; ilast = ilast_tmp; isize = ilast - ifirst + 1; *offsetp = ifirst - iil; *nzp = isize; } ifirst_tmp = ilast_tmp + 1; ifirst_tmp = imin(ifirst_tmp, iiu + 1); } if (isize > 0) { memmove(W, &W[ifirst-1], *nzp * sizeof(double)); } /* If matrix was scaled, rescale eigenvalues */ invscale_eigenvalues(Wstruct, scale, *nzp); clean_up (comm_dup, Werr, Wgap, gersch, iblock, iproc, Windex, isplit, Zindex, procinfo, Dstruct, Wstruct, Zstruct, tolstruct); return 0; } /* end of only eigenvalues to compute */ /* Compute eigenvectors */ info = plarrv(procinfo, Dstruct, Wstruct, Zstruct, tolstruct, nzp, offsetp); assert(info == 0); /* Refine to high relative with respect to input matrix */ if (*tryracp) { info = refine_to_highrac(procinfo, jobz, Dcopy, E2copy, Dstruct, nzp, Wstruct, tolstruct); assert(info == 0); } /* If matrix was scaled, rescale eigenvalues */ invscale_eigenvalues(Wstruct, scale, n); /* Make the first nz elements of W contains the eigenvalues * associated to the process */ int j, im=0; for (j=0; j<n; j++) { if (iproc[j] == pid) { W[im] = W[j]; Windex[im] = Windex[j]; Zindex[im] = Zindex[j]; im++; } } clean_up(comm_dup, Werr, Wgap, gersch, iblock, iproc, Windex, isplit, Zindex, procinfo, Dstruct, Wstruct, Zstruct, tolstruct); if (*tryracp) { free(Dcopy); free(E2copy); } return 0; } /* end pmrrr */
void nrnmpi_init(int nrnmpi_under_nrncontrol, int* pargc, char*** pargv) { #if NRNMPI int i, b, flag; static int called = 0; if (called) { return; } called = 1; nrnmpi_use = 1; nrnmpi_under_nrncontrol_ = nrnmpi_under_nrncontrol; if( nrnmpi_under_nrncontrol_ ) { #if 0 {int i; printf("nrnmpi_init: argc=%d\n", *pargc); for (i=0; i < *pargc; ++i) { printf("%d |%s|\n", i, (*pargv)[i]); } } #endif #if NRN_MUSIC nrnmusic_init(pargc, pargv); /* see src/nrniv/nrnmusic.cpp */ #endif #if !ALWAYS_CALL_MPI_INIT /* this is not good. depends on mpirun adding at least one arg that starts with -p4 but that probably is dependent on mpich and the use of the ch_p4 device. We are trying to work around the problem that MPI_Init may change the working directory and so when not invoked under mpirun we would like to NOT call MPI_Init. */ b = 0; for (i=0; i < *pargc; ++i) { if (strncmp("-p4", (*pargv)[i], 3) == 0) { b = 1; break; } if (strcmp("-mpi", (*pargv)[i]) == 0) { b = 1; break; } } if (nrnmusic) { b = 1; } if (!b) { nrnmpi_use = 0; nrnmpi_under_nrncontrol_ = 0; return; } #endif MPI_Initialized(&flag); if (!flag && MPI_Init(pargc, pargv) != MPI_SUCCESS) { printf("MPI_INIT failed\n"); } #if NRN_MUSIC if (nrnmusic) { asrt(MPI_Comm_dup(nrnmusic_comm, &nrnmpi_world_comm)); }else{ #else { #endif asrt(MPI_Comm_dup(MPI_COMM_WORLD, &nrnmpi_world_comm)); } } grp_bbs = MPI_GROUP_NULL; grp_net = MPI_GROUP_NULL; asrt(MPI_Comm_dup(nrnmpi_world_comm, &nrnmpi_comm)); asrt(MPI_Comm_dup(nrnmpi_world_comm, &nrn_bbs_comm)); asrt(MPI_Comm_rank(nrnmpi_world_comm, &nrnmpi_myid_world)); asrt(MPI_Comm_size(nrnmpi_world_comm, &nrnmpi_numprocs_world)); nrnmpi_numprocs = nrnmpi_numprocs_bbs = nrnmpi_numprocs_world; nrnmpi_myid = nrnmpi_myid_bbs = nrnmpi_myid_world; nrnmpi_spike_initialize(); /*begin instrumentation*/ #if USE_HPM hpmInit( nrnmpi_myid_world, "mpineuron" ); #endif #if 0 {int i; printf("nrnmpi_init: argc=%d\n", *pargc); for (i=0; i < *pargc; ++i) { printf("%d |%s|\n", i, (*pargv)[i]); } } #endif #if 1 if (nrnmpi_myid == 0) { printf("numprocs=%d\n", nrnmpi_numprocs_world); } #endif #endif /* NRNMPI */ } double nrnmpi_wtime() { #if NRNMPI if (nrnmpi_use) { return MPI_Wtime(); } #endif return nrn_timeus(); } void nrnmpi_terminate() { #if NRNMPI if (nrnmpi_use) { #if 0 printf("%d nrnmpi_terminate\n", nrnmpi_myid_world); #endif #if USE_HPM hpmTerminate( nrnmpi_myid_world ); #endif if( nrnmpi_under_nrncontrol_ ) { #if NRN_MUSIC if (nrnmusic) { nrnmusic_terminate(); }else #endif MPI_Finalize(); } nrnmpi_use = 0; #if nrnmpidebugleak nrnmpi_checkbufleak(); #endif } #endif /*NRNMPI*/ } void nrnmpi_abort(int errcode) { #if NRNMPI int flag; MPI_Initialized(&flag); if (flag) { MPI_Abort(MPI_COMM_WORLD, errcode); }else{ abort(); } #else abort(); #endif }