/* * scan * * Function: - basic scan operation * Accepts: - same arguments as MPI_Scan() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_scan_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int size, rank, err; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *pml_buffer = NULL; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* If I'm rank 0, just copy into the receive buffer */ if (0 == rank) { if (MPI_IN_PLACE != sbuf) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); if (MPI_SUCCESS != err) { return err; } } } /* Otherwise receive previous buffer and reduce. */ else { /* Allocate a temporary buffer. Rationale for this size is * listed in coll_basic_reduce.c. Use this temporary buffer to * receive into, later. */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - lb; /* Copy the send buffer into the receive buffer. */ if (MPI_IN_PLACE != sbuf) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } } /* Receive the prior answer */ err = MCA_PML_CALL(recv(pml_buffer, count, dtype, rank - 1, MCA_COLL_BASE_TAG_SCAN, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Perform the operation */ ompi_op_reduce(op, pml_buffer, rbuf, count, dtype); /* All done */ if (NULL != free_buffer) { free(free_buffer); } } /* Send result to next process. */ if (rank < (size - 1)) { return MCA_PML_CALL(send(rbuf, count, dtype, rank + 1, MCA_COLL_BASE_TAG_SCAN, MCA_PML_BASE_SEND_STANDARD, comm)); } /* All done */ return MPI_SUCCESS; }
int ompi_osc_pt2pt_replyreq_send(ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_replyreq_t *replyreq) { int ret = OMPI_SUCCESS; opal_free_list_item_t *item; ompi_osc_pt2pt_buffer_t *buffer = NULL; ompi_osc_pt2pt_reply_header_t *header = NULL; size_t written_data = 0; /* Get a buffer */ OPAL_FREE_LIST_GET(&mca_osc_pt2pt_component.p2p_c_buffers, item, ret); if (NULL == item) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; } buffer = (ompi_osc_pt2pt_buffer_t*) item; /* verify at least enough space for header */ if (mca_osc_pt2pt_component.p2p_c_eager_size < sizeof(ompi_osc_pt2pt_reply_header_t)) { ret = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } /* setup buffer */ buffer->cbfunc = ompi_osc_pt2pt_replyreq_send_cb; buffer->cbdata = (void*) replyreq; /* pack header */ header = (ompi_osc_pt2pt_reply_header_t*) buffer->payload; written_data += sizeof(ompi_osc_pt2pt_reply_header_t); header->hdr_base.hdr_type = OMPI_OSC_PT2PT_HDR_REPLY; header->hdr_base.hdr_flags = 0; header->hdr_origin_sendreq = replyreq->rep_origin_sendreq; header->hdr_target_tag = 0; /* if sending data fits, pack payload */ if (mca_osc_pt2pt_component.p2p_c_eager_size >= written_data + replyreq->rep_target_bytes_packed) { struct iovec iov; uint32_t iov_count = 1; size_t max_data = replyreq->rep_target_bytes_packed; iov.iov_len = max_data; iov.iov_base = (IOVBASE_TYPE*)((unsigned char*) buffer->payload + written_data); ret = ompi_convertor_pack(&replyreq->rep_target_convertor, &iov, &iov_count, &max_data ); if (ret < 0) { ret = OMPI_ERR_FATAL; goto cleanup; } assert(max_data == replyreq->rep_target_bytes_packed); written_data += max_data; header->hdr_msg_length = replyreq->rep_target_bytes_packed; } else { header->hdr_msg_length = 0; header->hdr_target_tag = create_send_tag(module); } buffer->len = written_data; #ifdef WORDS_BIGENDIAN header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; #elif OMPI_ENABLE_HETEROGENEOUS_SUPPORT if (replyreq->rep_origin_proc->proc_arch & OMPI_ARCH_ISBIGENDIAN) { header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; OMPI_OSC_PT2PT_REPLY_HDR_HTON(*header); } #endif /* send fragment */ ret = MCA_PML_CALL(isend(buffer->payload, buffer->len, MPI_BYTE, replyreq->rep_origin_rank, -200, MCA_PML_BASE_SEND_STANDARD, module->p2p_comm, &buffer->request)); opal_list_append(&module->p2p_pending_control_sends, &buffer->super.super); goto done; cleanup: if (item != NULL) { OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_buffers, item); } done: return ret; }
int ompi_mpi_init(int argc, char **argv, int requested, int *provided) { int ret; ompi_proc_t** procs; size_t nprocs; char *error = NULL; char *cmd=NULL, *av=NULL; ompi_errhandler_errtrk_t errtrk; OPAL_TIMING_DECLARE(tm); OPAL_TIMING_INIT_EXT(&tm, OPAL_TIMING_GET_TIME_OF_DAY); /* bitflag of the thread level support provided. To be used * for the modex in order to work in heterogeneous environments. */ uint8_t threadlevel_bf; /* Ensure that we were not already initialized or finalized. This lock is held for the duration of ompi_mpi_init() and ompi_mpi_finalize(). Hence, if we get it, then no other thread is inside the critical section (and we don't have to check the *_started bool variables). */ opal_mutex_lock(&ompi_mpi_bootstrap_mutex); if (ompi_mpi_finalized) { opal_show_help("help-mpi-runtime.txt", "mpi_init: already finalized", true); opal_mutex_unlock(&ompi_mpi_bootstrap_mutex); return MPI_ERR_OTHER; } else if (ompi_mpi_initialized) { opal_show_help("help-mpi-runtime.txt", "mpi_init: invoked multiple times", true); opal_mutex_unlock(&ompi_mpi_bootstrap_mutex); return MPI_ERR_OTHER; } /* Indicate that we have *started* MPI_INIT* */ ompi_mpi_init_started = true; /* Setup enough to check get/set MCA params */ if (OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv))) { error = "ompi_mpi_init: opal_init_util failed"; goto error; } /* Convince OPAL to use our naming scheme */ opal_process_name_print = _process_name_print_for_opal; opal_compare_proc = _process_name_compare; opal_convert_string_to_process_name = _convert_string_to_process_name; opal_convert_process_name_to_string = _convert_process_name_to_string; opal_proc_for_name = ompi_proc_for_name; /* Register MCA variables */ if (OPAL_SUCCESS != (ret = ompi_register_mca_variables())) { error = "ompi_mpi_init: ompi_register_mca_variables failed"; goto error; } if (OPAL_SUCCESS != (ret = opal_arch_set_fortran_logical_size(sizeof(ompi_fortran_logical_t)))) { error = "ompi_mpi_init: opal_arch_set_fortran_logical_size failed"; goto error; } /* _After_ opal_init_util() but _before_ orte_init(), we need to set an MCA param that tells libevent that it's ok to use any mechanism in libevent that is available on this platform (e.g., epoll and friends). Per opal/event/event.s, we default to select/poll -- but we know that MPI processes won't be using pty's with the event engine, so it's ok to relax this constraint and let any fd-monitoring mechanism be used. */ ret = mca_base_var_find("opal", "event", "*", "event_include"); if (ret >= 0) { char *allvalue = "all"; /* We have to explicitly "set" the MCA param value here because libevent initialization will re-register the MCA param and therefore override the default. Setting the value here puts the desired value ("all") in different storage that is not overwritten if/when the MCA param is re-registered. This is unless the user has specified a different value for this MCA parameter. Make sure we check to see if the default is specified before forcing "all" in case that is not what the user desires. Note that we do *NOT* set this value as an environment variable, just so that it won't be inherited by any spawned processes and potentially cause unintented side-effects with launching RTE tools... */ mca_base_var_set_value(ret, allvalue, 4, MCA_BASE_VAR_SOURCE_DEFAULT, NULL); } OPAL_TIMING_MSTART((&tm,"time from start to completion of rte_init")); /* if we were not externally started, then we need to setup * some envars so the MPI_INFO_ENV can get the cmd name * and argv (but only if the user supplied a non-NULL argv!), and * the requested thread level */ if (NULL == getenv("OMPI_COMMAND") && NULL != argv && NULL != argv[0]) { asprintf(&cmd, "OMPI_COMMAND=%s", argv[0]); putenv(cmd); } if (NULL == getenv("OMPI_ARGV") && 1 < argc) { char *tmp; tmp = opal_argv_join(&argv[1], ' '); asprintf(&av, "OMPI_ARGV=%s", tmp); free(tmp); putenv(av); } /* open the rte framework */ if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_rte_base_framework, 0))) { error = "ompi_rte_base_open() failed"; goto error; } /* no select is required as this is a static framework */ /* Setup RTE */ if (OMPI_SUCCESS != (ret = ompi_rte_init(NULL, NULL))) { error = "ompi_mpi_init: ompi_rte_init failed"; goto error; } ompi_rte_initialized = true; /* check for timing request - get stop time and report elapsed time if so */ OPAL_TIMING_MNEXT((&tm,"time from completion of rte_init to modex")); /* if hwloc is available but didn't get setup for some * reason, do so now */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) { error = "Topology init"; goto error; } } /* Register the default errhandler callback */ errtrk.status = OPAL_ERROR; errtrk.active = true; opal_pmix.register_errhandler(NULL, ompi_errhandler_callback, ompi_errhandler_registration_callback, (void*)&errtrk); OMPI_WAIT_FOR_COMPLETION(errtrk.active); if (OPAL_SUCCESS != errtrk.status) { error = "Error handler registration"; ret = errtrk.status; goto error; } /* Figure out the final MPI thread levels. If we were not compiled for support for MPI threads, then don't allow MPI_THREAD_MULTIPLE. Set this stuff up here early in the process so that other components can make decisions based on this value. */ ompi_mpi_thread_level(requested, provided); /* determine the bitflag belonging to the threadlevel_support provided */ memset ( &threadlevel_bf, 0, sizeof(uint8_t)); OMPI_THREADLEVEL_SET_BITFLAG ( ompi_mpi_thread_provided, threadlevel_bf ); #if OMPI_ENABLE_THREAD_MULTIPLE /* add this bitflag to the modex */ OPAL_MODEX_SEND_STRING(ret, OPAL_PMIX_GLOBAL, "MPI_THREAD_LEVEL", &threadlevel_bf, sizeof(uint8_t)); if (OPAL_SUCCESS != ret) { error = "ompi_mpi_init: modex send thread level"; goto error; } #endif /* If thread support was enabled, then setup OPAL to allow for them. */ if ((OPAL_ENABLE_PROGRESS_THREADS == 1) || (*provided != MPI_THREAD_SINGLE)) { opal_set_using_threads(true); } /* initialize datatypes. This step should be done early as it will * create the local convertor and local arch used in the proc * init. */ if (OMPI_SUCCESS != (ret = ompi_datatype_init())) { error = "ompi_datatype_init() failed"; goto error; } /* Initialize OMPI procs */ if (OMPI_SUCCESS != (ret = ompi_proc_init())) { error = "mca_proc_init() failed"; goto error; } /* Initialize the op framework. This has to be done *after* ddt_init, but befor mca_coll_base_open, since some collective modules (e.g., the hierarchical coll component) may need ops in their query function. */ if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_op_base_framework, 0))) { error = "ompi_op_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = ompi_op_base_find_available(OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) { error = "ompi_op_base_find_available() failed"; goto error; } if (OMPI_SUCCESS != (ret = ompi_op_init())) { error = "ompi_op_init() failed"; goto error; } /* Open up MPI-related MCA components */ if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_allocator_base_framework, 0))) { error = "mca_allocator_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_rcache_base_framework, 0))) { error = "mca_rcache_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_mpool_base_framework, 0))) { error = "mca_mpool_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_bml_base_framework, 0))) { error = "mca_bml_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_pml_base_framework, 0))) { error = "mca_pml_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_coll_base_framework, 0))) { error = "mca_coll_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_osc_base_framework, 0))) { error = "ompi_osc_base_open() failed"; goto error; } #if OPAL_ENABLE_FT_CR == 1 if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_crcp_base_framework, 0))) { error = "ompi_crcp_base_open() failed"; goto error; } #endif /* In order to reduce the common case for MPI apps (where they don't use MPI-2 IO or MPI-1 topology functions), the io and topo frameworks are initialized lazily, at the first use of relevant functions (e.g., MPI_FILE_*, MPI_CART_*, MPI_GRAPH_*), so they are not opened here. */ /* Select which MPI components to use */ if (OMPI_SUCCESS != (ret = mca_mpool_base_init(OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) { error = "mca_mpool_base_init() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_pml_base_select(OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) { error = "mca_pml_base_select() failed"; goto error; } /* check for timing request - get stop time and report elapsed time if so */ OPAL_TIMING_MNEXT((&tm,"time to execute modex")); /* exchange connection info - this function may also act as a barrier * if data exchange is required. The modex occurs solely across procs * in our job. If a barrier is required, the "modex" function will * perform it internally */ OPAL_MODEX(); OPAL_TIMING_MNEXT((&tm,"time from modex to first barrier")); /* select buffered send allocator component to be used */ if( OMPI_SUCCESS != (ret = mca_pml_base_bsend_init(ompi_mpi_thread_multiple))) { error = "mca_pml_base_bsend_init() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_coll_base_find_available(OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) { error = "mca_coll_base_find_available() failed"; goto error; } if (OMPI_SUCCESS != (ret = ompi_osc_base_find_available(OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) { error = "ompi_osc_base_find_available() failed"; goto error; } #if OPAL_ENABLE_FT_CR == 1 if (OMPI_SUCCESS != (ret = ompi_crcp_base_select() ) ) { error = "ompi_crcp_base_select() failed"; goto error; } #endif /* io and topo components are not selected here -- see comment above about the io and topo frameworks being loaded lazily */ /* Initialize each MPI handle subsystem */ /* initialize requests */ if (OMPI_SUCCESS != (ret = ompi_request_init())) { error = "ompi_request_init() failed"; goto error; } if (OMPI_SUCCESS != (ret = ompi_message_init())) { error = "ompi_message_init() failed"; goto error; } /* initialize info */ if (OMPI_SUCCESS != (ret = ompi_mpiinfo_init())) { error = "ompi_info_init() failed"; goto error; } /* initialize error handlers */ if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) { error = "ompi_errhandler_init() failed"; goto error; } /* initialize error codes */ if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_init())) { error = "ompi_mpi_errcode_init() failed"; goto error; } /* initialize internal error codes */ if (OMPI_SUCCESS != (ret = ompi_errcode_intern_init())) { error = "ompi_errcode_intern_init() failed"; goto error; } /* initialize groups */ if (OMPI_SUCCESS != (ret = ompi_group_init())) { error = "ompi_group_init() failed"; goto error; } /* initialize communicators */ if (OMPI_SUCCESS != (ret = ompi_comm_init())) { error = "ompi_comm_init() failed"; goto error; } /* initialize file handles */ if (OMPI_SUCCESS != (ret = ompi_file_init())) { error = "ompi_file_init() failed"; goto error; } /* initialize windows */ if (OMPI_SUCCESS != (ret = ompi_win_init())) { error = "ompi_win_init() failed"; goto error; } /* initialize attribute meta-data structure for comm/win/dtype */ if (OMPI_SUCCESS != (ret = ompi_attr_init())) { error = "ompi_attr_init() failed"; goto error; } /* identify the architectures of remote procs and setup * their datatype convertors, if required */ if (OMPI_SUCCESS != (ret = ompi_proc_complete_init())) { error = "ompi_proc_complete_init failed"; goto error; } /* start PML/BTL's */ ret = MCA_PML_CALL(enable(true)); if( OMPI_SUCCESS != ret ) { error = "PML control failed"; goto error; } /* some btls/mtls require we call add_procs with all procs in the job. * since the btls/mtls have no visibility here it is up to the pml to * convey this requirement */ if (mca_pml_base_requires_world ()) { if (NULL == (procs = ompi_proc_world (&nprocs))) { error = "ompi_proc_get_allocated () failed"; goto error; } } else { /* add all allocated ompi_proc_t's to PML (below the add_procs limit this * behaves identically to ompi_proc_world ()) */ if (NULL == (procs = ompi_proc_get_allocated (&nprocs))) { error = "ompi_proc_get_allocated () failed"; goto error; } } ret = MCA_PML_CALL(add_procs(procs, nprocs)); free(procs); /* If we got "unreachable", then print a specific error message. Otherwise, if we got some other failure, fall through to print a generic message. */ if (OMPI_ERR_UNREACH == ret) { opal_show_help("help-mpi-runtime.txt", "mpi_init:startup:pml-add-procs-fail", true); error = NULL; goto error; } else if (OMPI_SUCCESS != ret) { error = "PML add procs failed"; goto error; } MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm)); MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm)); /* * Dump all MCA parameters if requested */ if (ompi_mpi_show_mca_params) { ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank, nprocs, ompi_process_info.nodename); } /* Do we need to wait for a debugger? */ ompi_rte_wait_for_debugger(); /* Next timing measurement */ OPAL_TIMING_MNEXT((&tm,"time to execute barrier")); /* wait for everyone to reach this point - this is a hard * barrier requirement at this time, though we hope to relax * it at a later point */ opal_pmix.fence(NULL, 0); /* check for timing request - get stop time and report elapsed time if so, then start the clock again */ OPAL_TIMING_MNEXT((&tm,"time from barrier to complete mpi_init")); #if OPAL_ENABLE_PROGRESS_THREADS == 0 /* Start setting up the event engine for MPI operations. Don't block in the event library, so that communications don't take forever between procs in the dynamic code. This will increase CPU utilization for the remainder of MPI_INIT when we are blocking on RTE-level events, but may greatly reduce non-TCP latency. */ opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK); #endif /* wire up the mpi interface, if requested. Do this after the non-block switch for non-TCP performance. Do before the polling change as anyone with a complex wire-up is going to be using the oob. */ if (OMPI_SUCCESS != (ret = ompi_init_preconnect_mpi())) { error = "ompi_mpi_do_preconnect_all() failed"; goto error; } /* Setup the dynamic process management (DPM) subsystem */ if (OMPI_SUCCESS != (ret = ompi_dpm_init())) { error = "ompi_dpm_init() failed"; goto error; } /* Determine the overall threadlevel support of all processes in MPI_COMM_WORLD. This has to be done before calling coll_base_comm_select, since some of the collective components e.g. hierarch, might create subcommunicators. The threadlevel requested by all processes is required in order to know which cid allocation algorithm can be used. */ if ( OMPI_SUCCESS != ( ret = ompi_comm_cid_init ())) { error = "ompi_mpi_init: ompi_comm_cid_init failed"; goto error; } /* Init coll for the comms. This has to be after dpm_base_select, (since dpm.mark_dyncomm is not set in the communicator creation function else), but before dpm.dyncom_init, since this function might require collective for the CID allocation. */ if (OMPI_SUCCESS != (ret = mca_coll_base_comm_select(MPI_COMM_WORLD))) { error = "mca_coll_base_comm_select(MPI_COMM_WORLD) failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_coll_base_comm_select(MPI_COMM_SELF))) { error = "mca_coll_base_comm_select(MPI_COMM_SELF) failed"; goto error; } /* Check whether we have been spawned or not. We introduce that at the very end, since we need collectives, datatypes, ptls etc. up and running here.... */ if (OMPI_SUCCESS != (ret = ompi_dpm_dyn_init())) { error = "ompi_dpm_dyn_init() failed"; goto error; } /* * Startup the Checkpoint/Restart Mech. * Note: Always do this so tools don't hang when * in a non-checkpointable build */ if (OMPI_SUCCESS != (ret = ompi_cr_init())) { error = "ompi_cr_init"; goto error; } /* Undo OPAL calling opal_progress_event_users_increment() during opal_init, to get better latency when not using TCP. Do this *after* dyn_init, as dyn init uses lots of RTE communication and we don't want to hinder the performance of that code. */ opal_progress_event_users_decrement(); /* see if yield_when_idle was specified - if so, use it */ opal_progress_set_yield_when_idle(ompi_mpi_yield_when_idle); /* negative value means use default - just don't do anything */ if (ompi_mpi_event_tick_rate >= 0) { opal_progress_set_event_poll_rate(ompi_mpi_event_tick_rate); } /* At this point, we are fully configured and in MPI mode. Any communication calls here will work exactly like they would in the user's code. Setup the connections between procs and warm them up with simple sends, if requested */ if (OMPI_SUCCESS != (ret = ompi_mpiext_init())) { error = "ompi_mpiext_init"; goto error; } /* Fall through */ error: if (ret != OMPI_SUCCESS) { /* Only print a message if one was not already printed */ if (NULL != error) { const char *err_msg = opal_strerror(ret); opal_show_help("help-mpi-runtime.txt", "mpi_init:startup:internal-failure", true, "MPI_INIT", "MPI_INIT", error, err_msg, ret); } opal_mutex_unlock(&ompi_mpi_bootstrap_mutex); return ret; } /* Initialize the registered datarep list to be empty */ OBJ_CONSTRUCT(&ompi_registered_datareps, opal_list_t); /* Initialize the arrays used to store the F90 types returned by the * MPI_Type_create_f90_XXX functions. */ OBJ_CONSTRUCT( &ompi_mpi_f90_integer_hashtable, opal_hash_table_t); opal_hash_table_init(&ompi_mpi_f90_integer_hashtable, 16 /* why not? */); OBJ_CONSTRUCT( &ompi_mpi_f90_real_hashtable, opal_hash_table_t); opal_hash_table_init(&ompi_mpi_f90_real_hashtable, FLT_MAX_10_EXP); OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t); opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP); /* All done. Wasn't that simple? */ ompi_mpi_initialized = true; /* Finish last measurement, output results * and clear timing structure */ OPAL_TIMING_MSTOP(&tm); OPAL_TIMING_DELTAS(ompi_enable_timing, &tm); OPAL_TIMING_REPORT(ompi_enable_timing_ext, &tm); OPAL_TIMING_RELEASE(&tm); opal_mutex_unlock(&ompi_mpi_bootstrap_mutex); return MPI_SUCCESS; }
/* MPI_IN_PLACE all to all algorithm. TODO: implement a better one. */ int mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; int i, j, size, rank, err = MPI_SUCCESS, line; MPI_Request *preq; char *tmp_buffer; size_t max_size; ptrdiff_t ext; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Find the largest receive amount */ ompi_datatype_type_extent (rdtype, &ext); max_size = ext * rcount; /* Allocate a temporary buffer */ tmp_buffer = calloc (max_size, 1); if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } /* in-place alltoall slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { for (j = i+1 ; j < size ; ++j) { /* Initiate all send/recv to/from others. */ preq = coll_base_comm_get_reqs(base_module->base_data, size * 2); if (i == rank) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer, (char *) rbuf + j * max_size); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * j, rcount, rdtype, j, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype, j, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } else if (j == rank) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer, (char *) rbuf + i * max_size); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * i, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } else { continue; } /* Wait for the requests to complete */ err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } } error_hndl: /* Free the temporary buffer */ free (tmp_buffer); if( MPI_SUCCESS != err ) { OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); ompi_coll_base_free_reqs(base_module->base_data->mcct_reqs, 2); } /* All done */ return err; }
/* * scatterv_intra * * Function: - scatterv operation * Accepts: - same arguments as MPI_Scatterv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_scatterv_intra(const void *sbuf, const int *scounts, const int *disps, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, err; char *ptmp; ptrdiff_t lb, extent; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* If not root, receive data. */ if (rank != root) { /* Only receive if there is something to receive */ if (rcount > 0) { return MCA_PML_CALL(recv(rbuf, rcount, rdtype, root, MCA_COLL_BASE_TAG_SCATTERV, comm, MPI_STATUS_IGNORE)); } return MPI_SUCCESS; } /* I am the root, loop sending data. */ err = ompi_datatype_get_extent(sdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } for (i = 0; i < size; ++i) { ptmp = ((char *) sbuf) + (extent * disps[i]); /* simple optimization */ if (i == rank) { /* simple optimization or a local operation */ if (scounts[i] > 0 && MPI_IN_PLACE != rbuf) { err = ompi_datatype_sndrcv(ptmp, scounts[i], sdtype, rbuf, rcount, rdtype); } } else { /* Only send if there is something to send */ if (scounts[i] > 0) { err = MCA_PML_CALL(send(ptmp, scounts[i], sdtype, i, MCA_COLL_BASE_TAG_SCATTERV, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { return err; } } } } /* All done */ return MPI_SUCCESS; }
/* This routine serves two purposes: * - the allreduce acts as a kind of Barrier, * which avoids, that we have incoming fragments * on the new communicator before everybody has set * up the comm structure. * - some components (e.g. the collective MagPIe component * might want to generate new communicators and communicate * using the new comm. Thus, it can just be called after * the 'barrier'. * * The reason that this routine is in comm_cid and not in * comm.c is, that this file contains the allreduce implementations * which are required, and thus we avoid having duplicate code... */ int ompi_comm_activate ( ompi_communicator_t** newcomm, ompi_communicator_t* comm, ompi_communicator_t* bridgecomm, void* local_leader, void* remote_leader, int mode, int send_first ) { int ret = 0; int ok=0, gok=0; ompi_comm_cid_allredfct* allredfnct; /* Step 1: the barrier, after which it is allowed to * send messages over the new communicator */ switch (mode) { case OMPI_COMM_CID_INTRA: allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_intra; break; case OMPI_COMM_CID_INTER: allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_inter; break; case OMPI_COMM_CID_INTRA_BRIDGE: allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_intra_bridge; break; case OMPI_COMM_CID_INTRA_OOB: allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_intra_oob; break; default: return MPI_UNDEFINED; break; } if (MPI_UNDEFINED != (*newcomm)->c_local_group->grp_my_rank) { /* Initialize the PML stuff in the newcomm */ if ( OMPI_SUCCESS != (ret = MCA_PML_CALL(add_comm(*newcomm))) ) { goto bail_on_error; } OMPI_COMM_SET_PML_ADDED(*newcomm); } ret = (allredfnct)(&ok, &gok, 1, MPI_MIN, comm, bridgecomm, local_leader, remote_leader, send_first ); if( OMPI_SUCCESS != ret ) { goto bail_on_error; } /** * Check to see if this process is in the new communicator. * * Specifically, this function is invoked by all proceses in the * old communicator, regardless of whether they are in the new * communicator or not. This is because it is far simpler to use * MPI collective functions on the old communicator to determine * some data for the new communicator (e.g., remote_leader) than * to kludge up our own pseudo-collective routines over just the * processes in the new communicator. Hence, *all* processes in * the old communicator need to invoke this function. * * That being said, only processes in the new communicator need to * select a coll module for the new communicator. More * specifically, proceses who are not in the new communicator * should *not* select a coll module -- for example, * ompi_comm_rank(newcomm) returns MPI_UNDEFINED for processes who * are not in the new communicator. This can cause errors in the * selection / initialization of a coll module. Plus, it's * wasteful -- processes in the new communicator will end up * freeing the new communicator anyway, so we might as well leave * the coll selection as NULL (the coll base comm unselect code * handles that case properly). */ if (MPI_UNDEFINED == (*newcomm)->c_local_group->grp_my_rank) { return OMPI_SUCCESS; } /* Let the collectives components fight over who will do collective on this new comm. */ if (OMPI_SUCCESS != (ret = mca_coll_base_comm_select(*newcomm))) { goto bail_on_error; } /* For an inter communicator, we have to deal with the potential * problem of what is happening if the local_comm that we created * has a lower CID than the parent comm. This is not a problem * as long as the user calls MPI_Comm_free on the inter communicator. * However, if the communicators are not freed by the user but released * by Open MPI in MPI_Finalize, we walk through the list of still available * communicators and free them one by one. Thus, local_comm is freed before * the actual inter-communicator. However, the local_comm pointer in the * inter communicator will still contain the 'previous' address of the local_comm * and thus this will lead to a segmentation violation. In order to prevent * that from happening, we increase the reference counter local_comm * by one if its CID is lower than the parent. We cannot increase however * its reference counter if the CID of local_comm is larger than * the CID of the inter communicators, since a regular MPI_Comm_free would * leave in that the case the local_comm hanging around and thus we would not * recycle CID's properly, which was the reason and the cause for this trouble. */ if ( OMPI_COMM_IS_INTER(*newcomm)) { if ( OMPI_COMM_CID_IS_LOWER(*newcomm, comm)) { OMPI_COMM_SET_EXTRA_RETAIN (*newcomm); OBJ_RETAIN (*newcomm); } } return OMPI_SUCCESS; bail_on_error: OBJ_RELEASE(*newcomm); *newcomm = MPI_COMM_NULL; return ret; }
/* Arguments not used in this implementation: * - send_first */ static int ompi_comm_allreduce_intra_bridge (int *inbuf, int *outbuf, int count, struct ompi_op_t *op, ompi_communicator_t *comm, ompi_communicator_t *bcomm, void* lleader, void* rleader, int send_first ) { int *tmpbuf=NULL; int local_rank; int i; int rc; int local_leader, remote_leader; local_leader = (*((int*)lleader)); remote_leader = (*((int*)rleader)); if ( &ompi_mpi_op_sum.op != op && &ompi_mpi_op_prod.op != op && &ompi_mpi_op_max.op != op && &ompi_mpi_op_min.op != op ) { return MPI_ERR_OP; } local_rank = ompi_comm_rank ( comm ); tmpbuf = (int *) malloc ( count * sizeof(int)); if ( NULL == tmpbuf ) { rc = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /* Intercomm_create */ rc = comm->c_coll.coll_allreduce ( inbuf, tmpbuf, count, MPI_INT, op, comm, comm->c_coll.coll_allreduce_module ); if ( OMPI_SUCCESS != rc ) { goto exit; } if (local_rank == local_leader ) { MPI_Request req; rc = MCA_PML_CALL(irecv ( outbuf, count, MPI_INT, remote_leader, OMPI_COMM_ALLREDUCE_TAG, bcomm, &req)); if ( OMPI_SUCCESS != rc ) { goto exit; } rc = MCA_PML_CALL(send (tmpbuf, count, MPI_INT, remote_leader, OMPI_COMM_ALLREDUCE_TAG, MCA_PML_BASE_SEND_STANDARD, bcomm)); if ( OMPI_SUCCESS != rc ) { goto exit; } rc = ompi_request_wait_all ( 1, &req, MPI_STATUS_IGNORE); if ( OMPI_SUCCESS != rc ) { goto exit; } if ( &ompi_mpi_op_max.op == op ) { for ( i = 0 ; i < count; i++ ) { if (tmpbuf[i] > outbuf[i]) { outbuf[i] = tmpbuf[i]; } } } else if ( &ompi_mpi_op_min.op == op ) { for ( i = 0 ; i < count; i++ ) { if (tmpbuf[i] < outbuf[i]) { outbuf[i] = tmpbuf[i]; } } } else if ( &ompi_mpi_op_sum.op == op ) { for ( i = 0 ; i < count; i++ ) { outbuf[i] += tmpbuf[i]; } } else if ( &ompi_mpi_op_prod.op == op ) { for ( i = 0 ; i < count; i++ ) { outbuf[i] *= tmpbuf[i]; } } } rc = comm->c_coll.coll_bcast ( outbuf, count, MPI_INT, local_leader, comm, comm->c_coll.coll_bcast_module ); exit: if (NULL != tmpbuf ) { free (tmpbuf); } return (rc); }
/* * scatterv_inter * * Function: - scatterv operation * Accepts: - same arguments as MPI_Scatterv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_scatterv_inter(void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, err, total, size_local; int *counts=NULL,*displace=NULL; char *ptmp=NULL; MPI_Aint incr; MPI_Aint extent; MPI_Aint lb; ompi_datatype_t *ndtype; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_remote_size(comm); size_local = ompi_comm_size(comm); if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { if(0 == rank) { /* local root recieves the counts from the root */ counts = (int *)malloc(sizeof(int) * size_local); err = MCA_PML_CALL(recv(counts, size_local, MPI_INT, root, MCA_COLL_BASE_TAG_SCATTERV, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { return err; } /* calculate the whole buffer size and recieve it from root */ err = ompi_ddt_get_extent(rdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } incr = 0; for (i = 0; i < size_local; i++) { incr = incr + extent*counts[i]; } if ( incr > 0 ) { ptmp = (char*)malloc(incr); if (NULL == ptmp) { return OMPI_ERR_OUT_OF_RESOURCE; } } total = 0; for (i = 0; i < size_local; i++) { total = total + counts[i]; } err = MCA_PML_CALL(recv(ptmp, total, rdtype, root, MCA_COLL_BASE_TAG_SCATTERV, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { return err; } /* set the local displacement i.e. no displacements here */ displace = (int *)malloc(sizeof(int) * size_local); displace[0] = 0; for (i = 1; i < size_local; i++) { displace[i] = displace[i-1] + counts[i-1]; } } /* perform the scatterv locally */ err = comm->c_local_comm->c_coll.coll_scatterv(ptmp, counts, displace, rdtype, rbuf, rcount, rdtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_scatterv_module); if (OMPI_SUCCESS != err) { return err; } if (NULL != ptmp) { free(ptmp); } if (NULL != displace) { free(displace); } if (NULL != counts) { free(counts); } } else { err = MCA_PML_CALL(send(scounts, size, MPI_INT, 0, MCA_COLL_BASE_TAG_SCATTERV, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { return err; } ompi_ddt_create_indexed(size,scounts,disps,sdtype,&ndtype); ompi_ddt_commit(&ndtype); err = MCA_PML_CALL(send(sbuf, 1, ndtype, 0, MCA_COLL_BASE_TAG_SCATTERV, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { return err; } ompi_ddt_destroy(&ndtype); } /* All done */ return err; }
int mca_fcoll_static_file_read_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { int ret = OMPI_SUCCESS, iov_size=0, *bytes_remaining=NULL; int i, j, l,cycles=0, local_cycles=0, *current_index=NULL; int index, *disp_index=NULL, *bytes_per_process=NULL, current_position=0; int **blocklen_per_process=NULL, *iovec_count_per_process=NULL; int *displs=NULL, *sorted=NULL ,entries_per_aggregator=0; int *sorted_file_offsets=NULL, temp_index=0, position=0, *temp_disp_index=NULL; MPI_Aint **displs_per_process=NULL, global_iov_count=0, global_count=0; MPI_Aint *memory_displacements=NULL; int bytes_to_read_in_cycle=0; size_t max_data=0, bytes_per_cycle=0; uint32_t iov_count=0, iov_index=0; struct iovec *decoded_iov=NULL, *iov=NULL; mca_fcoll_static_local_io_array *local_iov_array=NULL, *global_iov_array=NULL; mca_fcoll_static_local_io_array *file_offsets_for_agg=NULL; char *global_buf=NULL, *receive_buf=NULL; int blocklen[3] = {1, 1, 1}; int static_num_io_procs=1; OPAL_PTRDIFF_TYPE d[3], base; ompi_datatype_t *types[3]; ompi_datatype_t *io_array_type=MPI_DATATYPE_NULL; ompi_datatype_t **sendtype = NULL; MPI_Request *send_req=NULL, recv_req=NULL; int my_aggregator=-1; bool recvbuf_is_contiguous=false; size_t ftype_size; OPAL_PTRDIFF_TYPE ftype_extent, lb; #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0; double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0; double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0; mca_common_ompio_print_entry nentry; #endif #if DEBUG_ON MPI_Aint gc_in; #endif opal_datatype_type_size ( &datatype->super, &ftype_size ); opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent ); /************************************************************************** ** 1. In case the data is not contigous in memory, decode it into an iovec **************************************************************************/ if ( ( ftype_extent == (OPAL_PTRDIFF_TYPE) ftype_size) && opal_datatype_is_contiguous_memory_layout(&datatype->super,1) && 0 == lb ) { recvbuf_is_contiguous = true; } /* In case the data is not contigous in memory, decode it into an iovec */ if (!recvbuf_is_contiguous ) { fh->f_decode_datatype ( (struct mca_io_ompio_file_t *)fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } fh->f_get_num_aggregators ( &static_num_io_procs ); fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh, static_num_io_procs, max_data); my_aggregator = fh->f_procs_in_group[fh->f_aggregator_index]; /* printf("max_data %ld\n", max_data); */ ret = fh->f_generate_current_file_view((struct mca_io_ompio_file_t *)fh, max_data, &iov, &iov_size); if (ret != OMPI_SUCCESS){ goto exit; } if ( iov_size > 0 ) { local_iov_array = (mca_fcoll_static_local_io_array *)malloc (iov_size * sizeof(mca_fcoll_static_local_io_array)); if ( NULL == local_iov_array){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (j=0; j < iov_size; j++){ local_iov_array[j].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t) iov[j].iov_base; local_iov_array[j].length = (size_t)iov[j].iov_len; local_iov_array[j].process_id = fh->f_rank; } } else { /* Allocate at least one element to correctly create the derived data type */ local_iov_array = (mca_fcoll_static_local_io_array *)malloc (sizeof(mca_fcoll_static_local_io_array)); if ( NULL == local_iov_array){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } local_iov_array[0].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t) 0; local_iov_array[0].length = (size_t) 0; local_iov_array[0].process_id = fh->f_rank; } d[0] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0]; d[1] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].length; d[2] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].process_id; base = d[0]; for (i=0 ; i<3 ; i++) { d[i] -= base; } /* io_array datatype for using in communication*/ types[0] = &ompi_mpi_long.dt; types[1] = &ompi_mpi_long.dt; types[2] = &ompi_mpi_int.dt; ompi_datatype_create_struct (3, blocklen, d, types, &io_array_type); ompi_datatype_commit (&io_array_type); /* #########################################################*/ fh->f_get_bytes_per_agg ( (int*) &bytes_per_cycle); local_cycles = ceil((double)max_data*fh->f_procs_per_group/bytes_per_cycle); #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif ret = fh->f_comm->c_coll.coll_allreduce (&local_cycles, &cycles, 1, MPI_INT, MPI_MAX, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (my_aggregator == fh->f_rank) { disp_index = (int *) malloc (fh->f_procs_per_group * sizeof(int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int )); if (NULL == bytes_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } bytes_remaining = (int *) calloc (fh->f_procs_per_group, sizeof(int)); if (NULL == bytes_remaining){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } current_index = (int *) calloc (fh->f_procs_per_group, sizeof(int)); if (NULL == current_index){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)calloc (fh->f_procs_per_group, sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **)calloc (fh->f_procs_per_group, sizeof (MPI_Aint*)); if (NULL == displs_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } iovec_count_per_process = (int *) calloc (fh->f_procs_per_group, sizeof(int)); if (NULL == iovec_count_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs = (int *) calloc (fh->f_procs_per_group, sizeof(int)); if (NULL == displs){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif ret = fcoll_base_coll_allgather_array (&iov_size, 1, MPI_INT, iovec_count_per_process, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (my_aggregator == fh->f_rank) { displs[0] = 0; global_iov_count = iovec_count_per_process[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { global_iov_count += iovec_count_per_process[i]; displs[i] = displs[i-1] + iovec_count_per_process[i-1]; } } if ( (my_aggregator == fh->f_rank) && (global_iov_count > 0 )) { global_iov_array = (mca_fcoll_static_local_io_array *) malloc (global_iov_count * sizeof(mca_fcoll_static_local_io_array)); if (NULL == global_iov_array){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif ret = fcoll_base_coll_gatherv_array (local_iov_array, iov_size, io_array_type, global_iov_array, iovec_count_per_process, displs, io_array_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ fprintf(stderr,"global_iov_array gather error!\n"); goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (NULL != local_iov_array){ free(local_iov_array); local_iov_array = NULL; } if ( ( my_aggregator == fh->f_rank) && ( global_iov_count > 0 )) { sorted = (int *)malloc (global_iov_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } read_local_heap_sort (global_iov_array, global_iov_count, sorted); send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request)); if (NULL == send_req){ opal_output ( 1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == sendtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for ( i=0; i<fh->f_procs_per_group; i++ ) { sendtype[i] = MPI_DATATYPE_NULL; } if (NULL == bytes_per_process){ bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int)); if (NULL == bytes_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } } #if DEBUG_ON if (my_aggregator == fh->f_rank) { for (gc_in=0; gc_in<global_iov_count; gc_in++){ printf("%d: Offset[%ld]: %lld, Length[%ld]: %ld\n", global_iov_array[sorted[gc_in]].process_id, gc_in, global_iov_array[sorted[gc_in]].offset, gc_in, global_iov_array[sorted[gc_in]].length); } } #endif #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif for (index = 0; index < cycles; index++){ if (my_aggregator == fh->f_rank) { fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if (NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements= NULL; } if ( NULL != sendtype ) { for ( i=0; i<fh->f_procs_per_group; i++ ) { if ( MPI_DATATYPE_NULL != sendtype[i] ) { ompi_datatype_destroy (&sendtype[i] ); sendtype[i] = MPI_DATATYPE_NULL; } } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); if (NULL == blocklen_per_process[l]) { opal_output (1, "OUT OF MEMORY for blocklen\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } } if (index < local_cycles ) { if ((index == local_cycles-1) && (max_data % (bytes_per_cycle/fh->f_procs_per_group))) { bytes_to_read_in_cycle = max_data - position; } else if (max_data <= bytes_per_cycle/fh->f_procs_per_group) { bytes_to_read_in_cycle = max_data; } else { bytes_to_read_in_cycle = bytes_per_cycle/fh->f_procs_per_group; } } else { bytes_to_read_in_cycle = 0; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif fcoll_base_coll_gather_array (&bytes_to_read_in_cycle, 1, MPI_INT, bytes_per_process, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (recvbuf_is_contiguous ) { receive_buf = &((char*)buf)[position]; } else if (bytes_to_read_in_cycle) { receive_buf = (char *) malloc (bytes_to_read_in_cycle * sizeof(char)); if ( NULL == receive_buf){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = MCA_PML_CALL(irecv(receive_buf, bytes_to_read_in_cycle, MPI_BYTE, my_aggregator, 123, fh->f_comm, &recv_req)); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (my_aggregator == fh->f_rank) { for (i=0;i<fh->f_procs_per_group; i++){ while (bytes_per_process[i] > 0){ /*printf("%d: bytes_per_process[%d]: %d, bytes_remaining[%d]: %d\n", index, i, bytes_per_process[i], i, bytes_remaining[i]);*/ if (read_get_process_id(global_iov_array[sorted[current_index[i]]].process_id, fh) == i){ /* current id owns this entry!*/ if (bytes_remaining[i]){ /*Remaining bytes in the current entry of the global offset array*/ if (bytes_remaining[i] <= bytes_per_process[i]){ blocklen_per_process[i][disp_index[i] - 1] = bytes_remaining[i]; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset + (global_iov_array[sorted[current_index[i]]].length - bytes_remaining[i]); blocklen_per_process[i] = (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int)); displs_per_process[i] = (MPI_Aint *)realloc ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint)); bytes_per_process[i] -= bytes_remaining[i]; blocklen_per_process[i][disp_index[i]] = 0; displs_per_process[i][disp_index[i]] = 0; disp_index[i] += 1; bytes_remaining[i] = 0; /* This entry has been used up, we need to move to the next entry of this process and make current_index point there*/ current_index[i] = read_find_next_index(i, current_index[i], fh, global_iov_array, global_iov_count, sorted); if (current_index[i] == -1){ break; } continue; } else{ blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i]; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset + (global_iov_array[sorted[current_index[i]]].length - bytes_remaining[i]); bytes_remaining[i] -= bytes_per_process[i]; bytes_per_process[i] = 0; break; } } else{ if (bytes_per_process[i] < global_iov_array[sorted[current_index[i]]].length){ blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i]; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset; bytes_remaining[i] = global_iov_array[sorted[current_index[i]]].length - bytes_per_process[i]; bytes_per_process[i] = 0; break; } else { blocklen_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].length; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset; blocklen_per_process[i] = (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int)); displs_per_process[i] = (MPI_Aint *)realloc ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint)); blocklen_per_process[i][disp_index[i]] = 0; displs_per_process[i][disp_index[i]] = 0; disp_index[i] += 1; bytes_per_process[i] -= global_iov_array[sorted[current_index[i]]].length; current_index[i] = read_find_next_index(i, current_index[i], fh, global_iov_array, global_iov_count, sorted); if (current_index[i] == -1){ break; } } } } else{ current_index[i] = read_find_next_index(i, current_index[i], fh, global_iov_array, global_iov_count, sorted); if (current_index[i] == -1){ bytes_per_process[i] = 0; /* no more entries left to service this request*/ continue; } } } } entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group;i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ entries_per_aggregator++; #if DEBUG_ON printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); #endif } } } if (entries_per_aggregator > 0){ file_offsets_for_agg = (mca_fcoll_static_local_io_array *) malloc(entries_per_aggregator*sizeof(mca_fcoll_static_local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator * sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } temp_index=0; global_count = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i]; j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; global_count += blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; } } } } else{ continue; } read_local_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } global_buf = (char *) malloc (global_count * sizeof(char)); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0; i<entries_per_aggregator;i++){ printf("%d: OFFSET: %lld LENGTH: %ld, Mem-offset: %ld, disp_index :%d\n", file_offsets_for_agg[sorted_file_offsets[i]].process_id, file_offsets_for_agg[sorted_file_offsets[i]].offset, file_offsets_for_agg[sorted_file_offsets[i]].length, memory_displacements[sorted_file_offsets[i]], disp_index[i]); } #endif fh->f_io_array = (mca_io_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_num_of_io_entries = 0; fh->f_io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[0].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[0].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else{ fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if DEBUG_ON printf("*************************** %d\n", fh->f_num_of_io_entries); for (i=0 ; i<fh->f_num_of_io_entries ; i++) { printf(" ADDRESS: %p OFFSET: %ld LENGTH: %ld\n", fh->f_io_array[i].memory_address, (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset, fh->f_io_array[i].length); } #endif #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_read_time = MPI_Wtime(); #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_preadv (fh)) { opal_output (1, "READ FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_read_time = MPI_Wtime(); read_time += end_read_time - start_read_time; #endif #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); if (my_aggregator == fh->f_rank){ for (i=0 ; i<global_count/4 ; i++) printf (" READ %d \n",((int *)global_buf)[i]); } #endif temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0; i<entries_per_aggregator; i++){ temp_index = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_index][temp_disp_index[temp_index]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_index] < disp_index[temp_index]){ temp_disp_index[temp_index] += 1; } else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_index, temp_disp_index[temp_index], temp_index, disp_index[temp_index]); } } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif for (i=0;i<fh->f_procs_per_group; i++){ send_req[i] = MPI_REQUEST_NULL; ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &sendtype[i]); ompi_datatype_commit(&sendtype[i]); ret = MCA_PML_CALL (isend(global_buf, 1, sendtype[i], fh->f_procs_in_group[i], 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, &send_req[i])); if(OMPI_SUCCESS != ret){ goto exit; } } ret = ompi_request_wait_all (fh->f_procs_per_group, send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } /* if ( my_aggregator == fh->f_rank ) */ ret = ompi_request_wait (&recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif position += bytes_to_read_in_cycle; if (!recvbuf_is_contiguous) { OPAL_PTRDIFF_TYPE mem_address; size_t remaining = 0; size_t temp_position = 0; remaining = bytes_to_read_in_cycle; while (remaining && (iov_count > iov_index)){ mem_address = (OPAL_PTRDIFF_TYPE) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else{ memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, remaining); current_position = current_position + remaining; remaining = 0; } } if (NULL != receive_buf) { free (receive_buf); receive_buf = NULL; } } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rexch = MPI_Wtime(); read_exch += end_rexch - start_rexch; nentry.time[0] = read_time; nentry.time[1] = rcomm_time; nentry.time[2] = read_exch; if (my_aggregator == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = static_num_io_procs; if (!mca_common_ompio_full_print_queue(fh->f_coll_read_time)){ mca_common_ompio_register_print_entry(fh->f_coll_read_time, nentry); } #endif exit: if (NULL != decoded_iov){ free(decoded_iov); decoded_iov = NULL; } if (NULL != displs){ free(displs); displs = NULL; } if (NULL != iovec_count_per_process){ free(iovec_count_per_process); iovec_count_per_process=NULL; } if (NULL != local_iov_array){ free(local_iov_array); local_iov_array=NULL; } if (NULL != global_iov_array){ free(global_iov_array); global_iov_array=NULL; } if (my_aggregator == fh->f_rank) { for(l=0;l<fh->f_procs_per_group;l++){ if (blocklen_per_process) { free(blocklen_per_process[l]); } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } } } if (NULL != bytes_per_process){ free(bytes_per_process); bytes_per_process =NULL; } if (NULL != disp_index){ free(disp_index); disp_index =NULL; } if (NULL != displs_per_process){ free(displs_per_process); displs_per_process = NULL; } if(NULL != bytes_remaining){ free(bytes_remaining); bytes_remaining = NULL; } if(NULL != current_index){ free(current_index); current_index = NULL; } if (NULL != blocklen_per_process){ free(blocklen_per_process); blocklen_per_process =NULL; } if (NULL != bytes_remaining){ free(bytes_remaining); bytes_remaining =NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements= NULL; } if (NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if (NULL != sendtype){ free(sendtype); sendtype=NULL; } if ( !recvbuf_is_contiguous ) { if (NULL != receive_buf){ free(receive_buf); receive_buf=NULL; } } if (NULL != global_buf) { free(global_buf); global_buf = NULL; } if (NULL != sorted) { free(sorted); sorted = NULL; } if (NULL != send_req){ free(send_req); send_req = NULL; } return ret; }
static int mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; int i, j, size, rank, err=MPI_SUCCESS; MPI_Request *preq; char *tmp_buffer; size_t max_size, rdtype_size; ptrdiff_t ext; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); ompi_datatype_type_size(rdtype, &rdtype_size); /* If only one process, we're done. */ if (1 == size || 0 == rdtype_size) { return MPI_SUCCESS; } /* Find the largest receive amount */ ompi_datatype_type_extent (rdtype, &ext); for (i = 0, max_size = 0 ; i < size ; ++i) { size_t size = ext * rcounts[i]; max_size = size > max_size ? size : max_size; } /* Allocate a temporary buffer */ tmp_buffer = calloc (max_size, 1); if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } /* in-place alltoallv slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { for (j = i+1 ; j < size ; ++j) { /* Initiate all send/recv to/from others. */ preq = tuned_module->tuned_data->mcct_reqs; if (i == rank && rcounts[j]) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[j], tmp_buffer, (char *) rbuf + rdisps[j] * ext); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j] * ext, rcounts[j], rdtype, j, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[j], rdtype, j, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else if (j == rank && rcounts[i]) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[i], tmp_buffer, (char *) rbuf + rdisps[i] * ext); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i] * ext, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else { continue; } /* Wait for the requests to complete */ err = ompi_request_wait_all (2, tuned_module->tuned_data->mcct_reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { goto error_hndl; } /* Free the requests. */ mca_coll_tuned_free_reqs(tuned_module->tuned_data->mcct_reqs, 2); } } error_hndl: /* Free the temporary buffer */ free (tmp_buffer); /* All done */ return err; }
int mca_fcoll_dynamic_file_read_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { MPI_Aint position = 0; MPI_Aint total_bytes = 0; /* total bytes to be read */ MPI_Aint bytes_to_read_in_cycle = 0; /* left to be read in a cycle*/ MPI_Aint bytes_per_cycle = 0; /* total read in each cycle by each process*/ int index = 0, ret=OMPI_SUCCESS; int cycles = 0; int i=0, j=0, l=0; int n=0; /* current position in total_bytes_per_process array */ MPI_Aint bytes_remaining = 0; /* how many bytes have been read from the current value from total_bytes_per_process */ int *sorted_file_offsets=NULL, entries_per_aggregator=0; int bytes_received = 0; int blocks = 0; /* iovec structure and count of the buffer passed in */ uint32_t iov_count = 0; struct iovec *decoded_iov = NULL; int iov_index = 0; size_t current_position = 0; struct iovec *local_iov_array=NULL, *global_iov_array=NULL; char *receive_buf = NULL; MPI_Aint *memory_displacements=NULL; /* global iovec at the readers that contain the iovecs created from file_set_view */ uint32_t total_fview_count = 0; int local_count = 0; int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL; int current_index=0, temp_index=0; int **blocklen_per_process=NULL; MPI_Aint **displs_per_process=NULL; char *global_buf = NULL; MPI_Aint global_count = 0; local_io_array *file_offsets_for_agg=NULL; /* array that contains the sorted indices of the global_iov */ int *sorted = NULL; int *displs = NULL; int dynamic_num_io_procs; size_t max_data = 0; int *bytes_per_process = NULL; MPI_Aint *total_bytes_per_process = NULL; ompi_datatype_t **sendtype = NULL; MPI_Request *send_req=NULL, *recv_req=NULL; #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0; double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0; double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0; mca_io_ompio_print_entry nentry; #endif // if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) { // fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY; // } /************************************************************************** ** In case the data is not contigous in memory, decode it into an iovec ** **************************************************************************/ if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { ret = fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); if (OMPI_SUCCESS != ret){ goto exit; } } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } fh->f_get_num_aggregators ( &dynamic_num_io_procs); ret = fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh, dynamic_num_io_procs, max_data); if (OMPI_SUCCESS != ret){ goto exit; } total_bytes_per_process = (MPI_Aint*)malloc (fh->f_procs_per_group*sizeof(MPI_Aint)); if (NULL == total_bytes_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = fh->f_allgather_array (&max_data, 1, MPI_LONG, total_bytes_per_process, 1, MPI_LONG, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } for (i=0 ; i<fh->f_procs_per_group ; i++) { total_bytes += total_bytes_per_process[i]; } if (NULL != total_bytes_per_process) { free (total_bytes_per_process); total_bytes_per_process = NULL; } /********************************************************************* *** Generate the File offsets/lengths corresponding to this write *** ********************************************************************/ ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t *) fh, max_data, &local_iov_array, &local_count); if (ret != OMPI_SUCCESS){ goto exit; } /* #########################################################*/ /************************************************************* *** ALLGather the File View information at all processes *** *************************************************************/ fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == fview_count) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = fh->f_allgather_array (&local_count, 1, MPI_INT, fview_count, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } displs = (int*)malloc (fh->f_procs_per_group*sizeof(int)); if (NULL == displs) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs[0] = 0; total_fview_count = fview_count[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { total_fview_count += fview_count[i]; displs[i] = displs[i-1] + fview_count[i-1]; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (i=0 ; i<fh->f_procs_per_group ; i++) { printf ("%d: PROCESS: %d ELEMENTS: %d DISPLS: %d\n", fh->f_rank, i, fview_count[i], displs[i]); } } #endif /* allocate the global iovec */ if (0 != total_fview_count) { global_iov_array = (struct iovec*)malloc (total_fview_count * sizeof(struct iovec)); if (NULL == global_iov_array) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } ret = fh->f_allgatherv_array (local_iov_array, local_count, fh->f_iov_type, global_iov_array, fview_count, displs, fh->f_iov_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } /* sort it */ if (0 != total_fview_count) { sorted = (int *)malloc (total_fview_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_sort_iovec (global_iov_array, total_fview_count, sorted); } if (NULL != local_iov_array) { free (local_iov_array); local_iov_array = NULL; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (i=0 ; i<total_fview_count ; i++) { printf("%d: OFFSET: %p LENGTH: %d\n", fh->f_rank, global_iov_array[sorted[i]].iov_base, global_iov_array[sorted[i]].iov_len); } } #endif if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*)); if (NULL == displs_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0;i<fh->f_procs_per_group;i++){ blocklen_per_process[i] = NULL; displs_per_process[i] = NULL; } } /* * Calculate how many bytes are read in each cycle */ fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle); cycles = ceil((double)total_bytes/bytes_per_cycle); n = 0; bytes_remaining = 0; current_index = 0; #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif for (index = 0; index < cycles; index++) { /* Getting ready for next cycle Initializing and freeing buffers */ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL == sendtype){ sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == sendtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); if (NULL == blocklen_per_process[l]) { opal_output (1, "OUT OF MEMORY for blocklen\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } } if (cycles-1 == index) { bytes_to_read_in_cycle = total_bytes - bytes_per_cycle*index; } else { bytes_to_read_in_cycle = bytes_per_cycle; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { printf ("****%d: CYCLE %d Bytes %d**********\n", fh->f_rank, index, bytes_to_write_in_cycle); } #endif /* Calculate how much data will be contributed in this cycle by each process*/ bytes_received = 0; while (bytes_to_read_in_cycle) { blocks = fview_count[0]; for (j=0 ; j<fh->f_procs_per_group ; j++) { if (sorted[current_index] < blocks) { n = j; break; } else { blocks += fview_count[j+1]; } } if (bytes_remaining) { if (bytes_remaining <= bytes_to_read_in_cycle) { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_remaining; } current_index ++; bytes_to_read_in_cycle -= bytes_remaining; bytes_remaining = 0; if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *) realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } continue; } else { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_to_read_in_cycle; } bytes_remaining -= bytes_to_read_in_cycle; bytes_to_read_in_cycle = 0; break; } } else { if (bytes_to_read_in_cycle < (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_to_read_in_cycle; } bytes_remaining = global_iov_array[sorted[current_index]].iov_len - bytes_to_read_in_cycle; bytes_to_read_in_cycle = 0; break; } else { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = global_iov_array[sorted[current_index]].iov_len; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE) global_iov_array[sorted[current_index]].iov_base; blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *)realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += global_iov_array[sorted[current_index]].iov_len; } bytes_to_read_in_cycle -= global_iov_array[sorted[current_index]].iov_len; current_index ++; continue; } } } /* Calculate the displacement on where to put the data and allocate the recieve buffer (global_buf) */ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group; i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0) entries_per_aggregator++ ; } } if (entries_per_aggregator > 0){ file_offsets_for_agg = (local_io_array *) malloc(entries_per_aggregator*sizeof(local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /*Moving file offsets to an IO array!*/ temp_index = 0; global_count = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; global_count += blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; } } } } else{ continue; } read_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } global_buf = (char *) malloc (global_count * sizeof(char)); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_io_array = (mca_io_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_num_of_io_entries = 0; fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else{ fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_read_time = MPI_Wtime(); #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_preadv (fh)) { opal_output (1, "READ FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_read_time = MPI_Wtime(); read_time += end_read_time - start_read_time; #endif /********************************************************** ******************** DONE READING ************************ *********************************************************/ temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0; i<entries_per_aggregator; i++){ temp_index = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_index][temp_disp_index[temp_index]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_index] < disp_index[temp_index]){ temp_disp_index[temp_index] += 1; } else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_index, temp_disp_index[temp_index], temp_index, disp_index[temp_index]); } } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request)); if (NULL == send_req){ opal_output ( 1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif for (i=0;i<fh->f_procs_per_group;i++){ ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &sendtype[i]); ompi_datatype_commit(&sendtype[i]); ret = MCA_PML_CALL (isend(global_buf, 1, sendtype[i], fh->f_procs_in_group[i], 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, &send_req[i])); if(OMPI_SUCCESS != ret){ goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif } /********************************************************** ********* Scatter the Data from the readers ************** *********************************************************/ if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) { receive_buf = &((char*)buf)[position]; } else if (bytes_received) { /* allocate a receive buffer and copy the data that needs to be received into it in case the data is non-contigous in memory */ receive_buf = malloc (bytes_received); if (NULL == receive_buf) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif recv_req = (MPI_Request *) malloc (sizeof (MPI_Request)); if (NULL == recv_req){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = MCA_PML_CALL(irecv(receive_buf, bytes_received, MPI_BYTE, fh->f_procs_in_group[fh->f_aggregator_index], 123, fh->f_comm, recv_req)); if (OMPI_SUCCESS != ret){ goto exit; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){ ret = ompi_request_wait_all (fh->f_procs_per_group, send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } ret = ompi_request_wait (recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } position += bytes_received; /* If data is not contigous in memory, copy the data from the receive buffer into the buffer passed in */ if (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { OPAL_PTRDIFF_TYPE mem_address; size_t remaining = 0; size_t temp_position = 0; remaining = bytes_received; while (remaining) { mem_address = (OPAL_PTRDIFF_TYPE) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, remaining); current_position = current_position + remaining; remaining = 0; } } if (NULL != receive_buf) { free (receive_buf); receive_buf = NULL; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (NULL != recv_req){ free(recv_req); recv_req = NULL; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){ fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } for (i = 0; i < fh->f_procs_per_group; i++) ompi_datatype_destroy(sendtype+i); if (NULL != sendtype){ free(sendtype); sendtype=NULL; } if (NULL != send_req){ free(send_req); send_req = NULL; } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if (NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != bytes_per_process){ free(bytes_per_process); bytes_per_process =NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements= NULL; } } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rexch = MPI_Wtime(); read_exch += end_rexch - start_rexch; nentry.time[0] = read_time; nentry.time[1] = rcomm_time; nentry.time[2] = read_exch; if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = dynamic_num_io_procs; if (!fh->f_full_print_queue(READ_PRINT_QUEUE)){ fh->f_register_print_entry(READ_PRINT_QUEUE, nentry); } #endif exit: if (NULL != sorted) { free (sorted); sorted = NULL; } if (NULL != global_iov_array) { free (global_iov_array); global_iov_array = NULL; } if (NULL != fview_count) { free (fview_count); fview_count = NULL; } if (NULL != decoded_iov) { free (decoded_iov); decoded_iov = NULL; } if (NULL != local_iov_array){ free(local_iov_array); local_iov_array=NULL; } if (NULL != displs) { free (displs); displs = NULL; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL != disp_index){ free(disp_index); disp_index = NULL; } if ( NULL != blocklen_per_process){ for(l=0;l<fh->f_procs_per_group;l++){ if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } } free(blocklen_per_process); blocklen_per_process = NULL; } if (NULL != displs_per_process){ for (l=0; i<fh->f_procs_per_group; l++){ if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } } free(displs_per_process); displs_per_process = NULL; } } return ret; }
/* * Linear functions are copied from the basic coll module. For * some small number of nodes and/or small data sizes they are just as * fast as tuned/tree based segmenting operations and as such may be * selected by the decision functions. These are copied into this module * due to the way we select modules in V1. i.e. in V2 we will handle this * differently and so will not have to duplicate code. * GEF Oct05 after asking Jeff. */ int ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, rank, err, nreqs; char *psnd, *prcv; ptrdiff_t sext, rext; MPI_Request *preq; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; if (MPI_IN_PLACE == sbuf) { return mca_coll_tuned_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:alltoallv_intra_basic_linear rank %d", rank)); ompi_datatype_type_extent(sdtype, &sext); ompi_datatype_type_extent(rdtype, &rext); /* Simple optimization - handle send to self first */ psnd = ((char *) sbuf) + (ptrdiff_t)sdisps[rank] * sext; prcv = ((char *) rbuf) + (ptrdiff_t)rdisps[rank] * rext; if (0 != scounts[rank]) { err = ompi_datatype_sndrcv(psnd, scounts[rank], sdtype, prcv, rcounts[rank], rdtype); if (MPI_SUCCESS != err) { return err; } } /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Now, initiate all send/recv to/from others. */ nreqs = 0; preq = data->mcct_reqs; /* Post all receives first */ for (i = 0; i < size; ++i) { if (i == rank || 0 == rcounts[i]) { continue; } prcv = ((char *) rbuf) + (ptrdiff_t)rdisps[i] * rext; err = MCA_PML_CALL(irecv_init(prcv, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); ++nreqs; if (MPI_SUCCESS != err) { ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs); return err; } } /* Now post all sends */ for (i = 0; i < size; ++i) { if (i == rank || 0 == scounts[i]) { continue; } psnd = ((char *) sbuf) + (ptrdiff_t)sdisps[i] * sext; err = MCA_PML_CALL(isend_init(psnd, scounts[i], sdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); ++nreqs; if (MPI_SUCCESS != err) { ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs); return err; } } /* Start your engines. This will never return an error. */ MCA_PML_CALL(start(nreqs, data->mcct_reqs)); /* Wait for them all. If there's an error, note that we don't care * what the error was -- just that there *was* an error. The PML * will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. * So free them anyway -- even if there was an error, and return the * error after we free everything. */ err = ompi_request_wait_all(nreqs, data->mcct_reqs, MPI_STATUSES_IGNORE); /* Free the requests. */ ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs); return err; }
static int two_phase_exchage_data(mca_io_ompio_file_t *fh, void *buf, char *write_buf, struct iovec *offset_length, int *send_size,int *start_pos, int *recv_size, OMPI_MPI_OFFSET_TYPE off, OMPI_MPI_OFFSET_TYPE size, int *count, int *partial_recv, int *sent_to_proc, int contig_access_count, OMPI_MPI_OFFSET_TYPE min_st_offset, OMPI_MPI_OFFSET_TYPE fd_size, OMPI_MPI_OFFSET_TYPE *fd_start, OMPI_MPI_OFFSET_TYPE *fd_end, Flatlist_node *flat_buf, mca_io_ompio_access_array_t *others_req, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int iter, int *buf_idx,MPI_Aint buftype_extent, int striping_unit, int *aggregator_list, int *hole){ int *tmp_len=NULL, sum, *srt_len=NULL, nprocs_recv, nprocs_send, k,i,j; int ret=OMPI_SUCCESS; MPI_Request *requests=NULL, *send_req=NULL; MPI_Datatype *recv_types=NULL; OMPI_MPI_OFFSET_TYPE *srt_off=NULL; char **send_buf = NULL; ret = fh->f_comm->c_coll.coll_alltoall (recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fh->f_comm, fh->f_comm->c_coll.coll_alltoall_module); if ( OMPI_SUCCESS != ret ){ return ret; } nprocs_recv = 0; for (i=0;i<fh->f_size;i++){ if (recv_size[i]){ nprocs_recv++; } } recv_types = (MPI_Datatype *) malloc (( nprocs_recv + 1 ) * sizeof(MPI_Datatype *)); if ( NULL == recv_types ){ return OMPI_ERR_OUT_OF_RESOURCE; } tmp_len = (int *) malloc(fh->f_size*sizeof(int)); if ( NULL == tmp_len ) { return OMPI_ERR_OUT_OF_RESOURCE; } j = 0; for (i=0;i<fh->f_size;i++){ if (recv_size[i]) { if (partial_recv[i]) { k = start_pos[i] + count[i] - 1; tmp_len[i] = others_req[i].lens[k]; others_req[i].lens[k] = partial_recv[i]; } MPI_Type_hindexed(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, recv_types+j); MPI_Type_commit(recv_types+j); j++; } } sum = 0; for (i=0;i<fh->f_size;i++) sum += count[i]; srt_off = (OMPI_MPI_OFFSET_TYPE *) malloc((sum+1)*sizeof(OMPI_MPI_OFFSET_TYPE)); if ( NULL == srt_off ){ return OMPI_ERR_OUT_OF_RESOURCE; } srt_len = (int *) malloc((sum+1)*sizeof(int)); if ( NULL == srt_len ) { return OMPI_ERR_OUT_OF_RESOURCE; } two_phase_heap_merge(others_req, count, srt_off, srt_len, start_pos, fh->f_size,fh->f_rank, nprocs_recv, sum); for (i=0; i<fh->f_size; i++) if (partial_recv[i]) { k = start_pos[i] + count[i] - 1; others_req[i].lens[k] = tmp_len[i]; } if ( NULL != tmp_len ){ free(tmp_len); } *hole = 0; if (off != srt_off[0]){ *hole = 1; } else{ for (i=1;i<sum;i++){ if (srt_off[i] <= srt_off[0] + srt_len[0]){ int new_len = srt_off[i] + srt_len[i] - srt_off[0]; if(new_len > srt_len[0]) srt_len[0] = new_len; } else break; } if (i < sum || size != srt_len[0]) *hole = 1; } if ( NULL != srt_off ){ free(srt_off); } if ( NULL != srt_len ){ free(srt_len); } if (nprocs_recv){ if (*hole){ if (off > 0){ fh->f_io_array = (mca_io_ompio_io_array_t *)malloc (sizeof(mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } fh->f_io_array[0].offset =(IOVBASE_TYPE *)(intptr_t)off; fh->f_num_of_io_entries = 1; fh->f_io_array[0].length = size; fh->f_io_array[0].memory_address = write_buf; if (fh->f_num_of_io_entries){ if (OMPI_SUCCESS != fh->f_fbtl->fbtl_preadv (fh, NULL)) { opal_output(1, "READ FAILED\n"); return OMPI_ERROR; } } } fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } } } nprocs_send = 0; for (i=0; i <fh->f_size; i++) if (send_size[i]) nprocs_send++; #if DEBUG_ON printf("%d : nprocs_send : %d\n", fh->f_rank,nprocs_send); #endif requests = (MPI_Request *) malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request)); if ( NULL == requests ){ return OMPI_ERR_OUT_OF_RESOURCE; } j = 0; for (i=0; i<fh->f_size; i++) { if (recv_size[i]) { ret = MCA_PML_CALL(irecv(MPI_BOTTOM, 1, recv_types[j], i, fh->f_rank+i+100*iter, fh->f_comm, requests+j)); if ( OMPI_SUCCESS != ret ){ return ret; } j++; } } send_req = requests + nprocs_recv; if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) { j = 0; for (i=0; i <fh->f_size; i++) if (send_size[i]) { ret = MCA_PML_CALL(isend(((char *) buf) + buf_idx[i], send_size[i], MPI_BYTE, i, fh->f_rank+i+100*iter, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, send_req+j)); if ( OMPI_SUCCESS != ret ){ return ret; } j++; buf_idx[i] += send_size[i]; } } else if(nprocs_send && (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY))){ send_buf = (char **) malloc(fh->f_size*sizeof(char*)); if ( NULL == send_buf ){ return OMPI_ERR_OUT_OF_RESOURCE; } for (i=0; i < fh->f_size; i++){ if (send_size[i]) { send_buf[i] = (char *) malloc(send_size[i]); if ( NULL == send_buf[i] ){ return OMPI_ERR_OUT_OF_RESOURCE; } } } ret = two_phase_fill_send_buffer(fh, buf,flat_buf, send_buf, offset_length, send_size, send_req,sent_to_proc, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, send_buf_idx, curr_to_proc, done_to_proc, iter, buftype_extent, striping_unit, aggregator_list); if ( OMPI_SUCCESS != ret ){ return ret; } } for (i=0; i<nprocs_recv; i++) MPI_Type_free(recv_types+i); free(recv_types); ret = ompi_request_wait_all (nprocs_send+nprocs_recv, requests, MPI_STATUS_IGNORE); if ( NULL != requests ){ free(requests); } return ret; }
static int two_phase_fill_send_buffer(mca_io_ompio_file_t *fh, void *buf, Flatlist_node *flat_buf, char **send_buf, struct iovec *offset_length, int *send_size, MPI_Request *requests, int *sent_to_proc, int contig_access_count, OMPI_MPI_OFFSET_TYPE min_st_offset, OMPI_MPI_OFFSET_TYPE fd_size, OMPI_MPI_OFFSET_TYPE *fd_start, OMPI_MPI_OFFSET_TYPE *fd_end, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int iter, MPI_Aint buftype_extent, int striping_unit, int *aggregator_list){ int i, p, flat_buf_idx; OMPI_MPI_OFFSET_TYPE flat_buf_sz, size_in_buf, buf_incr, size; int jj, n_buftypes, ret=OMPI_SUCCESS; OMPI_MPI_OFFSET_TYPE off, len, rem_len, user_buf_idx; for (i=0; i < fh->f_size; i++) { send_buf_idx[i] = curr_to_proc[i] = 0; done_to_proc[i] = sent_to_proc[i]; } jj = 0; user_buf_idx = flat_buf->indices[0]; flat_buf_idx = 0; n_buftypes = 0; flat_buf_sz = flat_buf->blocklens[0]; for (i=0; i<contig_access_count; i++) { off = (OMPI_MPI_OFFSET_TYPE)(intptr_t)offset_length[i].iov_base; rem_len = (OMPI_MPI_OFFSET_TYPE)offset_length[i].iov_len; while (rem_len != 0) { len = rem_len; p = mca_fcoll_two_phase_calc_aggregator(fh, off, min_st_offset, &len, fd_size, fd_start, fd_end, striping_unit, mca_fcoll_two_phase_num_io_procs, aggregator_list); if (send_buf_idx[p] < send_size[p]) { if (curr_to_proc[p]+len > done_to_proc[p]) { if (done_to_proc[p] > curr_to_proc[p]) { size = OMPIO_MIN(curr_to_proc[p] + len - done_to_proc[p], send_size[p]-send_buf_idx[p]); buf_incr = done_to_proc[p] - curr_to_proc[p]; TWO_PHASE_BUF_INCR buf_incr = curr_to_proc[p] + len - done_to_proc[p]; curr_to_proc[p] = done_to_proc[p] + size; TWO_PHASE_BUF_COPY } else { size = OMPIO_MIN(len,send_size[p]-send_buf_idx[p]); buf_incr = len; curr_to_proc[p] += size; TWO_PHASE_BUF_COPY } if (send_buf_idx[p] == send_size[p]) { ret = MCA_PML_CALL(isend(send_buf[p], send_size[p], MPI_BYTE, p, fh->f_rank+p+100*iter, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, requests+jj)); if ( OMPI_SUCCESS != ret ){ return ret; } jj++; } } else { curr_to_proc[p] += len; buf_incr = len; TWO_PHASE_BUF_INCR } }
static int two_phase_exchange_data(mca_io_ompio_file_t *fh, void *buf, struct iovec *offset_len, int *send_size, int *start_pos, int *recv_size, int *count, int *partial_send, int *recd_from_proc, int contig_access_count, OMPI_MPI_OFFSET_TYPE min_st_offset, OMPI_MPI_OFFSET_TYPE fd_size, OMPI_MPI_OFFSET_TYPE *fd_start, OMPI_MPI_OFFSET_TYPE *fd_end, Flatlist_node *flat_buf, mca_io_ompio_access_array_t *others_req, int iter, size_t *buf_idx, MPI_Aint buftype_extent, int striping_unit, int *aggregator_list) { int i=0, j=0, k=0, tmp=0, nprocs_recv=0, nprocs_send=0; int ret = OMPI_SUCCESS; char **recv_buf = NULL; MPI_Request *requests=NULL; MPI_Datatype send_type; #if TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = fh->f_comm->c_coll.coll_alltoall (send_size, 1, MPI_INT, recv_size, 1, MPI_INT, fh->f_comm, fh->f_comm->c_coll.coll_alltoall_module); if ( OMPI_SUCCESS != ret ){ return ret; } #if DEBUG for (i=0; i<fh->f_size; i++){ printf("%d: RS[%d]: %d\n", fh->f_rank, i, recv_size[i]); } #endif nprocs_recv = 0; for (i=0; i < fh->f_size; i++) if (recv_size[i]) nprocs_recv++; nprocs_send = 0; for (i=0; i< fh->f_size; i++) if (send_size[i]) nprocs_send++; requests = (MPI_Request *) malloc((nprocs_send+nprocs_recv+1) * sizeof(MPI_Request)); if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) { j = 0; for (i=0; i < fh->f_size; i++){ if (recv_size[i]){ ret = MCA_PML_CALL(irecv(((char *) buf)+ buf_idx[i], recv_size[i], MPI_BYTE, i, fh->f_rank+i+100*iter, fh->f_comm, requests+j)); if ( OMPI_SUCCESS != ret ){ return ret; } j++; buf_idx[i] += recv_size[i]; } } } else{ recv_buf = (char **)malloc(fh->f_size * sizeof(char *)); if (NULL == recv_buf){ return OMPI_ERR_OUT_OF_RESOURCE; } for (i=0; i < fh->f_size; i++) if(recv_size[i]) recv_buf[i] = (char *) malloc (recv_size[i] * sizeof(char)); j = 0; for(i=0; i<fh->f_size; i++) if (recv_size[i]) { ret = MCA_PML_CALL(irecv(recv_buf[i], recv_size[i], MPI_BYTE, i, fh->f_rank+i+100*iter, fh->f_comm, requests+j)); j++; } } j = 0; for (i = 0; i< fh->f_size; i++){ if (send_size[i]){ if (partial_send[i]){ k = start_pos[i] + count[i] - 1; tmp = others_req[i].lens[k]; others_req[i].lens[k] = partial_send[i]; } MPI_Type_hindexed(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, &send_type); MPI_Type_commit(&send_type); ret = MCA_PML_CALL(isend(MPI_BOTTOM, 1, send_type, i, fh->f_rank+i+100*iter, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, requests+nprocs_recv+j)); MPI_Type_free(&send_type); if (partial_send[i]) others_req[i].lens[k] = tmp; j++; } } if (nprocs_recv) { ret = ompi_request_wait_all(nprocs_recv, requests, MPI_STATUS_IGNORE); if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { two_phase_fill_user_buffer(fh, buf, flat_buf, recv_buf, offset_len, (unsigned *)recv_size, requests, recd_from_proc, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, buftype_extent, striping_unit, aggregator_list); } } ret = ompi_request_wait_all(nprocs_send, requests+nprocs_recv, MPI_STATUS_IGNORE); if (NULL != requests){ free(requests); requests = NULL; } if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)){ for (i=0; i< fh->f_size; i++){ if (recv_size[i]){ free(recv_buf[i]); } } free(recv_buf); } #if TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += (end_rcomm_time - start_rcomm_time); #endif return ret; }
static int send_nb( dte_data_representation_t data, uint32_t count, void *buffer, rte_ec_handle_t ec_h, rte_grp_handle_t grp_h, uint32_t tag, rte_request_handle_t *req) { ompi_communicator_t *comm = (ompi_communicator_t *)grp_h; #if RTE_DEBUG assert(ec_h.group == grp_h); #endif if (! ec_h.handle) { fprintf(stderr,"***Error in hcolrte_rml_send_nb: wrong null argument: " "ec_h.handle = %p, ec_h.rank = %d\n",ec_h.handle,ec_h.rank); return 1; } if (HCOL_DTE_IS_INLINE(data)){ /*do inline nb recv*/ int rc; size_t size; ompi_request_t *ompi_req; ompi_free_list_item_t *item; if (!buffer && !HCOL_DTE_IS_ZERO(data)) { fprintf(stderr, "***Error in hcolrte_rml_send_nb: buffer pointer is NULL" " for non DTE_ZERO INLINE data representation\n"); return 1; } size = (size_t)data.rep.in_line_rep.data_handle.in_line.packed_size*count/8; HCOL_VERBOSE(30,"PML_ISEND: dest = %d: buf = %p: size = %u: comm = %p", ec_h.rank, buffer, (unsigned int)size, (void *)comm); if (MCA_PML_CALL(isend(buffer,size,&(ompi_mpi_unsigned_char.dt),ec_h.rank, tag,MCA_PML_BASE_SEND_STANDARD,comm,&ompi_req))) { return 1; } req->data = (void *)ompi_req; req->status = HCOLRTE_REQUEST_ACTIVE; }else{ int total_entries_number; int i; unsigned int j; void *buf; uint64_t len; int repeat_count; struct dte_struct_t * repeat; if (NULL != buffer) { /* We have a full data description & buffer pointer simultaneously. It is ambiguous. Throw a warning since the user might have made a mistake with data reps*/ fprintf(stderr,"Warning: buffer_pointer != NULL for NON-inline data representation: buffer_pointer is ignored.\n"); } total_entries_number = count_total_dte_repeat_entries(&data); repeat = data.rep.general_rep->data_representation.data->repeat; repeat_count = data.rep.general_rep->data_representation.data->repeat_count; for (i=0; i< repeat_count; i++){ for (j=0; j<repeat[i].n_elements; j++){ char *repeat_unit = (char *)&repeat[i]; buf = (void *)(repeat_unit+repeat[i].elements[j].base_offset); len = repeat[i].elements[j].packed_size; send_nb(DTE_BYTE,len,buf,ec_h,grp_h,tag,req); } } } return HCOLL_SUCCESS; }
int mca_fcoll_dynamic_file_write_all (ompio_file_t *fh, const void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { MPI_Aint total_bytes_written = 0; /* total bytes that have been written*/ MPI_Aint total_bytes = 0; /* total bytes to be written */ MPI_Aint bytes_to_write_in_cycle = 0; /* left to be written in a cycle*/ MPI_Aint bytes_per_cycle = 0; /* total written in each cycle by each process*/ int index = 0; int cycles = 0; int i=0, j=0, l=0; int n=0; /* current position in total_bytes_per_process array */ MPI_Aint bytes_remaining = 0; /* how many bytes have been written from the current value from total_bytes_per_process */ int bytes_sent = 0, ret =0; int blocks=0, entries_per_aggregator=0; /* iovec structure and count of the buffer passed in */ uint32_t iov_count = 0; struct iovec *decoded_iov = NULL; int iov_index = 0; char *send_buf = NULL; size_t current_position = 0; struct iovec *local_iov_array=NULL, *global_iov_array=NULL; mca_io_ompio_local_io_array *file_offsets_for_agg=NULL; /* global iovec at the writers that contain the iovecs created from file_set_view */ uint32_t total_fview_count = 0; int local_count = 0, temp_pindex; int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL; int current_index = 0, temp_index=0; char *global_buf = NULL; MPI_Aint global_count = 0; /* array that contains the sorted indices of the global_iov */ int *sorted = NULL, *sorted_file_offsets=NULL; int *displs = NULL; int dynamic_num_io_procs; size_t max_data = 0, datatype_size = 0; int **blocklen_per_process=NULL; MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL; ompi_datatype_t **recvtype = NULL; MPI_Aint *total_bytes_per_process = NULL; MPI_Request send_req=NULL, *recv_req=NULL; int my_aggregator=-1; bool sendbuf_is_contiguous = false; size_t ftype_size; ptrdiff_t ftype_extent, lb; #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0; double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0; double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0; mca_common_ompio_print_entry nentry; #endif opal_datatype_type_size ( &datatype->super, &ftype_size ); opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent ); /************************************************************************** ** 1. In case the data is not contigous in memory, decode it into an iovec **************************************************************************/ if ( ( ftype_extent == (ptrdiff_t) ftype_size) && opal_datatype_is_contiguous_memory_layout(&datatype->super,1) && 0 == lb ) { sendbuf_is_contiguous = true; } if (! sendbuf_is_contiguous ) { ret = mca_common_ompio_decode_datatype ((struct ompio_file_t *) fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); if (OMPI_SUCCESS != ret ){ goto exit; } } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } dynamic_num_io_procs = fh->f_get_mca_parameter_value ( "num_aggregators", strlen ("num_aggregators")); if ( OMPI_ERR_MAX == dynamic_num_io_procs ) { ret = OMPI_ERROR; goto exit; } ret = mca_common_ompio_set_aggregator_props ((struct ompio_file_t *) fh, dynamic_num_io_procs, max_data); if (OMPI_SUCCESS != ret){ goto exit; } my_aggregator = fh->f_procs_in_group[0]; /************************************************************************** ** 2. Determine the total amount of data to be written **************************************************************************/ total_bytes_per_process = (MPI_Aint*)malloc (fh->f_procs_per_group*sizeof(MPI_Aint)); if (NULL == total_bytes_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif ret = ompi_fcoll_base_coll_allgather_array (&max_data, 1, MPI_LONG, total_bytes_per_process, 1, MPI_LONG, 0, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += (end_comm_time - start_comm_time); #endif for (i=0 ; i<fh->f_procs_per_group ; i++) { total_bytes += total_bytes_per_process[i]; } if (NULL != total_bytes_per_process) { free (total_bytes_per_process); total_bytes_per_process = NULL; } /********************************************************************* *** 3. Generate the local offsets/lengths array corresponding to *** this write operation ********************************************************************/ ret = fh->f_generate_current_file_view( (struct ompio_file_t *) fh, max_data, &local_iov_array, &local_count); if (ret != OMPI_SUCCESS){ goto exit; } #if DEBUG_ON for (i=0 ; i<local_count ; i++) { printf("%d: OFFSET: %d LENGTH: %ld\n", fh->f_rank, local_iov_array[i].iov_base, local_iov_array[i].iov_len); } #endif /************************************************************* *** 4. Allgather the offset/lengths array from all processes *************************************************************/ fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == fview_count) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif ret = ompi_fcoll_base_coll_allgather_array (&local_count, 1, MPI_INT, fview_count, 1, MPI_INT, 0, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += (end_comm_time - start_comm_time); #endif displs = (int*) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == displs) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs[0] = 0; total_fview_count = fview_count[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { total_fview_count += fview_count[i]; displs[i] = displs[i-1] + fview_count[i-1]; } #if DEBUG_ON printf("total_fview_count : %d\n", total_fview_count); if (my_aggregator == fh->f_rank) { for (i=0 ; i<fh->f_procs_per_group ; i++) { printf ("%d: PROCESS: %d ELEMENTS: %d DISPLS: %d\n", fh->f_rank, i, fview_count[i], displs[i]); } } #endif /* allocate the global iovec */ if (0 != total_fview_count) { global_iov_array = (struct iovec*) malloc (total_fview_count * sizeof(struct iovec)); if (NULL == global_iov_array){ opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif ret = ompi_fcoll_base_coll_allgatherv_array (local_iov_array, local_count, fh->f_iov_type, global_iov_array, fview_count, displs, fh->f_iov_type, 0, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += (end_comm_time - start_comm_time); #endif /**************************************************************************************** *** 5. Sort the global offset/lengths list based on the offsets. *** The result of the sort operation is the 'sorted', an integer array, *** which contains the indexes of the global_iov_array based on the offset. *** For example, if global_iov_array[x].offset is followed by global_iov_array[y].offset *** in the file, and that one is followed by global_iov_array[z].offset, than *** sorted[0] = x, sorted[1]=y and sorted[2]=z; ******************************************************************************************/ if (0 != total_fview_count) { sorted = (int *)malloc (total_fview_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ompi_fcoll_base_sort_iovec (global_iov_array, total_fview_count, sorted); } if (NULL != local_iov_array){ free(local_iov_array); local_iov_array = NULL; } if (NULL != displs){ free(displs); displs=NULL; } #if DEBUG_ON if (my_aggregator == fh->f_rank) { uint32_t tv=0; for (tv=0 ; tv<total_fview_count ; tv++) { printf("%d: OFFSET: %lld LENGTH: %ld\n", fh->f_rank, global_iov_array[sorted[tv]].iov_base, global_iov_array[sorted[tv]].iov_len); } } #endif /************************************************************* *** 6. Determine the number of cycles required to execute this *** operation *************************************************************/ bytes_per_cycle = fh->f_bytes_per_agg; cycles = ceil((double)total_bytes/bytes_per_cycle); if (my_aggregator == fh->f_rank) { disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)calloc (fh->f_procs_per_group, sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **)calloc (fh->f_procs_per_group, sizeof (MPI_Aint*)); if (NULL == displs_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recv_req = (MPI_Request *)malloc ((fh->f_procs_per_group)*sizeof(MPI_Request)); if ( NULL == recv_req ) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } global_buf = (char *) malloc (bytes_per_cycle); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == recvtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for(l=0;l<fh->f_procs_per_group;l++){ recvtype[l] = MPI_DATATYPE_NULL; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_exch = MPI_Wtime(); #endif n = 0; bytes_remaining = 0; current_index = 0; for (index = 0; index < cycles; index++) { /********************************************************************** *** 7a. Getting ready for next cycle: initializing and freeing buffers **********************************************************************/ if (my_aggregator == fh->f_rank) { if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } fh->f_num_of_io_entries = 0; if (NULL != recvtype){ for (i =0; i< fh->f_procs_per_group; i++) { if ( MPI_DATATYPE_NULL != recvtype[i] ) { ompi_datatype_destroy(&recvtype[i]); recvtype[i] = MPI_DATATYPE_NULL; } } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; free(blocklen_per_process[l]); free(displs_per_process[l]); blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l] || NULL == blocklen_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } } /* (my_aggregator == fh->f_rank */ /************************************************************************** *** 7b. Determine the number of bytes to be actually written in this cycle **************************************************************************/ if (cycles-1 == index) { bytes_to_write_in_cycle = total_bytes - bytes_per_cycle*index; } else { bytes_to_write_in_cycle = bytes_per_cycle; } #if DEBUG_ON if (my_aggregator == fh->f_rank) { printf ("****%d: CYCLE %d Bytes %lld**********\n", fh->f_rank, index, bytes_to_write_in_cycle); } #endif /********************************************************** **Gather the Data from all the processes at the writers ** *********************************************************/ #if DEBUG_ON printf("bytes_to_write_in_cycle: %ld, cycle : %d\n", bytes_to_write_in_cycle, index); #endif /***************************************************************** *** 7c. Calculate how much data will be contributed in this cycle *** by each process *****************************************************************/ bytes_sent = 0; /* The blocklen and displs calculation only done at aggregators!*/ while (bytes_to_write_in_cycle) { /* This next block identifies which process is the holder ** of the sorted[current_index] element; */ blocks = fview_count[0]; for (j=0 ; j<fh->f_procs_per_group ; j++) { if (sorted[current_index] < blocks) { n = j; break; } else { blocks += fview_count[j+1]; } } if (bytes_remaining) { /* Finish up a partially used buffer from the previous cycle */ if (bytes_remaining <= bytes_to_write_in_cycle) { /* The data fits completely into the block */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining; displs_per_process[n][disp_index[n] - 1] = (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); /* In this cases the length is consumed so allocating for next displacement and blocklength*/ blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *) realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_remaining; } current_index ++; bytes_to_write_in_cycle -= bytes_remaining; bytes_remaining = 0; continue; } else { /* the remaining data from the previous cycle is larger than the bytes_to_write_in_cycle, so we have to segment again */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle; displs_per_process[n][disp_index[n] - 1] = (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_to_write_in_cycle; } bytes_remaining -= bytes_to_write_in_cycle; bytes_to_write_in_cycle = 0; break; } } else { /* No partially used entry available, have to start a new one */ if (bytes_to_write_in_cycle < (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) { /* This entry has more data than we can sendin one cycle */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle; displs_per_process[n][disp_index[n] - 1] = (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base ; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_to_write_in_cycle; } bytes_remaining = global_iov_array[sorted[current_index]].iov_len - bytes_to_write_in_cycle; bytes_to_write_in_cycle = 0; break; } else { /* Next data entry is less than bytes_to_write_in_cycle */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = global_iov_array[sorted[current_index]].iov_len; displs_per_process[n][disp_index[n] - 1] = (ptrdiff_t) global_iov_array[sorted[current_index]].iov_base; /*realloc for next blocklength and assign this displacement and check for next displs as the total length of this entry has been consumed!*/ blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *)realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += global_iov_array[sorted[current_index]].iov_len; } bytes_to_write_in_cycle -= global_iov_array[sorted[current_index]].iov_len; current_index ++; continue; } } } /************************************************************************* *** 7d. Calculate the displacement on where to put the data and allocate *** the recieve buffer (global_buf) *************************************************************************/ if (my_aggregator == fh->f_rank) { entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group; i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0) entries_per_aggregator++ ; } } #if DEBUG_ON printf("%d: cycle: %d, bytes_sent: %d\n ",fh->f_rank,index, bytes_sent); printf("%d : Entries per aggregator : %d\n",fh->f_rank,entries_per_aggregator); #endif if (entries_per_aggregator > 0){ file_offsets_for_agg = (mca_io_ompio_local_io_array *) malloc(entries_per_aggregator*sizeof(mca_io_ompio_local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /*Moving file offsets to an IO array!*/ temp_index = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); #endif } } } } else{ continue; } /* Sort the displacements for each aggregator*/ local_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); /*create contiguous memory displacements based on blocklens on the same displs array and map it to this aggregator's actual file-displacements (this is in the io-array created above)*/ memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /*Now update the displacements array with memory offsets*/ global_count = 0; for (i=0;i<entries_per_aggregator;i++){ temp_pindex = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_pindex] < disp_index[temp_pindex]) temp_disp_index[temp_pindex] += 1; else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_pindex, temp_disp_index[temp_pindex], temp_pindex, disp_index[temp_pindex]); } global_count += file_offsets_for_agg[sorted_file_offsets[i]].length; } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); } } } printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0; i<entries_per_aggregator;i++){ printf("%d: OFFSET: %lld LENGTH: %ld, Mem-offset: %ld\n", file_offsets_for_agg[sorted_file_offsets[i]].process_id, file_offsets_for_agg[sorted_file_offsets[i]].offset, file_offsets_for_agg[sorted_file_offsets[i]].length, memory_displacements[sorted_file_offsets[i]]); } printf("%d : global_count : %ld, bytes_sent : %d\n", fh->f_rank,global_count, bytes_sent); #endif #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif /************************************************************************* *** 7e. Perform the actual communication *************************************************************************/ for (i=0;i<fh->f_procs_per_group; i++) { recv_req[i] = MPI_REQUEST_NULL; if ( 0 < disp_index[i] ) { ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &recvtype[i]); ompi_datatype_commit(&recvtype[i]); opal_datatype_type_size(&recvtype[i]->super, &datatype_size); if (datatype_size){ ret = MCA_PML_CALL(irecv(global_buf, 1, recvtype[i], fh->f_procs_in_group[i], 123, fh->f_comm, &recv_req[i])); if (OMPI_SUCCESS != ret){ goto exit; } } } } } /* end if (my_aggregator == fh->f_rank ) */ if ( sendbuf_is_contiguous ) { send_buf = &((char*)buf)[total_bytes_written]; } else if (bytes_sent) { /* allocate a send buffer and copy the data that needs to be sent into it in case the data is non-contigous in memory */ ptrdiff_t mem_address; size_t remaining = 0; size_t temp_position = 0; send_buf = malloc (bytes_sent); if (NULL == send_buf) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } remaining = bytes_sent; while (remaining) { mem_address = (ptrdiff_t) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy (send_buf+temp_position, (IOVBASE_TYPE *)mem_address, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else { memcpy (send_buf+temp_position, (IOVBASE_TYPE *) mem_address, remaining); current_position = current_position + remaining; remaining = 0; } } } total_bytes_written += bytes_sent; /* Gather the sendbuf from each process in appropritate locations in aggregators*/ if (bytes_sent){ ret = MCA_PML_CALL(isend(send_buf, bytes_sent, MPI_BYTE, my_aggregator, 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, &send_req)); if ( OMPI_SUCCESS != ret ){ goto exit; } ret = ompi_request_wait(&send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } if (my_aggregator == fh->f_rank) { ret = ompi_request_wait_all (fh->f_procs_per_group, recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } #if DEBUG_ON if (my_aggregator == fh->f_rank){ printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0 ; i<global_count/4 ; i++) printf (" RECV %d \n",((int *)global_buf)[i]); } #endif if (! sendbuf_is_contiguous) { if (NULL != send_buf) { free (send_buf); send_buf = NULL; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += (end_comm_time - start_comm_time); #endif /********************************************************** *** 7f. Create the io array, and pass it to fbtl *********************************************************/ if (my_aggregator == fh->f_rank) { #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_write_time = MPI_Wtime(); #endif fh->f_io_array = (mca_common_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_common_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_num_of_io_entries = 0; /*First entry for every aggregator*/ fh->f_io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[0].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[0].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ /* If the enrties are contiguous merge them, else make a new entry */ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else { fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if DEBUG_ON printf("*************************** %d\n", fh->f_num_of_io_entries); for (i=0 ; i<fh->f_num_of_io_entries ; i++) { printf(" ADDRESS: %p OFFSET: %ld LENGTH: %ld\n", fh->f_io_array[i].memory_address, (ptrdiff_t)fh->f_io_array[i].offset, fh->f_io_array[i].length); } #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_pwritev (fh)) { opal_output (1, "WRITE FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_write_time = MPI_Wtime(); write_time += end_write_time - start_write_time; #endif } /* end if (my_aggregator == fh->f_rank) */ } /* end for (index = 0; index < cycles; index++) */ #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_exch = MPI_Wtime(); exch_write += end_exch - start_exch; nentry.time[0] = write_time; nentry.time[1] = comm_time; nentry.time[2] = exch_write; if (my_aggregator == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = dynamic_num_io_procs; if (!mca_common_ompio_full_print_queue(fh->f_coll_write_time)){ mca_common_ompio_register_print_entry(fh->f_coll_write_time, nentry); } #endif exit : if (my_aggregator == fh->f_rank) { if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } if (NULL != recvtype){ for (i =0; i< fh->f_procs_per_group; i++) { if ( MPI_DATATYPE_NULL != recvtype[i] ) { ompi_datatype_destroy(&recvtype[i]); } } free(recvtype); recvtype=NULL; } if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } if (NULL != disp_index){ free(disp_index); disp_index = NULL; } if (NULL != recvtype){ free(recvtype); recvtype=NULL; } if (NULL != recv_req){ free(recv_req); recv_req = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } for(l=0;l<fh->f_procs_per_group;l++){ if (NULL != blocklen_per_process){ free(blocklen_per_process[l]); } if (NULL != displs_per_process){ free(displs_per_process[l]); } } free(blocklen_per_process); free(displs_per_process); } if (NULL != displs){ free(displs); displs=NULL; } if (! sendbuf_is_contiguous) { if (NULL != send_buf) { free (send_buf); send_buf = NULL; } } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } if (NULL != sorted) { free (sorted); sorted = NULL; } if (NULL != global_iov_array) { free (global_iov_array); global_iov_array = NULL; } if (NULL != fview_count) { free (fview_count); fview_count = NULL; } if (NULL != decoded_iov) { free (decoded_iov); decoded_iov = NULL; } return OMPI_SUCCESS; }
static int mca_coll_basic_neighbor_allgather_cart(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { const mca_topo_base_comm_cart_2_2_0_t *cart = comm->c_topo->mtc.cart; const int rank = ompi_comm_rank (comm); ompi_request_t **reqs, **preqs; ptrdiff_t lb, extent; int rc = MPI_SUCCESS, dim, nreqs; ompi_datatype_get_extent(rdtype, &lb, &extent); reqs = preqs = coll_base_comm_get_reqs( module->base_data, 4 * cart->ndims ); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } /* The ordering is defined as -1 then +1 in each dimension in * order of dimension. */ for (dim = 0, nreqs = 0 ; dim < cart->ndims ; ++dim) { int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL; if (cart->dims[dim] > 1) { mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank); } else if (1 == cart->dims[dim] && cart->periods[dim]) { srank = drank = rank; } if (MPI_PROC_NULL != srank) { nreqs++; rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, srank, MCA_COLL_BASE_TAG_ALLGATHER, comm, preqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; /* remove cast from const when the pml layer is updated to take * a const for the send buffer. */ rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, srank, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } rbuf = (char *) rbuf + extent * rcount; if (MPI_PROC_NULL != drank) { nreqs++; rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, drank, MCA_COLL_BASE_TAG_ALLGATHER, comm, preqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, drank, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } rbuf = (char *) rbuf + extent * rcount; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, nreqs); return rc; } rc = ompi_request_wait_all (nreqs, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, nreqs); } return rc; }
/* Arguments not used in this implementation: * - bridgecomm * - local_leader * - remote_leader * - send_first */ static int ompi_comm_allreduce_inter ( int *inbuf, int *outbuf, int count, struct ompi_op_t *op, ompi_communicator_t *intercomm, ompi_communicator_t *bridgecomm, void* local_leader, void* remote_leader, int send_first ) { int local_rank, rsize; int i, rc; int *sbuf; int *tmpbuf=NULL; int *rcounts=NULL, scount=0; int *rdisps=NULL; if ( &ompi_mpi_op_sum.op != op && &ompi_mpi_op_prod.op != op && &ompi_mpi_op_max.op != op && &ompi_mpi_op_min.op != op ) { return MPI_ERR_OP; } if ( !OMPI_COMM_IS_INTER (intercomm)) { return MPI_ERR_COMM; } /* Allocate temporary arrays */ rsize = ompi_comm_remote_size (intercomm); local_rank = ompi_comm_rank ( intercomm ); tmpbuf = (int *) malloc ( count * sizeof(int)); rdisps = (int *) calloc ( rsize, sizeof(int)); rcounts = (int *) calloc ( rsize, sizeof(int) ); if ( OPAL_UNLIKELY (NULL == tmpbuf || NULL == rdisps || NULL == rcounts)) { rc = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /* Execute the inter-allreduce: the result of our group will be in the buffer of the remote group */ rc = intercomm->c_coll.coll_allreduce ( inbuf, tmpbuf, count, MPI_INT, op, intercomm, intercomm->c_coll.coll_allreduce_module); if ( OMPI_SUCCESS != rc ) { goto exit; } if ( 0 == local_rank ) { MPI_Request req; /* for the allgatherv later */ scount = count; /* local leader exchange their data and determine the overall result for both groups */ rc = MCA_PML_CALL(irecv (outbuf, count, MPI_INT, 0, OMPI_COMM_ALLREDUCE_TAG, intercomm, &req)); if ( OMPI_SUCCESS != rc ) { goto exit; } rc = MCA_PML_CALL(send (tmpbuf, count, MPI_INT, 0, OMPI_COMM_ALLREDUCE_TAG, MCA_PML_BASE_SEND_STANDARD, intercomm)); if ( OMPI_SUCCESS != rc ) { goto exit; } rc = ompi_request_wait ( &req, MPI_STATUS_IGNORE ); if ( OMPI_SUCCESS != rc ) { goto exit; } if ( &ompi_mpi_op_max.op == op ) { for ( i = 0 ; i < count; i++ ) { if (tmpbuf[i] > outbuf[i]) { outbuf[i] = tmpbuf[i]; } } } else if ( &ompi_mpi_op_min.op == op ) { for ( i = 0 ; i < count; i++ ) { if (tmpbuf[i] < outbuf[i]) { outbuf[i] = tmpbuf[i]; } } } else if ( &ompi_mpi_op_sum.op == op ) { for ( i = 0 ; i < count; i++ ) { outbuf[i] += tmpbuf[i]; } } else if ( &ompi_mpi_op_prod.op == op ) { for ( i = 0 ; i < count; i++ ) { outbuf[i] *= tmpbuf[i]; } } } /* distribute the overall result to all processes in the other group. Instead of using bcast, we are using here allgatherv, to avoid the possible deadlock. Else, we need an algorithm to determine, which group sends first in the inter-bcast and which receives the result first. */ rcounts[0] = count; sbuf = outbuf; rc = intercomm->c_coll.coll_allgatherv (sbuf, scount, MPI_INT, outbuf, rcounts, rdisps, MPI_INT, intercomm, intercomm->c_coll.coll_allgatherv_module); exit: if ( NULL != tmpbuf ) { free ( tmpbuf ); } if ( NULL != rcounts ) { free ( rcounts ); } if ( NULL != rdisps ) { free ( rdisps ); } return (rc); }
/* * barrier_intra_log * * Function: - barrier using O(log(N)) algorithm * Accepts: - same as MPI_Barrier() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_barrier_intra_log(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i; int err; int peer; int dim; int hibit; int mask; int size = ompi_comm_size(comm); int rank = ompi_comm_rank(comm); /* Send null-messages up and down the tree. Synchronization at the * root (rank 0). */ dim = comm->c_cube_dim; hibit = opal_hibit(rank, dim); --dim; /* Receive from children. */ for (i = dim, mask = 1 << i; i > hibit; --i, mask >>= 1) { peer = rank | mask; if (peer < size) { err = MCA_PML_CALL(recv(NULL, 0, MPI_BYTE, peer, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { return err; } } } /* Send to and receive from parent. */ if (rank > 0) { peer = rank & ~(1 << hibit); err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, peer, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { return err; } err = MCA_PML_CALL(recv(NULL, 0, MPI_BYTE, peer, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { return err; } } /* Send to children. */ for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) { peer = rank | mask; if (peer < size) { err = MCA_PML_CALL(send(NULL, 0, MPI_BYTE, peer, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { return err; } } } /* All done */ return MPI_SUCCESS; }
/* * alltoall_intra_linear_sync * * Function: Linear implementation of alltoall with limited number * of outstanding requests. * Accepts: Same as MPI_Alltoall(), and the maximum number of * outstanding requests (actual number is 2 * max, since * we count receive and send requests separately). * Returns: MPI_SUCCESS or error code * * Description: Algorithm is the following: * 1) post K irecvs, K <= N * 2) post K isends, K <= N * 3) while not done * - wait for any request to complete * - replace that request by the new one of the same type. */ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, int max_outstanding_reqs) { int line, error, ri, si, rank, size, nreqs, nrreqs, nsreqs, total_reqs; char *psnd, *prcv; ptrdiff_t slb, sext, rlb, rext; ompi_request_t **reqs = NULL; if (MPI_IN_PLACE == sbuf) { return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_alltoall_intra_linear_sync rank %d", rank)); error = ompi_datatype_get_extent(sdtype, &slb, &sext); if (OMPI_SUCCESS != error) { return error; } sext *= scount; error = ompi_datatype_get_extent(rdtype, &rlb, &rext); if (OMPI_SUCCESS != error) { return error; } rext *= rcount; /* simple optimization */ psnd = ((char *) sbuf) + (ptrdiff_t)rank * sext; prcv = ((char *) rbuf) + (ptrdiff_t)rank * rext; error = ompi_datatype_sndrcv(psnd, scount, sdtype, prcv, rcount, rdtype); if (MPI_SUCCESS != error) { return error; } /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Initiate send/recv to/from others. */ total_reqs = (((max_outstanding_reqs > (size - 1)) || (max_outstanding_reqs <= 0)) ? (size - 1) : (max_outstanding_reqs)); reqs = coll_base_comm_get_reqs(module->base_data, 2 * total_reqs); if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; } prcv = (char *) rbuf; psnd = (char *) sbuf; /* Post first batch or ireceive and isend requests */ for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs; ri = (ri + 1) % size, ++nreqs, ++nrreqs) { error = MCA_PML_CALL(irecv (prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri, MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[nreqs])); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } } for ( nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs; si = (si + size - 1) % size, ++nreqs, ++nsreqs) { error = MCA_PML_CALL(isend (psnd + (ptrdiff_t)si * sext, scount, sdtype, si, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[nreqs])); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } } /* Wait for requests to complete */ if (nreqs == 2 * (size - 1)) { /* Optimization for the case when all requests have been posted */ error = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } } else { /* As requests complete, replace them with corresponding requests: - wait for any request to complete, mark the request as MPI_REQUEST_NULL - If it was a receive request, replace it with new irecv request (if any) - if it was a send request, replace it with new isend request (if any) */ int ncreqs = 0; while (ncreqs < 2 * (size - 1)) { int completed; error = ompi_request_wait_any(2 * total_reqs, reqs, &completed, MPI_STATUS_IGNORE); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } reqs[completed] = MPI_REQUEST_NULL; ncreqs++; if (completed < total_reqs) { if (nrreqs < (size - 1)) { error = MCA_PML_CALL(irecv (prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri, MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[completed])); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } ++nrreqs; ri = (ri + 1) % size; } } else { if (nsreqs < (size - 1)) { error = MCA_PML_CALL(isend (psnd + (ptrdiff_t)si * sext, scount, sdtype, si, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[completed])); ++nsreqs; si = (si + size - 1) % size; } } } } /* All done */ return MPI_SUCCESS; error_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error, rank)); ompi_coll_base_free_reqs(reqs, 2 * total_reqs); return error; }
int ompi_mpi_finalize(void) { int ret; static int32_t finalize_has_already_started = 0; opal_list_item_t *item; ompi_proc_t** procs; size_t nprocs; OPAL_TIMING_DECLARE(tm); OPAL_TIMING_INIT(&tm); /* Be a bit social if an erroneous program calls MPI_FINALIZE in two different threads, otherwise we may deadlock in ompi_comm_free() (or run into other nasty lions, tigers, or bears) */ if (! opal_atomic_cmpset_32(&finalize_has_already_started, 0, 1)) { /* Note that if we're already finalized, we cannot raise an MPI exception. The best that we can do is write something to stderr. */ char hostname[MAXHOSTNAMELEN]; pid_t pid = getpid(); gethostname(hostname, sizeof(hostname)); opal_show_help("help-mpi-runtime.txt", "mpi_finalize:invoked_multiple_times", true, hostname, pid); return MPI_ERR_OTHER; } ompi_mpiext_fini(); /* Per MPI-2:4.8, we have to free MPI_COMM_SELF before doing anything else in MPI_FINALIZE (to include setting up such that MPI_FINALIZED will return true). */ if (NULL != ompi_mpi_comm_self.comm.c_keyhash) { ompi_attr_delete_all(COMM_ATTR, &ompi_mpi_comm_self, ompi_mpi_comm_self.comm.c_keyhash); OBJ_RELEASE(ompi_mpi_comm_self.comm.c_keyhash); ompi_mpi_comm_self.comm.c_keyhash = NULL; } /* Proceed with MPI_FINALIZE */ ompi_mpi_finalized = true; /* As finalize is the last legal MPI call, we are allowed to force the release * of the user buffer used for bsend, before going anywhere further. */ (void)mca_pml_base_bsend_detach(NULL, NULL); #if OPAL_ENABLE_PROGRESS_THREADS == 0 opal_progress_set_event_flag(OPAL_EVLOOP_ONCE | OPAL_EVLOOP_NONBLOCK); #endif /* Redo ORTE calling opal_progress_event_users_increment() during MPI lifetime, to get better latency when not using TCP */ opal_progress_event_users_increment(); /* check to see if we want timing information */ OPAL_TIMING_EVENT((&tm,"Start barrier")); /* NOTE: MPI-2.1 requires that MPI_FINALIZE is "collective" across *all* connected processes. This only means that all processes have to call it. It does *not* mean that all connected processes need to synchronize (either directly or indirectly). For example, it is quite easy to construct complicated scenarios where one job is "connected" to another job via transitivity, but have no direct knowledge of each other. Consider the following case: job A spawns job B, and job B later spawns job C. A "connectedness" graph looks something like this: A <--> B <--> C So what are we *supposed* to do in this case? If job A is still connected to B when it calls FINALIZE, should it block until jobs B and C also call FINALIZE? After lengthy discussions many times over the course of this project, the issue was finally decided at the Louisville Feb 2009 meeting: no. Rationale: - "Collective" does not mean synchronizing. It only means that every process call it. Hence, in this scenario, every process in A, B, and C must call FINALIZE. - KEY POINT: if A calls FINALIZE, then it is erroneous for B or C to try to communicate with A again. - Hence, OMPI is *correct* to only effect a barrier across each jobs' MPI_COMM_WORLD before exiting. Specifically, if A calls FINALIZE long before B or C, it's *correct* if A exits at any time (and doesn't notify B or C that it is exiting). - Arguably, if B or C do try to communicate with the now-gone A, OMPI should try to print a nice error ("you tried to communicate with a job that is already gone...") instead of segv or other Badness. However, that is an *extremely* difficult problem -- sure, it's easy for A to tell B that it is finalizing, but how can A tell C? A doesn't even know about C. You'd need to construct a "connected" graph in a distributed fashion, which is fraught with race conditions, etc. Hence, our conclusion is: OMPI is *correct* in its current behavior (of only doing a barrier across its own COMM_WORLD) before exiting. Any problems that occur are as a result of erroneous MPI applications. We *could* tighten up the erroneous cases and ensure that we print nice error messages / don't crash, but that is such a difficult problem that we decided we have many other, much higher priority issues to handle that deal with non-erroneous cases. */ /* Wait for everyone to reach this point. This is a grpcomm barrier instead of an MPI barrier for (at least) two reasons: 1. An MPI barrier doesn't ensure that all messages have been transmitted before exiting (e.g., a BTL can lie and buffer a message without actually injecting it to the network, and therefore require further calls to that BTL's progress), so the possibility of a stranded message exists. 2. If the MPI communication is using an unreliable transport, there's a problem of knowing that everyone has *left* the barrier. E.g., one proc can send its ACK to the barrier message to a peer and then leave the barrier, but the ACK can get lost and therefore the peer is left in the barrier. Point #1 has been known for a long time; point #2 emerged after we added the first unreliable BTL to Open MPI and fixed the del_procs behavior around May of 2014 (see https://svn.open-mpi.org/trac/ompi/ticket/4669#comment:4 for more details). */ opal_pmix.fence(NULL, 0); /* check for timing request - get stop time and report elapsed time if so */ OPAL_TIMING_EVENT((&tm,"Finish barrier")); OPAL_TIMING_REPORT(ompi_enable_timing, &tm, "MPI_Finish"); OPAL_TIMING_RELEASE(&tm); /* * Shutdown the Checkpoint/Restart Mech. */ if (OMPI_SUCCESS != (ret = ompi_cr_finalize())) { OMPI_ERROR_LOG(ret); } /* Shut down any bindings-specific issues: C++, F77, F90 */ /* Remove all memory associated by MPI_REGISTER_DATAREP (per MPI-2:9.5.3, there is no way for an MPI application to *un*register datareps, but we don't want the OMPI layer causing memory leaks). */ while (NULL != (item = opal_list_remove_first(&ompi_registered_datareps))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&ompi_registered_datareps); /* Remove all F90 types from the hash tables. As the OBJ_DESTRUCT will * call a special destructor able to release predefined types, we can * simply call the OBJ_DESTRUCT on the hash table and all memory will * be correctly released. */ OBJ_DESTRUCT( &ompi_mpi_f90_integer_hashtable ); OBJ_DESTRUCT( &ompi_mpi_f90_real_hashtable ); OBJ_DESTRUCT( &ompi_mpi_f90_complex_hashtable ); /* Free communication objects */ /* free file resources */ if (OMPI_SUCCESS != (ret = ompi_file_finalize())) { return ret; } /* free window resources */ if (OMPI_SUCCESS != (ret = ompi_win_finalize())) { return ret; } if (OMPI_SUCCESS != (ret = ompi_osc_base_finalize())) { return ret; } /* free communicator resources. this MUST come before finalizing the PML * as this will call into the pml */ if (OMPI_SUCCESS != (ret = ompi_comm_finalize())) { return ret; } nprocs = 0; procs = ompi_proc_world(&nprocs); MCA_PML_CALL(del_procs(procs, nprocs)); free(procs); /* free pml resource */ if(OMPI_SUCCESS != (ret = mca_pml_base_finalize())) { return ret; } /* free requests */ if (OMPI_SUCCESS != (ret = ompi_request_finalize())) { return ret; } if (OMPI_SUCCESS != (ret = ompi_message_finalize())) { return ret; } /* If requested, print out a list of memory allocated by ALLOC_MEM but not freed by FREE_MEM */ if (0 != ompi_debug_show_mpi_alloc_mem_leaks) { mca_mpool_base_tree_print(ompi_debug_show_mpi_alloc_mem_leaks); } /* Now that all MPI objects dealing with communications are gone, shut down MCA types having to do with communications */ if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_pml_base_framework) ) ) { OMPI_ERROR_LOG(ret); return ret; } /* shut down buffered send code */ mca_pml_base_bsend_fini(); #if OPAL_ENABLE_FT_CR == 1 /* * Shutdown the CRCP Framework, must happen after PML shutdown */ if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_crcp_base_framework) ) ) { OMPI_ERROR_LOG(ret); return ret; } #endif /* Free secondary resources */ /* free attr resources */ if (OMPI_SUCCESS != (ret = ompi_attr_finalize())) { return ret; } /* free group resources */ if (OMPI_SUCCESS != (ret = ompi_group_finalize())) { return ret; } /* free proc resources */ if ( OMPI_SUCCESS != (ret = ompi_proc_finalize())) { return ret; } /* finalize the pubsub functions */ if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_pubsub_base_framework) ) ) { return ret; } /* finalize the DPM framework */ if ( OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_dpm_base_framework))) { return ret; } /* free internal error resources */ if (OMPI_SUCCESS != (ret = ompi_errcode_intern_finalize())) { return ret; } /* free error code resources */ if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_finalize())) { return ret; } /* free errhandler resources */ if (OMPI_SUCCESS != (ret = ompi_errhandler_finalize())) { return ret; } /* Free all other resources */ /* free op resources */ if (OMPI_SUCCESS != (ret = ompi_op_finalize())) { return ret; } /* free ddt resources */ if (OMPI_SUCCESS != (ret = ompi_datatype_finalize())) { return ret; } /* free info resources */ if (OMPI_SUCCESS != (ret = ompi_info_finalize())) { return ret; } /* Close down MCA modules */ /* io is opened lazily, so it's only necessary to close it if it was actually opened */ if (0 < ompi_io_base_framework.framework_refcnt) { /* May have been "opened" multiple times. We want it closed now */ ompi_io_base_framework.framework_refcnt = 1; if (OMPI_SUCCESS != mca_base_framework_close(&ompi_io_base_framework)) { return ret; } } (void) mca_base_framework_close(&ompi_topo_base_framework); if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_osc_base_framework))) { return ret; } if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_coll_base_framework))) { return ret; } if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_bml_base_framework))) { return ret; } if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_mpool_base_framework))) { return ret; } if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_rcache_base_framework))) { return ret; } if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_allocator_base_framework))) { return ret; } if (NULL != ompi_mpi_main_thread) { OBJ_RELEASE(ompi_mpi_main_thread); ompi_mpi_main_thread = NULL; } /* Leave the RTE */ if (OMPI_SUCCESS != (ret = ompi_rte_finalize())) { return ret; } ompi_rte_initialized = false; /* now close the rte framework */ if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_rte_base_framework) ) ) { OMPI_ERROR_LOG(ret); return ret; } if (OPAL_SUCCESS != (ret = opal_finalize_util())) { return ret; } /* All done */ return MPI_SUCCESS; }
int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, err, nreqs, line; char *psnd, *prcv; MPI_Aint lb, sndinc, rcvinc; ompi_request_t **req, **sreq, **rreq; mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; mca_coll_base_comm_t *data = base_module->base_data; if (MPI_IN_PLACE == sbuf) { return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_alltoall_intra_basic_linear rank %d", rank)); err = ompi_datatype_get_extent(sdtype, &lb, &sndinc); if (OMPI_SUCCESS != err) { return err; } sndinc *= scount; err = ompi_datatype_get_extent(rdtype, &lb, &rcvinc); if (OMPI_SUCCESS != err) { return err; } rcvinc *= rcount; /* simple optimization */ psnd = ((char *) sbuf) + (ptrdiff_t)rank * sndinc; prcv = ((char *) rbuf) + (ptrdiff_t)rank * rcvinc; err = ompi_datatype_sndrcv(psnd, scount, sdtype, prcv, rcount, rdtype); if (MPI_SUCCESS != err) { return err; } /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Initiate all send/recv to/from others. */ req = rreq = coll_base_comm_get_reqs(data, (size - 1) * 2); prcv = (char *) rbuf; psnd = (char *) sbuf; /* Post all receives first -- a simple optimization */ for (nreqs = 0, i = (rank + 1) % size; i != rank; i = (i + 1) % size, ++rreq, ++nreqs) { err = MCA_PML_CALL(irecv_init (prcv + (ptrdiff_t)i * rcvinc, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } /* Now post all sends in reverse order - We would like to minimize the search time through message queue when messages actually arrive in the order in which they were posted. */ sreq = rreq; for (i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size, ++sreq, ++nreqs) { err = MCA_PML_CALL(isend_init (psnd + (ptrdiff_t)i * sndinc, scount, sdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, sreq)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } /* Start your engines. This will never return an error. */ MCA_PML_CALL(start(nreqs, req)); /* Wait for them all. If there's an error, note that we don't * care what the error was -- just that there *was* an error. The * PML will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. * So free them anyway -- even if there was an error, and return * the error after we free everything. */ err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE); err_hndl: if( MPI_SUCCESS != err ) { OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); } /* Free the reqs in all cases as they are persistent requests */ ompi_coll_base_free_reqs(req, nreqs); /* All done */ return err; }
static int bootstrap_comm(ompi_communicator_t *comm, mca_coll_sm_module_t *module) { int i; char *shortpath, *fullpath; mca_coll_sm_component_t *c = &mca_coll_sm_component; mca_coll_sm_comm_t *data = module->sm_comm_data; int comm_size = ompi_comm_size(comm); int num_segments = c->sm_comm_num_segments; int num_in_use = c->sm_comm_num_in_use_flags; int frag_size = c->sm_fragment_size; int control_size = c->sm_control_size; ompi_process_name_t *lowest_name = NULL; size_t size; ompi_proc_t *proc; /* Make the rendezvous filename for this communicators shmem data segment. The CID is not guaranteed to be unique among all procs on this node, so also pair it with the PID of the proc with the lowest ORTE name to form a unique filename. */ proc = ompi_group_peer_lookup(comm->c_local_group, 0); lowest_name = OMPI_CAST_RTE_NAME(&proc->super.proc_name); for (i = 1; i < comm_size; ++i) { proc = ompi_group_peer_lookup(comm->c_local_group, i); if (ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, OMPI_CAST_RTE_NAME(&proc->super.proc_name), lowest_name) < 0) { lowest_name = OMPI_CAST_RTE_NAME(&proc->super.proc_name); } } asprintf(&shortpath, "coll-sm-cid-%d-name-%s.mmap", comm->c_contextid, OMPI_NAME_PRINT(lowest_name)); if (NULL == shortpath) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:sm:enable:bootstrap comm (%d/%s): asprintf failed", comm->c_contextid, comm->c_name); return OMPI_ERR_OUT_OF_RESOURCE; } fullpath = opal_os_path(false, ompi_process_info.job_session_dir, shortpath, NULL); free(shortpath); if (NULL == fullpath) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:sm:enable:bootstrap comm (%d/%s): opal_os_path failed", comm->c_contextid, comm->c_name); return OMPI_ERR_OUT_OF_RESOURCE; } /* Calculate how much space we need in the per-communicator shmem data segment. There are several values to add: - size of the barrier data (2 of these): - fan-in data (num_procs * control_size) - fan-out data (num_procs * control_size) - size of the "in use" buffers: - num_in_use_buffers * control_size - size of the message fragment area (one for each segment): - control (num_procs * control_size) - fragment data (num_procs * (frag_size)) So it's: barrier: 2 * control_size + 2 * control_size in use: num_in_use * control_size control: num_segments * (num_procs * control_size * 2 + num_procs * control_size) message: num_segments * (num_procs * frag_size) */ size = 4 * control_size + (num_in_use * control_size) + (num_segments * (comm_size * control_size * 2)) + (num_segments * (comm_size * frag_size)); opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:sm:enable:bootstrap comm (%d/%s): attaching to %" PRIsize_t " byte mmap: %s", comm->c_contextid, comm->c_name, size, fullpath); if (0 == ompi_comm_rank (comm)) { data->sm_bootstrap_meta = mca_common_sm_module_create_and_attach (size, fullpath, sizeof(mca_common_sm_seg_header_t), 8); if (NULL == data->sm_bootstrap_meta) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:sm:enable:bootstrap comm (%d/%s): mca_common_sm_init_group failed", comm->c_contextid, comm->c_name); free(fullpath); return OMPI_ERR_OUT_OF_RESOURCE; } for (int i = 1 ; i < ompi_comm_size (comm) ; ++i) { MCA_PML_CALL(send(&data->sm_bootstrap_meta->shmem_ds, sizeof (data->sm_bootstrap_meta->shmem_ds), MPI_BYTE, i, MCA_COLL_BASE_TAG_BCAST, MCA_PML_BASE_SEND_STANDARD, comm)); } } else { opal_shmem_ds_t shmem_ds; MCA_PML_CALL(recv(&shmem_ds, sizeof (shmem_ds), MPI_BYTE, 0, MCA_COLL_BASE_TAG_BCAST, comm, MPI_STATUS_IGNORE)); data->sm_bootstrap_meta = mca_common_sm_module_attach (&shmem_ds, sizeof(mca_common_sm_seg_header_t), 8); } /* All done */ free(fullpath); return OMPI_SUCCESS; }
/* create the initial fragment, pack header, datatype, and payload (if size fits) and send */ int ompi_osc_pt2pt_sendreq_send(ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_sendreq_t *sendreq) { int ret = OMPI_SUCCESS; opal_free_list_item_t *item; ompi_osc_pt2pt_send_header_t *header = NULL; ompi_osc_pt2pt_buffer_t *buffer = NULL; size_t written_data = 0; size_t needed_len = sizeof(ompi_osc_pt2pt_send_header_t); const void *packed_ddt; size_t packed_ddt_len = ompi_ddt_pack_description_length(sendreq->req_target_datatype); /* we always need to send the ddt */ needed_len += packed_ddt_len; if (OMPI_OSC_PT2PT_GET != sendreq->req_type) { needed_len += sendreq->req_origin_bytes_packed; } /* Get a buffer */ OPAL_FREE_LIST_GET(&mca_osc_pt2pt_component.p2p_c_buffers, item, ret); if (NULL == item) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; } buffer = (ompi_osc_pt2pt_buffer_t*) item; /* verify at least enough space for header */ if (mca_osc_pt2pt_component.p2p_c_eager_size < sizeof(ompi_osc_pt2pt_send_header_t)) { ret = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } /* setup buffer */ buffer->cbfunc = ompi_osc_pt2pt_sendreq_send_cb; buffer->cbdata = (void*) sendreq; /* pack header */ header = (ompi_osc_pt2pt_send_header_t*) buffer->payload; written_data += sizeof(ompi_osc_pt2pt_send_header_t); header->hdr_base.hdr_flags = 0; header->hdr_windx = sendreq->req_module->p2p_comm->c_contextid; header->hdr_origin = sendreq->req_module->p2p_comm->c_my_rank; header->hdr_origin_sendreq.pval = (void*) sendreq; header->hdr_origin_tag = 0; header->hdr_target_disp = sendreq->req_target_disp; header->hdr_target_count = sendreq->req_target_count; switch (sendreq->req_type) { case OMPI_OSC_PT2PT_PUT: header->hdr_base.hdr_type = OMPI_OSC_PT2PT_HDR_PUT; #if OMPI_ENABLE_MEM_DEBUG header->hdr_target_op = 0; #endif break; case OMPI_OSC_PT2PT_ACC: header->hdr_base.hdr_type = OMPI_OSC_PT2PT_HDR_ACC; header->hdr_target_op = sendreq->req_op_id; break; case OMPI_OSC_PT2PT_GET: header->hdr_base.hdr_type = OMPI_OSC_PT2PT_HDR_GET; #if OMPI_ENABLE_MEM_DEBUG header->hdr_target_op = 0; #endif break; } /* Set datatype id and / or pack datatype */ ret = ompi_ddt_get_pack_description(sendreq->req_target_datatype, &packed_ddt); if (OMPI_SUCCESS != ret) goto cleanup; memcpy((unsigned char*) buffer->payload + written_data, packed_ddt, packed_ddt_len); written_data += packed_ddt_len; if (OMPI_OSC_PT2PT_GET != sendreq->req_type) { /* if sending data and it fits, pack payload */ if (mca_osc_pt2pt_component.p2p_c_eager_size >= written_data + sendreq->req_origin_bytes_packed) { struct iovec iov; uint32_t iov_count = 1; size_t max_data = sendreq->req_origin_bytes_packed; iov.iov_len = max_data; iov.iov_base = (IOVBASE_TYPE*)((unsigned char*) buffer->payload + written_data); ret = ompi_convertor_pack(&sendreq->req_origin_convertor, &iov, &iov_count, &max_data ); if (ret < 0) { ret = OMPI_ERR_FATAL; goto cleanup; } assert(max_data == sendreq->req_origin_bytes_packed); written_data += max_data; header->hdr_msg_length = sendreq->req_origin_bytes_packed; } else { header->hdr_msg_length = 0; header->hdr_origin_tag = create_send_tag(module); } } else { header->hdr_msg_length = 0; } buffer->len = written_data; #ifdef WORDS_BIGENDIAN header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; #elif OMPI_ENABLE_HETEROGENEOUS_SUPPORT if (sendreq->req_target_proc->proc_arch & OMPI_ARCH_ISBIGENDIAN) { header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; OMPI_OSC_PT2PT_SEND_HDR_HTON(*header); } #endif /* send fragment */ opal_output_verbose(51, ompi_osc_base_output, "%d sending sendreq to %d", sendreq->req_module->p2p_comm->c_my_rank, sendreq->req_target_rank); ret = MCA_PML_CALL(isend(buffer->payload, buffer->len, MPI_BYTE, sendreq->req_target_rank, -200, MCA_PML_BASE_SEND_STANDARD, module->p2p_comm, &buffer->request)); opal_list_append(&module->p2p_pending_control_sends, &buffer->super.super); goto done; cleanup: if (item != NULL) { OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_buffers, item); } done: return ret; }
/* * allgather_inter * * Function: - allgather using other MPI collections * Accepts: - same as MPI_Allgather() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_allgather_inter(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, root = 0, size, rsize, err, i; char *tmpbuf = NULL, *ptmp; ptrdiff_t rlb, slb, rextent, sextent, incr; ompi_request_t *req; mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module; ompi_request_t **reqs = basic_module->mccb_reqs; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); rsize = ompi_comm_remote_size(comm); /* Algorithm: * - a gather to the root in remote group (simultaniously executed, * thats why we cannot use coll_gather). * - exchange the temp-results between two roots * - inter-bcast (again simultanious). */ /* Step one: gather operations: */ if (rank != root) { /* send your data to root */ err = MCA_PML_CALL(send(sbuf, scount, sdtype, root, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { return err; } } else { /* receive a msg. from all other procs. */ err = ompi_datatype_get_extent(rdtype, &rlb, &rextent); if (OMPI_SUCCESS != err) { return err; } err = ompi_datatype_get_extent(sdtype, &slb, &sextent); if (OMPI_SUCCESS != err) { return err; } /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(isend(sbuf, scount, sdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[rsize])); if (OMPI_SUCCESS != err) { return err; } err = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, &reqs[0])); if (OMPI_SUCCESS != err) { return err; } incr = rextent * rcount; ptmp = (char *) rbuf + incr; for (i = 1; i < rsize; ++i, ptmp += incr) { err = MCA_PML_CALL(irecv(ptmp, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLGATHER, comm, &reqs[i])); if (MPI_SUCCESS != err) { return err; } } err = ompi_request_wait_all(rsize + 1, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { return err; } /* Step 2: exchange the resuts between the root processes */ tmpbuf = (char *) malloc(scount * size * sextent); if (NULL == tmpbuf) { return err; } err = MCA_PML_CALL(isend(rbuf, rsize * rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, &req)); if (OMPI_SUCCESS != err) { goto exit; } err = MCA_PML_CALL(recv(tmpbuf, size * scount, sdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { goto exit; } err = ompi_request_wait( &req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { goto exit; } } /* Step 3: bcast the data to the remote group. This * happens in both groups simultaniously, thus we can * not use coll_bcast (this would deadlock). */ if (rank != root) { /* post the recv */ err = MCA_PML_CALL(recv(rbuf, rsize * rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { goto exit; } } else { /* Send the data to every other process in the remote group * except to rank zero. which has it already. */ for (i = 1; i < rsize; i++) { err = MCA_PML_CALL(isend(tmpbuf, size * scount, sdtype, i, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[i - 1])); if (OMPI_SUCCESS != err) { goto exit; } } err = ompi_request_wait_all(rsize - 1, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { goto exit; } } exit: if (NULL != tmpbuf) { free(tmpbuf); } return err; }
int ompi_osc_pt2pt_control_send(ompi_osc_pt2pt_module_t *module, ompi_proc_t *proc, uint8_t type, int32_t value0, int32_t value1) { int ret = OMPI_SUCCESS; opal_free_list_item_t *item; ompi_osc_pt2pt_buffer_t *buffer = NULL; ompi_osc_pt2pt_control_header_t *header = NULL; int rank = -1, i; /* find the rank */ for (i = 0 ; i < module->p2p_comm->c_remote_group->grp_proc_count ; ++i) { if (proc == module->p2p_comm->c_remote_group->grp_proc_pointers[i]) { rank = i; } } /* Get a buffer */ OPAL_FREE_LIST_GET(&mca_osc_pt2pt_component.p2p_c_buffers, item, ret); if (NULL == item) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; } buffer = (ompi_osc_pt2pt_buffer_t*) item; /* verify at least enough space for header */ if (mca_osc_pt2pt_component.p2p_c_eager_size < sizeof(ompi_osc_pt2pt_control_header_t)) { ret = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } /* setup buffer */ buffer->cbfunc = ompi_osc_pt2pt_control_send_cb; buffer->cbdata = NULL; buffer->len = sizeof(ompi_osc_pt2pt_control_header_t); /* pack header */ header = (ompi_osc_pt2pt_control_header_t*) buffer->payload; header->hdr_base.hdr_type = type; header->hdr_base.hdr_flags = 0; header->hdr_value[0] = value0; header->hdr_value[1] = value1; header->hdr_windx = module->p2p_comm->c_contextid; #ifdef WORDS_BIGENDIAN header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; #elif OMPI_ENABLE_HETEROGENEOUS_SUPPORT if (proc->proc_arch & OMPI_ARCH_ISBIGENDIAN) { header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; OMPI_OSC_PT2PT_CONTROL_HDR_HTON(*header); } #endif /* send fragment */ ret = MCA_PML_CALL(isend(buffer->payload, buffer->len, MPI_BYTE, rank, -200, MCA_PML_BASE_SEND_STANDARD, module->p2p_comm, &buffer->request)); opal_list_append(&module->p2p_pending_control_sends, &buffer->super.super); goto done; cleanup: if (item != NULL) { OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_buffers, item); } done: return ret; }
static inline int NBC_Start_round(NBC_Handle *handle) { int num; /* number of operations */ int res; char* ptr; MPI_Request *tmp; NBC_Fn_type type; NBC_Args_send sendargs; NBC_Args_recv recvargs; NBC_Args_op opargs; NBC_Args_copy copyargs; NBC_Args_unpack unpackargs; void *buf1, *buf2; /* get round-schedule address */ ptr = handle->schedule->data + handle->row_offset; NBC_GET_BYTES(ptr,num); NBC_DEBUG(10, "start_round round at offset %d : posting %i operations\n", handle->row_offset, num); for (int i = 0 ; i < num ; ++i) { int offset = (intptr_t)(ptr - handle->schedule->data); memcpy (&type, ptr, sizeof (type)); switch(type) { case SEND: NBC_DEBUG(5," SEND (offset %li) ", offset); NBC_GET_BYTES(ptr,sendargs); NBC_DEBUG(5,"*buf: %p, count: %i, type: %p, dest: %i, tag: %i)\n", sendargs.buf, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag); /* get an additional request */ handle->req_count++; /* get buffer */ if(sendargs.tmpbuf) { buf1=(char*)handle->tmpbuf+(long)sendargs.buf; } else { buf1=(void *)sendargs.buf; } #ifdef NBC_TIMING Isend_time -= MPI_Wtime(); #endif tmp = (MPI_Request *) realloc ((void *) handle->req_array, handle->req_count * sizeof (MPI_Request)); if (NULL == tmp) { return OMPI_ERR_OUT_OF_RESOURCE; } handle->req_array = tmp; res = MCA_PML_CALL(isend(buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag, MCA_PML_BASE_SEND_STANDARD, sendargs.local?handle->comm->c_local_comm:handle->comm, handle->req_array+handle->req_count - 1)); if (OMPI_SUCCESS != res) { NBC_Error ("Error in MPI_Isend(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag, (unsigned long)handle->comm, res); return res; } #ifdef NBC_TIMING Isend_time += MPI_Wtime(); #endif break; case RECV: NBC_DEBUG(5, " RECV (offset %li) ", offset); NBC_GET_BYTES(ptr,recvargs); NBC_DEBUG(5, "*buf: %p, count: %i, type: %p, source: %i, tag: %i)\n", recvargs.buf, recvargs.count, recvargs.datatype, recvargs.source, handle->tag); /* get an additional request - TODO: req_count NOT thread safe */ handle->req_count++; /* get buffer */ if(recvargs.tmpbuf) { buf1=(char*)handle->tmpbuf+(long)recvargs.buf; } else { buf1=recvargs.buf; } #ifdef NBC_TIMING Irecv_time -= MPI_Wtime(); #endif tmp = (MPI_Request *) realloc ((void *) handle->req_array, handle->req_count * sizeof (MPI_Request)); if (NULL == tmp) { return OMPI_ERR_OUT_OF_RESOURCE; } handle->req_array = tmp; res = MCA_PML_CALL(irecv(buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, recvargs.local?handle->comm->c_local_comm:handle->comm, handle->req_array+handle->req_count-1)); if (OMPI_SUCCESS != res) { NBC_Error("Error in MPI_Irecv(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, (unsigned long)handle->comm, res); return res; } #ifdef NBC_TIMING Irecv_time += MPI_Wtime(); #endif break; case OP: NBC_DEBUG(5, " OP2 (offset %li) ", offset); NBC_GET_BYTES(ptr,opargs); NBC_DEBUG(5, "*buf1: %p, buf2: %p, count: %i, type: %p)\n", opargs.buf1, opargs.buf2, opargs.count, opargs.datatype); /* get buffers */ if(opargs.tmpbuf1) { buf1=(char*)handle->tmpbuf+(long)opargs.buf1; } else { buf1=(void *)opargs.buf1; } if(opargs.tmpbuf2) { buf2=(char*)handle->tmpbuf+(long)opargs.buf2; } else { buf2=opargs.buf2; } ompi_op_reduce(opargs.op, buf1, buf2, opargs.count, opargs.datatype); break; case COPY: NBC_DEBUG(5, " COPY (offset %li) ", offset); NBC_GET_BYTES(ptr,copyargs); NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %p, *tgt: %lu, tgtcount: %i, tgttype: %p)\n", (unsigned long) copyargs.src, copyargs.srccount, copyargs.srctype, (unsigned long) copyargs.tgt, copyargs.tgtcount, copyargs.tgttype); /* get buffers */ if(copyargs.tmpsrc) { buf1=(char*)handle->tmpbuf+(long)copyargs.src; } else { buf1=copyargs.src; } if(copyargs.tmptgt) { buf2=(char*)handle->tmpbuf+(long)copyargs.tgt; } else { buf2=copyargs.tgt; } res = NBC_Copy (buf1, copyargs.srccount, copyargs.srctype, buf2, copyargs.tgtcount, copyargs.tgttype, handle->comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } break; case UNPACK: NBC_DEBUG(5, " UNPACK (offset %li) ", offset); NBC_GET_BYTES(ptr,unpackargs); NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %p, *tgt: %lu\n", (unsigned long) unpackargs.inbuf, unpackargs.count, unpackargs.datatype, (unsigned long) unpackargs.outbuf); /* get buffers */ if(unpackargs.tmpinbuf) { buf1=(char*)handle->tmpbuf+(long)unpackargs.inbuf; } else { buf1=unpackargs.inbuf; } if(unpackargs.tmpoutbuf) { buf2=(char*)handle->tmpbuf+(long)unpackargs.outbuf; } else { buf2=unpackargs.outbuf; } res = NBC_Unpack (buf1, unpackargs.count, unpackargs.datatype, buf2, handle->comm); if (OMPI_SUCCESS != res) { NBC_Error ("NBC_Unpack() failed (code: %i)", res); return res; } break; default: NBC_Error ("NBC_Start_round: bad type %li at offset %li", (long)type, offset); return OMPI_ERROR; } } /* check if we can make progress - not in the first round, this allows us to leave the * initialization faster and to reach more overlap * * threaded case: calling progress in the first round can lead to a * deadlock if NBC_Free is called in this round :-( */ if (handle->row_offset) { res = NBC_Progress(handle); if ((NBC_OK != res) && (NBC_CONTINUE != res)) { return OMPI_ERROR; } } return OMPI_SUCCESS; }
static int mca_coll_hierarch_bcast_intra_seg3 (void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, int segsize ) { struct ompi_communicator_t *llcomm=NULL; struct ompi_communicator_t *lcomm=NULL; mca_coll_hierarch_module_t *hierarch_module = (mca_coll_hierarch_module_t *) module; int lroot=MPI_UNDEFINED, llroot=MPI_UNDEFINED; int llrank=MPI_UNDEFINED, llsize=0, rank=0, ret=OMPI_SUCCESS; int lsize=0, lrank=MPI_UNDEFINED; MPI_Aint ub=0, typeext=0; size_t typesize=0; int i, realsegsize=0, remaining_count=0; int num_segments=0, segcount=0, segindex=0; char* tmpbuf = (char *) buff; ompi_request_t **sreq=NULL, **sreq1=NULL; ompi_request_t *rreq=MPI_REQUEST_NULL, *rreq1=MPI_REQUEST_NULL; rank = ompi_comm_rank ( comm ); lcomm = hierarch_module->hier_lcomm; if ( mca_coll_hierarch_verbose_param ) { printf("%s:%d: executing hierarchical seg3 bcast with cnt=%d root=%d segsize=%d\n", comm->c_name, rank, count, root, segsize ); } /* * This function returns the local leader communicator * which *always* contains the root of this operation. * This might involve creating a new communicator. This is * also the reason, that *every* process in comm has to call * this function */ llcomm = mca_coll_hierarch_get_llcomm ( root, hierarch_module, &llroot, &lroot); ompi_datatype_type_size ( datatype, &typesize); ompi_datatype_get_extent ( datatype, &ub, &typeext); /* Determine number of segments and number of elements per segment */ if ((typesize > 0) && (segsize % typesize != 0)) { /* segment size must be a multiple of typesize */ segsize = typesize * (segsize / typesize); } if ((segsize == 0) || (count == 0) || (typesize == 0)) { segcount = count; num_segments = 1; } else { segcount = segsize/typesize; num_segments = count/segcount; if ( (count % segcount) != 0 ) num_segments++; if (num_segments == 1) segcount = count; } realsegsize = segcount*typeext; remaining_count = segcount; if ( MPI_COMM_NULL != lcomm ) { lsize = ompi_comm_size ( lcomm ); lrank = ompi_comm_rank ( lcomm ); sreq1 = (ompi_request_t **)malloc ( lsize * sizeof(ompi_request_t *)); if ( NULL == sreq1 ) { return OMPI_ERR_OUT_OF_RESOURCE; } for(i=0; i<lsize; i++) { sreq1[i] = MPI_REQUEST_NULL; } } if ( MPI_COMM_NULL != llcomm ) { llsize = ompi_comm_size (llcomm); llrank = ompi_comm_rank ( llcomm ); sreq = hierarch_module->hier_reqs; for(i=0; i<llsize; i++) { sreq[i] = MPI_REQUEST_NULL; } } /* Broadcasting the first segment in the upper level*/ if ( MPI_UNDEFINED != llroot ) { ret = llcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype, llroot, llcomm, llcomm->c_coll.coll_bcast_module); if ( OMPI_SUCCESS != ret ) { goto exit; } } for (segindex = 1; segindex < num_segments; segindex++) { /* determine how many elements are being sent in this round */ if( segindex == (num_segments - 1) ) { remaining_count = count - segindex*segcount; } tmpbuf += realsegsize; /* Broadcasting the next segment in the upper level*/ if ( MPI_COMM_NULL != llcomm ) { if(llrank == llroot) { for(i = 0; i < llsize; i++) { if( i != llroot) { ret = MCA_PML_CALL(isend(tmpbuf, remaining_count, datatype, i, MCA_COLL_BASE_TAG_BCAST, MCA_PML_BASE_SEND_STANDARD, llcomm, (sreq+i) )); if ( OMPI_SUCCESS != ret ) { goto exit; } } } } else { ret = MCA_PML_CALL(irecv(tmpbuf, remaining_count, datatype, llroot, MCA_COLL_BASE_TAG_BCAST, llcomm, &rreq )); if ( OMPI_SUCCESS != ret ) { goto exit; } } } /* broadcasting the before segment among the lower level processes * once the local leaders got the data from the root, they can distribute * it to the processes in their local, low-level communicator. */ if ( MPI_COMM_NULL != lcomm ) { if( lrank == lroot) { for( i = 0; i < lsize; i++) { if( i != lroot) { ret = MCA_PML_CALL(isend(tmpbuf-realsegsize, segcount, datatype, i, MCA_COLL_BASE_TAG_BCAST, MCA_PML_BASE_SEND_STANDARD, lcomm, (sreq1+i) )); if ( OMPI_SUCCESS != ret ) { goto exit; } } } } else { ret = MCA_PML_CALL(irecv(tmpbuf-realsegsize, segcount, datatype, lroot, MCA_COLL_BASE_TAG_BCAST , lcomm, &rreq1 )); if ( OMPI_SUCCESS != ret ) { goto exit; } } } /* Wait for the upper level bcast to complete*/ if ( MPI_COMM_NULL != llcomm ) { if ( llrank == llroot ) { ret = ompi_request_wait_all(llsize, sreq, MPI_STATUSES_IGNORE); if ( OMPI_SUCCESS != ret ) { goto exit; } } else { ret = ompi_request_wait_all ( 1, &rreq, MPI_STATUS_IGNORE ); if ( OMPI_SUCCESS != ret ) { goto exit; } } } /*Wait for the lower level bcast to complete */ if ( MPI_COMM_NULL != lcomm ) { if ( lrank == lroot ) { ret = ompi_request_wait_all(lsize, sreq1, MPI_STATUSES_IGNORE); if ( OMPI_SUCCESS != ret ) { goto exit; } } else { ret = ompi_request_wait_all( 1, &rreq1, MPI_STATUS_IGNORE); if ( OMPI_SUCCESS != ret ) { goto exit; } } } } /*Bcasting the last segment among the lower level processes * once the local leaders got the data from the root, they can distribute * it to the processes in their local, low-level communicator. */ if ( MPI_COMM_NULL != lcomm ) { ret = lcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype, lroot, lcomm, lcomm->c_coll.coll_bcast_module); } exit: if ( NULL != sreq1 ) { free ( sreq1 ); } return ret; }
/** * Bcast - subgroup in communicator * This is a very simple algorithm - binary tree, transmitting the full * message at each step. */ OMPI_DECLSPEC int comm_bcast_pml(void *buffer, int root, int count, ompi_datatype_t *dtype, int my_rank_in_group, int n_peers, int *ranks_in_comm,ompi_communicator_t *comm) { /* local variables */ int rc=OMPI_SUCCESS,msg_cnt,i; ompi_request_t *requests[2]; int node_rank, peer_rank; netpatterns_tree_node_t node_data; /* * shift rank to root==0 tree */ node_rank=(my_rank_in_group-root+n_peers)%n_peers; /* * compute my communication pattern - binary tree */ rc=netpatterns_setup_narray_tree(2, node_rank, n_peers, &node_data); if( OMPI_SUCCESS != rc ) { goto Error; } /* 1 process special case */ if(1 == n_peers) { return OMPI_SUCCESS; } /* if I have parents - wait on the data to arrive */ if(node_data.n_parents) { /* I will have only 1 parent */ peer_rank=node_data.parent_rank; peer_rank=(peer_rank+root)%n_peers; /* translate back to actual rank */ rc=MCA_PML_CALL(recv(buffer, count,dtype,peer_rank, -OMPI_COMMON_TAG_BCAST, comm, MPI_STATUSES_IGNORE)); if( 0 > rc ) { goto Error; } } /* send the data to my children */ msg_cnt=0; for(i=0 ; i < node_data.n_children ; i++ ) { peer_rank=node_data.children_ranks[i]; peer_rank=(peer_rank+root)%n_peers; rc=MCA_PML_CALL(isend(buffer, count,dtype,peer_rank, -OMPI_COMMON_TAG_BCAST,MCA_PML_BASE_SEND_STANDARD, comm,&(requests[msg_cnt]))); if( 0 > rc ) { goto Error; } msg_cnt++; } /* wait for send completion */ if(msg_cnt) { /* wait on send and receive completion */ ompi_request_wait_all(msg_cnt,requests,MPI_STATUSES_IGNORE); } /* return */ return OMPI_SUCCESS; Error: return rc; }