/*
 *	scan
 *
 *	Function:	- basic scan operation
 *	Accepts:	- same arguments as MPI_Scan()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_scan_intra(void *sbuf, void *rbuf, int count,
                          struct ompi_datatype_t *dtype,
                          struct ompi_op_t *op,
                          struct ompi_communicator_t *comm,
                          mca_coll_base_module_t *module)
{
    int size, rank, err;
    ptrdiff_t true_lb, true_extent, lb, extent;
    char *free_buffer = NULL;
    char *pml_buffer = NULL;

    /* Initialize */

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    /* If I'm rank 0, just copy into the receive buffer */

    if (0 == rank) {
        if (MPI_IN_PLACE != sbuf) {
            err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
            if (MPI_SUCCESS != err) {
                return err;
            }
        }
    }

    /* Otherwise receive previous buffer and reduce. */

    else {
        /* Allocate a temporary buffer.  Rationale for this size is
         * listed in coll_basic_reduce.c.  Use this temporary buffer to
         * receive into, later. */

        ompi_datatype_get_extent(dtype, &lb, &extent);
        ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);

        free_buffer = (char*)malloc(true_extent + (count - 1) * extent);
        if (NULL == free_buffer) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        pml_buffer = free_buffer - lb;

        /* Copy the send buffer into the receive buffer. */

        if (MPI_IN_PLACE != sbuf) {
            err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
            if (MPI_SUCCESS != err) {
                if (NULL != free_buffer) {
                    free(free_buffer);
                }
                return err;
            }
        }

        /* Receive the prior answer */

        err = MCA_PML_CALL(recv(pml_buffer, count, dtype,
                                rank - 1, MCA_COLL_BASE_TAG_SCAN, comm,
                                MPI_STATUS_IGNORE));
        if (MPI_SUCCESS != err) {
            if (NULL != free_buffer) {
                free(free_buffer);
            }
            return err;
        }

        /* Perform the operation */

        ompi_op_reduce(op, pml_buffer, rbuf, count, dtype);

        /* All done */

        if (NULL != free_buffer) {
            free(free_buffer);
        }
    }

    /* Send result to next process. */

    if (rank < (size - 1)) {
        return MCA_PML_CALL(send(rbuf, count, dtype, rank + 1,
                                 MCA_COLL_BASE_TAG_SCAN,
                                 MCA_PML_BASE_SEND_STANDARD, comm));
    }

    /* All done */

    return MPI_SUCCESS;
}
Example #2
0
int
ompi_osc_pt2pt_replyreq_send(ompi_osc_pt2pt_module_t *module,
                             ompi_osc_pt2pt_replyreq_t *replyreq)
{
    int ret = OMPI_SUCCESS;
    opal_free_list_item_t *item;
    ompi_osc_pt2pt_buffer_t *buffer = NULL;
    ompi_osc_pt2pt_reply_header_t *header = NULL;
    size_t written_data = 0;

    /* Get a buffer */
    OPAL_FREE_LIST_GET(&mca_osc_pt2pt_component.p2p_c_buffers,
                       item, ret);
    if (NULL == item) {
        ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
        goto cleanup;
    }
    buffer = (ompi_osc_pt2pt_buffer_t*) item;

    /* verify at least enough space for header */
    if (mca_osc_pt2pt_component.p2p_c_eager_size < sizeof(ompi_osc_pt2pt_reply_header_t)) {
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto cleanup;
    }

    /* setup buffer */
    buffer->cbfunc = ompi_osc_pt2pt_replyreq_send_cb;
    buffer->cbdata = (void*) replyreq;

    /* pack header */
    header = (ompi_osc_pt2pt_reply_header_t*) buffer->payload;
    written_data += sizeof(ompi_osc_pt2pt_reply_header_t);
    header->hdr_base.hdr_type = OMPI_OSC_PT2PT_HDR_REPLY;
    header->hdr_base.hdr_flags = 0;
    header->hdr_origin_sendreq = replyreq->rep_origin_sendreq;
    header->hdr_target_tag = 0;

    /* if sending data fits, pack payload */
    if (mca_osc_pt2pt_component.p2p_c_eager_size >=
        written_data + replyreq->rep_target_bytes_packed) {
        struct iovec iov;
        uint32_t iov_count = 1;
        size_t max_data = replyreq->rep_target_bytes_packed;

        iov.iov_len = max_data;
        iov.iov_base = (IOVBASE_TYPE*)((unsigned char*) buffer->payload + written_data);

        ret = ompi_convertor_pack(&replyreq->rep_target_convertor, &iov, &iov_count,
                                  &max_data );
        if (ret < 0) {
            ret = OMPI_ERR_FATAL;
            goto cleanup;
        }

        assert(max_data == replyreq->rep_target_bytes_packed);
        written_data += max_data;

        header->hdr_msg_length = replyreq->rep_target_bytes_packed;
    } else {
        header->hdr_msg_length = 0;
        header->hdr_target_tag = create_send_tag(module);
    }

    buffer->len = written_data;

#ifdef WORDS_BIGENDIAN
    header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO;
#elif OMPI_ENABLE_HETEROGENEOUS_SUPPORT
    if (replyreq->rep_origin_proc->proc_arch & OMPI_ARCH_ISBIGENDIAN) {
        header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO;
        OMPI_OSC_PT2PT_REPLY_HDR_HTON(*header);
    }
#endif

    /* send fragment */
    ret = MCA_PML_CALL(isend(buffer->payload,
                             buffer->len,
                             MPI_BYTE,
                             replyreq->rep_origin_rank,
                             -200,
                             MCA_PML_BASE_SEND_STANDARD,
                             module->p2p_comm,
                             &buffer->request));
    opal_list_append(&module->p2p_pending_control_sends, 
                     &buffer->super.super);

    goto done;

 cleanup:
    if (item != NULL) {
        OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_buffers,
                              item);
    }

 done:
    return ret;
}
Example #3
0
int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
{
    int ret;
    ompi_proc_t** procs;
    size_t nprocs;
    char *error = NULL;
    char *cmd=NULL, *av=NULL;
    ompi_errhandler_errtrk_t errtrk;
    OPAL_TIMING_DECLARE(tm);
    OPAL_TIMING_INIT_EXT(&tm, OPAL_TIMING_GET_TIME_OF_DAY);

    /* bitflag of the thread level support provided. To be used
     * for the modex in order to work in heterogeneous environments. */
    uint8_t threadlevel_bf;

    /* Ensure that we were not already initialized or finalized.

       This lock is held for the duration of ompi_mpi_init() and
       ompi_mpi_finalize().  Hence, if we get it, then no other thread
       is inside the critical section (and we don't have to check the
       *_started bool variables). */
    opal_mutex_lock(&ompi_mpi_bootstrap_mutex);
    if (ompi_mpi_finalized) {
        opal_show_help("help-mpi-runtime.txt",
                       "mpi_init: already finalized", true);
        opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
        return MPI_ERR_OTHER;
    } else if (ompi_mpi_initialized) {
        opal_show_help("help-mpi-runtime.txt",
                       "mpi_init: invoked multiple times", true);
        opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
        return MPI_ERR_OTHER;
    }

    /* Indicate that we have *started* MPI_INIT* */
    ompi_mpi_init_started = true;

    /* Setup enough to check get/set MCA params */

    if (OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv))) {
        error = "ompi_mpi_init: opal_init_util failed";
        goto error;
    }

    /* Convince OPAL to use our naming scheme */
    opal_process_name_print = _process_name_print_for_opal;
    opal_compare_proc = _process_name_compare;
    opal_convert_string_to_process_name = _convert_string_to_process_name;
    opal_convert_process_name_to_string = _convert_process_name_to_string;
    opal_proc_for_name = ompi_proc_for_name;

    /* Register MCA variables */
    if (OPAL_SUCCESS != (ret = ompi_register_mca_variables())) {
        error = "ompi_mpi_init: ompi_register_mca_variables failed";
        goto error;
    }

    if (OPAL_SUCCESS != (ret = opal_arch_set_fortran_logical_size(sizeof(ompi_fortran_logical_t)))) {
        error = "ompi_mpi_init: opal_arch_set_fortran_logical_size failed";
        goto error;
    }

    /* _After_ opal_init_util() but _before_ orte_init(), we need to
       set an MCA param that tells libevent that it's ok to use any
       mechanism in libevent that is available on this platform (e.g.,
       epoll and friends).  Per opal/event/event.s, we default to
       select/poll -- but we know that MPI processes won't be using
       pty's with the event engine, so it's ok to relax this
       constraint and let any fd-monitoring mechanism be used. */

    ret = mca_base_var_find("opal", "event", "*", "event_include");
    if (ret >= 0) {
        char *allvalue = "all";
        /* We have to explicitly "set" the MCA param value here
           because libevent initialization will re-register the MCA
           param and therefore override the default. Setting the value
           here puts the desired value ("all") in different storage
           that is not overwritten if/when the MCA param is
           re-registered. This is unless the user has specified a different
           value for this MCA parameter. Make sure we check to see if the
           default is specified before forcing "all" in case that is not what
           the user desires. Note that we do *NOT* set this value as an
           environment variable, just so that it won't be inherited by
           any spawned processes and potentially cause unintented
           side-effects with launching RTE tools... */
        mca_base_var_set_value(ret, allvalue, 4, MCA_BASE_VAR_SOURCE_DEFAULT, NULL);
    }

    OPAL_TIMING_MSTART((&tm,"time from start to completion of rte_init"));

    /* if we were not externally started, then we need to setup
     * some envars so the MPI_INFO_ENV can get the cmd name
     * and argv (but only if the user supplied a non-NULL argv!), and
     * the requested thread level
     */
    if (NULL == getenv("OMPI_COMMAND") && NULL != argv && NULL != argv[0]) {
        asprintf(&cmd, "OMPI_COMMAND=%s", argv[0]);
        putenv(cmd);
    }
    if (NULL == getenv("OMPI_ARGV") && 1 < argc) {
        char *tmp;
        tmp = opal_argv_join(&argv[1], ' ');
        asprintf(&av, "OMPI_ARGV=%s", tmp);
        free(tmp);
        putenv(av);
    }

    /* open the rte framework */
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_rte_base_framework, 0))) {
        error = "ompi_rte_base_open() failed";
        goto error;
    }
    /* no select is required as this is a static framework */

    /* Setup RTE */
    if (OMPI_SUCCESS != (ret = ompi_rte_init(NULL, NULL))) {
        error = "ompi_mpi_init: ompi_rte_init failed";
        goto error;
    }
    ompi_rte_initialized = true;

    /* check for timing request - get stop time and report elapsed time if so */
    OPAL_TIMING_MNEXT((&tm,"time from completion of rte_init to modex"));

    /* if hwloc is available but didn't get setup for some
     * reason, do so now
     */
    if (NULL == opal_hwloc_topology) {
        if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
            error = "Topology init";
            goto error;
        }
    }

    /* Register the default errhandler callback  */
    errtrk.status = OPAL_ERROR;
    errtrk.active = true;
    opal_pmix.register_errhandler(NULL, ompi_errhandler_callback,
                                  ompi_errhandler_registration_callback,
                                  (void*)&errtrk);
    OMPI_WAIT_FOR_COMPLETION(errtrk.active);
    if (OPAL_SUCCESS != errtrk.status) {
        error = "Error handler registration";
        ret = errtrk.status;
        goto error;
    }

    /* Figure out the final MPI thread levels.  If we were not
       compiled for support for MPI threads, then don't allow
       MPI_THREAD_MULTIPLE.  Set this stuff up here early in the
       process so that other components can make decisions based on
       this value. */

    ompi_mpi_thread_level(requested, provided);

    /* determine the bitflag belonging to the threadlevel_support provided */
    memset ( &threadlevel_bf, 0, sizeof(uint8_t));
    OMPI_THREADLEVEL_SET_BITFLAG ( ompi_mpi_thread_provided, threadlevel_bf );

#if OMPI_ENABLE_THREAD_MULTIPLE
    /* add this bitflag to the modex */
    OPAL_MODEX_SEND_STRING(ret, OPAL_PMIX_GLOBAL,
                           "MPI_THREAD_LEVEL", &threadlevel_bf, sizeof(uint8_t));
    if (OPAL_SUCCESS != ret) {
        error = "ompi_mpi_init: modex send thread level";
        goto error;
    }
#endif

    /* If thread support was enabled, then setup OPAL to allow for
       them. */
    if ((OPAL_ENABLE_PROGRESS_THREADS == 1) ||
            (*provided != MPI_THREAD_SINGLE)) {
        opal_set_using_threads(true);
    }

    /* initialize datatypes. This step should be done early as it will
     * create the local convertor and local arch used in the proc
     * init.
     */
    if (OMPI_SUCCESS != (ret = ompi_datatype_init())) {
        error = "ompi_datatype_init() failed";
        goto error;
    }

    /* Initialize OMPI procs */
    if (OMPI_SUCCESS != (ret = ompi_proc_init())) {
        error = "mca_proc_init() failed";
        goto error;
    }

    /* Initialize the op framework. This has to be done *after*
       ddt_init, but befor mca_coll_base_open, since some collective
       modules (e.g., the hierarchical coll component) may need ops in
       their query function. */
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_op_base_framework, 0))) {
        error = "ompi_op_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS !=
            (ret = ompi_op_base_find_available(OPAL_ENABLE_PROGRESS_THREADS,
                    ompi_mpi_thread_multiple))) {
        error = "ompi_op_base_find_available() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_op_init())) {
        error = "ompi_op_init() failed";
        goto error;
    }

    /* Open up MPI-related MCA components */

    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_allocator_base_framework, 0))) {
        error = "mca_allocator_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_rcache_base_framework, 0))) {
        error = "mca_rcache_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_mpool_base_framework, 0))) {
        error = "mca_mpool_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_bml_base_framework, 0))) {
        error = "mca_bml_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_pml_base_framework, 0))) {
        error = "mca_pml_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_coll_base_framework, 0))) {
        error = "mca_coll_base_open() failed";
        goto error;
    }

    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_osc_base_framework, 0))) {
        error = "ompi_osc_base_open() failed";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_crcp_base_framework, 0))) {
        error = "ompi_crcp_base_open() failed";
        goto error;
    }
#endif

    /* In order to reduce the common case for MPI apps (where they
       don't use MPI-2 IO or MPI-1 topology functions), the io and
       topo frameworks are initialized lazily, at the first use of
       relevant functions (e.g., MPI_FILE_*, MPI_CART_*, MPI_GRAPH_*),
       so they are not opened here. */

    /* Select which MPI components to use */

    if (OMPI_SUCCESS !=
            (ret = mca_mpool_base_init(OPAL_ENABLE_PROGRESS_THREADS,
                                       ompi_mpi_thread_multiple))) {
        error = "mca_mpool_base_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS !=
            (ret = mca_pml_base_select(OPAL_ENABLE_PROGRESS_THREADS,
                                       ompi_mpi_thread_multiple))) {
        error = "mca_pml_base_select() failed";
        goto error;
    }

    /* check for timing request - get stop time and report elapsed time if so */
    OPAL_TIMING_MNEXT((&tm,"time to execute modex"));

    /* exchange connection info - this function may also act as a barrier
     * if data exchange is required. The modex occurs solely across procs
     * in our job. If a barrier is required, the "modex" function will
     * perform it internally */
    OPAL_MODEX();

    OPAL_TIMING_MNEXT((&tm,"time from modex to first barrier"));

    /* select buffered send allocator component to be used */
    if( OMPI_SUCCESS !=
            (ret = mca_pml_base_bsend_init(ompi_mpi_thread_multiple))) {
        error = "mca_pml_base_bsend_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS !=
            (ret = mca_coll_base_find_available(OPAL_ENABLE_PROGRESS_THREADS,
                    ompi_mpi_thread_multiple))) {
        error = "mca_coll_base_find_available() failed";
        goto error;
    }

    if (OMPI_SUCCESS !=
            (ret = ompi_osc_base_find_available(OPAL_ENABLE_PROGRESS_THREADS,
                    ompi_mpi_thread_multiple))) {
        error = "ompi_osc_base_find_available() failed";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    if (OMPI_SUCCESS != (ret = ompi_crcp_base_select() ) ) {
        error = "ompi_crcp_base_select() failed";
        goto error;
    }
#endif

    /* io and topo components are not selected here -- see comment
       above about the io and topo frameworks being loaded lazily */

    /* Initialize each MPI handle subsystem */
    /* initialize requests */
    if (OMPI_SUCCESS != (ret = ompi_request_init())) {
        error = "ompi_request_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != (ret = ompi_message_init())) {
        error = "ompi_message_init() failed";
        goto error;
    }

    /* initialize info */
    if (OMPI_SUCCESS != (ret = ompi_mpiinfo_init())) {
        error = "ompi_info_init() failed";
        goto error;
    }

    /* initialize error handlers */
    if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) {
        error = "ompi_errhandler_init() failed";
        goto error;
    }

    /* initialize error codes */
    if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_init())) {
        error = "ompi_mpi_errcode_init() failed";
        goto error;
    }

    /* initialize internal error codes */
    if (OMPI_SUCCESS != (ret = ompi_errcode_intern_init())) {
        error = "ompi_errcode_intern_init() failed";
        goto error;
    }

    /* initialize groups  */
    if (OMPI_SUCCESS != (ret = ompi_group_init())) {
        error = "ompi_group_init() failed";
        goto error;
    }

    /* initialize communicators */
    if (OMPI_SUCCESS != (ret = ompi_comm_init())) {
        error = "ompi_comm_init() failed";
        goto error;
    }

    /* initialize file handles */
    if (OMPI_SUCCESS != (ret = ompi_file_init())) {
        error = "ompi_file_init() failed";
        goto error;
    }

    /* initialize windows */
    if (OMPI_SUCCESS != (ret = ompi_win_init())) {
        error = "ompi_win_init() failed";
        goto error;
    }

    /* initialize attribute meta-data structure for comm/win/dtype */
    if (OMPI_SUCCESS != (ret = ompi_attr_init())) {
        error = "ompi_attr_init() failed";
        goto error;
    }

    /* identify the architectures of remote procs and setup
     * their datatype convertors, if required
     */
    if (OMPI_SUCCESS != (ret = ompi_proc_complete_init())) {
        error = "ompi_proc_complete_init failed";
        goto error;
    }

    /* start PML/BTL's */
    ret = MCA_PML_CALL(enable(true));
    if( OMPI_SUCCESS != ret ) {
        error = "PML control failed";
        goto error;
    }

    /* some btls/mtls require we call add_procs with all procs in the job.
     * since the btls/mtls have no visibility here it is up to the pml to
     * convey this requirement */
    if (mca_pml_base_requires_world ()) {
        if (NULL == (procs = ompi_proc_world (&nprocs))) {
            error = "ompi_proc_get_allocated () failed";
            goto error;
        }
    } else {
        /* add all allocated ompi_proc_t's to PML (below the add_procs limit this
         * behaves identically to ompi_proc_world ()) */
        if (NULL == (procs = ompi_proc_get_allocated (&nprocs))) {
            error = "ompi_proc_get_allocated () failed";
            goto error;
        }
    }
    ret = MCA_PML_CALL(add_procs(procs, nprocs));
    free(procs);
    /* If we got "unreachable", then print a specific error message.
       Otherwise, if we got some other failure, fall through to print
       a generic message. */
    if (OMPI_ERR_UNREACH == ret) {
        opal_show_help("help-mpi-runtime.txt",
                       "mpi_init:startup:pml-add-procs-fail", true);
        error = NULL;
        goto error;
    } else if (OMPI_SUCCESS != ret) {
        error = "PML add procs failed";
        goto error;
    }

    MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
    MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));

    /*
     * Dump all MCA parameters if requested
     */
    if (ompi_mpi_show_mca_params) {
        ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank,
                                 nprocs,
                                 ompi_process_info.nodename);
    }

    /* Do we need to wait for a debugger? */
    ompi_rte_wait_for_debugger();

    /* Next timing measurement */
    OPAL_TIMING_MNEXT((&tm,"time to execute barrier"));

    /* wait for everyone to reach this point - this is a hard
     * barrier requirement at this time, though we hope to relax
     * it at a later point */
    opal_pmix.fence(NULL, 0);

    /* check for timing request - get stop time and report elapsed
       time if so, then start the clock again */
    OPAL_TIMING_MNEXT((&tm,"time from barrier to complete mpi_init"));

#if OPAL_ENABLE_PROGRESS_THREADS == 0
    /* Start setting up the event engine for MPI operations.  Don't
       block in the event library, so that communications don't take
       forever between procs in the dynamic code.  This will increase
       CPU utilization for the remainder of MPI_INIT when we are
       blocking on RTE-level events, but may greatly reduce non-TCP
       latency. */
    opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
#endif

    /* wire up the mpi interface, if requested.  Do this after the
       non-block switch for non-TCP performance.  Do before the
       polling change as anyone with a complex wire-up is going to be
       using the oob. */
    if (OMPI_SUCCESS != (ret = ompi_init_preconnect_mpi())) {
        error = "ompi_mpi_do_preconnect_all() failed";
        goto error;
    }

    /* Setup the dynamic process management (DPM) subsystem */
    if (OMPI_SUCCESS != (ret = ompi_dpm_init())) {
        error = "ompi_dpm_init() failed";
        goto error;
    }

    /* Determine the overall threadlevel support of all processes
       in MPI_COMM_WORLD. This has to be done before calling
       coll_base_comm_select, since some of the collective components
       e.g. hierarch, might create subcommunicators. The threadlevel
       requested by all processes is required in order to know
       which cid allocation algorithm can be used. */
    if ( OMPI_SUCCESS !=
            ( ret = ompi_comm_cid_init ())) {
        error = "ompi_mpi_init: ompi_comm_cid_init failed";
        goto error;
    }

    /* Init coll for the comms. This has to be after dpm_base_select,
       (since dpm.mark_dyncomm is not set in the communicator creation
       function else), but before dpm.dyncom_init, since this function
       might require collective for the CID allocation. */
    if (OMPI_SUCCESS !=
            (ret = mca_coll_base_comm_select(MPI_COMM_WORLD))) {
        error = "mca_coll_base_comm_select(MPI_COMM_WORLD) failed";
        goto error;
    }

    if (OMPI_SUCCESS !=
            (ret = mca_coll_base_comm_select(MPI_COMM_SELF))) {
        error = "mca_coll_base_comm_select(MPI_COMM_SELF) failed";
        goto error;
    }

    /* Check whether we have been spawned or not.  We introduce that
       at the very end, since we need collectives, datatypes, ptls
       etc. up and running here.... */
    if (OMPI_SUCCESS != (ret = ompi_dpm_dyn_init())) {
        error = "ompi_dpm_dyn_init() failed";
        goto error;
    }

    /*
     * Startup the Checkpoint/Restart Mech.
     * Note: Always do this so tools don't hang when
     * in a non-checkpointable build
     */
    if (OMPI_SUCCESS != (ret = ompi_cr_init())) {
        error = "ompi_cr_init";
        goto error;
    }

    /* Undo OPAL calling opal_progress_event_users_increment() during
       opal_init, to get better latency when not using TCP.  Do
       this *after* dyn_init, as dyn init uses lots of RTE
       communication and we don't want to hinder the performance of
       that code. */
    opal_progress_event_users_decrement();

    /* see if yield_when_idle was specified - if so, use it */
    opal_progress_set_yield_when_idle(ompi_mpi_yield_when_idle);

    /* negative value means use default - just don't do anything */
    if (ompi_mpi_event_tick_rate >= 0) {
        opal_progress_set_event_poll_rate(ompi_mpi_event_tick_rate);
    }

    /* At this point, we are fully configured and in MPI mode.  Any
       communication calls here will work exactly like they would in
       the user's code.  Setup the connections between procs and warm
       them up with simple sends, if requested */

    if (OMPI_SUCCESS != (ret = ompi_mpiext_init())) {
        error = "ompi_mpiext_init";
        goto error;
    }

    /* Fall through */
error:
    if (ret != OMPI_SUCCESS) {
        /* Only print a message if one was not already printed */
        if (NULL != error) {
            const char *err_msg = opal_strerror(ret);
            opal_show_help("help-mpi-runtime.txt",
                           "mpi_init:startup:internal-failure", true,
                           "MPI_INIT", "MPI_INIT", error, err_msg, ret);
        }
        opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
        return ret;
    }

    /* Initialize the registered datarep list to be empty */
    OBJ_CONSTRUCT(&ompi_registered_datareps, opal_list_t);

    /* Initialize the arrays used to store the F90 types returned by the
     *  MPI_Type_create_f90_XXX functions.
     */
    OBJ_CONSTRUCT( &ompi_mpi_f90_integer_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_integer_hashtable, 16 /* why not? */);

    OBJ_CONSTRUCT( &ompi_mpi_f90_real_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_real_hashtable, FLT_MAX_10_EXP);

    OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP);

    /* All done.  Wasn't that simple? */

    ompi_mpi_initialized = true;

    /* Finish last measurement, output results
     * and clear timing structure */
    OPAL_TIMING_MSTOP(&tm);
    OPAL_TIMING_DELTAS(ompi_enable_timing, &tm);
    OPAL_TIMING_REPORT(ompi_enable_timing_ext, &tm);
    OPAL_TIMING_RELEASE(&tm);

    opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
    return MPI_SUCCESS;
}
Example #4
0
/* MPI_IN_PLACE all to all algorithm. TODO: implement a better one. */
int
mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount,
                                           struct ompi_datatype_t *rdtype,
                                           struct ompi_communicator_t *comm,
                                           mca_coll_base_module_t *module)
{
    mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
    int i, j, size, rank, err = MPI_SUCCESS, line;
    MPI_Request *preq;
    char *tmp_buffer;
    size_t max_size;
    ptrdiff_t ext;

    /* Initialize. */

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    /* If only one process, we're done. */
    if (1 == size) {
        return MPI_SUCCESS;
    }

    /* Find the largest receive amount */
    ompi_datatype_type_extent (rdtype, &ext);
    max_size = ext * rcount;

    /* Allocate a temporary buffer */
    tmp_buffer = calloc (max_size, 1);
    if (NULL == tmp_buffer) {
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* in-place alltoall slow algorithm (but works) */
    for (i = 0 ; i < size ; ++i) {
        for (j = i+1 ; j < size ; ++j) {
            /* Initiate all send/recv to/from others. */
            preq = coll_base_comm_get_reqs(base_module->base_data, size * 2);

            if (i == rank) {
                /* Copy the data into the temporary buffer */
                err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer,
                                                       (char *) rbuf + j * max_size);
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }

                /* Exchange data with the peer */
                err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * j, rcount, rdtype,
                                          j, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++));
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }

                err = MCA_PML_CALL(isend ((char *) tmp_buffer,  rcount, rdtype,
                                          j, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
                                          comm, preq++));
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
            } else if (j == rank) {
                /* Copy the data into the temporary buffer */
                err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer,
                                                       (char *) rbuf + i * max_size);
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }

                /* Exchange data with the peer */
                err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * i, rcount, rdtype,
                                          i, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++));
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }

                err = MCA_PML_CALL(isend ((char *) tmp_buffer,  rcount, rdtype,
                                          i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
                                          comm, preq++));
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
            } else {
                continue;
            }

            /* Wait for the requests to complete */
            err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE);
            if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
        }
    }

 error_hndl:
    /* Free the temporary buffer */
    free (tmp_buffer);

    if( MPI_SUCCESS != err ) {
        OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                     "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
                     rank));
        ompi_coll_base_free_reqs(base_module->base_data->mcct_reqs, 2);
    }

    /* All done */
    return err;
}
Example #5
0
/*
 *	scatterv_intra
 *
 *	Function:	- scatterv operation
 *	Accepts:	- same arguments as MPI_Scatterv()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_scatterv_intra(const void *sbuf, const int *scounts,
                              const int *disps, struct ompi_datatype_t *sdtype,
                              void *rbuf, int rcount,
                              struct ompi_datatype_t *rdtype, int root,
                              struct ompi_communicator_t *comm,
                              mca_coll_base_module_t *module)
{
    int i, rank, size, err;
    char *ptmp;
    ptrdiff_t lb, extent;

    /* Initialize */

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    /* If not root, receive data. */

    if (rank != root) {
        /* Only receive if there is something to receive */
        if (rcount > 0) {
            return MCA_PML_CALL(recv(rbuf, rcount, rdtype,
                                     root, MCA_COLL_BASE_TAG_SCATTERV,
                                     comm, MPI_STATUS_IGNORE));
        }
        return MPI_SUCCESS;
    }

    /* I am the root, loop sending data. */

    err = ompi_datatype_get_extent(sdtype, &lb, &extent);
    if (OMPI_SUCCESS != err) {
        return OMPI_ERROR;
    }

    for (i = 0; i < size; ++i) {
        ptmp = ((char *) sbuf) + (extent * disps[i]);

        /* simple optimization */

        if (i == rank) {
            /* simple optimization or a local operation */
            if (scounts[i] > 0 && MPI_IN_PLACE != rbuf) {
                err = ompi_datatype_sndrcv(ptmp, scounts[i], sdtype, rbuf, rcount,
                                      rdtype);
            }
        } else {
            /* Only send if there is something to send */
            if (scounts[i] > 0) {
                err = MCA_PML_CALL(send(ptmp, scounts[i], sdtype, i,
                                        MCA_COLL_BASE_TAG_SCATTERV,
                                        MCA_PML_BASE_SEND_STANDARD, comm));
                if (MPI_SUCCESS != err) {
                    return err;
                }
            }
        }
    }

    /* All done */

    return MPI_SUCCESS;
}
Example #6
0
/* This routine serves two purposes:
 * - the allreduce acts as a kind of Barrier,
 *   which avoids, that we have incoming fragments 
 *   on the new communicator before everybody has set
 *   up the comm structure.
 * - some components (e.g. the collective MagPIe component
 *   might want to generate new communicators and communicate
 *   using the new comm. Thus, it can just be called after
 *   the 'barrier'.
 *
 * The reason that this routine is in comm_cid and not in
 * comm.c is, that this file contains the allreduce implementations
 * which are required, and thus we avoid having duplicate code...
 */
int ompi_comm_activate ( ompi_communicator_t** newcomm, 
                         ompi_communicator_t* comm,
                         ompi_communicator_t* bridgecomm,
                         void* local_leader,
                         void* remote_leader,
                         int mode,
                         int send_first )
{
    int ret = 0;

    int ok=0, gok=0;
    ompi_comm_cid_allredfct* allredfnct;

    /* Step 1: the barrier, after which it is allowed to
     * send messages over the new communicator
     */
    switch (mode)
        {
        case OMPI_COMM_CID_INTRA:
            allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_intra;
            break;
        case OMPI_COMM_CID_INTER:
            allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_inter;
            break;
        case OMPI_COMM_CID_INTRA_BRIDGE:
            allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_intra_bridge;
            break;
        case OMPI_COMM_CID_INTRA_OOB:
            allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_intra_oob;
            break;
        default:
            return MPI_UNDEFINED;
            break;
        }

    if (MPI_UNDEFINED != (*newcomm)->c_local_group->grp_my_rank) {

        /* Initialize the PML stuff in the newcomm  */
        if ( OMPI_SUCCESS != (ret = MCA_PML_CALL(add_comm(*newcomm))) ) {
            goto bail_on_error;
        }
        OMPI_COMM_SET_PML_ADDED(*newcomm);
    }


    ret = (allredfnct)(&ok, &gok, 1, MPI_MIN, comm, bridgecomm,
                       local_leader, remote_leader, send_first );
    if( OMPI_SUCCESS != ret ) {
        goto bail_on_error;
    }



    /**
     * Check to see if this process is in the new communicator.
     *
     * Specifically, this function is invoked by all proceses in the
     * old communicator, regardless of whether they are in the new
     * communicator or not.  This is because it is far simpler to use
     * MPI collective functions on the old communicator to determine
     * some data for the new communicator (e.g., remote_leader) than
     * to kludge up our own pseudo-collective routines over just the
     * processes in the new communicator.  Hence, *all* processes in
     * the old communicator need to invoke this function.
     *
     * That being said, only processes in the new communicator need to
     * select a coll module for the new communicator.  More
     * specifically, proceses who are not in the new communicator
     * should *not* select a coll module -- for example,
     * ompi_comm_rank(newcomm) returns MPI_UNDEFINED for processes who
     * are not in the new communicator.  This can cause errors in the
     * selection / initialization of a coll module.  Plus, it's
     * wasteful -- processes in the new communicator will end up
     * freeing the new communicator anyway, so we might as well leave
     * the coll selection as NULL (the coll base comm unselect code
     * handles that case properly).
     */
    if (MPI_UNDEFINED == (*newcomm)->c_local_group->grp_my_rank) {
        return OMPI_SUCCESS;
    }

    /* Let the collectives components fight over who will do
       collective on this new comm.  */
    if (OMPI_SUCCESS != (ret = mca_coll_base_comm_select(*newcomm))) {
        goto bail_on_error;
    }

    /* For an inter communicator, we have to deal with the potential
     * problem of what is happening if the local_comm that we created
     * has a lower CID than the parent comm. This is not a problem
     * as long as the user calls MPI_Comm_free on the inter communicator.
     * However, if the communicators are not freed by the user but released
     * by Open MPI in MPI_Finalize, we walk through the list of still available
     * communicators and free them one by one. Thus, local_comm is freed before
     * the actual inter-communicator. However, the local_comm pointer in the
     * inter communicator will still contain the 'previous' address of the local_comm
     * and thus this will lead to a segmentation violation. In order to prevent
     * that from happening, we increase the reference counter local_comm
     * by one if its CID is lower than the parent. We cannot increase however
     *  its reference counter if the CID of local_comm is larger than
     * the CID of the inter communicators, since a regular MPI_Comm_free would
     * leave in that the case the local_comm hanging around and thus we would not
     * recycle CID's properly, which was the reason and the cause for this trouble.
     */
    if ( OMPI_COMM_IS_INTER(*newcomm)) {
        if ( OMPI_COMM_CID_IS_LOWER(*newcomm, comm)) {
            OMPI_COMM_SET_EXTRA_RETAIN (*newcomm);
            OBJ_RETAIN (*newcomm);
        }
    }


    return OMPI_SUCCESS;

 bail_on_error:
    OBJ_RELEASE(*newcomm);
    *newcomm = MPI_COMM_NULL;
    return ret;
}                         
Example #7
0
/* Arguments not used in this implementation:
 * - send_first
 */
static int ompi_comm_allreduce_intra_bridge (int *inbuf, int *outbuf, 
                                             int count, struct ompi_op_t *op, 
                                             ompi_communicator_t *comm,
                                             ompi_communicator_t *bcomm, 
                                             void* lleader, void* rleader,
                                             int send_first )
{
    int *tmpbuf=NULL;
    int local_rank;
    int i;
    int rc;
    int local_leader, remote_leader;

    local_leader  = (*((int*)lleader));
    remote_leader = (*((int*)rleader));

    if ( &ompi_mpi_op_sum.op != op && &ompi_mpi_op_prod.op != op &&
         &ompi_mpi_op_max.op != op && &ompi_mpi_op_min.op  != op ) {
        return MPI_ERR_OP;
    }
    
    local_rank = ompi_comm_rank ( comm );
    tmpbuf     = (int *) malloc ( count * sizeof(int));
    if ( NULL == tmpbuf ) {
        rc = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

    /* Intercomm_create */
    rc = comm->c_coll.coll_allreduce ( inbuf, tmpbuf, count, MPI_INT,
                                       op, comm, comm->c_coll.coll_allreduce_module );
    if ( OMPI_SUCCESS != rc ) {
        goto exit;
    }

    if (local_rank == local_leader ) {
        MPI_Request req;
        
        rc = MCA_PML_CALL(irecv ( outbuf, count, MPI_INT, remote_leader,
                                  OMPI_COMM_ALLREDUCE_TAG, 
                                  bcomm, &req));
        if ( OMPI_SUCCESS != rc ) {
            goto exit;       
        }
        rc = MCA_PML_CALL(send (tmpbuf, count, MPI_INT, remote_leader, 
                                OMPI_COMM_ALLREDUCE_TAG,
                                MCA_PML_BASE_SEND_STANDARD,  bcomm));
        if ( OMPI_SUCCESS != rc ) {
            goto exit;
        }
        rc = ompi_request_wait_all ( 1, &req, MPI_STATUS_IGNORE);
        if ( OMPI_SUCCESS != rc ) {
            goto exit;
        }

        if ( &ompi_mpi_op_max.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                if (tmpbuf[i] > outbuf[i]) {
                    outbuf[i] = tmpbuf[i];
                }
            }
        }
        else if ( &ompi_mpi_op_min.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                if (tmpbuf[i] < outbuf[i]) {
                    outbuf[i] = tmpbuf[i];
                }
            }
        }
        else if ( &ompi_mpi_op_sum.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                outbuf[i] += tmpbuf[i];
            }
        }
        else if ( &ompi_mpi_op_prod.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                outbuf[i] *= tmpbuf[i];
            }
        }
    }

    rc = comm->c_coll.coll_bcast ( outbuf, count, MPI_INT, local_leader, 
                                   comm, comm->c_coll.coll_bcast_module );

 exit:
    if (NULL != tmpbuf ) {
        free (tmpbuf);
    }

    return (rc);
}
/*
 *	scatterv_inter
 *
 *	Function:	- scatterv operation
 *	Accepts:	- same arguments as MPI_Scatterv()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_inter_scatterv_inter(void *sbuf, int *scounts,
                              int *disps, struct ompi_datatype_t *sdtype,
                              void *rbuf, int rcount,
                              struct ompi_datatype_t *rdtype, int root,
                              struct ompi_communicator_t *comm,
                              mca_coll_base_module_t *module)
{
    int i, rank, size, err, total, size_local;
    int *counts=NULL,*displace=NULL;
    char *ptmp=NULL;
    MPI_Aint incr;
    MPI_Aint extent;
    MPI_Aint lb;
    ompi_datatype_t *ndtype;

    /* Initialize */

    rank = ompi_comm_rank(comm);
    size = ompi_comm_remote_size(comm);
    size_local = ompi_comm_size(comm);

    if (MPI_PROC_NULL == root) {
        /* do nothing */
        err = OMPI_SUCCESS;
    } else if (MPI_ROOT != root) {
	if(0 == rank) {
	    /* local root recieves the counts from the root */
	    counts = (int *)malloc(sizeof(int) * size_local);
	    err = MCA_PML_CALL(recv(counts, size_local, MPI_INT,
				    root, MCA_COLL_BASE_TAG_SCATTERV,
				    comm, MPI_STATUS_IGNORE));
	    if (OMPI_SUCCESS != err) {
		return err;
	    }
	    /* calculate the whole buffer size and recieve it from root */
	    err = ompi_ddt_get_extent(rdtype, &lb, &extent);
	    if (OMPI_SUCCESS != err) {
		return OMPI_ERROR;
	    }
	    incr = 0;
	    for (i = 0; i < size_local; i++) {
		incr = incr + extent*counts[i];
	    }
	    if ( incr > 0 ) {
		ptmp = (char*)malloc(incr); 
		if (NULL == ptmp) {
		    return OMPI_ERR_OUT_OF_RESOURCE;
		}
	    }
	    total = 0;
	    for (i = 0; i < size_local; i++) {
		total = total + counts[i];
	    }
	    err = MCA_PML_CALL(recv(ptmp, total, rdtype,
				    root, MCA_COLL_BASE_TAG_SCATTERV,
				    comm, MPI_STATUS_IGNORE));
	    if (OMPI_SUCCESS != err) {
		return err;
	    }
	    /* set the local displacement i.e. no displacements here */
	    displace = (int *)malloc(sizeof(int) * size_local);
	    displace[0] = 0;
	    for (i = 1; i < size_local; i++) {
		displace[i] = displace[i-1] + counts[i-1];
	    }
	}
	/* perform the scatterv locally */
	err = comm->c_local_comm->c_coll.coll_scatterv(ptmp, counts, displace, 
						       rdtype, rbuf, rcount, 
						       rdtype, 0, comm->c_local_comm,
                                                       comm->c_local_comm->c_coll.coll_scatterv_module);
	if (OMPI_SUCCESS != err) {
	    return err;
	}

	if (NULL != ptmp) {
	    free(ptmp);
	}
	if (NULL != displace) {
	    free(displace);
	}
	if (NULL != counts) {
	    free(counts);
	}

    } else {
	err = MCA_PML_CALL(send(scounts, size, MPI_INT, 0,
				MCA_COLL_BASE_TAG_SCATTERV,
				MCA_PML_BASE_SEND_STANDARD, comm));
	if (OMPI_SUCCESS != err) {
	    return err;
	}

	ompi_ddt_create_indexed(size,scounts,disps,sdtype,&ndtype);
	ompi_ddt_commit(&ndtype);
	
	err = MCA_PML_CALL(send(sbuf, 1, ndtype, 0,
				MCA_COLL_BASE_TAG_SCATTERV,
				MCA_PML_BASE_SEND_STANDARD, comm));
	if (OMPI_SUCCESS != err) {
	    return err;
	}
	ompi_ddt_destroy(&ndtype);

    }

    /* All done */
    return err;
}
int
mca_fcoll_static_file_read_all (mca_io_ompio_file_t *fh,
				void *buf,
				int count,
				struct ompi_datatype_t *datatype,
				ompi_status_public_t *status)
{

    int ret = OMPI_SUCCESS, iov_size=0, *bytes_remaining=NULL;
    int i, j, l,cycles=0, local_cycles=0, *current_index=NULL;
    int index, *disp_index=NULL, *bytes_per_process=NULL, current_position=0;
    int **blocklen_per_process=NULL, *iovec_count_per_process=NULL;
    int *displs=NULL, *sorted=NULL ,entries_per_aggregator=0;
    int *sorted_file_offsets=NULL, temp_index=0, position=0, *temp_disp_index=NULL;


    MPI_Aint **displs_per_process=NULL, global_iov_count=0, global_count=0;
    MPI_Aint *memory_displacements=NULL;
    int bytes_to_read_in_cycle=0;
    size_t max_data=0, bytes_per_cycle=0;
    uint32_t iov_count=0, iov_index=0;
    struct iovec *decoded_iov=NULL, *iov=NULL;
    mca_fcoll_static_local_io_array *local_iov_array=NULL, *global_iov_array=NULL;
    mca_fcoll_static_local_io_array *file_offsets_for_agg=NULL;

    char *global_buf=NULL, *receive_buf=NULL;

    int blocklen[3] = {1, 1, 1};
    int static_num_io_procs=1;
    OPAL_PTRDIFF_TYPE d[3], base;
    ompi_datatype_t *types[3];
    ompi_datatype_t *io_array_type=MPI_DATATYPE_NULL;
    ompi_datatype_t **sendtype = NULL;
    MPI_Request *send_req=NULL, recv_req=NULL;
    int my_aggregator=-1;
    bool recvbuf_is_contiguous=false;
    size_t ftype_size;
    OPAL_PTRDIFF_TYPE ftype_extent, lb;

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0;
    double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0;
    double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0;
    mca_common_ompio_print_entry nentry;
#endif
#if DEBUG_ON
    MPI_Aint gc_in;
#endif
    opal_datatype_type_size ( &datatype->super, &ftype_size );
    opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent );

    /**************************************************************************
     ** 1.  In case the data is not contigous in memory, decode it into an iovec
     **************************************************************************/
    if ( ( ftype_extent == (OPAL_PTRDIFF_TYPE) ftype_size)             &&
         opal_datatype_is_contiguous_memory_layout(&datatype->super,1) &&
         0 == lb ) {
        recvbuf_is_contiguous = true;
    }


    /* In case the data is not contigous in memory, decode it into an iovec */
    if (!recvbuf_is_contiguous  ) {
        fh->f_decode_datatype ( (struct mca_io_ompio_file_t *)fh,
                                datatype,
                                count,
                                buf,
                                &max_data,
                                &decoded_iov,
                                &iov_count);
    }
    else {
        max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
        status->_ucount = max_data;
    }


    fh->f_get_num_aggregators ( &static_num_io_procs );
    fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh,
                                static_num_io_procs,
                                max_data);
    my_aggregator = fh->f_procs_in_group[fh->f_aggregator_index];

    /*  printf("max_data %ld\n", max_data);  */
    ret = fh->f_generate_current_file_view((struct mca_io_ompio_file_t *)fh,
                                           max_data,
                                           &iov,
                                           &iov_size);
    if (ret != OMPI_SUCCESS){
        goto exit;
    }

    if ( iov_size > 0 ) {
        local_iov_array = (mca_fcoll_static_local_io_array *)malloc (iov_size * sizeof(mca_fcoll_static_local_io_array));
        if ( NULL == local_iov_array){
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }


        for (j=0; j < iov_size; j++){
            local_iov_array[j].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)
                iov[j].iov_base;
            local_iov_array[j].length = (size_t)iov[j].iov_len;
            local_iov_array[j].process_id = fh->f_rank;

        }
    }
    else {
        /* Allocate at least one element to correctly create the derived
           data type */
        local_iov_array = (mca_fcoll_static_local_io_array *)malloc (sizeof(mca_fcoll_static_local_io_array));
        if ( NULL == local_iov_array){
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }


        local_iov_array[0].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t) 0;
        local_iov_array[0].length = (size_t) 0;
        local_iov_array[0].process_id = fh->f_rank;
    }

    d[0] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0];
    d[1] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].length;
    d[2] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].process_id;
    base = d[0];
    for (i=0 ; i<3 ; i++) {
        d[i] -= base;
    }

    /* io_array datatype  for using in communication*/
    types[0] = &ompi_mpi_long.dt;
    types[1] = &ompi_mpi_long.dt;
    types[2] = &ompi_mpi_int.dt;

    ompi_datatype_create_struct (3,
                                 blocklen,
                                 d,
                                 types,
                                 &io_array_type);
    ompi_datatype_commit (&io_array_type);

    /* #########################################################*/
    fh->f_get_bytes_per_agg ( (int*) &bytes_per_cycle);
    local_cycles = ceil((double)max_data*fh->f_procs_per_group/bytes_per_cycle);

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
    ret = fh->f_comm->c_coll.coll_allreduce (&local_cycles,
                                             &cycles,
                                             1,
                                             MPI_INT,
                                             MPI_MAX,
                                             fh->f_comm,
                                             fh->f_comm->c_coll.coll_allreduce_module);

    if (OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif


    if (my_aggregator == fh->f_rank) {
        disp_index = (int *) malloc (fh->f_procs_per_group * sizeof(int));
        if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int ));
        if (NULL == bytes_per_process){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        bytes_remaining = (int *) calloc (fh->f_procs_per_group, sizeof(int));
        if (NULL == bytes_remaining){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        current_index = (int *) calloc (fh->f_procs_per_group, sizeof(int));
        if (NULL == current_index){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        blocklen_per_process = (int **)calloc (fh->f_procs_per_group, sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        displs_per_process = (MPI_Aint **)calloc (fh->f_procs_per_group, sizeof (MPI_Aint*));
        if (NULL == displs_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
    }


    iovec_count_per_process = (int *) calloc (fh->f_procs_per_group, sizeof(int));
    if (NULL == iovec_count_per_process){
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

    displs = (int *) calloc (fh->f_procs_per_group, sizeof(int));
    if (NULL == displs){
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
    ret = fcoll_base_coll_allgather_array (&iov_size,
                                           1,
                                           MPI_INT,
                                           iovec_count_per_process,
                                           1,
                                           MPI_INT,
                                           fh->f_aggregator_index,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);

    if( OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif

    if (my_aggregator == fh->f_rank) {
        displs[0] = 0;
        global_iov_count = iovec_count_per_process[0];
        for (i=1 ; i<fh->f_procs_per_group ; i++) {
            global_iov_count += iovec_count_per_process[i];
            displs[i] = displs[i-1] + iovec_count_per_process[i-1];
        }
    }


    if ( (my_aggregator == fh->f_rank) &&
         (global_iov_count >  0 )) {
        global_iov_array = (mca_fcoll_static_local_io_array *) malloc (global_iov_count *
                                                      sizeof(mca_fcoll_static_local_io_array));
        if (NULL == global_iov_array){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
    ret = fcoll_base_coll_gatherv_array (local_iov_array,
                                         iov_size,
                                         io_array_type,
                                         global_iov_array,
                                         iovec_count_per_process,
                                         displs,
                                         io_array_type,
                                         fh->f_aggregator_index,
                                         fh->f_procs_in_group,
                                         fh->f_procs_per_group,
                                         fh->f_comm);

    if (OMPI_SUCCESS != ret){
        fprintf(stderr,"global_iov_array gather error!\n");
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif


    if (NULL != local_iov_array){
        free(local_iov_array);
        local_iov_array = NULL;
    }

    if ( ( my_aggregator == fh->f_rank) &&
         ( global_iov_count > 0 )) {
        sorted = (int *)malloc (global_iov_count * sizeof(int));
        if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        read_local_heap_sort (global_iov_array, global_iov_count, sorted);

        send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request));
        if (NULL == send_req){
            opal_output ( 1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *));
        if (NULL == sendtype) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        for ( i=0; i<fh->f_procs_per_group; i++ ) {
            sendtype[i] = MPI_DATATYPE_NULL;
        }

        if (NULL == bytes_per_process){
            bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int));
            if (NULL == bytes_per_process){
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
        }
    }

#if DEBUG_ON

    if (my_aggregator == fh->f_rank) {
        for (gc_in=0; gc_in<global_iov_count; gc_in++){
            printf("%d: Offset[%ld]: %lld, Length[%ld]: %ld\n",
                   global_iov_array[sorted[gc_in]].process_id,
                   gc_in, global_iov_array[sorted[gc_in]].offset,
                   gc_in, global_iov_array[sorted[gc_in]].length);
        }
    }
#endif

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif

    for (index = 0; index < cycles; index++){

        if (my_aggregator == fh->f_rank) {

            fh->f_num_of_io_entries = 0;
            if (NULL != fh->f_io_array) {
                free (fh->f_io_array);
                fh->f_io_array = NULL;
            }
            if (NULL != global_buf) {
                free (global_buf);
                global_buf = NULL;
            }

            if (NULL != sorted_file_offsets){
                free(sorted_file_offsets);
                sorted_file_offsets = NULL;
            }
            if (NULL != file_offsets_for_agg){
                free(file_offsets_for_agg);
                file_offsets_for_agg = NULL;
            }
            if (NULL != memory_displacements){
                free(memory_displacements);
                memory_displacements= NULL;
            }

            if ( NULL != sendtype ) {
                for ( i=0; i<fh->f_procs_per_group; i++ ) {
                    if ( MPI_DATATYPE_NULL != sendtype[i] ) {
                        ompi_datatype_destroy (&sendtype[i] );
                        sendtype[i] = MPI_DATATYPE_NULL;
                    }
                }
            }

            for(l=0;l<fh->f_procs_per_group;l++){
                disp_index[l] =  1;
                if (NULL != blocklen_per_process[l]){
                    free(blocklen_per_process[l]);
                    blocklen_per_process[l] = NULL;
                }
                if (NULL != displs_per_process[l]){
                    free(displs_per_process[l]);
                    displs_per_process[l] = NULL;
                }
                blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
                if (NULL == blocklen_per_process[l]) {
                    opal_output (1, "OUT OF MEMORY for blocklen\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
                if (NULL == displs_per_process[l]){
                    opal_output (1, "OUT OF MEMORY for displs\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }
        }

        if (index < local_cycles ) {
            if ((index == local_cycles-1) && (max_data % (bytes_per_cycle/fh->f_procs_per_group))) {
                bytes_to_read_in_cycle = max_data - position;
            }
            else if (max_data <= bytes_per_cycle/fh->f_procs_per_group) {
                bytes_to_read_in_cycle = max_data;
            }
            else {
                bytes_to_read_in_cycle = bytes_per_cycle/fh->f_procs_per_group;
            }
        }
        else {
            bytes_to_read_in_cycle = 0;
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
        fcoll_base_coll_gather_array (&bytes_to_read_in_cycle,
                                      1,
                                      MPI_INT,
                                      bytes_per_process,
                                      1,
                                      MPI_INT,
                                      fh->f_aggregator_index,
                                      fh->f_procs_in_group,
                                      fh->f_procs_per_group,
                                      fh->f_comm);

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif

        if (recvbuf_is_contiguous ) {
            receive_buf = &((char*)buf)[position];
        }
        else if (bytes_to_read_in_cycle) {
            receive_buf = (char *) malloc (bytes_to_read_in_cycle * sizeof(char));
            if ( NULL == receive_buf){
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
        }


#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        start_rcomm_time = MPI_Wtime();
#endif

        ret = MCA_PML_CALL(irecv(receive_buf,
                                 bytes_to_read_in_cycle,
                                 MPI_BYTE,
                                 my_aggregator,
                                 123,
                                 fh->f_comm,
                                 &recv_req));
        if (OMPI_SUCCESS != ret){
            goto exit;
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif


        if (my_aggregator == fh->f_rank) {
            for (i=0;i<fh->f_procs_per_group; i++){
                while (bytes_per_process[i] > 0){
                    /*printf("%d: bytes_per_process[%d]: %d, bytes_remaining[%d]: %d\n",
                      index, i, bytes_per_process[i], i, bytes_remaining[i]);*/
                    if (read_get_process_id(global_iov_array[sorted[current_index[i]]].process_id,
                                            fh) == i){ /* current id owns this entry!*/
                        if (bytes_remaining[i]){ /*Remaining bytes in the current entry of
                                                   the global offset array*/
                            if (bytes_remaining[i] <= bytes_per_process[i]){

                                blocklen_per_process[i][disp_index[i] - 1] = bytes_remaining[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                blocklen_per_process[i] = (int *) realloc
                                    ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                    ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                bytes_per_process[i] -= bytes_remaining[i];
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                disp_index[i] += 1;
                                bytes_remaining[i] = 0;
                                /* This entry has been used up, we need to move to the
                                   next entry of this process and make current_index point there*/
                                current_index[i]  = read_find_next_index(i,
                                                                         current_index[i],
                                                                         fh,
                                                                         global_iov_array,
                                                                         global_iov_count,
                                                                         sorted);
                                if (current_index[i] == -1){
                                    break;
                                }
                                continue;
                            }
                            else{
                                blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                bytes_remaining[i] -= bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                        }
                        else{
                            if (bytes_per_process[i] <
                                global_iov_array[sorted[current_index[i]]].length){
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                bytes_remaining[i] =
                                    global_iov_array[sorted[current_index[i]]].length -
                                    bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                            else {
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].length;
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                blocklen_per_process[i] =
                                    (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                    ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                disp_index[i] += 1;
                                bytes_per_process[i] -=
                                    global_iov_array[sorted[current_index[i]]].length;
                                current_index[i] = read_find_next_index(i,
                                                                        current_index[i],
                                                                        fh,
                                                                        global_iov_array,
                                                                        global_iov_count,
                                                                        sorted);
                                if (current_index[i] == -1){
                                    break;
                                }
                            }
                        }
                    }
                    else{
                        current_index[i] = read_find_next_index(i,
                                                                current_index[i],
                                                                fh,
                                                                global_iov_array,
                                                                global_iov_count,
                                                                sorted);
                        if (current_index[i] == -1){
                            bytes_per_process[i] = 0; /* no more entries left
                                                         to service this request*/
                            continue;
                        }
                    }
                }
            }

            entries_per_aggregator=0;
            for (i=0;i<fh->f_procs_per_group;i++){
                for (j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0){
                        entries_per_aggregator++;
#if DEBUG_ON
                        printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
                               fh->f_procs_in_group[i],j,
                               blocklen_per_process[i][j],j,
                               displs_per_process[i][j],
                               fh->f_rank);

#endif
                    }
                }
            }

            if (entries_per_aggregator > 0){
                file_offsets_for_agg = (mca_fcoll_static_local_io_array *)
                    malloc(entries_per_aggregator*sizeof(mca_fcoll_static_local_io_array));
                if (NULL == file_offsets_for_agg) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                sorted_file_offsets = (int *) malloc (entries_per_aggregator * sizeof(int));
                if (NULL == sorted_file_offsets){
                    opal_output (1, "OUT OF MEMORY\n");
                    ret =  OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                temp_index=0;
                global_count = 0;
                for (i=0;i<fh->f_procs_per_group; i++){
                    for(j=0;j<disp_index[i]; j++){
                        if (blocklen_per_process[i][j] > 0){
                            file_offsets_for_agg[temp_index].length =
                                blocklen_per_process[i][j];
                            global_count += blocklen_per_process[i][j];
                            file_offsets_for_agg[temp_index].process_id = i;
                            file_offsets_for_agg[temp_index].offset =
                                displs_per_process[i][j];
                            temp_index++;
                        }
                    }
                }
            }
            else{
                continue;
            }
            read_local_heap_sort (file_offsets_for_agg,
                                  entries_per_aggregator,
                                  sorted_file_offsets);
            memory_displacements = (MPI_Aint *) malloc
                (entries_per_aggregator * sizeof(MPI_Aint));
            memory_displacements[sorted_file_offsets[0]] = 0;
            for (i=1; i<entries_per_aggregator; i++){
                memory_displacements[sorted_file_offsets[i]] =
                    memory_displacements[sorted_file_offsets[i-1]] +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
            }

            global_buf = (char *) malloc (global_count * sizeof(char));
            if (NULL == global_buf){
                opal_output(1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0; i<entries_per_aggregator;i++){
                printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld, disp_index :%d\n",
                       file_offsets_for_agg[sorted_file_offsets[i]].process_id,
                       file_offsets_for_agg[sorted_file_offsets[i]].offset,
                       file_offsets_for_agg[sorted_file_offsets[i]].length,
                       memory_displacements[sorted_file_offsets[i]],
                       disp_index[i]);
            }
#endif

            fh->f_io_array = (mca_io_ompio_io_array_t *) malloc
                (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
            if (NULL == fh->f_io_array) {
                opal_output(1, "OUT OF MEMORY\n");
                ret =  OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }



            fh->f_num_of_io_entries = 0;
            fh->f_io_array[0].offset =
                (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
            fh->f_io_array[0].length = file_offsets_for_agg[sorted_file_offsets[0]].length;
            fh->f_io_array[0].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]];
            fh->f_num_of_io_entries++;
            for (i=1;i<entries_per_aggregator;i++){
                if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
                    file_offsets_for_agg[sorted_file_offsets[i]].offset){
                    fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                }
                else{
                    fh->f_io_array[fh->f_num_of_io_entries].offset =
                        (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
                    fh->f_io_array[fh->f_num_of_io_entries].length =
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                    fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                        global_buf+memory_displacements[sorted_file_offsets[i]];
                    fh->f_num_of_io_entries++;
                }
            }

#if DEBUG_ON
            printf("*************************** %d\n", fh->f_num_of_io_entries);
            for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
                printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
                       fh->f_io_array[i].memory_address,
                       (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset,
                       fh->f_io_array[i].length);
            }
#endif
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_read_time = MPI_Wtime();
#endif

            if (fh->f_num_of_io_entries) {
                if ( 0 > fh->f_fbtl->fbtl_preadv (fh)) {
                    opal_output (1, "READ FAILED\n");
                    ret = OMPI_ERROR;
                    goto exit;
                }
            }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_read_time = MPI_Wtime();
            read_time += end_read_time - start_read_time;
#endif


#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            if (my_aggregator == fh->f_rank){
                for (i=0 ; i<global_count/4 ; i++)
                    printf (" READ %d \n",((int *)global_buf)[i]);
            }
#endif

            temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
            if (NULL == temp_disp_index) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            for (i=0; i<entries_per_aggregator; i++){
                temp_index =
                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
                displs_per_process[temp_index][temp_disp_index[temp_index]] =
                    memory_displacements[sorted_file_offsets[i]];
                if (temp_disp_index[temp_index] < disp_index[temp_index]){
                    temp_disp_index[temp_index] += 1;
                }
                else{
                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
                           temp_index, temp_disp_index[temp_index],
                           temp_index, disp_index[temp_index]);
                }
            }
            if (NULL != temp_disp_index){
                free(temp_disp_index);
                temp_disp_index = NULL;
            }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_rcomm_time = MPI_Wtime();
#endif

            for (i=0;i<fh->f_procs_per_group; i++){
                send_req[i] = MPI_REQUEST_NULL;
                ompi_datatype_create_hindexed(disp_index[i],
                                              blocklen_per_process[i],
                                              displs_per_process[i],
                                              MPI_BYTE,
                                              &sendtype[i]);
                ompi_datatype_commit(&sendtype[i]);
                ret = MCA_PML_CALL (isend(global_buf,
                                          1,
                                          sendtype[i],
                                          fh->f_procs_in_group[i],
                                          123,
                                          MCA_PML_BASE_SEND_STANDARD,
                                          fh->f_comm,
                                          &send_req[i]));
                if(OMPI_SUCCESS != ret){
                    goto exit;
                }
            }

            ret = ompi_request_wait_all (fh->f_procs_per_group,
                                         send_req,
                                         MPI_STATUS_IGNORE);
            if (OMPI_SUCCESS != ret){
                goto exit;
            }
        } /* if ( my_aggregator == fh->f_rank ) */

        ret = ompi_request_wait (&recv_req, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != ret){
            goto exit;
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

        position += bytes_to_read_in_cycle;

        if (!recvbuf_is_contiguous) {
            OPAL_PTRDIFF_TYPE mem_address;
            size_t remaining = 0;
            size_t temp_position = 0;

            remaining = bytes_to_read_in_cycle;

            while (remaining && (iov_count > iov_index)){
                mem_address = (OPAL_PTRDIFF_TYPE)
                    (decoded_iov[iov_index].iov_base) + current_position;

                if (remaining >=
                    (decoded_iov[iov_index].iov_len - current_position)) {
                    memcpy ((IOVBASE_TYPE *) mem_address,
                            receive_buf+temp_position,
                            decoded_iov[iov_index].iov_len - current_position);
                    remaining = remaining -
                        (decoded_iov[iov_index].iov_len - current_position);
                    temp_position = temp_position +
                        (decoded_iov[iov_index].iov_len - current_position);
                    iov_index = iov_index + 1;
                    current_position = 0;
                }
                else{
                    memcpy ((IOVBASE_TYPE *) mem_address,
                            receive_buf+temp_position,
                            remaining);
                    current_position = current_position + remaining;
                    remaining = 0;
                }
            }
            if (NULL != receive_buf) {
                free (receive_buf);
                receive_buf = NULL;
            }
        }

    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rexch = MPI_Wtime();
    read_exch += end_rexch - start_rexch;
    nentry.time[0] = read_time;
    nentry.time[1] = rcomm_time;
    nentry.time[2] = read_exch;
    if (my_aggregator == fh->f_rank)
        nentry.aggregator = 1;
    else
        nentry.aggregator = 0;
    nentry.nprocs_for_coll = static_num_io_procs;
    if (!mca_common_ompio_full_print_queue(fh->f_coll_read_time)){
        mca_common_ompio_register_print_entry(fh->f_coll_read_time,
                                              nentry);
    }
#endif

exit:
    if (NULL != decoded_iov){
        free(decoded_iov);
        decoded_iov = NULL;
    }

    if (NULL != displs){
        free(displs);
        displs = NULL;
    }

    if (NULL != iovec_count_per_process){
        free(iovec_count_per_process);
        iovec_count_per_process=NULL;
    }

    if (NULL != local_iov_array){
        free(local_iov_array);
        local_iov_array=NULL;
    }

    if (NULL != global_iov_array){
        free(global_iov_array);
        global_iov_array=NULL;
    }

    if (my_aggregator == fh->f_rank) {

        for(l=0;l<fh->f_procs_per_group;l++){
            if (blocklen_per_process) {
                free(blocklen_per_process[l]);
            }
            if (NULL != displs_per_process[l]){
                free(displs_per_process[l]);
                displs_per_process[l] = NULL;
            }
        }
    }

    if (NULL != bytes_per_process){
        free(bytes_per_process);
        bytes_per_process =NULL;
    }

    if (NULL != disp_index){
        free(disp_index);
        disp_index =NULL;
    }

    if (NULL != displs_per_process){
        free(displs_per_process);
        displs_per_process = NULL;
    }

    if(NULL != bytes_remaining){
        free(bytes_remaining);
        bytes_remaining = NULL;
    }

    if(NULL != current_index){
        free(current_index);
        current_index = NULL;
    }

    if (NULL != blocklen_per_process){
        free(blocklen_per_process);
        blocklen_per_process =NULL;
    }

    if (NULL != bytes_remaining){
        free(bytes_remaining);
        bytes_remaining =NULL;
    }

    if (NULL != memory_displacements){
        free(memory_displacements);
        memory_displacements= NULL;
    }

    if (NULL != file_offsets_for_agg){
        free(file_offsets_for_agg);
        file_offsets_for_agg = NULL;
    }

    if (NULL != sorted_file_offsets){
        free(sorted_file_offsets);
        sorted_file_offsets = NULL;
    }

    if (NULL != sendtype){
        free(sendtype);
        sendtype=NULL;
    }

    if ( !recvbuf_is_contiguous ) {
        if (NULL != receive_buf){
            free(receive_buf);
            receive_buf=NULL;
        }
    }

    if (NULL != global_buf) {
        free(global_buf);
        global_buf = NULL;
    }

    if (NULL != sorted) {
        free(sorted);
        sorted = NULL;
    }

    if (NULL != send_req){
        free(send_req);
        send_req = NULL;
    }


    return ret;

}
Example #10
0
static int
mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, const int *rdisps,
                                             struct ompi_datatype_t *rdtype,
                                             struct ompi_communicator_t *comm,
                                             mca_coll_base_module_t *module)
{
    mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
    int i, j, size, rank, err=MPI_SUCCESS;
    MPI_Request *preq;
    char *tmp_buffer;
    size_t max_size, rdtype_size;
    ptrdiff_t ext;

    /* Initialize. */

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);
    ompi_datatype_type_size(rdtype, &rdtype_size);

    /* If only one process, we're done. */
    if (1 == size || 0 == rdtype_size) {
        return MPI_SUCCESS;
    }

    /* Find the largest receive amount */
    ompi_datatype_type_extent (rdtype, &ext);
    for (i = 0, max_size = 0 ; i < size ; ++i) {
        size_t size = ext * rcounts[i];

        max_size = size > max_size ? size : max_size;
    }

    /* Allocate a temporary buffer */
    tmp_buffer = calloc (max_size, 1);
    if (NULL == tmp_buffer) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* in-place alltoallv slow algorithm (but works) */
    for (i = 0 ; i < size ; ++i) {
        for (j = i+1 ; j < size ; ++j) {
            /* Initiate all send/recv to/from others. */
            preq = tuned_module->tuned_data->mcct_reqs;

            if (i == rank && rcounts[j]) {
                /* Copy the data into the temporary buffer */
                err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[j],
                                                           tmp_buffer, (char *) rbuf + rdisps[j] * ext);
                if (MPI_SUCCESS != err) { goto error_hndl; }

                /* Exchange data with the peer */
                err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j] * ext, rcounts[j], rdtype,
                                          j, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }

                err = MCA_PML_CALL(isend ((void *) tmp_buffer,  rcounts[j], rdtype,
                                          j, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD,
                                          comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }
            } else if (j == rank && rcounts[i]) {
                /* Copy the data into the temporary buffer */
                err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[i],
                                                           tmp_buffer, (char *) rbuf + rdisps[i] * ext);
                if (MPI_SUCCESS != err) { goto error_hndl; }

                /* Exchange data with the peer */
                err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i] * ext, rcounts[i], rdtype,
                                          i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }

                err = MCA_PML_CALL(isend ((void *) tmp_buffer,  rcounts[i], rdtype,
                                          i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD,
                                          comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }
            } else {
                continue;
            }

            /* Wait for the requests to complete */
            err = ompi_request_wait_all (2, tuned_module->tuned_data->mcct_reqs, MPI_STATUSES_IGNORE);
            if (MPI_SUCCESS != err) { goto error_hndl; }

            /* Free the requests. */
            mca_coll_tuned_free_reqs(tuned_module->tuned_data->mcct_reqs, 2);
        }
    }

 error_hndl:
    /* Free the temporary buffer */
    free (tmp_buffer);

    /* All done */

    return err;
}
 int
 mca_fcoll_dynamic_file_read_all (mca_io_ompio_file_t *fh,
				  void *buf,
				  int count,
				  struct ompi_datatype_t *datatype,
				  ompi_status_public_t *status)
 {
     MPI_Aint position = 0;
     MPI_Aint total_bytes = 0;          /* total bytes to be read */
     MPI_Aint bytes_to_read_in_cycle = 0; /* left to be read in a cycle*/
     MPI_Aint bytes_per_cycle = 0;      /* total read in each cycle by each process*/
     int index = 0, ret=OMPI_SUCCESS;
     int cycles = 0;
     int i=0, j=0, l=0;
     int n=0; /* current position in total_bytes_per_process array */
     MPI_Aint bytes_remaining = 0; /* how many bytes have been read from the current
					 value from total_bytes_per_process */
     int *sorted_file_offsets=NULL, entries_per_aggregator=0;
     int bytes_received = 0;
     int blocks = 0;
     /* iovec structure and count of the buffer passed in */
     uint32_t iov_count = 0;
     struct iovec *decoded_iov = NULL;
     int iov_index = 0;
     size_t current_position = 0;
     struct iovec *local_iov_array=NULL, *global_iov_array=NULL;
     char *receive_buf = NULL;
     MPI_Aint *memory_displacements=NULL;
     /* global iovec at the readers that contain the iovecs created from
	file_set_view */
     uint32_t total_fview_count = 0;
     int local_count = 0;
     int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL;
     int current_index=0, temp_index=0;
     int **blocklen_per_process=NULL;
     MPI_Aint **displs_per_process=NULL;
     char *global_buf = NULL;
     MPI_Aint global_count = 0;
     local_io_array *file_offsets_for_agg=NULL;

     /* array that contains the sorted indices of the global_iov */
     int *sorted = NULL;
     int *displs = NULL;
     int dynamic_num_io_procs;
     size_t max_data = 0;
     int *bytes_per_process = NULL;
     MPI_Aint *total_bytes_per_process = NULL;
     ompi_datatype_t **sendtype = NULL;
     MPI_Request *send_req=NULL, *recv_req=NULL;


 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
     double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0;
     double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0;
     double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0;
     mca_io_ompio_print_entry nentry;
 #endif


//     if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) {
//	 fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY;
//     }
     /**************************************************************************
      ** In case the data is not contigous in memory, decode it into an iovec **
      **************************************************************************/
     if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
	 ret = fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh,
				    datatype,
				    count,
				    buf,
				    &max_data,
				    &decoded_iov,
				    &iov_count);
       if (OMPI_SUCCESS != ret){
	 goto exit;
       }
     }
     else {
	 max_data = count * datatype->super.size;
     }

     if ( MPI_STATUS_IGNORE != status ) {
	 status->_ucount = max_data;
     }

     fh->f_get_num_aggregators ( &dynamic_num_io_procs);
     ret = fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh,
				       dynamic_num_io_procs,
				       max_data);
     if (OMPI_SUCCESS != ret){
	 goto exit;
     }

     total_bytes_per_process = (MPI_Aint*)malloc
	 (fh->f_procs_per_group*sizeof(MPI_Aint));
     if (NULL == total_bytes_per_process) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
     }

     ret = fh->f_allgather_array (&max_data,
				  1,
				  MPI_LONG,
				  total_bytes_per_process,
				  1,
				  MPI_LONG,
				  fh->f_aggregator_index,
				  fh->f_procs_in_group,
				  fh->f_procs_per_group,
				  fh->f_comm);
     if (OMPI_SUCCESS != ret){
       goto exit;
     }

     for (i=0 ; i<fh->f_procs_per_group ; i++) {
	 total_bytes += total_bytes_per_process[i];
     }

     if (NULL != total_bytes_per_process) {
	 free (total_bytes_per_process);
	 total_bytes_per_process = NULL;
     }

     /*********************************************************************
      *** Generate the File offsets/lengths corresponding to this write ***
      ********************************************************************/
     ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t *) fh,
					     max_data,
					     &local_iov_array,
					     &local_count);

     if (ret != OMPI_SUCCESS){
	 goto exit;
     }



     /* #########################################################*/

     /*************************************************************
      *** ALLGather the File View information at all processes ***
      *************************************************************/

     fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int));
     if (NULL == fview_count) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
     }

     ret = fh->f_allgather_array (&local_count,
				  1,
				  MPI_INT,
				  fview_count,
				  1,
				  MPI_INT,
				  fh->f_aggregator_index,
				  fh->f_procs_in_group,
				  fh->f_procs_per_group,
				  fh->f_comm);

     if (OMPI_SUCCESS != ret){
	 goto exit;
     }

     displs = (int*)malloc (fh->f_procs_per_group*sizeof(int));
     if (NULL == displs) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
     }

     displs[0] = 0;
     total_fview_count = fview_count[0];
     for (i=1 ; i<fh->f_procs_per_group ; i++) {
	 total_fview_count += fview_count[i];
	 displs[i] = displs[i-1] + fview_count[i-1];
     }

 #if DEBUG_ON
     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	 for (i=0 ; i<fh->f_procs_per_group ; i++) {
	     printf ("%d: PROCESS: %d  ELEMENTS: %d  DISPLS: %d\n",
		     fh->f_rank,
		     i,
		     fview_count[i],
		     displs[i]);
	 }
     }
 #endif

     /* allocate the global iovec  */
     if (0 != total_fview_count) {
       global_iov_array = (struct iovec*)malloc (total_fview_count *
						 sizeof(struct iovec));
       if (NULL == global_iov_array) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
       }
     }

     ret =  fh->f_allgatherv_array (local_iov_array,
				    local_count,
				    fh->f_iov_type,
				    global_iov_array,
				    fview_count,
				    displs,
				    fh->f_iov_type,
				    fh->f_aggregator_index,
				    fh->f_procs_in_group,
				    fh->f_procs_per_group,
				    fh->f_comm);

     if (OMPI_SUCCESS != ret){
       goto exit;
     }

     /* sort it */
     if (0 != total_fview_count) {
	 sorted = (int *)malloc (total_fview_count * sizeof(int));
	 if (NULL == sorted) {
	     opal_output (1, "OUT OF MEMORY\n");
	     ret = OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	 }
	 fh->f_sort_iovec (global_iov_array, total_fview_count, sorted);
     }

     if (NULL != local_iov_array) {
	 free (local_iov_array);
	 local_iov_array = NULL;
     }

 #if DEBUG_ON
     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	 for (i=0 ; i<total_fview_count ; i++) {
	     printf("%d: OFFSET: %p   LENGTH: %d\n",
		    fh->f_rank,
		    global_iov_array[sorted[i]].iov_base,
		    global_iov_array[sorted[i]].iov_len);
	 }
     }
 #endif

     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {

       disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
       if (NULL == disp_index) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
       }

       blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*));
       if (NULL == blocklen_per_process) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
       }

       displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*));
       if (NULL == displs_per_process){
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
       }

       for (i=0;i<fh->f_procs_per_group;i++){
	 blocklen_per_process[i] = NULL;
	 displs_per_process[i] = NULL;
       }
     }


     /*
      * Calculate how many bytes are read in each cycle
      */
     fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle);
     cycles = ceil((double)total_bytes/bytes_per_cycle);

     n = 0;
     bytes_remaining = 0;
     current_index = 0;


 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
     start_rexch = MPI_Wtime();
 #endif
     for (index = 0; index < cycles; index++) {
       /* Getting ready for next cycle
	  Initializing and freeing buffers */
       if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	 if (NULL == sendtype){
	   sendtype = (ompi_datatype_t **)
	     malloc (fh->f_procs_per_group  * sizeof(ompi_datatype_t *));
	   if (NULL == sendtype) {
	     opal_output (1, "OUT OF MEMORY\n");
	     ret = OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	   }
	 }

	 for(l=0;l<fh->f_procs_per_group;l++){

	   disp_index[l] =  1;

	   if (NULL != blocklen_per_process[l]){
	     free(blocklen_per_process[l]);
	     blocklen_per_process[l] = NULL;
	   }
	   if (NULL != displs_per_process[l]){
	     free(displs_per_process[l]);
	     displs_per_process[l] = NULL;
	   }
	   blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
	   if (NULL == blocklen_per_process[l]) {
	     opal_output (1, "OUT OF MEMORY for blocklen\n");
	     ret = OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	   }
	   displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
	   if (NULL == displs_per_process[l]){
	     opal_output (1, "OUT OF MEMORY for displs\n");
	     ret = OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	   }
	 }

	 if (NULL != sorted_file_offsets){
	   free(sorted_file_offsets);
	   sorted_file_offsets = NULL;
	 }

	 if(NULL != file_offsets_for_agg){
	   free(file_offsets_for_agg);
	   file_offsets_for_agg = NULL;
	 }
	 if (NULL != memory_displacements){
	   free(memory_displacements);
	   memory_displacements = NULL;
	 }
       }


       if (cycles-1 == index) {
	 bytes_to_read_in_cycle = total_bytes - bytes_per_cycle*index;
       }
       else {
	 bytes_to_read_in_cycle = bytes_per_cycle;
       }

 #if DEBUG_ON
       if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	 printf ("****%d: CYCLE %d   Bytes %d**********\n",
		 fh->f_rank,
		 index,
		 bytes_to_write_in_cycle);
       }
 #endif

       /* Calculate how much data will be contributed in this cycle
	    by each process*/
       bytes_received = 0;

       while (bytes_to_read_in_cycle) {
	 blocks = fview_count[0];
	 for (j=0 ; j<fh->f_procs_per_group ; j++) {
	   if (sorted[current_index] < blocks) {
	     n = j;
	     break;
	   }
	   else {
	     blocks += fview_count[j+1];
	   }
	 }
	 if (bytes_remaining) {
	   if (bytes_remaining <= bytes_to_read_in_cycle) {

	     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	       blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining;
	       displs_per_process[n][disp_index[n] - 1] =
		 (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base +
		 (global_iov_array[sorted[current_index]].iov_len - bytes_remaining);
	     }
	     if (fh->f_procs_in_group[n] == fh->f_rank) {
	       bytes_received += bytes_remaining;
	     }
	     current_index ++;
	     bytes_to_read_in_cycle -= bytes_remaining;
	     bytes_remaining = 0;
	     if (fh->f_procs_in_group[fh->f_aggregator_index] ==
		 fh->f_rank) {
	       blocklen_per_process[n] = (int *) realloc
		 ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
	       displs_per_process[n] = (MPI_Aint *) realloc
		 ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
	       blocklen_per_process[n][disp_index[n]] = 0;
	       displs_per_process[n][disp_index[n]] = 0;
	       disp_index[n] += 1;
	     }
	     continue;
	   }
	   else {
	     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	       blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle;
	       displs_per_process[n][disp_index[n] - 1] =
		 (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base +
		 (global_iov_array[sorted[current_index]].iov_len
		  - bytes_remaining);
	     }
	     if (fh->f_procs_in_group[n] == fh->f_rank) {
	       bytes_received += bytes_to_read_in_cycle;
	     }
	     bytes_remaining -= bytes_to_read_in_cycle;
	     bytes_to_read_in_cycle = 0;
	     break;
	   }
	 }
	 else {
	   if (bytes_to_read_in_cycle <
		   (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) {
	     if (fh->f_procs_in_group[fh->f_aggregator_index] ==
		 fh->f_rank) {

	       blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle;
	       displs_per_process[n][disp_index[n] - 1] =
		 (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ;
	     }

	     if (fh->f_procs_in_group[n] == fh->f_rank) {
	       bytes_received += bytes_to_read_in_cycle;
	     }
	     bytes_remaining = global_iov_array[sorted[current_index]].iov_len -
	       bytes_to_read_in_cycle;
	     bytes_to_read_in_cycle = 0;
	     break;
	   }
	   else {
	     if (fh->f_procs_in_group[fh->f_aggregator_index] ==
		 fh->f_rank) {
	       blocklen_per_process[n][disp_index[n] - 1] =
		 global_iov_array[sorted[current_index]].iov_len;
	       displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)
		 global_iov_array[sorted[current_index]].iov_base;
	       blocklen_per_process[n] =
		 (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
	       displs_per_process[n] = (MPI_Aint *)realloc
		 ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
	       blocklen_per_process[n][disp_index[n]] = 0;
	       displs_per_process[n][disp_index[n]] = 0;
	       disp_index[n] += 1;
	     }
	     if (fh->f_procs_in_group[n] == fh->f_rank) {
	       bytes_received +=
		 global_iov_array[sorted[current_index]].iov_len;
	     }
	     bytes_to_read_in_cycle -=
	       global_iov_array[sorted[current_index]].iov_len;
	     current_index ++;
	     continue;
	   }
	 }
       }
       /* Calculate the displacement on where to put the data and allocate
	  the recieve buffer (global_buf) */
       if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	 entries_per_aggregator=0;
	 for (i=0;i<fh->f_procs_per_group; i++){
	   for (j=0;j<disp_index[i];j++){
	     if (blocklen_per_process[i][j] > 0)
	       entries_per_aggregator++ ;
	   }
	 }
	 if (entries_per_aggregator > 0){
	   file_offsets_for_agg = (local_io_array *)
	     malloc(entries_per_aggregator*sizeof(local_io_array));
	   if (NULL == file_offsets_for_agg) {
	     opal_output (1, "OUT OF MEMORY\n");
	     ret = OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	   }
	   sorted_file_offsets = (int *)
	     malloc (entries_per_aggregator*sizeof(int));
	   if (NULL == sorted_file_offsets){
	     opal_output (1, "OUT OF MEMORY\n");
	     ret =  OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	   }
	   /*Moving file offsets to an IO array!*/
	   temp_index = 0;
	   global_count = 0;
	   for (i=0;i<fh->f_procs_per_group; i++){
	     for(j=0;j<disp_index[i];j++){
	       if (blocklen_per_process[i][j] > 0){
		   file_offsets_for_agg[temp_index].length =
		     blocklen_per_process[i][j];
		   global_count += blocklen_per_process[i][j];
		   file_offsets_for_agg[temp_index].process_id = i;
		   file_offsets_for_agg[temp_index].offset =
		     displs_per_process[i][j];
		   temp_index++;
	       }
	     }
	   }
	 }
	 else{
	   continue;
	 }

	 read_heap_sort (file_offsets_for_agg,
			 entries_per_aggregator,
			 sorted_file_offsets);

	 memory_displacements = (MPI_Aint *) malloc
	   (entries_per_aggregator * sizeof(MPI_Aint));
	 memory_displacements[sorted_file_offsets[0]] = 0;
	 for (i=1; i<entries_per_aggregator; i++){
	   memory_displacements[sorted_file_offsets[i]] =
	     memory_displacements[sorted_file_offsets[i-1]] +
	     file_offsets_for_agg[sorted_file_offsets[i-1]].length;
	 }

	 global_buf = (char *) malloc (global_count * sizeof(char));
	 if (NULL == global_buf){
	   opal_output(1, "OUT OF MEMORY\n");
	   ret = OMPI_ERR_OUT_OF_RESOURCE;
	   goto exit;
	 }

	  fh->f_io_array = (mca_io_ompio_io_array_t *) malloc
	    (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
	  if (NULL == fh->f_io_array) {
	    opal_output(1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	  }

	 fh->f_num_of_io_entries = 0;
	 fh->f_io_array[fh->f_num_of_io_entries].offset =
	     (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
	 fh->f_io_array[fh->f_num_of_io_entries].length =
	   file_offsets_for_agg[sorted_file_offsets[0]].length;
	 fh->f_io_array[fh->f_num_of_io_entries].memory_address =
	   global_buf+memory_displacements[sorted_file_offsets[0]];
	 fh->f_num_of_io_entries++;
	 for (i=1;i<entries_per_aggregator;i++){
	   if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
	       file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
	       file_offsets_for_agg[sorted_file_offsets[i]].offset){
	     fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
	       file_offsets_for_agg[sorted_file_offsets[i]].length;
	   }
	   else{
	     fh->f_io_array[fh->f_num_of_io_entries].offset =
		 (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
	     fh->f_io_array[fh->f_num_of_io_entries].length =
	       file_offsets_for_agg[sorted_file_offsets[i]].length;
	     fh->f_io_array[fh->f_num_of_io_entries].memory_address =
	       global_buf+memory_displacements[sorted_file_offsets[i]];
	     fh->f_num_of_io_entries++;
	   }
	 }


 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
	 start_read_time = MPI_Wtime();
 #endif

	 if (fh->f_num_of_io_entries) {
	   if ( 0 >  fh->f_fbtl->fbtl_preadv (fh)) {
	     opal_output (1, "READ FAILED\n");
	     ret = OMPI_ERROR;
	     goto exit;
	   }
	 }

 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
	 end_read_time = MPI_Wtime();
	 read_time += end_read_time - start_read_time;
 #endif
	 /**********************************************************
	  ******************** DONE READING ************************
	  *********************************************************/

	 temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
	 if (NULL == temp_disp_index) {
	   opal_output (1, "OUT OF MEMORY\n");
	   ret = OMPI_ERR_OUT_OF_RESOURCE;
	   goto exit;
	 }
	 for (i=0; i<entries_per_aggregator; i++){
	   temp_index =
	     file_offsets_for_agg[sorted_file_offsets[i]].process_id;
	   displs_per_process[temp_index][temp_disp_index[temp_index]] =
	     memory_displacements[sorted_file_offsets[i]];
	   if (temp_disp_index[temp_index] < disp_index[temp_index]){
	     temp_disp_index[temp_index] += 1;
	   }
	   else{
	     printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
		    temp_index, temp_disp_index[temp_index],
		    temp_index, disp_index[temp_index]);
	   }
	 }
	 if (NULL != temp_disp_index){
	   free(temp_disp_index);
	   temp_disp_index = NULL;
	 }

	 send_req = (MPI_Request *)
	   malloc (fh->f_procs_per_group * sizeof(MPI_Request));
	 if (NULL == send_req){
	   opal_output ( 1, "OUT OF MEMORY\n");
	   ret = OMPI_ERR_OUT_OF_RESOURCE;
	   goto exit;
	 }
 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
	 start_rcomm_time = MPI_Wtime();
 #endif
	 for (i=0;i<fh->f_procs_per_group;i++){
	   ompi_datatype_create_hindexed(disp_index[i],
					 blocklen_per_process[i],
					 displs_per_process[i],
					 MPI_BYTE,
					 &sendtype[i]);
	   ompi_datatype_commit(&sendtype[i]);
	   ret = MCA_PML_CALL (isend(global_buf,
				     1,
				     sendtype[i],
				     fh->f_procs_in_group[i],
				     123,
				     MCA_PML_BASE_SEND_STANDARD,
				     fh->f_comm,
				     &send_req[i]));
	   if(OMPI_SUCCESS != ret){
	       goto exit;
	   }
	 }
 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
	 end_rcomm_time = MPI_Wtime();
	 rcomm_time += end_rcomm_time - start_rcomm_time;
 #endif
       }

       /**********************************************************
	********* Scatter the Data from the readers **************
	*********************************************************/
       if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
	 receive_buf = &((char*)buf)[position];
       }
       else if (bytes_received) {
	 /* allocate a receive buffer and copy the data that needs
	    to be received into it in case the data is non-contigous
	    in memory */
	 receive_buf = malloc (bytes_received);
	 if (NULL == receive_buf) {
	   opal_output (1, "OUT OF MEMORY\n");
	   ret = OMPI_ERR_OUT_OF_RESOURCE;
	   goto exit;
	 }
       }

 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
       start_rcomm_time = MPI_Wtime();
 #endif
       recv_req = (MPI_Request *) malloc (sizeof (MPI_Request));
       if (NULL == recv_req){
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
       }

       ret = MCA_PML_CALL(irecv(receive_buf,
				bytes_received,
				MPI_BYTE,
				fh->f_procs_in_group[fh->f_aggregator_index],
				123,
				fh->f_comm,
				recv_req));
       if (OMPI_SUCCESS != ret){
	 goto exit;
       }


       if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){
	 ret = ompi_request_wait_all (fh->f_procs_per_group,
				    send_req,
				      MPI_STATUS_IGNORE);
	 if (OMPI_SUCCESS != ret){
	   goto exit;
	 }
       }

       ret = ompi_request_wait (recv_req, MPI_STATUS_IGNORE);
       if (OMPI_SUCCESS != ret){
	 goto exit;
       }
       position += bytes_received;

       /* If data is not contigous in memory, copy the data from the
	  receive buffer into the buffer passed in */
       if (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
	 OPAL_PTRDIFF_TYPE mem_address;
	 size_t remaining = 0;
	 size_t temp_position = 0;

	 remaining = bytes_received;

	 while (remaining) {
	   mem_address = (OPAL_PTRDIFF_TYPE)
	     (decoded_iov[iov_index].iov_base) + current_position;

	   if (remaining >=
	       (decoded_iov[iov_index].iov_len - current_position)) {
	     memcpy ((IOVBASE_TYPE *) mem_address,
		     receive_buf+temp_position,
		     decoded_iov[iov_index].iov_len - current_position);
	     remaining = remaining -
	       (decoded_iov[iov_index].iov_len - current_position);
	     temp_position = temp_position +
	       (decoded_iov[iov_index].iov_len - current_position);
	     iov_index = iov_index + 1;
	     current_position = 0;
	   }
	   else {
	     memcpy ((IOVBASE_TYPE *) mem_address,
		     receive_buf+temp_position,
		     remaining);
	     current_position = current_position + remaining;
	     remaining = 0;
	   }
	 }

	 if (NULL != receive_buf) {
	   free (receive_buf);
	 receive_buf = NULL;
	 }
       }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
       end_rcomm_time = MPI_Wtime();
       rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

       if (NULL != recv_req){
	 free(recv_req);
	 recv_req = NULL;
       }
       if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){
	 fh->f_num_of_io_entries = 0;
	 if (NULL != fh->f_io_array) {
	   free (fh->f_io_array);
	   fh->f_io_array = NULL;
	 }
	 if (NULL != global_buf) {
	   free (global_buf);
	   global_buf = NULL;
	 }
	 for (i = 0; i < fh->f_procs_per_group; i++)
	   ompi_datatype_destroy(sendtype+i);
	 if (NULL != sendtype){
	   free(sendtype);
	   sendtype=NULL;
	 }
	 if (NULL != send_req){
	   free(send_req);
	   send_req = NULL;
	 }
	 if (NULL != sorted_file_offsets){
	   free(sorted_file_offsets);
	   sorted_file_offsets = NULL;
	 }
	 if (NULL != file_offsets_for_agg){
	   free(file_offsets_for_agg);
	   file_offsets_for_agg = NULL;
	 }
	 if (NULL != bytes_per_process){
	   free(bytes_per_process);
	   bytes_per_process =NULL;
	 }
	 if (NULL != memory_displacements){
	   free(memory_displacements);
	   memory_displacements= NULL;
	 }
       }
     }

 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
     end_rexch = MPI_Wtime();
     read_exch += end_rexch - start_rexch;
     nentry.time[0] = read_time;
     nentry.time[1] = rcomm_time;
     nentry.time[2] = read_exch;
     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank)
       nentry.aggregator = 1;
     else
       nentry.aggregator = 0;
     nentry.nprocs_for_coll = dynamic_num_io_procs;
     if (!fh->f_full_print_queue(READ_PRINT_QUEUE)){
       fh->f_register_print_entry(READ_PRINT_QUEUE,
				  nentry);
     }
 #endif

 exit:
     if (NULL != sorted) {
       free (sorted);
       sorted = NULL;
     }
     if (NULL != global_iov_array) {
       free (global_iov_array);
       global_iov_array = NULL;
     }
     if (NULL != fview_count) {
       free (fview_count);
       fview_count = NULL;
     }
     if (NULL != decoded_iov) {
       free (decoded_iov);
       decoded_iov = NULL;
     }
     if (NULL != local_iov_array){
       free(local_iov_array);
       local_iov_array=NULL;
     }

     if (NULL != displs) {
       free (displs);
       displs = NULL;
     }
     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {

       if (NULL != disp_index){
	 free(disp_index);
	 disp_index = NULL;
       }

       if ( NULL != blocklen_per_process){
	 for(l=0;l<fh->f_procs_per_group;l++){
	   if (NULL != blocklen_per_process[l]){
	     free(blocklen_per_process[l]);
	     blocklen_per_process[l] = NULL;
	   }
	 }

	 free(blocklen_per_process);
	 blocklen_per_process = NULL;
       }

       if (NULL != displs_per_process){
	 for (l=0; i<fh->f_procs_per_group; l++){
	   if (NULL != displs_per_process[l]){
	     free(displs_per_process[l]);
	     displs_per_process[l] = NULL;
	   }
	 }
	 free(displs_per_process);
	 displs_per_process = NULL;
       }

     }
     return ret;
 }
Example #12
0
/*  
 * Linear functions are copied from the basic coll module.  For
 * some small number of nodes and/or small data sizes they are just as
 * fast as tuned/tree based segmenting operations and as such may be
 * selected by the decision functions.  These are copied into this module
 * due to the way we select modules in V1. i.e. in V2 we will handle this
 * differently and so will not have to duplicate code.  
 * GEF Oct05 after asking Jeff.  
 */
int
ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdisps,
                                            struct ompi_datatype_t *sdtype,
                                            void *rbuf, int *rcounts, int *rdisps,
                                            struct ompi_datatype_t *rdtype,
                                            struct ompi_communicator_t *comm,
                                            mca_coll_base_module_t *module)
{
    int i, size, rank, err, nreqs;
    char *psnd, *prcv;
    ptrdiff_t sext, rext;
    MPI_Request *preq;
    mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
    mca_coll_tuned_comm_t *data = tuned_module->tuned_data;

    if (MPI_IN_PLACE == sbuf) {
        return  mca_coll_tuned_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps,
                                                              rdtype, comm, module);
    }

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_tuned_stream,
                 "coll:tuned:alltoallv_intra_basic_linear rank %d", rank));

    ompi_datatype_type_extent(sdtype, &sext);
    ompi_datatype_type_extent(rdtype, &rext);

    /* Simple optimization - handle send to self first */
    psnd = ((char *) sbuf) + (ptrdiff_t)sdisps[rank] * sext;
    prcv = ((char *) rbuf) + (ptrdiff_t)rdisps[rank] * rext;
    if (0 != scounts[rank]) {
        err = ompi_datatype_sndrcv(psnd, scounts[rank], sdtype,
                              prcv, rcounts[rank], rdtype);
        if (MPI_SUCCESS != err) {
            return err;
        }
    }

    /* If only one process, we're done. */
    if (1 == size) {
        return MPI_SUCCESS;
    }

    /* Now, initiate all send/recv to/from others. */
    nreqs = 0;
    preq = data->mcct_reqs;

    /* Post all receives first */
    for (i = 0; i < size; ++i) {
        if (i == rank || 0 == rcounts[i]) {
            continue;
        }

        prcv = ((char *) rbuf) + (ptrdiff_t)rdisps[i] * rext;
        err = MCA_PML_CALL(irecv_init(prcv, rcounts[i], rdtype,
                                      i, MCA_COLL_BASE_TAG_ALLTOALLV, comm,
                                      preq++));
        ++nreqs;
        if (MPI_SUCCESS != err) {
            ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs);
            return err;
        }
    }

    /* Now post all sends */
    for (i = 0; i < size; ++i) {
        if (i == rank || 0 == scounts[i]) {
            continue;
        }

        psnd = ((char *) sbuf) + (ptrdiff_t)sdisps[i] * sext;
        err = MCA_PML_CALL(isend_init(psnd, scounts[i], sdtype,
                                      i, MCA_COLL_BASE_TAG_ALLTOALLV,
                                      MCA_PML_BASE_SEND_STANDARD, comm,
                                      preq++));
        ++nreqs;
        if (MPI_SUCCESS != err) {
            ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs);
            return err;
        }
    }

    /* Start your engines.  This will never return an error. */
    MCA_PML_CALL(start(nreqs, data->mcct_reqs));

    /* Wait for them all.  If there's an error, note that we don't care
     * what the error was -- just that there *was* an error.  The PML
     * will finish all requests, even if one or more of them fail.
     * i.e., by the end of this call, all the requests are free-able.
     * So free them anyway -- even if there was an error, and return the
     * error after we free everything. */
    err = ompi_request_wait_all(nreqs, data->mcct_reqs,
                                MPI_STATUSES_IGNORE);

    /* Free the requests. */
    ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs);

    return err;
}
static int two_phase_exchage_data(mca_io_ompio_file_t *fh,
				  void *buf,
				  char *write_buf,
				  struct iovec *offset_length,
				  int *send_size,int *start_pos,
				  int *recv_size,
				  OMPI_MPI_OFFSET_TYPE off,
				  OMPI_MPI_OFFSET_TYPE size, int *count,
				  int *partial_recv, int *sent_to_proc,
				  int contig_access_count,
				  OMPI_MPI_OFFSET_TYPE min_st_offset,
				  OMPI_MPI_OFFSET_TYPE fd_size,
				  OMPI_MPI_OFFSET_TYPE *fd_start,
				  OMPI_MPI_OFFSET_TYPE *fd_end,
				  Flatlist_node *flat_buf,
				  mca_io_ompio_access_array_t *others_req,
				  int *send_buf_idx, int *curr_to_proc,
				  int *done_to_proc, int iter,
				  int *buf_idx,MPI_Aint buftype_extent,
				  int striping_unit, int *aggregator_list,
				  int *hole){
  
    int *tmp_len=NULL, sum, *srt_len=NULL, nprocs_recv, nprocs_send,  k,i,j;
    int ret=OMPI_SUCCESS;
    MPI_Request *requests=NULL, *send_req=NULL;
    MPI_Datatype *recv_types=NULL;
    OMPI_MPI_OFFSET_TYPE *srt_off=NULL;
    char **send_buf = NULL; 
    
    
    ret = fh->f_comm->c_coll.coll_alltoall (recv_size,
					    1,
					    MPI_INT,
					    send_size,
					    1,
					    MPI_INT,
					    fh->f_comm,
					    fh->f_comm->c_coll.coll_alltoall_module);
    
    if ( OMPI_SUCCESS != ret ){
      return ret;
    }

    nprocs_recv = 0;
    for (i=0;i<fh->f_size;i++){
      if (recv_size[i]){
	nprocs_recv++;
      }
    }
    
    
    recv_types = (MPI_Datatype *)
	malloc (( nprocs_recv + 1 ) * sizeof(MPI_Datatype *));
    
    if ( NULL == recv_types ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    tmp_len = (int *) malloc(fh->f_size*sizeof(int));
    
    if ( NULL == tmp_len ) {
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    j = 0;
    for (i=0;i<fh->f_size;i++){
	if (recv_size[i]) {
	    if (partial_recv[i]) {
		k = start_pos[i] + count[i] - 1;
		tmp_len[i] = others_req[i].lens[k];
		others_req[i].lens[k] = partial_recv[i];
	    }
	    MPI_Type_hindexed(count[i], 
			      &(others_req[i].lens[start_pos[i]]),
			      &(others_req[i].mem_ptrs[start_pos[i]]), 
			      MPI_BYTE, recv_types+j);
	    MPI_Type_commit(recv_types+j);
	    j++;
	}
    }

    sum = 0;
    for (i=0;i<fh->f_size;i++) sum += count[i];
    srt_off = (OMPI_MPI_OFFSET_TYPE *) 
      malloc((sum+1)*sizeof(OMPI_MPI_OFFSET_TYPE));
    
    if ( NULL == srt_off ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }
    
    srt_len = (int *) malloc((sum+1)*sizeof(int));
    
    if ( NULL == srt_len ) {
      return OMPI_ERR_OUT_OF_RESOURCE;
    }


    two_phase_heap_merge(others_req, count, srt_off, srt_len, start_pos, fh->f_size,fh->f_rank,  nprocs_recv, sum);


    for (i=0; i<fh->f_size; i++) 
        if (partial_recv[i]) {
            k = start_pos[i] + count[i] - 1;
            others_req[i].lens[k] = tmp_len[i];
        }
    
    if ( NULL != tmp_len ){
      free(tmp_len); 
    }

    *hole = 0;
    if (off != srt_off[0]){
	*hole = 1;
    }
    else{
	for (i=1;i<sum;i++){
	    if (srt_off[i] <= srt_off[0] + srt_len[0]){
		int new_len = srt_off[i] + srt_len[i] - srt_off[0];
		if(new_len > srt_len[0]) 
		    srt_len[0] = new_len;
	    }
	    else
		break;
	}
	if (i < sum || size != srt_len[0])
	    *hole = 1;
    }


    if ( NULL != srt_off ){
      free(srt_off);
    }
    if ( NULL != srt_len ){
      free(srt_len);
    }

    if (nprocs_recv){
	if (*hole){
	    if (off > 0){
		fh->f_io_array = (mca_io_ompio_io_array_t *)malloc 
		    (sizeof(mca_io_ompio_io_array_t));
		if (NULL == fh->f_io_array) {
		    opal_output(1, "OUT OF MEMORY\n");
		    return OMPI_ERR_OUT_OF_RESOURCE;
		}
		fh->f_io_array[0].offset  =(IOVBASE_TYPE *)(intptr_t)off;
		fh->f_num_of_io_entries = 1;
		fh->f_io_array[0].length = size;
		fh->f_io_array[0].memory_address = write_buf;
		if (fh->f_num_of_io_entries){
		    if (OMPI_SUCCESS != fh->f_fbtl->fbtl_preadv (fh, NULL)) {
			opal_output(1, "READ FAILED\n");
			return OMPI_ERROR;
		    }
		}
		
	    }
	    fh->f_num_of_io_entries = 0;
	    if (NULL != fh->f_io_array) {
		free (fh->f_io_array);
		fh->f_io_array = NULL;
	    }
	}
    }
    
    nprocs_send = 0;
    for (i=0; i <fh->f_size; i++) if (send_size[i]) nprocs_send++;

    #if DEBUG_ON
    printf("%d : nprocs_send : %d\n", fh->f_rank,nprocs_send);
    #endif

    requests = (MPI_Request *) 	
	malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request)); 

    if ( NULL == requests ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }
    
    j = 0;
    for (i=0; i<fh->f_size; i++) {
	if (recv_size[i]) {
	  ret = MCA_PML_CALL(irecv(MPI_BOTTOM,
				   1,
				   recv_types[j],
				   i,
				   fh->f_rank+i+100*iter,
				   fh->f_comm,
				   requests+j));

	  if ( OMPI_SUCCESS != ret ){
	    return ret;
	  }
	  j++;
	}
    }
    send_req = requests + nprocs_recv;
    
    
    if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
	j = 0;
	for (i=0; i <fh->f_size; i++) 
	  if (send_size[i]) {
	    ret = MCA_PML_CALL(isend(((char *) buf) + buf_idx[i],
				     send_size[i],
				     MPI_BYTE,
				     i,
				     fh->f_rank+i+100*iter,
				     MCA_PML_BASE_SEND_STANDARD, 
				     fh->f_comm,
				     send_req+j));	

	    if ( OMPI_SUCCESS != ret ){
	      return ret;
	    }
	    
	    j++;
	    buf_idx[i] += send_size[i];
	  }
    }
    else if(nprocs_send && (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY))){
      send_buf = (char **) malloc(fh->f_size*sizeof(char*));
      if ( NULL == send_buf ){
	return OMPI_ERR_OUT_OF_RESOURCE;
      }
      for (i=0; i < fh->f_size; i++){
	if (send_size[i]) {
	  send_buf[i] = (char *) malloc(send_size[i]);
	  
	  if ( NULL == send_buf[i] ){
	    return OMPI_ERR_OUT_OF_RESOURCE;
	  }
	}
      }
      
      ret = two_phase_fill_send_buffer(fh, buf,flat_buf, send_buf,
				       offset_length, send_size,
				       send_req,sent_to_proc,
				       contig_access_count, 
				       min_st_offset, fd_size,
				       fd_start, fd_end, send_buf_idx,
				       curr_to_proc, done_to_proc,
				       iter, buftype_extent, striping_unit,
				       aggregator_list);
      
      if ( OMPI_SUCCESS != ret ){
	return ret;
      }
    }

    for (i=0; i<nprocs_recv; i++) MPI_Type_free(recv_types+i);
    free(recv_types);
    ret = ompi_request_wait_all (nprocs_send+nprocs_recv,
				 requests,
				 MPI_STATUS_IGNORE);
    

    if ( NULL != requests ){
      free(requests);
    }
    
    return ret;
}
static int two_phase_fill_send_buffer(mca_io_ompio_file_t *fh,
				      void *buf,
				      Flatlist_node *flat_buf,
				      char **send_buf,
				      struct iovec *offset_length,
				      int *send_size,
				      MPI_Request *requests,
				      int *sent_to_proc,
				      int contig_access_count, 
				      OMPI_MPI_OFFSET_TYPE min_st_offset,
				      OMPI_MPI_OFFSET_TYPE fd_size,
				      OMPI_MPI_OFFSET_TYPE *fd_start,
				      OMPI_MPI_OFFSET_TYPE *fd_end,
				      int *send_buf_idx,
				      int *curr_to_proc, 
				      int *done_to_proc,
				      int iter, MPI_Aint buftype_extent,
				      int striping_unit, int *aggregator_list){

    int i, p, flat_buf_idx;
    OMPI_MPI_OFFSET_TYPE flat_buf_sz, size_in_buf, buf_incr, size;
    int jj, n_buftypes, ret=OMPI_SUCCESS;
    OMPI_MPI_OFFSET_TYPE off, len, rem_len, user_buf_idx;

    for (i=0; i < fh->f_size; i++) {
	send_buf_idx[i] = curr_to_proc[i] = 0;
	done_to_proc[i] = sent_to_proc[i];
    }
    jj = 0;
    
    user_buf_idx = flat_buf->indices[0];
    flat_buf_idx = 0;
    n_buftypes = 0;
    flat_buf_sz = flat_buf->blocklens[0];
    
    for (i=0; i<contig_access_count; i++) { 
	
      off     = (OMPI_MPI_OFFSET_TYPE)(intptr_t)offset_length[i].iov_base;
	rem_len = (OMPI_MPI_OFFSET_TYPE)offset_length[i].iov_len;
	

	while (rem_len != 0) {
	    len = rem_len;
	    p = mca_fcoll_two_phase_calc_aggregator(fh,
						    off,
						    min_st_offset,
						    &len,
						    fd_size,
						    fd_start,
						    fd_end,
						    striping_unit,
						    mca_fcoll_two_phase_num_io_procs,
						    aggregator_list);

	    if (send_buf_idx[p] < send_size[p]) {
		if (curr_to_proc[p]+len > done_to_proc[p]) {
		    if (done_to_proc[p] > curr_to_proc[p]) {
			size = OMPIO_MIN(curr_to_proc[p] + len - 
					 done_to_proc[p], send_size[p]-send_buf_idx[p]);
			buf_incr = done_to_proc[p] - curr_to_proc[p];
			TWO_PHASE_BUF_INCR
		        buf_incr = curr_to_proc[p] + len - done_to_proc[p];
			curr_to_proc[p] = done_to_proc[p] + size;
		        TWO_PHASE_BUF_COPY
		    }
		    else {
			size = OMPIO_MIN(len,send_size[p]-send_buf_idx[p]);
			buf_incr = len;
			curr_to_proc[p] += size;
			TWO_PHASE_BUF_COPY
		    }
		    if (send_buf_idx[p] == send_size[p]) {

		      ret = MCA_PML_CALL(isend(send_buf[p],
					       send_size[p],
					       MPI_BYTE,
					       p,
					       fh->f_rank+p+100*iter,
					       MCA_PML_BASE_SEND_STANDARD, 
					       fh->f_comm,
					       requests+jj));	
		      
		      if ( OMPI_SUCCESS != ret ){
			return ret;
		      }
		      jj++;
		    }
		}
		else {
		    curr_to_proc[p] += len;
		    buf_incr = len;
		    TWO_PHASE_BUF_INCR
		}
	    }
Example #15
0
static int two_phase_exchange_data(mca_io_ompio_file_t *fh,
				   void *buf, struct iovec *offset_len,
				   int *send_size, int *start_pos, int *recv_size,
				   int *count, int *partial_send, 
				   int *recd_from_proc, int contig_access_count,
				   OMPI_MPI_OFFSET_TYPE min_st_offset,
				   OMPI_MPI_OFFSET_TYPE fd_size,
				   OMPI_MPI_OFFSET_TYPE *fd_start,
				   OMPI_MPI_OFFSET_TYPE *fd_end, 
				   Flatlist_node *flat_buf,
				   mca_io_ompio_access_array_t *others_req, 
				   int iter, size_t *buf_idx, 
				   MPI_Aint buftype_extent, int striping_unit,
				   int *aggregator_list)
{
  
  int i=0, j=0, k=0, tmp=0, nprocs_recv=0, nprocs_send=0;
  int ret = OMPI_SUCCESS;
  char **recv_buf = NULL;
  MPI_Request *requests=NULL;
  MPI_Datatype send_type;


#if TIME_BREAKDOWN
    start_rcomm_time = MPI_Wtime();
#endif

  ret = fh->f_comm->c_coll.coll_alltoall (send_size,
					  1,
					  MPI_INT,
					  recv_size,
					  1,
					  MPI_INT,
					  fh->f_comm,
					  fh->f_comm->c_coll.coll_alltoall_module);

  if ( OMPI_SUCCESS != ret ){
    return ret;
  }
  
  
#if DEBUG
  for (i=0; i<fh->f_size; i++){
    printf("%d: RS[%d]: %d\n", fh->f_rank,
	   i,
	   recv_size[i]);
  }
#endif

  
  nprocs_recv = 0;
  for (i=0; i < fh->f_size; i++) 
    if (recv_size[i]) nprocs_recv++;

  nprocs_send = 0;
  for (i=0; i< fh->f_size; i++) 
    if (send_size[i]) nprocs_send++;

  requests = (MPI_Request *)
    malloc((nprocs_send+nprocs_recv+1) *  sizeof(MPI_Request));
  
  if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
    j = 0;
    for (i=0; i < fh->f_size; i++){
      if (recv_size[i]){
	ret = MCA_PML_CALL(irecv(((char *) buf)+ buf_idx[i],
				 recv_size[i],
				 MPI_BYTE,
				 i,
				 fh->f_rank+i+100*iter,
				 fh->f_comm,
				 requests+j));
	
	if ( OMPI_SUCCESS != ret ){
	  return ret;
	}
	j++;
	buf_idx[i] += recv_size[i];
      }
    }
  }
  else{

    recv_buf = (char **)malloc(fh->f_size * sizeof(char *));
    if (NULL == recv_buf){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    for (i=0; i < fh->f_size; i++)
      if(recv_size[i]) recv_buf[i] = 
			 (char *) malloc (recv_size[i] *  sizeof(char));
    j = 0;
    for(i=0; i<fh->f_size; i++)
      if (recv_size[i]) {
	ret = MCA_PML_CALL(irecv(recv_buf[i],
				 recv_size[i],
				 MPI_BYTE,
				 i,
				 fh->f_rank+i+100*iter,
				 fh->f_comm,
				 requests+j));
	j++;
	
      }
  }
    
  

  j = 0;
  for (i = 0; i< fh->f_size; i++){
    if (send_size[i]){
      if (partial_send[i]){
	k = start_pos[i] + count[i] - 1;
	tmp = others_req[i].lens[k];
	others_req[i].lens[k] = partial_send[i];
      }

      MPI_Type_hindexed(count[i],
			&(others_req[i].lens[start_pos[i]]),
			&(others_req[i].mem_ptrs[start_pos[i]]),
			MPI_BYTE,
			&send_type);
      MPI_Type_commit(&send_type);

      ret = MCA_PML_CALL(isend(MPI_BOTTOM,
			       1,
			       send_type,
			       i,
			       fh->f_rank+i+100*iter,
			       MCA_PML_BASE_SEND_STANDARD,
			       fh->f_comm,
			       requests+nprocs_recv+j));
      MPI_Type_free(&send_type);
      if (partial_send[i]) others_req[i].lens[k] = tmp;
      j++;
    }
  }


  if (nprocs_recv) {

    ret = ompi_request_wait_all(nprocs_recv,
				requests, 
				MPI_STATUS_IGNORE);
    
    
    if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {

      two_phase_fill_user_buffer(fh, buf, flat_buf,
				 recv_buf, offset_len,
				 (unsigned *)recv_size, requests,
				 recd_from_proc, contig_access_count,
				 min_st_offset, fd_size, fd_start, fd_end,
				 buftype_extent, striping_unit, aggregator_list);
    }
  }

  ret = ompi_request_wait_all(nprocs_send,
			      requests+nprocs_recv, 
			      MPI_STATUS_IGNORE);

  if (NULL != requests){
    free(requests);
    requests = NULL;
  }
  
  if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)){
    for (i=0; i< fh->f_size; i++){
      if (recv_size[i]){
	free(recv_buf[i]);
      }
    }
    free(recv_buf);
  }

#if TIME_BREAKDOWN
    end_rcomm_time = MPI_Wtime();
    rcomm_time += (end_rcomm_time - start_rcomm_time);
#endif

  return ret;

}
static int send_nb( dte_data_representation_t data,
                    uint32_t count,
                    void *buffer,
                    rte_ec_handle_t ec_h,
                    rte_grp_handle_t grp_h,
                    uint32_t tag,
                    rte_request_handle_t *req)
{
    ompi_communicator_t *comm = (ompi_communicator_t *)grp_h;

#if RTE_DEBUG
    assert(ec_h.group == grp_h);
#endif

    if (! ec_h.handle) {
        fprintf(stderr,"***Error in hcolrte_rml_send_nb: wrong null argument: "
                "ec_h.handle = %p, ec_h.rank = %d\n",ec_h.handle,ec_h.rank);
        return 1;
    }
    if (HCOL_DTE_IS_INLINE(data)){
        /*do inline nb recv*/
        int rc;
        size_t size;
        ompi_request_t *ompi_req;
        ompi_free_list_item_t *item;
        if (!buffer && !HCOL_DTE_IS_ZERO(data)) {
            fprintf(stderr, "***Error in hcolrte_rml_send_nb: buffer pointer is NULL"
                    " for non DTE_ZERO INLINE data representation\n");
            return 1;
        }
        size = (size_t)data.rep.in_line_rep.data_handle.in_line.packed_size*count/8;
        HCOL_VERBOSE(30,"PML_ISEND: dest = %d: buf = %p: size = %u: comm = %p",
                        ec_h.rank, buffer, (unsigned int)size, (void *)comm);
        if (MCA_PML_CALL(isend(buffer,size,&(ompi_mpi_unsigned_char.dt),ec_h.rank,
                               tag,MCA_PML_BASE_SEND_STANDARD,comm,&ompi_req)))
        {
            return 1;
        }
        req->data = (void *)ompi_req;
        req->status = HCOLRTE_REQUEST_ACTIVE;
    }else{
        int total_entries_number;
        int i;
        unsigned int j;
        void *buf;
        uint64_t len;
        int repeat_count;
        struct dte_struct_t * repeat;
        if (NULL != buffer) {
            /* We have a full data description & buffer pointer simultaneously.
               It is ambiguous. Throw a warning since the user might have made a
               mistake with data reps*/
            fprintf(stderr,"Warning: buffer_pointer != NULL for NON-inline data representation: buffer_pointer is ignored.\n");
        }
        total_entries_number = count_total_dte_repeat_entries(&data);
        repeat = data.rep.general_rep->data_representation.data->repeat;
        repeat_count = data.rep.general_rep->data_representation.data->repeat_count;
        for (i=0; i< repeat_count; i++){
            for (j=0; j<repeat[i].n_elements; j++){
                char *repeat_unit = (char *)&repeat[i];
                buf = (void *)(repeat_unit+repeat[i].elements[j].base_offset);
                len = repeat[i].elements[j].packed_size;
                send_nb(DTE_BYTE,len,buf,ec_h,grp_h,tag,req);
            }
        }
    }
    return HCOLL_SUCCESS;
}
int
mca_fcoll_dynamic_file_write_all (ompio_file_t *fh,
                                  const void *buf,
                                  int count,
                                  struct ompi_datatype_t *datatype,
                                  ompi_status_public_t *status)
{
    MPI_Aint total_bytes_written = 0;  /* total bytes that have been written*/
    MPI_Aint total_bytes = 0;          /* total bytes to be written */
    MPI_Aint bytes_to_write_in_cycle = 0; /* left to be written in a cycle*/
    MPI_Aint bytes_per_cycle = 0;      /* total written in each cycle by each process*/
    int index = 0;
    int cycles = 0;
    int i=0, j=0, l=0;
    int n=0; /* current position in total_bytes_per_process array */
    MPI_Aint bytes_remaining = 0; /* how many bytes have been written from the current
                                     value from total_bytes_per_process */
    int bytes_sent = 0, ret =0;
    int blocks=0, entries_per_aggregator=0;

    /* iovec structure and count of the buffer passed in */
    uint32_t iov_count = 0;
    struct iovec *decoded_iov = NULL;
    int iov_index = 0;
    char *send_buf = NULL;
    size_t current_position = 0;
    struct iovec *local_iov_array=NULL, *global_iov_array=NULL;
    mca_io_ompio_local_io_array *file_offsets_for_agg=NULL;
    /* global iovec at the writers that contain the iovecs created from
       file_set_view */
    uint32_t total_fview_count = 0;
    int local_count = 0, temp_pindex;
    int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL;
    int current_index = 0, temp_index=0;

    char *global_buf = NULL;
    MPI_Aint global_count = 0;


    /* array that contains the sorted indices of the global_iov */
    int *sorted = NULL, *sorted_file_offsets=NULL;
    int *displs = NULL;
    int dynamic_num_io_procs;
    size_t max_data = 0, datatype_size = 0;
    int **blocklen_per_process=NULL;
    MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL;
    ompi_datatype_t **recvtype = NULL;
    MPI_Aint *total_bytes_per_process = NULL;
    MPI_Request send_req=NULL, *recv_req=NULL;
    int my_aggregator=-1;
    bool sendbuf_is_contiguous = false;
    size_t ftype_size;
    ptrdiff_t ftype_extent, lb;


#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0;
    double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0;
    double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0;
    mca_common_ompio_print_entry nentry;
#endif

    opal_datatype_type_size ( &datatype->super, &ftype_size );
    opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent );

    /**************************************************************************
     ** 1.  In case the data is not contigous in memory, decode it into an iovec
     **************************************************************************/
    if ( ( ftype_extent == (ptrdiff_t) ftype_size)             &&
         opal_datatype_is_contiguous_memory_layout(&datatype->super,1) &&
         0 == lb ) {
        sendbuf_is_contiguous = true;
    }



    if (! sendbuf_is_contiguous ) {
        ret =   mca_common_ompio_decode_datatype ((struct ompio_file_t *) fh,
                                                  datatype,
                                                  count,
                                                  buf,
                                                  &max_data,
                                                  &decoded_iov,
                                                  &iov_count);
        if (OMPI_SUCCESS != ret ){
            goto exit;
        }
    }
    else {
        max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
	status->_ucount = max_data;
    }

    dynamic_num_io_procs = fh->f_get_mca_parameter_value ( "num_aggregators", strlen ("num_aggregators"));
    if ( OMPI_ERR_MAX == dynamic_num_io_procs ) {
        ret = OMPI_ERROR;
        goto exit;
    }
    ret = mca_common_ompio_set_aggregator_props ((struct ompio_file_t *) fh,
				                 dynamic_num_io_procs,
				                 max_data);

    if (OMPI_SUCCESS != ret){
	goto exit;
    }
    my_aggregator = fh->f_procs_in_group[0];
    /**************************************************************************
     ** 2. Determine the total amount of data to be written
     **************************************************************************/
    total_bytes_per_process = (MPI_Aint*)malloc
        (fh->f_procs_per_group*sizeof(MPI_Aint));
    if (NULL == total_bytes_per_process) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_comm_time = MPI_Wtime();
#endif
    ret = ompi_fcoll_base_coll_allgather_array (&max_data,
                                           1,
                                           MPI_LONG,
                                           total_bytes_per_process,
                                           1,
                                           MPI_LONG,
                                           0,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);
    
    if( OMPI_SUCCESS != ret){
	goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_comm_time = MPI_Wtime();
    comm_time += (end_comm_time - start_comm_time);
#endif

    for (i=0 ; i<fh->f_procs_per_group ; i++) {
        total_bytes += total_bytes_per_process[i];
    }

    if (NULL != total_bytes_per_process) {
        free (total_bytes_per_process);
        total_bytes_per_process = NULL;
    }

    /*********************************************************************
     *** 3. Generate the local offsets/lengths array corresponding to
     ***    this write operation
     ********************************************************************/
    ret = fh->f_generate_current_file_view( (struct ompio_file_t *) fh,
					    max_data,
					    &local_iov_array,
					    &local_count);
    if (ret != OMPI_SUCCESS){
	goto exit;
    }

#if DEBUG_ON
    for (i=0 ; i<local_count ; i++) {

        printf("%d: OFFSET: %d   LENGTH: %ld\n",
               fh->f_rank,
               local_iov_array[i].iov_base,
               local_iov_array[i].iov_len);

    }
#endif

    /*************************************************************
     *** 4. Allgather the offset/lengths array from all processes
     *************************************************************/
    fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int));
    if (NULL == fview_count) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_comm_time = MPI_Wtime();
#endif
    ret = ompi_fcoll_base_coll_allgather_array (&local_count,
                                           1,
                                           MPI_INT,
                                           fview_count,
                                           1,
                                           MPI_INT,
                                           0,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);
    
    if( OMPI_SUCCESS != ret){
	goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_comm_time = MPI_Wtime();
    comm_time += (end_comm_time - start_comm_time);
#endif

    displs = (int*) malloc (fh->f_procs_per_group * sizeof (int));
    if (NULL == displs) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    displs[0] = 0;
    total_fview_count = fview_count[0];
    for (i=1 ; i<fh->f_procs_per_group ; i++) {
        total_fview_count += fview_count[i];
        displs[i] = displs[i-1] + fview_count[i-1];
    }

#if DEBUG_ON
    printf("total_fview_count : %d\n", total_fview_count);
    if (my_aggregator == fh->f_rank) {
        for (i=0 ; i<fh->f_procs_per_group ; i++) {
            printf ("%d: PROCESS: %d  ELEMENTS: %d  DISPLS: %d\n",
                    fh->f_rank,
                    i,
                    fview_count[i],
                    displs[i]);
        }
    }
#endif

    /* allocate the global iovec  */

    if (0 != total_fview_count) {
        global_iov_array = (struct iovec*) malloc (total_fview_count *
                                                   sizeof(struct iovec));
        if (NULL == global_iov_array){
            opal_output(1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_comm_time = MPI_Wtime();
#endif
    ret = ompi_fcoll_base_coll_allgatherv_array (local_iov_array,
                                            local_count,
                                            fh->f_iov_type,
                                            global_iov_array,
                                            fview_count,
                                            displs,
                                            fh->f_iov_type,
                                            0,
                                            fh->f_procs_in_group,
                                            fh->f_procs_per_group,
                                            fh->f_comm);
    if (OMPI_SUCCESS != ret){
	goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_comm_time = MPI_Wtime();
    comm_time += (end_comm_time - start_comm_time);
#endif

    /****************************************************************************************
    *** 5. Sort the global offset/lengths list based on the offsets.
    *** The result of the sort operation is the 'sorted', an integer array,
    *** which contains the indexes of the global_iov_array based on the offset.
    *** For example, if global_iov_array[x].offset is followed by global_iov_array[y].offset
    *** in the file, and that one is followed by global_iov_array[z].offset, than
    *** sorted[0] = x, sorted[1]=y and sorted[2]=z;
    ******************************************************************************************/
    if (0 != total_fview_count) {
        sorted = (int *)malloc (total_fview_count * sizeof(int));
        if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }
	ompi_fcoll_base_sort_iovec (global_iov_array, total_fview_count, sorted);
    }

    if (NULL != local_iov_array){
	free(local_iov_array);
	local_iov_array = NULL;
    }

    if (NULL != displs){
	free(displs);
	displs=NULL;
    }


#if DEBUG_ON
    if (my_aggregator == fh->f_rank) {
        uint32_t tv=0;
        for (tv=0 ; tv<total_fview_count ; tv++) {
            printf("%d: OFFSET: %lld   LENGTH: %ld\n",
                   fh->f_rank,
                   global_iov_array[sorted[tv]].iov_base,
                   global_iov_array[sorted[tv]].iov_len);
        }
    }
#endif
    /*************************************************************
     *** 6. Determine the number of cycles required to execute this
     ***    operation
     *************************************************************/
    bytes_per_cycle = fh->f_bytes_per_agg;
    cycles = ceil((double)total_bytes/bytes_per_cycle);

    if (my_aggregator == fh->f_rank) {
        disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
        if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	blocklen_per_process = (int **)calloc (fh->f_procs_per_group, sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	displs_per_process = (MPI_Aint **)calloc (fh->f_procs_per_group, sizeof (MPI_Aint*));
	if (NULL == displs_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	recv_req = (MPI_Request *)malloc ((fh->f_procs_per_group)*sizeof(MPI_Request));
	if ( NULL == recv_req ) {
	    opal_output (1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	global_buf  = (char *) malloc (bytes_per_cycle);
	if (NULL == global_buf){
	    opal_output(1, "OUT OF MEMORY");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group  * sizeof(ompi_datatype_t *));
	if (NULL == recvtype) {
	    opal_output (1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}
	for(l=0;l<fh->f_procs_per_group;l++){
            recvtype[l] = MPI_DATATYPE_NULL;
	}
    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_exch = MPI_Wtime();
#endif
    n = 0;
    bytes_remaining = 0;
    current_index = 0;

    for (index = 0; index < cycles; index++) {
        /**********************************************************************
         ***  7a. Getting ready for next cycle: initializing and freeing buffers
	 **********************************************************************/
        if (my_aggregator == fh->f_rank) {
            if (NULL != fh->f_io_array) {
                free (fh->f_io_array);
                fh->f_io_array = NULL;
            }
	    fh->f_num_of_io_entries = 0;

            if (NULL != recvtype){
                for (i =0; i< fh->f_procs_per_group; i++) {
                    if ( MPI_DATATYPE_NULL != recvtype[i] ) {
                        ompi_datatype_destroy(&recvtype[i]);
			recvtype[i] = MPI_DATATYPE_NULL;
                    }
                }
            }

            for(l=0;l<fh->f_procs_per_group;l++){
                disp_index[l] =  1;

                free(blocklen_per_process[l]);
                free(displs_per_process[l]);

                blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
                displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
                if (NULL == displs_per_process[l] || NULL == blocklen_per_process[l]){
                    opal_output (1, "OUT OF MEMORY for displs\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }

            if (NULL != sorted_file_offsets){
                free(sorted_file_offsets);
                sorted_file_offsets = NULL;
            }

            if(NULL != file_offsets_for_agg){
                free(file_offsets_for_agg);
                file_offsets_for_agg = NULL;
            }

            if (NULL != memory_displacements){
                free(memory_displacements);
                memory_displacements = NULL;
            }

        } /* (my_aggregator == fh->f_rank */

        /**************************************************************************
         ***  7b. Determine the number of bytes to be actually written in this cycle
	 **************************************************************************/
        if (cycles-1 == index) {
            bytes_to_write_in_cycle = total_bytes - bytes_per_cycle*index;
        }
        else {
            bytes_to_write_in_cycle = bytes_per_cycle;
        }

#if DEBUG_ON
        if (my_aggregator == fh->f_rank) {
            printf ("****%d: CYCLE %d   Bytes %lld**********\n",
                    fh->f_rank,
                    index,
                    bytes_to_write_in_cycle);
        }
#endif
        /**********************************************************
         **Gather the Data from all the processes at the writers **
         *********************************************************/

#if DEBUG_ON
        printf("bytes_to_write_in_cycle: %ld, cycle : %d\n", bytes_to_write_in_cycle,
	       index);
#endif

        /*****************************************************************
         *** 7c. Calculate how much data will be contributed in this cycle
	 ***     by each process
         *****************************************************************/
        bytes_sent = 0;

        /* The blocklen and displs calculation only done at aggregators!*/
        while (bytes_to_write_in_cycle) {

	    /* This next block identifies which process is the holder
	    ** of the sorted[current_index] element;
	    */
            blocks = fview_count[0];
            for (j=0 ; j<fh->f_procs_per_group ; j++) {
                if (sorted[current_index] < blocks) {
                    n = j;
                    break;
                }
                else {
                    blocks += fview_count[j+1];
                }
            }

            if (bytes_remaining) {
                /* Finish up a partially used buffer from the previous  cycle */

                if (bytes_remaining <= bytes_to_write_in_cycle) {
                    /* The data fits completely into the block */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining;
                        displs_per_process[n][disp_index[n] - 1] =
                            (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base +
                            (global_iov_array[sorted[current_index]].iov_len
                             - bytes_remaining);

                        /* In this cases the length is consumed so allocating for
                           next displacement and blocklength*/
                        blocklen_per_process[n] = (int *) realloc
                            ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
                        displs_per_process[n] = (MPI_Aint *) realloc
                            ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
                        blocklen_per_process[n][disp_index[n]] = 0;
                        displs_per_process[n][disp_index[n]] = 0;
                        disp_index[n] += 1;
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_sent += bytes_remaining;
                    }
                    current_index ++;
                    bytes_to_write_in_cycle -= bytes_remaining;
                    bytes_remaining = 0;
                    continue;
                }
                else {
                    /* the remaining data from the previous cycle is larger than the
		       bytes_to_write_in_cycle, so we have to segment again */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle;
                        displs_per_process[n][disp_index[n] - 1] =
                            (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base +
                            (global_iov_array[sorted[current_index]].iov_len
                             - bytes_remaining);
                    }

                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_sent += bytes_to_write_in_cycle;
                    }
                    bytes_remaining -= bytes_to_write_in_cycle;
                    bytes_to_write_in_cycle = 0;
                    break;
                }
            }
            else {
                 /* No partially used entry available, have to start a new one */
                if (bytes_to_write_in_cycle <
                    (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) {
                     /* This entry has more data than we can sendin one cycle */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle;
                        displs_per_process[n][disp_index[n] - 1] =
                            (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base ;
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_sent += bytes_to_write_in_cycle;

                    }
                    bytes_remaining = global_iov_array[sorted[current_index]].iov_len -
                        bytes_to_write_in_cycle;
                    bytes_to_write_in_cycle = 0;
                    break;
                }
                else {
                    /* Next data entry is less than bytes_to_write_in_cycle */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] =
                            global_iov_array[sorted[current_index]].iov_len;
                        displs_per_process[n][disp_index[n] - 1] = (ptrdiff_t)
                            global_iov_array[sorted[current_index]].iov_base;

                        /*realloc for next blocklength
                          and assign this displacement and check for next displs as
                          the total length of this entry has been consumed!*/
                        blocklen_per_process[n] =
                            (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
                        displs_per_process[n] = (MPI_Aint *)realloc
                            ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
                        blocklen_per_process[n][disp_index[n]] = 0;
                        displs_per_process[n][disp_index[n]] = 0;
                        disp_index[n] += 1;
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_sent += global_iov_array[sorted[current_index]].iov_len;
                    }
                    bytes_to_write_in_cycle -=
                        global_iov_array[sorted[current_index]].iov_len;
                    current_index ++;
                    continue;
                }
            }
        }


        /*************************************************************************
	 *** 7d. Calculate the displacement on where to put the data and allocate
         ***     the recieve buffer (global_buf)
	 *************************************************************************/
        if (my_aggregator == fh->f_rank) {
            entries_per_aggregator=0;
            for (i=0;i<fh->f_procs_per_group; i++){
                for (j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0)
                        entries_per_aggregator++ ;
                }
            }

#if DEBUG_ON
            printf("%d: cycle: %d, bytes_sent: %d\n ",fh->f_rank,index,
                   bytes_sent);
            printf("%d : Entries per aggregator : %d\n",fh->f_rank,entries_per_aggregator);
#endif

            if (entries_per_aggregator > 0){
                file_offsets_for_agg = (mca_io_ompio_local_io_array *)
                    malloc(entries_per_aggregator*sizeof(mca_io_ompio_local_io_array));
                if (NULL == file_offsets_for_agg) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }

                sorted_file_offsets = (int *)
                    malloc (entries_per_aggregator*sizeof(int));
                if (NULL == sorted_file_offsets){
                    opal_output (1, "OUT OF MEMORY\n");
                    ret =  OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }

                /*Moving file offsets to an IO array!*/
                temp_index = 0;

                for (i=0;i<fh->f_procs_per_group; i++){
                    for(j=0;j<disp_index[i];j++){
                        if (blocklen_per_process[i][j] > 0){
                            file_offsets_for_agg[temp_index].length =
                                blocklen_per_process[i][j];
                            file_offsets_for_agg[temp_index].process_id = i;
                            file_offsets_for_agg[temp_index].offset =
                                displs_per_process[i][j];
                            temp_index++;

#if DEBUG_ON
                            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                                   index+1,fh->f_rank);

                            printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
                                   fh->f_procs_in_group[i],j,
                                   blocklen_per_process[i][j],j,
                                   displs_per_process[i][j],
                                   fh->f_rank);
#endif
                        }
                    }
                }
            }
            else{
                continue;
            }
            /* Sort the displacements for each aggregator*/
            local_heap_sort (file_offsets_for_agg,
                             entries_per_aggregator,
                             sorted_file_offsets);

            /*create contiguous memory displacements
              based on blocklens on the same displs array
              and map it to this aggregator's actual
              file-displacements (this is in the io-array created above)*/
            memory_displacements = (MPI_Aint *) malloc
                (entries_per_aggregator * sizeof(MPI_Aint));

            memory_displacements[sorted_file_offsets[0]] = 0;
            for (i=1; i<entries_per_aggregator; i++){
                memory_displacements[sorted_file_offsets[i]] =
                    memory_displacements[sorted_file_offsets[i-1]] +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
            }

            temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
            if (NULL == temp_disp_index) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            /*Now update the displacements array  with memory offsets*/
            global_count = 0;
            for (i=0;i<entries_per_aggregator;i++){
                temp_pindex =
                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
                displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] =
                    memory_displacements[sorted_file_offsets[i]];
                if (temp_disp_index[temp_pindex] < disp_index[temp_pindex])
                    temp_disp_index[temp_pindex] += 1;
                else{
                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
                           temp_pindex, temp_disp_index[temp_pindex],
                           temp_pindex, disp_index[temp_pindex]);
                }
                global_count +=
                    file_offsets_for_agg[sorted_file_offsets[i]].length;
            }

            if (NULL != temp_disp_index){
                free(temp_disp_index);
                temp_disp_index = NULL;
            }

#if DEBUG_ON

            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0;i<fh->f_procs_per_group; i++){
                for(j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0){
                        printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
                               fh->f_procs_in_group[i],j,
                               blocklen_per_process[i][j],j,
                               displs_per_process[i][j],
                               fh->f_rank);

                    }
                }
            }
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0; i<entries_per_aggregator;i++){
                printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld\n",
                       file_offsets_for_agg[sorted_file_offsets[i]].process_id,
                       file_offsets_for_agg[sorted_file_offsets[i]].offset,
                       file_offsets_for_agg[sorted_file_offsets[i]].length,
                       memory_displacements[sorted_file_offsets[i]]);
            }
            printf("%d : global_count : %ld, bytes_sent : %d\n",
                   fh->f_rank,global_count, bytes_sent);
#endif
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_comm_time = MPI_Wtime();
#endif
        /*************************************************************************
	 *** 7e. Perform the actual communication
	 *************************************************************************/
            for (i=0;i<fh->f_procs_per_group; i++) {
                recv_req[i] = MPI_REQUEST_NULL;
                if ( 0 < disp_index[i] ) {
                    ompi_datatype_create_hindexed(disp_index[i],
                                                  blocklen_per_process[i],
                                                  displs_per_process[i],
                                                  MPI_BYTE,
                                                  &recvtype[i]);
                    ompi_datatype_commit(&recvtype[i]);
                    opal_datatype_type_size(&recvtype[i]->super, &datatype_size);

                    if (datatype_size){
                        ret = MCA_PML_CALL(irecv(global_buf,
                                                 1,
                                                 recvtype[i],
                                                 fh->f_procs_in_group[i],
                                                 123,
                                                 fh->f_comm,
                                                 &recv_req[i]));
                        if (OMPI_SUCCESS != ret){
                            goto exit;
                        }
                    }
                }
            }
        } /* end if (my_aggregator == fh->f_rank ) */


        if ( sendbuf_is_contiguous ) {
            send_buf = &((char*)buf)[total_bytes_written];
        }
        else if (bytes_sent) {
            /* allocate a send buffer and copy the data that needs
               to be sent into it in case the data is non-contigous
               in memory */
            ptrdiff_t mem_address;
            size_t remaining = 0;
            size_t temp_position = 0;

            send_buf = malloc (bytes_sent);
            if (NULL == send_buf) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            remaining = bytes_sent;

            while (remaining) {
                mem_address = (ptrdiff_t)
                    (decoded_iov[iov_index].iov_base) + current_position;

                if (remaining >=
                    (decoded_iov[iov_index].iov_len - current_position)) {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *)mem_address,
                            decoded_iov[iov_index].iov_len - current_position);
                    remaining = remaining -
                        (decoded_iov[iov_index].iov_len - current_position);
                    temp_position = temp_position +
                        (decoded_iov[iov_index].iov_len - current_position);
                    iov_index = iov_index + 1;
                    current_position = 0;
                }
                else {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *) mem_address,
                            remaining);
                    current_position = current_position + remaining;
                    remaining = 0;
                }
            }
	}
	total_bytes_written += bytes_sent;

	/* Gather the sendbuf from each process in appropritate locations in
           aggregators*/

	if (bytes_sent){
            ret = MCA_PML_CALL(isend(send_buf,
                                     bytes_sent,
                                     MPI_BYTE,
                                     my_aggregator,
                                     123,
                                     MCA_PML_BASE_SEND_STANDARD,
                                     fh->f_comm,
                                     &send_req));


	    if ( OMPI_SUCCESS != ret ){
		goto exit;
	    }

	    ret = ompi_request_wait(&send_req, MPI_STATUS_IGNORE);
	    if (OMPI_SUCCESS != ret){
		goto exit;
	    }
	}

	if (my_aggregator == fh->f_rank) {
            ret = ompi_request_wait_all (fh->f_procs_per_group,
                                         recv_req,
                                         MPI_STATUS_IGNORE);

            if (OMPI_SUCCESS != ret){
                goto exit;
            }
	}

#if DEBUG_ON
	if (my_aggregator == fh->f_rank){
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0 ; i<global_count/4 ; i++)
                printf (" RECV %d \n",((int *)global_buf)[i]);
	}
#endif

        if (! sendbuf_is_contiguous) {
            if (NULL != send_buf) {
                free (send_buf);
                send_buf = NULL;
            }
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_comm_time = MPI_Wtime();
        comm_time += (end_comm_time - start_comm_time);
#endif
        /**********************************************************
         *** 7f. Create the io array, and pass it to fbtl
         *********************************************************/

	if (my_aggregator == fh->f_rank) {

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
	    start_write_time = MPI_Wtime();
#endif

            fh->f_io_array = (mca_common_ompio_io_array_t *) malloc
                (entries_per_aggregator * sizeof (mca_common_ompio_io_array_t));
            if (NULL == fh->f_io_array) {
                opal_output(1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            fh->f_num_of_io_entries = 0;
            /*First entry for every aggregator*/
            fh->f_io_array[0].offset =
                (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
            fh->f_io_array[0].length =
                file_offsets_for_agg[sorted_file_offsets[0]].length;
            fh->f_io_array[0].memory_address =
                global_buf+memory_displacements[sorted_file_offsets[0]];
            fh->f_num_of_io_entries++;

            for (i=1;i<entries_per_aggregator;i++){
                /* If the enrties are contiguous merge them,
                   else make a new entry */
                if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
                    file_offsets_for_agg[sorted_file_offsets[i]].offset){
                    fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                }
                else {
                    fh->f_io_array[fh->f_num_of_io_entries].offset =
                        (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
                    fh->f_io_array[fh->f_num_of_io_entries].length =
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                    fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                        global_buf+memory_displacements[sorted_file_offsets[i]];
                    fh->f_num_of_io_entries++;
                }

            }

#if DEBUG_ON
            printf("*************************** %d\n", fh->f_num_of_io_entries);
            for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
                printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
                       fh->f_io_array[i].memory_address,
                       (ptrdiff_t)fh->f_io_array[i].offset,
                       fh->f_io_array[i].length);
            }

#endif

            if (fh->f_num_of_io_entries) {
                if ( 0 >  fh->f_fbtl->fbtl_pwritev (fh)) {
                    opal_output (1, "WRITE FAILED\n");
                    ret = OMPI_ERROR;
                    goto exit;
                }
            }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_write_time = MPI_Wtime();
            write_time += end_write_time - start_write_time;
#endif


	} /* end if (my_aggregator == fh->f_rank) */
    } /* end  for (index = 0; index < cycles; index++) */

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_exch = MPI_Wtime();
    exch_write += end_exch - start_exch;
    nentry.time[0] = write_time;
    nentry.time[1] = comm_time;
    nentry.time[2] = exch_write;
    if (my_aggregator == fh->f_rank)
	nentry.aggregator = 1;
    else
	nentry.aggregator = 0;
    nentry.nprocs_for_coll = dynamic_num_io_procs;
    if (!mca_common_ompio_full_print_queue(fh->f_coll_write_time)){
        mca_common_ompio_register_print_entry(fh->f_coll_write_time,
                                              nentry);
    }
#endif


exit :
    if (my_aggregator == fh->f_rank) {
        if (NULL != sorted_file_offsets){
            free(sorted_file_offsets);
            sorted_file_offsets = NULL;
        }
        if(NULL != file_offsets_for_agg){
            free(file_offsets_for_agg);
            file_offsets_for_agg = NULL;
        }
        if (NULL != memory_displacements){
            free(memory_displacements);
            memory_displacements = NULL;
        }
        if (NULL != recvtype){
            for (i =0; i< fh->f_procs_per_group; i++) {
                if ( MPI_DATATYPE_NULL != recvtype[i] ) {
                    ompi_datatype_destroy(&recvtype[i]);
                }
            }
            free(recvtype);
            recvtype=NULL;
        }

        if (NULL != fh->f_io_array) {
            free (fh->f_io_array);
            fh->f_io_array = NULL;
        }
        if (NULL != disp_index){
            free(disp_index);
            disp_index = NULL;
        }
        if (NULL != recvtype){
            free(recvtype);
            recvtype=NULL;
        }
        if (NULL != recv_req){
            free(recv_req);
            recv_req = NULL;
        }
        if (NULL != global_buf) {
            free (global_buf);
            global_buf = NULL;
        }
        for(l=0;l<fh->f_procs_per_group;l++){
            if (NULL != blocklen_per_process){
                free(blocklen_per_process[l]);
            }
            if (NULL != displs_per_process){
                free(displs_per_process[l]);
            }
        }

        free(blocklen_per_process);
        free(displs_per_process);
    }

    if (NULL != displs){
	free(displs);
	displs=NULL;
    }

    if (! sendbuf_is_contiguous) {
	if (NULL != send_buf) {
	    free (send_buf);
	    send_buf = NULL;
	}
    }
    if (NULL != global_buf) {
        free (global_buf);
        global_buf = NULL;
    }
    if (NULL != sorted) {
        free (sorted);
        sorted = NULL;
    }
    if (NULL != global_iov_array) {
        free (global_iov_array);
	global_iov_array = NULL;
    }
    if (NULL != fview_count) {
        free (fview_count);
        fview_count = NULL;
    }
    if (NULL != decoded_iov) {
        free (decoded_iov);
        decoded_iov = NULL;
    }


    return OMPI_SUCCESS;
}
static int
mca_coll_basic_neighbor_allgather_cart(const void *sbuf, int scount,
                                       struct ompi_datatype_t *sdtype, void *rbuf,
                                       int rcount, struct ompi_datatype_t *rdtype,
                                       struct ompi_communicator_t *comm,
                                       mca_coll_base_module_t *module)
{
    const mca_topo_base_comm_cart_2_2_0_t *cart = comm->c_topo->mtc.cart;
    const int rank = ompi_comm_rank (comm);
    ompi_request_t **reqs, **preqs;
    ptrdiff_t lb, extent;
    int rc = MPI_SUCCESS, dim, nreqs;

    ompi_datatype_get_extent(rdtype, &lb, &extent);

    reqs = preqs = coll_base_comm_get_reqs( module->base_data, 4 * cart->ndims );
    if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; }

    /* The ordering is defined as -1 then +1 in each dimension in
     * order of dimension. */
    for (dim = 0, nreqs = 0 ; dim < cart->ndims ; ++dim) {
        int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL;

        if (cart->dims[dim] > 1) {
            mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank);
        } else if (1 == cart->dims[dim] && cart->periods[dim]) {
            srank = drank = rank;
        }

        if (MPI_PROC_NULL != srank) {
            nreqs++;
            rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, srank,
                                    MCA_COLL_BASE_TAG_ALLGATHER,
                                    comm, preqs++));
            if (OMPI_SUCCESS != rc) break;

            nreqs++;
            /* remove cast from const when the pml layer is updated to take
             * a const for the send buffer. */
            rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, srank,
                                    MCA_COLL_BASE_TAG_ALLGATHER,
                                    MCA_PML_BASE_SEND_STANDARD,
                                    comm, preqs++));
            if (OMPI_SUCCESS != rc) break;
        }

        rbuf = (char *) rbuf + extent * rcount;

        if (MPI_PROC_NULL != drank) {
            nreqs++;
            rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, drank,
                                    MCA_COLL_BASE_TAG_ALLGATHER,
                                    comm, preqs++));
            if (OMPI_SUCCESS != rc) break;

            nreqs++;
            rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, drank,
                                    MCA_COLL_BASE_TAG_ALLGATHER,
                                    MCA_PML_BASE_SEND_STANDARD,
                                    comm, preqs++));
            if (OMPI_SUCCESS != rc) break;
        }

        rbuf = (char *) rbuf + extent * rcount;
    }

    if (OMPI_SUCCESS != rc) {
        ompi_coll_base_free_reqs(reqs, nreqs);
        return rc;
    }

    rc = ompi_request_wait_all (nreqs, reqs, MPI_STATUSES_IGNORE);
    if (OMPI_SUCCESS != rc) {
        ompi_coll_base_free_reqs(reqs, nreqs);
    }
    return rc;
}
Example #19
0
/* Arguments not used in this implementation:
 *  - bridgecomm
 *  - local_leader
 *  - remote_leader
 *  - send_first
 */
static int ompi_comm_allreduce_inter ( int *inbuf, int *outbuf, 
                                       int count, struct ompi_op_t *op, 
                                       ompi_communicator_t *intercomm,
                                       ompi_communicator_t *bridgecomm, 
                                       void* local_leader, 
                                       void* remote_leader, 
                                       int send_first )
{
    int local_rank, rsize;
    int i, rc;
    int *sbuf;
    int *tmpbuf=NULL;
    int *rcounts=NULL, scount=0;
    int *rdisps=NULL;

    if ( &ompi_mpi_op_sum.op != op && &ompi_mpi_op_prod.op != op &&
         &ompi_mpi_op_max.op != op && &ompi_mpi_op_min.op  != op ) {
        return MPI_ERR_OP;
    }

    if ( !OMPI_COMM_IS_INTER (intercomm)) {
        return MPI_ERR_COMM;
    }

    /* Allocate temporary arrays */
    rsize      = ompi_comm_remote_size (intercomm);
    local_rank = ompi_comm_rank ( intercomm );

    tmpbuf  = (int *) malloc ( count * sizeof(int));
    rdisps  = (int *) calloc ( rsize, sizeof(int));
    rcounts = (int *) calloc ( rsize, sizeof(int) );
    if ( OPAL_UNLIKELY (NULL == tmpbuf || NULL == rdisps || NULL == rcounts)) {
        rc = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

    /* Execute the inter-allreduce: the result of our group will
       be in the buffer of the remote group */
    rc = intercomm->c_coll.coll_allreduce ( inbuf, tmpbuf, count, MPI_INT,
                                            op, intercomm,
                                            intercomm->c_coll.coll_allreduce_module);
    if ( OMPI_SUCCESS != rc ) {
        goto exit;
    }

    if ( 0 == local_rank ) {
        MPI_Request req;

        /* for the allgatherv later */
        scount = count;

        /* local leader exchange their data and determine the overall result
           for both groups */
        rc = MCA_PML_CALL(irecv (outbuf, count, MPI_INT, 0, 
                                 OMPI_COMM_ALLREDUCE_TAG,
                                 intercomm, &req));
        if ( OMPI_SUCCESS != rc ) {
            goto exit;
        }
        rc = MCA_PML_CALL(send (tmpbuf, count, MPI_INT, 0,
                                OMPI_COMM_ALLREDUCE_TAG,
                                MCA_PML_BASE_SEND_STANDARD,
                                intercomm));
        if ( OMPI_SUCCESS != rc ) {
            goto exit;
        }
        rc = ompi_request_wait ( &req, MPI_STATUS_IGNORE );
        if ( OMPI_SUCCESS != rc ) {
            goto exit;
        }

        if ( &ompi_mpi_op_max.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                if (tmpbuf[i] > outbuf[i]) {
                    outbuf[i] = tmpbuf[i];
                }
            }
        }
        else if ( &ompi_mpi_op_min.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                if (tmpbuf[i] < outbuf[i]) {
                    outbuf[i] = tmpbuf[i];
                }
            }
        }
        else if ( &ompi_mpi_op_sum.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                outbuf[i] += tmpbuf[i];
            }
        }
        else if ( &ompi_mpi_op_prod.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                outbuf[i] *= tmpbuf[i];
            }
        }
    }

    /* distribute the overall result to all processes in the other group.
       Instead of using bcast, we are using here allgatherv, to avoid the
       possible deadlock. Else, we need an algorithm to determine, 
       which group sends first in the inter-bcast and which receives 
       the result first.
    */
    rcounts[0] = count;
    sbuf       = outbuf;
    rc = intercomm->c_coll.coll_allgatherv (sbuf, scount, MPI_INT, outbuf,
                                            rcounts, rdisps, MPI_INT, 
                                            intercomm,
                                            intercomm->c_coll.coll_allgatherv_module);

 exit:
    if ( NULL != tmpbuf ) {
        free ( tmpbuf );
    }
    if ( NULL != rcounts ) {
        free ( rcounts );
    }
    if ( NULL != rdisps ) {
        free ( rdisps );
    }
    
    return (rc);
}
Example #20
0
/*
 *	barrier_intra_log
 *
 *	Function:	- barrier using O(log(N)) algorithm
 *	Accepts:	- same as MPI_Barrier()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_barrier_intra_log(struct ompi_communicator_t *comm,
                                 mca_coll_base_module_t *module)
{
    int i;
    int err;
    int peer;
    int dim;
    int hibit;
    int mask;
    int size = ompi_comm_size(comm);
    int rank = ompi_comm_rank(comm);

    /* Send null-messages up and down the tree.  Synchronization at the
     * root (rank 0). */

    dim = comm->c_cube_dim;
    hibit = opal_hibit(rank, dim);
    --dim;

    /* Receive from children. */

    for (i = dim, mask = 1 << i; i > hibit; --i, mask >>= 1) {
        peer = rank | mask;
        if (peer < size) {
            err = MCA_PML_CALL(recv(NULL, 0, MPI_BYTE, peer,
                                    MCA_COLL_BASE_TAG_BARRIER,
                                    comm, MPI_STATUS_IGNORE));
            if (MPI_SUCCESS != err) {
                return err;
            }
        }
    }

    /* Send to and receive from parent. */

    if (rank > 0) {
        peer = rank & ~(1 << hibit);
        err =
            MCA_PML_CALL(send
                         (NULL, 0, MPI_BYTE, peer,
                          MCA_COLL_BASE_TAG_BARRIER,
                          MCA_PML_BASE_SEND_STANDARD, comm));
        if (MPI_SUCCESS != err) {
            return err;
        }

        err = MCA_PML_CALL(recv(NULL, 0, MPI_BYTE, peer,
                                MCA_COLL_BASE_TAG_BARRIER,
                                comm, MPI_STATUS_IGNORE));
        if (MPI_SUCCESS != err) {
            return err;
        }
    }

    /* Send to children. */

    for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) {
        peer = rank | mask;
        if (peer < size) {
            err = MCA_PML_CALL(send(NULL, 0, MPI_BYTE, peer,
                                    MCA_COLL_BASE_TAG_BARRIER,
                                    MCA_PML_BASE_SEND_STANDARD, comm));
            if (MPI_SUCCESS != err) {
                return err;
            }
        }
    }

    /* All done */

    return MPI_SUCCESS;
}
Example #21
0
/*
 * alltoall_intra_linear_sync
 *
 * Function:       Linear implementation of alltoall with limited number
 *                 of outstanding requests.
 * Accepts:        Same as MPI_Alltoall(), and the maximum number of
 *                 outstanding requests (actual number is 2 * max, since
 *                 we count receive and send requests separately).
 * Returns:        MPI_SUCCESS or error code
 *
 * Description:    Algorithm is the following:
 *                 1) post K irecvs, K <= N
 *                 2) post K isends, K <= N
 *                 3) while not done
 *                    - wait for any request to complete
 *                    - replace that request by the new one of the same type.
 */
int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount,
                                               struct ompi_datatype_t *sdtype,
                                               void* rbuf, int rcount,
                                               struct ompi_datatype_t *rdtype,
                                               struct ompi_communicator_t *comm,
                                               mca_coll_base_module_t *module,
                                               int max_outstanding_reqs)
{
    int line, error, ri, si, rank, size, nreqs, nrreqs, nsreqs, total_reqs;
    char *psnd, *prcv;
    ptrdiff_t slb, sext, rlb, rext;

    ompi_request_t **reqs = NULL;

    if (MPI_IN_PLACE == sbuf) {
        return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
                                                            comm, module);
    }

    /* Initialize. */

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "ompi_coll_base_alltoall_intra_linear_sync rank %d", rank));

    error = ompi_datatype_get_extent(sdtype, &slb, &sext);
    if (OMPI_SUCCESS != error) {
        return error;
    }
    sext *= scount;

    error = ompi_datatype_get_extent(rdtype, &rlb, &rext);
    if (OMPI_SUCCESS != error) {
        return error;
    }
    rext *= rcount;

    /* simple optimization */

    psnd = ((char *) sbuf) + (ptrdiff_t)rank * sext;
    prcv = ((char *) rbuf) + (ptrdiff_t)rank * rext;

    error = ompi_datatype_sndrcv(psnd, scount, sdtype, prcv, rcount, rdtype);
    if (MPI_SUCCESS != error) {
        return error;
    }

    /* If only one process, we're done. */

    if (1 == size) {
        return MPI_SUCCESS;
    }

    /* Initiate send/recv to/from others. */
    total_reqs =  (((max_outstanding_reqs > (size - 1)) ||
                    (max_outstanding_reqs <= 0)) ?
                   (size - 1) : (max_outstanding_reqs));
    reqs = coll_base_comm_get_reqs(module->base_data, 2 * total_reqs);
    if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; }

    prcv = (char *) rbuf;
    psnd = (char *) sbuf;

    /* Post first batch or ireceive and isend requests  */
    for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs;
         ri = (ri + 1) % size, ++nreqs, ++nrreqs) {
        error =
            MCA_PML_CALL(irecv
                         (prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri,
                          MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[nreqs]));
        if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
    }
    for ( nsreqs = 0, si =  (rank + size - 1) % size; nreqs < 2 * total_reqs;
          si = (si + size - 1) % size, ++nreqs, ++nsreqs) {
        error =
            MCA_PML_CALL(isend
                         (psnd + (ptrdiff_t)si * sext, scount, sdtype, si,
                          MCA_COLL_BASE_TAG_ALLTOALL,
                          MCA_PML_BASE_SEND_STANDARD, comm, &reqs[nreqs]));
        if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
    }

    /* Wait for requests to complete */
    if (nreqs == 2 * (size - 1)) {
        /* Optimization for the case when all requests have been posted  */
        error = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE);
        if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }

    } else {
        /* As requests complete, replace them with corresponding requests:
           - wait for any request to complete, mark the request as
           MPI_REQUEST_NULL
           - If it was a receive request, replace it with new irecv request
           (if any)
           - if it was a send request, replace it with new isend request (if any)
        */
        int ncreqs = 0;
        while (ncreqs < 2 * (size - 1)) {
            int completed;
            error = ompi_request_wait_any(2 * total_reqs, reqs, &completed,
                                          MPI_STATUS_IGNORE);
            if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
            reqs[completed] = MPI_REQUEST_NULL;
            ncreqs++;
            if (completed < total_reqs) {
                if (nrreqs < (size - 1)) {
                    error =
                        MCA_PML_CALL(irecv
                                     (prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri,
                                      MCA_COLL_BASE_TAG_ALLTOALL, comm,
                                      &reqs[completed]));
                    if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
                    ++nrreqs;
                    ri = (ri + 1) % size;
                }
            } else {
                if (nsreqs < (size - 1)) {
                    error = MCA_PML_CALL(isend
                                         (psnd + (ptrdiff_t)si * sext, scount, sdtype, si,
                                          MCA_COLL_BASE_TAG_ALLTOALL,
                                          MCA_PML_BASE_SEND_STANDARD, comm,
                                          &reqs[completed]));
                    ++nsreqs;
                    si = (si + size - 1) % size;
                }
            }
        }
    }

    /* All done */
    return MPI_SUCCESS;

 error_hndl:
    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error,
                 rank));
    ompi_coll_base_free_reqs(reqs, 2 * total_reqs);
    return error;
}
Example #22
0
int ompi_mpi_finalize(void)
{
    int ret;
    static int32_t finalize_has_already_started = 0;
    opal_list_item_t *item;
    ompi_proc_t** procs;
    size_t nprocs;
    OPAL_TIMING_DECLARE(tm);
    OPAL_TIMING_INIT(&tm);


    /* Be a bit social if an erroneous program calls MPI_FINALIZE in
       two different threads, otherwise we may deadlock in
       ompi_comm_free() (or run into other nasty lions, tigers, or
       bears) */

    if (! opal_atomic_cmpset_32(&finalize_has_already_started, 0, 1)) {
        /* Note that if we're already finalized, we cannot raise an
           MPI exception.  The best that we can do is write something
           to stderr. */
        char hostname[MAXHOSTNAMELEN];
        pid_t pid = getpid();
        gethostname(hostname, sizeof(hostname));

        opal_show_help("help-mpi-runtime.txt",
                       "mpi_finalize:invoked_multiple_times",
                       true, hostname, pid);
        return MPI_ERR_OTHER;
    }

    ompi_mpiext_fini();

    /* Per MPI-2:4.8, we have to free MPI_COMM_SELF before doing
       anything else in MPI_FINALIZE (to include setting up such that
       MPI_FINALIZED will return true). */

    if (NULL != ompi_mpi_comm_self.comm.c_keyhash) {
        ompi_attr_delete_all(COMM_ATTR, &ompi_mpi_comm_self,
                             ompi_mpi_comm_self.comm.c_keyhash);
        OBJ_RELEASE(ompi_mpi_comm_self.comm.c_keyhash);
        ompi_mpi_comm_self.comm.c_keyhash = NULL;
    }

    /* Proceed with MPI_FINALIZE */

    ompi_mpi_finalized = true;

    /* As finalize is the last legal MPI call, we are allowed to force the release
     * of the user buffer used for bsend, before going anywhere further.
     */
    (void)mca_pml_base_bsend_detach(NULL, NULL);

#if OPAL_ENABLE_PROGRESS_THREADS == 0
    opal_progress_set_event_flag(OPAL_EVLOOP_ONCE | OPAL_EVLOOP_NONBLOCK);
#endif

    /* Redo ORTE calling opal_progress_event_users_increment() during
       MPI lifetime, to get better latency when not using TCP */
    opal_progress_event_users_increment();

    /* check to see if we want timing information */
    OPAL_TIMING_EVENT((&tm,"Start barrier"));

    /* NOTE: MPI-2.1 requires that MPI_FINALIZE is "collective" across
       *all* connected processes.  This only means that all processes
       have to call it.  It does *not* mean that all connected
       processes need to synchronize (either directly or indirectly).  

       For example, it is quite easy to construct complicated
       scenarios where one job is "connected" to another job via
       transitivity, but have no direct knowledge of each other.
       Consider the following case: job A spawns job B, and job B
       later spawns job C.  A "connectedness" graph looks something
       like this:

           A <--> B <--> C

       So what are we *supposed* to do in this case?  If job A is
       still connected to B when it calls FINALIZE, should it block
       until jobs B and C also call FINALIZE?

       After lengthy discussions many times over the course of this
       project, the issue was finally decided at the Louisville Feb
       2009 meeting: no.

       Rationale:

       - "Collective" does not mean synchronizing.  It only means that
         every process call it.  Hence, in this scenario, every
         process in A, B, and C must call FINALIZE.

       - KEY POINT: if A calls FINALIZE, then it is erroneous for B or
         C to try to communicate with A again.

       - Hence, OMPI is *correct* to only effect a barrier across each
         jobs' MPI_COMM_WORLD before exiting.  Specifically, if A
         calls FINALIZE long before B or C, it's *correct* if A exits
         at any time (and doesn't notify B or C that it is exiting).

       - Arguably, if B or C do try to communicate with the now-gone
         A, OMPI should try to print a nice error ("you tried to
         communicate with a job that is already gone...") instead of
         segv or other Badness.  However, that is an *extremely*
         difficult problem -- sure, it's easy for A to tell B that it
         is finalizing, but how can A tell C?  A doesn't even know
         about C.  You'd need to construct a "connected" graph in a
         distributed fashion, which is fraught with race conditions,
         etc.

      Hence, our conclusion is: OMPI is *correct* in its current
      behavior (of only doing a barrier across its own COMM_WORLD)
      before exiting.  Any problems that occur are as a result of
      erroneous MPI applications.  We *could* tighten up the erroneous
      cases and ensure that we print nice error messages / don't
      crash, but that is such a difficult problem that we decided we
      have many other, much higher priority issues to handle that deal
      with non-erroneous cases. */

    /* Wait for everyone to reach this point.  This is a grpcomm
       barrier instead of an MPI barrier for (at least) two reasons:

       1. An MPI barrier doesn't ensure that all messages have been
          transmitted before exiting (e.g., a BTL can lie and buffer a
          message without actually injecting it to the network, and
          therefore require further calls to that BTL's progress), so
          the possibility of a stranded message exists.

       2. If the MPI communication is using an unreliable transport,
          there's a problem of knowing that everyone has *left* the
          barrier.  E.g., one proc can send its ACK to the barrier
          message to a peer and then leave the barrier, but the ACK
          can get lost and therefore the peer is left in the barrier.

       Point #1 has been known for a long time; point #2 emerged after
       we added the first unreliable BTL to Open MPI and fixed the
       del_procs behavior around May of 2014 (see
       https://svn.open-mpi.org/trac/ompi/ticket/4669#comment:4 for
       more details). */
    opal_pmix.fence(NULL, 0);

    /* check for timing request - get stop time and report elapsed
     time if so */
    OPAL_TIMING_EVENT((&tm,"Finish barrier"));
    OPAL_TIMING_REPORT(ompi_enable_timing, &tm, "MPI_Finish");
    OPAL_TIMING_RELEASE(&tm);

    /*
     * Shutdown the Checkpoint/Restart Mech.
     */
    if (OMPI_SUCCESS != (ret = ompi_cr_finalize())) {
        OMPI_ERROR_LOG(ret);
    }

    /* Shut down any bindings-specific issues: C++, F77, F90 */

    /* Remove all memory associated by MPI_REGISTER_DATAREP (per
       MPI-2:9.5.3, there is no way for an MPI application to
       *un*register datareps, but we don't want the OMPI layer causing
       memory leaks). */
    while (NULL != (item = opal_list_remove_first(&ompi_registered_datareps))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&ompi_registered_datareps);

    /* Remove all F90 types from the hash tables. As the OBJ_DESTRUCT will
     * call a special destructor able to release predefined types, we can
     * simply call the OBJ_DESTRUCT on the hash table and all memory will
     * be correctly released.
     */
    OBJ_DESTRUCT( &ompi_mpi_f90_integer_hashtable );
    OBJ_DESTRUCT( &ompi_mpi_f90_real_hashtable );
    OBJ_DESTRUCT( &ompi_mpi_f90_complex_hashtable );

    /* Free communication objects */

    /* free file resources */
    if (OMPI_SUCCESS != (ret = ompi_file_finalize())) {
        return ret;
    }

    /* free window resources */
    if (OMPI_SUCCESS != (ret = ompi_win_finalize())) {
        return ret;
    }
    if (OMPI_SUCCESS != (ret = ompi_osc_base_finalize())) {
        return ret;
    }

    /* free communicator resources. this MUST come before finalizing the PML
     * as this will call into the pml */
    if (OMPI_SUCCESS != (ret = ompi_comm_finalize())) {
        return ret;
    }

    nprocs = 0;
    procs = ompi_proc_world(&nprocs);
    MCA_PML_CALL(del_procs(procs, nprocs));
    free(procs);

    /* free pml resource */ 
    if(OMPI_SUCCESS != (ret = mca_pml_base_finalize())) { 
      return ret;
    }

    /* free requests */
    if (OMPI_SUCCESS != (ret = ompi_request_finalize())) {
        return ret;
    }

    if (OMPI_SUCCESS != (ret = ompi_message_finalize())) {
        return ret;
    }

    /* If requested, print out a list of memory allocated by ALLOC_MEM
       but not freed by FREE_MEM */
    if (0 != ompi_debug_show_mpi_alloc_mem_leaks) {
        mca_mpool_base_tree_print(ompi_debug_show_mpi_alloc_mem_leaks);
    }

    /* Now that all MPI objects dealing with communications are gone,
       shut down MCA types having to do with communications */
    if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_pml_base_framework) ) ) {
        OMPI_ERROR_LOG(ret);
        return ret;
    }

    /* shut down buffered send code */
    mca_pml_base_bsend_fini();

#if OPAL_ENABLE_FT_CR == 1
    /*
     * Shutdown the CRCP Framework, must happen after PML shutdown
     */
    if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_crcp_base_framework) ) ) {
        OMPI_ERROR_LOG(ret);
        return ret;
    }
#endif

    /* Free secondary resources */

    /* free attr resources */
    if (OMPI_SUCCESS != (ret = ompi_attr_finalize())) {
        return ret;
    }

    /* free group resources */
    if (OMPI_SUCCESS != (ret = ompi_group_finalize())) {
        return ret;
    }

    /* free proc resources */
    if ( OMPI_SUCCESS != (ret = ompi_proc_finalize())) {
        return ret;
    }
    
    /* finalize the pubsub functions */
    if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_pubsub_base_framework) ) ) {
        return ret;
    }
    
    /* finalize the DPM framework */
    if ( OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_dpm_base_framework))) {
        return ret;
    }
    
    /* free internal error resources */
    if (OMPI_SUCCESS != (ret = ompi_errcode_intern_finalize())) {
        return ret;
    }
     
    /* free error code resources */
    if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_finalize())) {
        return ret;
    }

    /* free errhandler resources */
    if (OMPI_SUCCESS != (ret = ompi_errhandler_finalize())) {
        return ret;
    }

    /* Free all other resources */

    /* free op resources */
    if (OMPI_SUCCESS != (ret = ompi_op_finalize())) {
        return ret;
    }

    /* free ddt resources */
    if (OMPI_SUCCESS != (ret = ompi_datatype_finalize())) {
        return ret;
    }

    /* free info resources */
    if (OMPI_SUCCESS != (ret = ompi_info_finalize())) {
        return ret;
    }

    /* Close down MCA modules */

    /* io is opened lazily, so it's only necessary to close it if it
       was actually opened */
    if (0 < ompi_io_base_framework.framework_refcnt) {
        /* May have been "opened" multiple times. We want it closed now */
        ompi_io_base_framework.framework_refcnt = 1;

        if (OMPI_SUCCESS != mca_base_framework_close(&ompi_io_base_framework)) {
            return ret;
        }
    }
    (void) mca_base_framework_close(&ompi_topo_base_framework);
    if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_osc_base_framework))) {
        return ret;
    }
    if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_coll_base_framework))) {
        return ret;
    }
    if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_bml_base_framework))) {
        return ret;
    }
    if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_mpool_base_framework))) {
        return ret;
    }
    if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_rcache_base_framework))) {
        return ret;
    }
    if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_allocator_base_framework))) {
        return ret;
    }

    if (NULL != ompi_mpi_main_thread) {
        OBJ_RELEASE(ompi_mpi_main_thread);
        ompi_mpi_main_thread = NULL;
    }

    /* Leave the RTE */

    if (OMPI_SUCCESS != (ret = ompi_rte_finalize())) {
        return ret;
    }
    ompi_rte_initialized = false;

    /* now close the rte framework */
    if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_rte_base_framework) ) ) {
        OMPI_ERROR_LOG(ret);
        return ret;
    }

    if (OPAL_SUCCESS != (ret = opal_finalize_util())) {
        return ret;
    }

    /* All done */

    return MPI_SUCCESS;
}
Example #23
0
int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount,
                                               struct ompi_datatype_t *sdtype,
                                               void* rbuf, int rcount,
                                               struct ompi_datatype_t *rdtype,
                                               struct ompi_communicator_t *comm,
                                               mca_coll_base_module_t *module)
{
    int i, rank, size, err, nreqs, line;
    char *psnd, *prcv;
    MPI_Aint lb, sndinc, rcvinc;
    ompi_request_t **req, **sreq, **rreq;
    mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
    mca_coll_base_comm_t *data = base_module->base_data;

    if (MPI_IN_PLACE == sbuf) {
        return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
                                                            comm, module);
    }

    /* Initialize. */

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "ompi_coll_base_alltoall_intra_basic_linear rank %d", rank));

    err = ompi_datatype_get_extent(sdtype, &lb, &sndinc);
    if (OMPI_SUCCESS != err) {
        return err;
    }
    sndinc *= scount;

    err = ompi_datatype_get_extent(rdtype, &lb, &rcvinc);
    if (OMPI_SUCCESS != err) {
        return err;
    }
    rcvinc *= rcount;

    /* simple optimization */

    psnd = ((char *) sbuf) + (ptrdiff_t)rank * sndinc;
    prcv = ((char *) rbuf) + (ptrdiff_t)rank * rcvinc;

    err = ompi_datatype_sndrcv(psnd, scount, sdtype, prcv, rcount, rdtype);
    if (MPI_SUCCESS != err) {
        return err;
    }

    /* If only one process, we're done. */

    if (1 == size) {
        return MPI_SUCCESS;
    }

    /* Initiate all send/recv to/from others. */

    req = rreq = coll_base_comm_get_reqs(data, (size - 1) * 2);

    prcv = (char *) rbuf;
    psnd = (char *) sbuf;

    /* Post all receives first -- a simple optimization */

    for (nreqs = 0, i = (rank + 1) % size; i != rank;
         i = (i + 1) % size, ++rreq, ++nreqs) {
        err = MCA_PML_CALL(irecv_init
                           (prcv + (ptrdiff_t)i * rcvinc, rcount, rdtype, i,
                           MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq));
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
    }

    /* Now post all sends in reverse order
       - We would like to minimize the search time through message queue
         when messages actually arrive in the order in which they were posted.
     */
    sreq = rreq;
    for (i = (rank + size - 1) % size; i != rank;
         i = (i + size - 1) % size, ++sreq, ++nreqs) {
        err = MCA_PML_CALL(isend_init
                           (psnd + (ptrdiff_t)i * sndinc, scount, sdtype, i,
                           MCA_COLL_BASE_TAG_ALLTOALL,
                           MCA_PML_BASE_SEND_STANDARD, comm, sreq));
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
    }

    /* Start your engines.  This will never return an error. */

    MCA_PML_CALL(start(nreqs, req));

    /* Wait for them all.  If there's an error, note that we don't
     * care what the error was -- just that there *was* an error.  The
     * PML will finish all requests, even if one or more of them fail.
     * i.e., by the end of this call, all the requests are free-able.
     * So free them anyway -- even if there was an error, and return
     * the error after we free everything. */

    err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE);

 err_hndl:
    if( MPI_SUCCESS != err ) {
        OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
                      __FILE__, line, err, rank) );
    }
    /* Free the reqs in all cases as they are persistent requests */
    ompi_coll_base_free_reqs(req, nreqs);

    /* All done */
    return err;
}
Example #24
0
static int bootstrap_comm(ompi_communicator_t *comm,
                          mca_coll_sm_module_t *module)
{
    int i;
    char *shortpath, *fullpath;
    mca_coll_sm_component_t *c = &mca_coll_sm_component;
    mca_coll_sm_comm_t *data = module->sm_comm_data;
    int comm_size = ompi_comm_size(comm);
    int num_segments = c->sm_comm_num_segments;
    int num_in_use = c->sm_comm_num_in_use_flags;
    int frag_size = c->sm_fragment_size;
    int control_size = c->sm_control_size;
    ompi_process_name_t *lowest_name = NULL;
    size_t size;
    ompi_proc_t *proc;

    /* Make the rendezvous filename for this communicators shmem data
       segment.  The CID is not guaranteed to be unique among all
       procs on this node, so also pair it with the PID of the proc
       with the lowest ORTE name to form a unique filename. */
    proc = ompi_group_peer_lookup(comm->c_local_group, 0);
    lowest_name = OMPI_CAST_RTE_NAME(&proc->super.proc_name);
    for (i = 1; i < comm_size; ++i) {
        proc = ompi_group_peer_lookup(comm->c_local_group, i);
        if (ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL,
                                          OMPI_CAST_RTE_NAME(&proc->super.proc_name),
                                          lowest_name) < 0) {
            lowest_name = OMPI_CAST_RTE_NAME(&proc->super.proc_name);
        }
    }
    asprintf(&shortpath, "coll-sm-cid-%d-name-%s.mmap", comm->c_contextid,
             OMPI_NAME_PRINT(lowest_name));
    if (NULL == shortpath) {
        opal_output_verbose(10, ompi_coll_base_framework.framework_output,
                            "coll:sm:enable:bootstrap comm (%d/%s): asprintf failed",
                            comm->c_contextid, comm->c_name);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    fullpath = opal_os_path(false, ompi_process_info.job_session_dir,
                            shortpath, NULL);
    free(shortpath);
    if (NULL == fullpath) {
        opal_output_verbose(10, ompi_coll_base_framework.framework_output,
                            "coll:sm:enable:bootstrap comm (%d/%s): opal_os_path failed",
                            comm->c_contextid, comm->c_name);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* Calculate how much space we need in the per-communicator shmem
       data segment.  There are several values to add:

       - size of the barrier data (2 of these):
           - fan-in data (num_procs * control_size)
           - fan-out data (num_procs * control_size)
       - size of the "in use" buffers:
           - num_in_use_buffers * control_size
       - size of the message fragment area (one for each segment):
           - control (num_procs * control_size)
           - fragment data (num_procs * (frag_size))

       So it's:

           barrier: 2 * control_size + 2 * control_size
           in use:  num_in_use * control_size
           control: num_segments * (num_procs * control_size * 2 +
                                    num_procs * control_size)
           message: num_segments * (num_procs * frag_size)
     */

    size = 4 * control_size +
        (num_in_use * control_size) +
        (num_segments * (comm_size * control_size * 2)) +
        (num_segments * (comm_size * frag_size));
    opal_output_verbose(10, ompi_coll_base_framework.framework_output,
                        "coll:sm:enable:bootstrap comm (%d/%s): attaching to %" PRIsize_t " byte mmap: %s",
                        comm->c_contextid, comm->c_name, size, fullpath);
    if (0 == ompi_comm_rank (comm)) {
        data->sm_bootstrap_meta = mca_common_sm_module_create_and_attach (size, fullpath, sizeof(mca_common_sm_seg_header_t), 8);
        if (NULL == data->sm_bootstrap_meta) {
            opal_output_verbose(10, ompi_coll_base_framework.framework_output,
                "coll:sm:enable:bootstrap comm (%d/%s): mca_common_sm_init_group failed",
                comm->c_contextid, comm->c_name);
            free(fullpath);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }

        for (int i = 1 ; i < ompi_comm_size (comm) ; ++i) {
            MCA_PML_CALL(send(&data->sm_bootstrap_meta->shmem_ds, sizeof (data->sm_bootstrap_meta->shmem_ds), MPI_BYTE,
                         i, MCA_COLL_BASE_TAG_BCAST, MCA_PML_BASE_SEND_STANDARD, comm));
        }
    } else {
        opal_shmem_ds_t shmem_ds;
        MCA_PML_CALL(recv(&shmem_ds, sizeof (shmem_ds), MPI_BYTE, 0, MCA_COLL_BASE_TAG_BCAST, comm, MPI_STATUS_IGNORE));
        data->sm_bootstrap_meta = mca_common_sm_module_attach (&shmem_ds, sizeof(mca_common_sm_seg_header_t), 8);
    }

    /* All done */
    free(fullpath);
    return OMPI_SUCCESS;
}
Example #25
0
/* create the initial fragment, pack header, datatype, and payload (if
   size fits) and send */
int
ompi_osc_pt2pt_sendreq_send(ompi_osc_pt2pt_module_t *module,
                            ompi_osc_pt2pt_sendreq_t *sendreq)
{
    int ret = OMPI_SUCCESS;
    opal_free_list_item_t *item;
    ompi_osc_pt2pt_send_header_t *header = NULL;
    ompi_osc_pt2pt_buffer_t *buffer = NULL;
    size_t written_data = 0;
    size_t needed_len = sizeof(ompi_osc_pt2pt_send_header_t);
    const void *packed_ddt;
    size_t packed_ddt_len = ompi_ddt_pack_description_length(sendreq->req_target_datatype);

    /* we always need to send the ddt */
    needed_len += packed_ddt_len;
    if (OMPI_OSC_PT2PT_GET != sendreq->req_type) {
        needed_len += sendreq->req_origin_bytes_packed;
    }

    /* Get a buffer */
    OPAL_FREE_LIST_GET(&mca_osc_pt2pt_component.p2p_c_buffers,
                       item, ret);
    if (NULL == item) {
        ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
        goto cleanup;
    }
    buffer = (ompi_osc_pt2pt_buffer_t*) item;

    /* verify at least enough space for header */
    if (mca_osc_pt2pt_component.p2p_c_eager_size < sizeof(ompi_osc_pt2pt_send_header_t)) {
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto cleanup;
    }

    /* setup buffer */
    buffer->cbfunc = ompi_osc_pt2pt_sendreq_send_cb;
    buffer->cbdata = (void*) sendreq;

    /* pack header */
    header = (ompi_osc_pt2pt_send_header_t*) buffer->payload;
    written_data += sizeof(ompi_osc_pt2pt_send_header_t);
    header->hdr_base.hdr_flags = 0;
    header->hdr_windx = sendreq->req_module->p2p_comm->c_contextid;
    header->hdr_origin = sendreq->req_module->p2p_comm->c_my_rank;
    header->hdr_origin_sendreq.pval = (void*) sendreq;
    header->hdr_origin_tag = 0;
    header->hdr_target_disp = sendreq->req_target_disp;
    header->hdr_target_count = sendreq->req_target_count;

    switch (sendreq->req_type) {
    case OMPI_OSC_PT2PT_PUT:
        header->hdr_base.hdr_type = OMPI_OSC_PT2PT_HDR_PUT;
#if OMPI_ENABLE_MEM_DEBUG
        header->hdr_target_op = 0;
#endif
        break;

    case OMPI_OSC_PT2PT_ACC:
        header->hdr_base.hdr_type = OMPI_OSC_PT2PT_HDR_ACC;
        header->hdr_target_op = sendreq->req_op_id;
        break;

    case OMPI_OSC_PT2PT_GET:
        header->hdr_base.hdr_type = OMPI_OSC_PT2PT_HDR_GET;
#if OMPI_ENABLE_MEM_DEBUG
        header->hdr_target_op = 0;
#endif
        break;
    }

    /* Set datatype id and / or pack datatype */
    ret = ompi_ddt_get_pack_description(sendreq->req_target_datatype, &packed_ddt);
    if (OMPI_SUCCESS != ret) goto cleanup;
    memcpy((unsigned char*) buffer->payload + written_data,
           packed_ddt, packed_ddt_len);
    written_data += packed_ddt_len;

    if (OMPI_OSC_PT2PT_GET != sendreq->req_type) {
        /* if sending data and it fits, pack payload */
        if (mca_osc_pt2pt_component.p2p_c_eager_size >=
            written_data + sendreq->req_origin_bytes_packed) {
            struct iovec iov;
            uint32_t iov_count = 1;
            size_t max_data = sendreq->req_origin_bytes_packed;

            iov.iov_len = max_data;
            iov.iov_base = (IOVBASE_TYPE*)((unsigned char*) buffer->payload + written_data);

            ret = ompi_convertor_pack(&sendreq->req_origin_convertor, &iov, &iov_count,
                                      &max_data );
            if (ret < 0) {
                ret = OMPI_ERR_FATAL;
                goto cleanup;
            }

            assert(max_data == sendreq->req_origin_bytes_packed);
            written_data += max_data;

            header->hdr_msg_length = sendreq->req_origin_bytes_packed;
        } else {
            header->hdr_msg_length = 0;
            header->hdr_origin_tag = create_send_tag(module);
        }
    } else {
        header->hdr_msg_length = 0;
    }

    buffer->len = written_data;

#ifdef WORDS_BIGENDIAN
    header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO;
#elif OMPI_ENABLE_HETEROGENEOUS_SUPPORT
    if (sendreq->req_target_proc->proc_arch & OMPI_ARCH_ISBIGENDIAN) {
        header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO;
        OMPI_OSC_PT2PT_SEND_HDR_HTON(*header);
    }
#endif

    /* send fragment */
    opal_output_verbose(51, ompi_osc_base_output,
                        "%d sending sendreq to %d",
                        sendreq->req_module->p2p_comm->c_my_rank,
                        sendreq->req_target_rank);

    ret = MCA_PML_CALL(isend(buffer->payload,
                             buffer->len,
                             MPI_BYTE,
                             sendreq->req_target_rank,
                             -200,
                             MCA_PML_BASE_SEND_STANDARD,
                             module->p2p_comm,
                             &buffer->request));
    opal_list_append(&module->p2p_pending_control_sends, 
                     &buffer->super.super);
    goto done;

 cleanup:
    if (item != NULL) {
        OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_buffers,
                              item);
    }

 done:
    return ret;
}
Example #26
0
/*
 *	allgather_inter
 *
 *	Function:	- allgather using other MPI collections
 *	Accepts:	- same as MPI_Allgather()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_allgather_inter(void *sbuf, int scount,
                               struct ompi_datatype_t *sdtype,
                               void *rbuf, int rcount,
                               struct ompi_datatype_t *rdtype,
                               struct ompi_communicator_t *comm,
                               mca_coll_base_module_t *module)
{
    int rank, root = 0, size, rsize, err, i;
    char *tmpbuf = NULL, *ptmp;
    ptrdiff_t rlb, slb, rextent, sextent, incr;
    ompi_request_t *req;
    mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
    ompi_request_t **reqs = basic_module->mccb_reqs;

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);
    rsize = ompi_comm_remote_size(comm);

    /* Algorithm:
     * - a gather to the root in remote group (simultaniously executed,
     * thats why we cannot use coll_gather).
     * - exchange the temp-results between two roots 
     * - inter-bcast (again simultanious).
     */

    /* Step one: gather operations: */
    if (rank != root) {
        /* send your data to root */
        err = MCA_PML_CALL(send(sbuf, scount, sdtype, root,
                                MCA_COLL_BASE_TAG_ALLGATHER,
                                MCA_PML_BASE_SEND_STANDARD, comm));
        if (OMPI_SUCCESS != err) {
            return err;
        }
    } else {
        /* receive a msg. from all other procs. */
        err = ompi_datatype_get_extent(rdtype, &rlb, &rextent);
        if (OMPI_SUCCESS != err) {
            return err;
        }
        err = ompi_datatype_get_extent(sdtype, &slb, &sextent);
        if (OMPI_SUCCESS != err) {
            return err;
        }

        /* Do a send-recv between the two root procs. to avoid deadlock */
        err = MCA_PML_CALL(isend(sbuf, scount, sdtype, 0,
                                 MCA_COLL_BASE_TAG_ALLGATHER,
                                 MCA_PML_BASE_SEND_STANDARD,
                                 comm, &reqs[rsize]));
        if (OMPI_SUCCESS != err) {
            return err;
        }

        err = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, 0,
                                 MCA_COLL_BASE_TAG_ALLGATHER, comm,
                                 &reqs[0]));
        if (OMPI_SUCCESS != err) {
            return err;
        }

        incr = rextent * rcount;
        ptmp = (char *) rbuf + incr;
        for (i = 1; i < rsize; ++i, ptmp += incr) {
            err = MCA_PML_CALL(irecv(ptmp, rcount, rdtype, i,
                                     MCA_COLL_BASE_TAG_ALLGATHER,
                                     comm, &reqs[i]));
            if (MPI_SUCCESS != err) {
                return err;
            }
        }

        err = ompi_request_wait_all(rsize + 1, reqs, MPI_STATUSES_IGNORE);
        if (OMPI_SUCCESS != err) {
            return err;
        }

        /* Step 2: exchange the resuts between the root processes */
        tmpbuf = (char *) malloc(scount * size * sextent);
        if (NULL == tmpbuf) {
            return err;
        }

        err = MCA_PML_CALL(isend(rbuf, rsize * rcount, rdtype, 0,
                                 MCA_COLL_BASE_TAG_ALLGATHER,
                                 MCA_PML_BASE_SEND_STANDARD, comm, &req));
        if (OMPI_SUCCESS != err) {
            goto exit;
        }

        err = MCA_PML_CALL(recv(tmpbuf, size * scount, sdtype, 0,
                                MCA_COLL_BASE_TAG_ALLGATHER, comm,
                                MPI_STATUS_IGNORE));
        if (OMPI_SUCCESS != err) {
            goto exit;
        }

        err = ompi_request_wait( &req, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != err) {
            goto exit;
        }
    }


    /* Step 3: bcast the data to the remote group. This 
     * happens in both groups simultaniously, thus we can 
     * not use coll_bcast (this would deadlock). 
     */
    if (rank != root) {
        /* post the recv */
        err = MCA_PML_CALL(recv(rbuf, rsize * rcount, rdtype, 0,
                                MCA_COLL_BASE_TAG_ALLGATHER, comm,
                                MPI_STATUS_IGNORE));
        if (OMPI_SUCCESS != err) {
            goto exit;
        }

    } else {
        /* Send the data to every other process in the remote group
         * except to rank zero. which has it already. */
        for (i = 1; i < rsize; i++) {
            err = MCA_PML_CALL(isend(tmpbuf, size * scount, sdtype, i,
                                     MCA_COLL_BASE_TAG_ALLGATHER,
                                     MCA_PML_BASE_SEND_STANDARD,
                                     comm, &reqs[i - 1]));
            if (OMPI_SUCCESS != err) {
                goto exit;
            }

        }

        err = ompi_request_wait_all(rsize - 1, reqs, MPI_STATUSES_IGNORE);
        if (OMPI_SUCCESS != err) {
            goto exit;
        }
    }

  exit:
    if (NULL != tmpbuf) {
        free(tmpbuf);
    }

    return err;
}
Example #27
0
int
ompi_osc_pt2pt_control_send(ompi_osc_pt2pt_module_t *module,
                            ompi_proc_t *proc,
                            uint8_t type, int32_t value0, int32_t value1)
{
    int ret = OMPI_SUCCESS;
    opal_free_list_item_t *item;
    ompi_osc_pt2pt_buffer_t *buffer = NULL;
    ompi_osc_pt2pt_control_header_t *header = NULL;
    int rank = -1, i;

    /* find the rank */
    for (i = 0 ; i < module->p2p_comm->c_remote_group->grp_proc_count ; ++i) {
        if (proc == module->p2p_comm->c_remote_group->grp_proc_pointers[i]) {
            rank = i;
        }
    }

    /* Get a buffer */
    OPAL_FREE_LIST_GET(&mca_osc_pt2pt_component.p2p_c_buffers,
                       item, ret);
    if (NULL == item) {
        ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
        goto cleanup;
    }
    buffer = (ompi_osc_pt2pt_buffer_t*) item;

    /* verify at least enough space for header */
    if (mca_osc_pt2pt_component.p2p_c_eager_size < sizeof(ompi_osc_pt2pt_control_header_t)) {
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto cleanup;
    }

    /* setup buffer */
    buffer->cbfunc = ompi_osc_pt2pt_control_send_cb;
    buffer->cbdata = NULL;
    buffer->len = sizeof(ompi_osc_pt2pt_control_header_t);

    /* pack header */
    header = (ompi_osc_pt2pt_control_header_t*) buffer->payload;
    header->hdr_base.hdr_type = type;
    header->hdr_base.hdr_flags = 0;
    header->hdr_value[0] = value0;
    header->hdr_value[1] = value1;
    header->hdr_windx = module->p2p_comm->c_contextid;

#ifdef WORDS_BIGENDIAN
    header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO;
#elif OMPI_ENABLE_HETEROGENEOUS_SUPPORT
    if (proc->proc_arch & OMPI_ARCH_ISBIGENDIAN) {
        header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO;
        OMPI_OSC_PT2PT_CONTROL_HDR_HTON(*header);
    }
#endif

    /* send fragment */
    ret = MCA_PML_CALL(isend(buffer->payload,
                             buffer->len,
                             MPI_BYTE,
                             rank,
                             -200,
                             MCA_PML_BASE_SEND_STANDARD,
                             module->p2p_comm,
                             &buffer->request));
    opal_list_append(&module->p2p_pending_control_sends, 
                     &buffer->super.super);
    goto done;

 cleanup:
    if (item != NULL) {
        OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_buffers,
                              item);
    }

 done:
    return ret;
}
Example #28
0
static inline int NBC_Start_round(NBC_Handle *handle) {
  int num; /* number of operations */
  int res;
  char* ptr;
  MPI_Request *tmp;
  NBC_Fn_type type;
  NBC_Args_send     sendargs;
  NBC_Args_recv     recvargs;
  NBC_Args_op         opargs;
  NBC_Args_copy     copyargs;
  NBC_Args_unpack unpackargs;
  void *buf1,  *buf2;

  /* get round-schedule address */
  ptr = handle->schedule->data + handle->row_offset;

  NBC_GET_BYTES(ptr,num);
  NBC_DEBUG(10, "start_round round at offset %d : posting %i operations\n", handle->row_offset, num);

  for (int i = 0 ; i < num ; ++i) {
    int offset = (intptr_t)(ptr - handle->schedule->data);

    memcpy (&type, ptr, sizeof (type));
    switch(type) {
      case SEND:
        NBC_DEBUG(5,"  SEND (offset %li) ", offset);
        NBC_GET_BYTES(ptr,sendargs);
        NBC_DEBUG(5,"*buf: %p, count: %i, type: %p, dest: %i, tag: %i)\n", sendargs.buf,
                  sendargs.count, sendargs.datatype, sendargs.dest, handle->tag);
        /* get an additional request */
        handle->req_count++;
        /* get buffer */
        if(sendargs.tmpbuf) {
          buf1=(char*)handle->tmpbuf+(long)sendargs.buf;
        } else {
          buf1=(void *)sendargs.buf;
        }
#ifdef NBC_TIMING
        Isend_time -= MPI_Wtime();
#endif
        tmp = (MPI_Request *) realloc ((void *) handle->req_array, handle->req_count * sizeof (MPI_Request));
        if (NULL == tmp) {
          return OMPI_ERR_OUT_OF_RESOURCE;
        }

        handle->req_array = tmp;

        res = MCA_PML_CALL(isend(buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag,
                                 MCA_PML_BASE_SEND_STANDARD, sendargs.local?handle->comm->c_local_comm:handle->comm,
                                 handle->req_array+handle->req_count - 1));
        if (OMPI_SUCCESS != res) {
          NBC_Error ("Error in MPI_Isend(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, sendargs.count,
                     sendargs.datatype, sendargs.dest, handle->tag, (unsigned long)handle->comm, res);
          return res;
        }
#ifdef NBC_TIMING
        Isend_time += MPI_Wtime();
#endif
        break;
      case RECV:
        NBC_DEBUG(5, "  RECV (offset %li) ", offset);
        NBC_GET_BYTES(ptr,recvargs);
        NBC_DEBUG(5, "*buf: %p, count: %i, type: %p, source: %i, tag: %i)\n", recvargs.buf, recvargs.count,
                  recvargs.datatype, recvargs.source, handle->tag);
        /* get an additional request - TODO: req_count NOT thread safe */
        handle->req_count++;
        /* get buffer */
        if(recvargs.tmpbuf) {
          buf1=(char*)handle->tmpbuf+(long)recvargs.buf;
        } else {
          buf1=recvargs.buf;
        }
#ifdef NBC_TIMING
        Irecv_time -= MPI_Wtime();
#endif
        tmp = (MPI_Request *) realloc ((void *) handle->req_array, handle->req_count * sizeof (MPI_Request));
        if (NULL == tmp) {
          return OMPI_ERR_OUT_OF_RESOURCE;
        }

        handle->req_array = tmp;

        res = MCA_PML_CALL(irecv(buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, recvargs.local?handle->comm->c_local_comm:handle->comm,
                                 handle->req_array+handle->req_count-1));
        if (OMPI_SUCCESS != res) {
          NBC_Error("Error in MPI_Irecv(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, recvargs.count,
                    recvargs.datatype, recvargs.source, handle->tag, (unsigned long)handle->comm, res);
          return res;
        }
#ifdef NBC_TIMING
        Irecv_time += MPI_Wtime();
#endif
        break;
      case OP:
        NBC_DEBUG(5, "  OP2  (offset %li) ", offset);
        NBC_GET_BYTES(ptr,opargs);
        NBC_DEBUG(5, "*buf1: %p, buf2: %p, count: %i, type: %p)\n", opargs.buf1, opargs.buf2,
                  opargs.count, opargs.datatype);
        /* get buffers */
        if(opargs.tmpbuf1) {
          buf1=(char*)handle->tmpbuf+(long)opargs.buf1;
        } else {
          buf1=(void *)opargs.buf1;
        }
        if(opargs.tmpbuf2) {
          buf2=(char*)handle->tmpbuf+(long)opargs.buf2;
        } else {
          buf2=opargs.buf2;
        }
        ompi_op_reduce(opargs.op, buf1, buf2, opargs.count, opargs.datatype);
        break;
      case COPY:
        NBC_DEBUG(5, "  COPY   (offset %li) ", offset);
        NBC_GET_BYTES(ptr,copyargs);
        NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %p, *tgt: %lu, tgtcount: %i, tgttype: %p)\n",
                  (unsigned long) copyargs.src, copyargs.srccount, copyargs.srctype,
                  (unsigned long) copyargs.tgt, copyargs.tgtcount, copyargs.tgttype);
        /* get buffers */
        if(copyargs.tmpsrc) {
          buf1=(char*)handle->tmpbuf+(long)copyargs.src;
        } else {
          buf1=copyargs.src;
        }
        if(copyargs.tmptgt) {
          buf2=(char*)handle->tmpbuf+(long)copyargs.tgt;
        } else {
          buf2=copyargs.tgt;
        }
        res = NBC_Copy (buf1, copyargs.srccount, copyargs.srctype, buf2, copyargs.tgtcount, copyargs.tgttype,
                        handle->comm);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
          return res;
        }
        break;
      case UNPACK:
        NBC_DEBUG(5, "  UNPACK   (offset %li) ", offset);
        NBC_GET_BYTES(ptr,unpackargs);
        NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %p, *tgt: %lu\n", (unsigned long) unpackargs.inbuf,
                  unpackargs.count, unpackargs.datatype, (unsigned long) unpackargs.outbuf);
        /* get buffers */
        if(unpackargs.tmpinbuf) {
          buf1=(char*)handle->tmpbuf+(long)unpackargs.inbuf;
        } else {
          buf1=unpackargs.inbuf;
        }
        if(unpackargs.tmpoutbuf) {
          buf2=(char*)handle->tmpbuf+(long)unpackargs.outbuf;
        } else {
          buf2=unpackargs.outbuf;
        }
        res = NBC_Unpack (buf1, unpackargs.count, unpackargs.datatype, buf2, handle->comm);
        if (OMPI_SUCCESS != res) {
          NBC_Error ("NBC_Unpack() failed (code: %i)", res);
          return res;
        }

        break;
      default:
        NBC_Error ("NBC_Start_round: bad type %li at offset %li", (long)type, offset);
        return OMPI_ERROR;
    }
  }

  /* check if we can make progress - not in the first round, this allows us to leave the
   * initialization faster and to reach more overlap
   *
   * threaded case: calling progress in the first round can lead to a
   * deadlock if NBC_Free is called in this round :-( */
  if (handle->row_offset) {
    res = NBC_Progress(handle);
    if ((NBC_OK != res) && (NBC_CONTINUE != res)) {
      return OMPI_ERROR;
    }
  }

  return OMPI_SUCCESS;
}
static int mca_coll_hierarch_bcast_intra_seg3 (void *buff, 
					       int count,
					       struct ompi_datatype_t *datatype, 
					       int root,
					       struct ompi_communicator_t *comm, 
					       mca_coll_base_module_t *module, 
                                               int segsize )
{
    struct ompi_communicator_t *llcomm=NULL;
    struct ompi_communicator_t *lcomm=NULL;
    mca_coll_hierarch_module_t *hierarch_module = (mca_coll_hierarch_module_t *) module;
    int lroot=MPI_UNDEFINED, llroot=MPI_UNDEFINED;
    int llrank=MPI_UNDEFINED, llsize=0, rank=0, ret=OMPI_SUCCESS;
    int lsize=0, lrank=MPI_UNDEFINED;
    MPI_Aint ub=0, typeext=0;
    size_t typesize=0;
    int i, realsegsize=0, remaining_count=0;
    int num_segments=0, segcount=0, segindex=0;
    char* tmpbuf = (char *) buff;
    ompi_request_t **sreq=NULL, **sreq1=NULL;
    ompi_request_t *rreq=MPI_REQUEST_NULL, *rreq1=MPI_REQUEST_NULL;

    rank   = ompi_comm_rank ( comm );
    lcomm  = hierarch_module->hier_lcomm;
    
    if ( mca_coll_hierarch_verbose_param ) { 
	printf("%s:%d: executing hierarchical seg3 bcast with cnt=%d root=%d segsize=%d\n",
	       comm->c_name, rank, count, root, segsize );
    } 

   /*
     * This function returns the local leader communicator
     * which *always* contains the root of this operation.
     * This might involve creating a new communicator. This is
     * also the reason, that *every* process in comm has to call
     * this function
     */
    llcomm = mca_coll_hierarch_get_llcomm ( root, hierarch_module, &llroot, &lroot);

    ompi_datatype_type_size ( datatype, &typesize);
    ompi_datatype_get_extent ( datatype, &ub, &typeext);

    /* Determine number of segments and number of elements per segment */
    if ((typesize > 0) && (segsize % typesize != 0)) {
	/* segment size must be a multiple of typesize */
	segsize = typesize * (segsize / typesize);
    }
    if ((segsize == 0) || (count == 0) || (typesize == 0)) {
	segcount = count;
	num_segments = 1;
    } else {
	segcount = segsize/typesize;
	num_segments = count/segcount;
	if ( (count % segcount) != 0 ) num_segments++;
	if (num_segments == 1) segcount = count;
    }
    realsegsize = segcount*typeext;
    remaining_count = segcount;

    if ( MPI_COMM_NULL != lcomm ) {
        lsize = ompi_comm_size ( lcomm );
        lrank = ompi_comm_rank ( lcomm );	
        sreq1 = (ompi_request_t **)malloc ( lsize * sizeof(ompi_request_t *));
        if ( NULL == sreq1 ) {
	    return OMPI_ERR_OUT_OF_RESOURCE;
        }
        for(i=0; i<lsize; i++) {
	    sreq1[i] = MPI_REQUEST_NULL;
        }
    }

    if ( MPI_COMM_NULL != llcomm ) {
        llsize = ompi_comm_size (llcomm);
        llrank = ompi_comm_rank ( llcomm );
    
        sreq  = hierarch_module->hier_reqs;
	for(i=0; i<llsize; i++) {
	    sreq[i] = MPI_REQUEST_NULL;
	}
    }

    
    /* Broadcasting the first segment in the upper level*/
    if ( MPI_UNDEFINED != llroot ) {
	ret = llcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype, 
					llroot, llcomm, 
					llcomm->c_coll.coll_bcast_module);
	if ( OMPI_SUCCESS != ret ) {
	    goto exit;
	}
    }
    
    for (segindex = 1; segindex < num_segments; segindex++) {
	/* determine how many elements are being sent in this round */
	if( segindex == (num_segments - 1) ) {
	    remaining_count = count - segindex*segcount;
	}
	tmpbuf += realsegsize;
	
	/* Broadcasting the next segment in the upper level*/
	if ( MPI_COMM_NULL != llcomm ) {
	    if(llrank == llroot) {
		for(i = 0; i < llsize; i++) {
		    if( i != llroot) {
			ret = MCA_PML_CALL(isend(tmpbuf, remaining_count, datatype, i,
						 MCA_COLL_BASE_TAG_BCAST, 
						 MCA_PML_BASE_SEND_STANDARD,
						 llcomm, (sreq+i) ));
			if ( OMPI_SUCCESS != ret ) {
			    goto exit;
			}
		    }
		}
	    }
            else {
                ret = MCA_PML_CALL(irecv(tmpbuf, remaining_count, datatype, llroot,
                                         MCA_COLL_BASE_TAG_BCAST, 
                                         llcomm, &rreq ));
                if ( OMPI_SUCCESS != ret ) {
                    goto exit;
                }
            }
        }
	
	/* broadcasting the before segment among the lower level processes
	 * once the local leaders got the data from the root, they can distribute
	 * it to the processes in their local, low-level communicator.
	 */
	if ( MPI_COMM_NULL != lcomm ) {
	    if( lrank == lroot) {
		for( i = 0; i < lsize; i++) {
		    if( i != lroot) {
			ret = MCA_PML_CALL(isend(tmpbuf-realsegsize, segcount, datatype, i,
						 MCA_COLL_BASE_TAG_BCAST,
						 MCA_PML_BASE_SEND_STANDARD,
						 lcomm, (sreq1+i) ));
			if ( OMPI_SUCCESS != ret ) {
			    goto exit;
			}
		    }
		}
	    }
	    else {
		ret = MCA_PML_CALL(irecv(tmpbuf-realsegsize, segcount, datatype, lroot, 
					 MCA_COLL_BASE_TAG_BCAST , lcomm, &rreq1 ));
		if ( OMPI_SUCCESS != ret ) {
		    goto exit;
		}
	    }
	}
	
	/* Wait for the upper level bcast to complete*/
	if ( MPI_COMM_NULL != llcomm ) {
	    if ( llrank == llroot ) {
		ret = ompi_request_wait_all(llsize, sreq, MPI_STATUSES_IGNORE);
		if ( OMPI_SUCCESS != ret ) {
		    goto exit;
		}
	    }
	    else {
		ret = ompi_request_wait_all ( 1, &rreq, MPI_STATUS_IGNORE );
		if ( OMPI_SUCCESS != ret ) {
		    goto exit;
		}
	    }
	}
	
	/*Wait for the lower level bcast to complete */
	if ( MPI_COMM_NULL != lcomm ) {
	    if ( lrank == lroot ) {
		ret = ompi_request_wait_all(lsize, sreq1, MPI_STATUSES_IGNORE);
		if ( OMPI_SUCCESS != ret ) {
		    goto exit;
		}
	    }
	    else {
		ret = ompi_request_wait_all( 1, &rreq1, MPI_STATUS_IGNORE);
		if ( OMPI_SUCCESS != ret ) {
		    goto exit;
		}
	    }
	}	
    }
    
    /*Bcasting the last segment among the lower level processes
     * once the local leaders got the data from the root, they can distribute
     * it to the processes in their local, low-level communicator.
     */
    if ( MPI_COMM_NULL != lcomm ) {
        ret = lcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype,  
                                       lroot, lcomm, 
                                       lcomm->c_coll.coll_bcast_module);
    }
    
exit:
    if ( NULL != sreq1 ) {
	free ( sreq1 );
    }

    return ret;
}
Example #30
0
File: bcast.c Project: IanYXXL/A1
/**
 * Bcast - subgroup in communicator
 *  This is a very simple algorithm - binary tree, transmitting the full
 *  message at each step.
 */
OMPI_DECLSPEC int comm_bcast_pml(void *buffer, int root, int count,
        ompi_datatype_t *dtype, int my_rank_in_group,
        int n_peers, int *ranks_in_comm,ompi_communicator_t *comm)
{
    /* local variables */
    int rc=OMPI_SUCCESS,msg_cnt,i;
    ompi_request_t *requests[2];
    int node_rank, peer_rank;
    netpatterns_tree_node_t node_data;

    /*
     * shift rank to root==0 tree
     */
    node_rank=(my_rank_in_group-root+n_peers)%n_peers;

    /*
     * compute my communication pattern - binary tree
     */
    rc=netpatterns_setup_narray_tree(2, node_rank, n_peers,
            &node_data);
    if( OMPI_SUCCESS != rc ) {
        goto Error;
    }

    /* 1 process special case */
    if(1 == n_peers) {
        return OMPI_SUCCESS;
    }

    /* if I have parents - wait on the data to arrive */
    if(node_data.n_parents) {
        /* I will have only 1 parent */
        peer_rank=node_data.parent_rank;
        peer_rank=(peer_rank+root)%n_peers;
        /* translate back to actual rank */
        rc=MCA_PML_CALL(recv(buffer, count,dtype,peer_rank,
                    -OMPI_COMMON_TAG_BCAST, comm, MPI_STATUSES_IGNORE));
        if( 0 > rc ) {
            goto Error;
        }
    }

    /* send the data to my children */
    msg_cnt=0;
    for(i=0 ; i < node_data.n_children ; i++ ) {
        peer_rank=node_data.children_ranks[i];
        peer_rank=(peer_rank+root)%n_peers;
        rc=MCA_PML_CALL(isend(buffer,
                    count,dtype,peer_rank,
                    -OMPI_COMMON_TAG_BCAST,MCA_PML_BASE_SEND_STANDARD,
                    comm,&(requests[msg_cnt])));
        if( 0 > rc ) {
            goto Error;
        }
        msg_cnt++;
    }
    /* wait for send completion */
    if(msg_cnt) {
        /* wait on send and receive completion */
        ompi_request_wait_all(msg_cnt,requests,MPI_STATUSES_IGNORE);
    }

    /* return */
    return OMPI_SUCCESS;

Error:
    return rc;
}