Exemplo n.º 1
0
static int barrier_smp_intra(MPID_Comm *comm_ptr, mpir_errflag_t *errflag)
{
    int mpi_errno=MPI_SUCCESS;
    int mpi_errno_ret = MPI_SUCCESS;

    MPIU_Assert(MPIR_CVAR_ENABLE_SMP_COLLECTIVES && MPIR_CVAR_ENABLE_SMP_BARRIER &&
                MPIR_Comm_is_node_aware(comm_ptr));

    /* do the intranode barrier on all nodes */
    if (comm_ptr->node_comm != NULL)
    {
        mpi_errno = MPIR_Barrier_impl(comm_ptr->node_comm, errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
            MPIU_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
    }

    /* do the barrier across roots of all nodes */
    if (comm_ptr->node_roots_comm != NULL) {
        mpi_errno = MPIR_Barrier_impl(comm_ptr->node_roots_comm, errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
            MPIU_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
    }

    /* release the local processes on each node with a 1-byte
       broadcast (0-byte broadcast just returns without doing
       anything) */
    if (comm_ptr->node_comm != NULL)
    {
        int i=0;
        mpi_errno = MPIR_Bcast_impl(&i, 1, MPI_BYTE, 0, comm_ptr->node_comm, errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
            MPIU_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
    }

 fn_exit:
    if (mpi_errno_ret)
        mpi_errno = mpi_errno_ret;
    else if (*errflag != MPIR_ERR_NONE)
        MPIU_ERR_SET(mpi_errno, *errflag, "**coll_fail");
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
Exemplo n.º 2
0
/**
 * \brief MPI-PAMI glue for MPI_Win_create function
 *
 * Create a window object. Allocates a MPID_Win object and initializes it,
 * then allocates the collective info array, initalizes our entry, and
 * performs an Allgather to distribute/collect the rest of the array entries.
 *
 * ON first call, initializes (registers) protocol objects for locking,
 * get, and send operations to message layer. Also creates datatype to
 * represent the rma_sends element of the collective info array,
 * used later to synchronize epoch end events.
 *
 * \param[in] base	Local window buffer
 * \param[in] size	Local window size
 * \param[in] disp_unit	Displacement unit size
 * \param[in] info	Window hints (not used)
 * \param[in] comm_ptr	Communicator
 * \param[out] win_ptr	Window
 * \return MPI_SUCCESS, MPI_ERR_OTHER, or error returned from
 *	MPI_Comm_dup or MPI_Allgather.
 */
int
MPID_Win_create(void       * base,
                MPI_Aint     size,
                int          disp_unit,
                MPID_Info  * info,
                MPID_Comm  * comm_ptr,
                MPID_Win  ** win_ptr)
{
  int mpi_errno  = MPI_SUCCESS;
  int rc  = MPI_SUCCESS;
  MPID_Win *win;
  size_t  rank;
  MPIDI_Win_info *winfo;

  rc=MPIDI_Win_init(size,disp_unit,win_ptr, info, comm_ptr, MPI_WIN_FLAVOR_CREATE, MPI_WIN_UNIFIED);
  win = *win_ptr;
  win->base = base;
  rank = comm_ptr->rank;
  winfo = &win->mpid.info[rank];
  winfo->base_addr = base;
  winfo->win = win;
  winfo->disp_unit = disp_unit;

  rc= MPIDI_Win_allgather(size,win_ptr);
  if (rc != MPI_SUCCESS)
      return rc;


  mpi_errno = MPIR_Barrier_impl(comm_ptr, &mpi_errno);

  return mpi_errno;
}
Exemplo n.º 3
0
int
MPID_Win_set_info(MPID_Win     *win, MPID_Info    *info)
{
    int mpi_errno = MPI_SUCCESS;

    mpi_errno = MPIDI_Win_set_info(win, info);
    MPID_assert(mpi_errno == MPI_SUCCESS);
    mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &mpi_errno);
    return mpi_errno;
}
Exemplo n.º 4
0
/**
 * \brief MPI-PAMI glue for MPI_Win_allocate function
 *
 * Create a window object. Allocates a MPID_Win object and initializes it,
 * then allocates the collective info array, initalizes our entry, and
 * performs an Allgather to distribute/collect the rest of the array entries.
 * On each process, it allocates memory of at least size bytes, returns a
 * pointer to it, and returns a window object that can be used by all processes
 * in comm to * perform RMA operations. The returned memory consists of size
 * bytes local to each process, starting at address base_ptr and is associated
 * with the window as if the user called 'MPI_Win_create' on existing memory.
 * The size argument may be different at each process and size = 0 is valid;
 * however, a library might allocate and expose more memory in order to create
 * a fast, globally symmetric allocation.
 * Input Parameters:
 * \param[in] size      size of window in bytes (nonnegative integer)
 * \param[in] disp_unit local unit size for displacements, in bytes (positive integer)
 * \param[in] info      info argument (handle))
 * \param[in] comm_ptr  Communicator (handle)
 * \param[out] base_ptr - base address of the window in local memory
 * \param[out] win_ptr  window object returned by the call (handle)
 * \return MPI_SUCCESS, MPI_ERR_ARG, MPI_ERR_COMM, MPI_ERR_INFO. MPI_ERR_OTHER,
 *         MPI_ERR_SIZE
 */
int
MPID_Win_allocate(MPI_Aint     size,
                  int          disp_unit,
                  MPID_Info  * info,
                  MPID_Comm  * comm_ptr,
                  void *base_ptr,
                  MPID_Win  ** win_ptr)
{
  int mpi_errno  = MPI_SUCCESS;
  int rc = MPI_SUCCESS;
  mpir_errflag_t errflag = MPIR_ERR_NONE;
  void *baseP; 
  static char FCNAME[] = "MPID_Win_allocate";
  MPIDI_Win_info  *winfo;
  MPID_Win   *win;
  int        rank;

  rc=MPIDI_Win_init(size,disp_unit,win_ptr, info, comm_ptr, MPI_WIN_FLAVOR_ALLOCATE, MPI_WIN_UNIFIED);
  win = *win_ptr;

  if (size > 0) {
      baseP = MPIU_Malloc(size);
  #ifndef MPIDI_NO_ASSERT
      MPID_assert(baseP != NULL);
  #else
      MPIU_ERR_CHKANDJUMP((baseP == NULL), mpi_errno, MPI_ERR_BUFFER, "**bufnull");
  #endif

  } else if (size == 0) {
      baseP = NULL;
  } else {
      MPIU_ERR_CHKANDSTMT(size >=0 , mpi_errno, MPI_ERR_SIZE,
                          return mpi_errno, "**rmasize");
  }

  win->base = baseP;
  rank = comm_ptr->rank;
  winfo = &win->mpid.info[rank];
  winfo->base_addr = baseP;
  winfo->win = win;
  winfo->disp_unit = disp_unit;

  rc= MPIDI_Win_allgather(size,win_ptr);
  if (rc != MPI_SUCCESS)
      return rc;
  *(void**) base_ptr = (void *) win->base;
  mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag);

  fn_fail:
  return mpi_errno;
}
Exemplo n.º 5
0
int
MPID_Win_fence(int       assert,
               MPID_Win *win)
{
  int mpi_errno = MPI_SUCCESS;

  struct MPIDI_Win_sync* sync = &win->mpid.sync;
  MPID_PROGRESS_WAIT_WHILE(sync->total != sync->complete);
  sync->total    = 0;
  sync->started  = 0;
  sync->complete = 0;

  mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &mpi_errno);
  return mpi_errno;
}
Exemplo n.º 6
0
int
MPID_Win_free(MPID_Win **win_ptr)
{
    int mpi_errno = MPI_SUCCESS;

    MPID_Win *win = *win_ptr;
    size_t rank = win->comm_ptr->rank;
    mpir_errflag_t errflag = MPIR_ERR_NONE;

    if(win->mpid.sync.origin_epoch_type != win->mpid.sync.target_epoch_type ||
            (win->mpid.sync.origin_epoch_type != MPID_EPOTYPE_NONE &&
             win->mpid.sync.origin_epoch_type != MPID_EPOTYPE_REFENCE)) {
        MPIU_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, return mpi_errno, "**rmasync");
    }

    mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &errflag);
    MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**mpi_bcast");

    if (win->create_flavor == MPI_WIN_FLAVOR_SHARED)
        mpi_errno=MPIDI_SHM_Win_free(win_ptr);



    if (win->create_flavor == MPI_WIN_FLAVOR_ALLOCATE)
        MPIU_Free(win->base);

    struct MPIDI_Win_info *winfo = &win->mpid.info[rank];
#ifdef USE_PAMI_RDMA
    if (win->size != 0)
    {
        pami_result_t rc;
Exemplo n.º 7
0
static int barrier_smp_intra(MPID_Comm *comm_ptr, MPIR_Errflag_t *errflag)
{
    int mpi_errno=MPI_SUCCESS;
    int mpi_errno_ret = MPI_SUCCESS;

    MPIU_Assert(MPIR_CVAR_ENABLE_SMP_COLLECTIVES && MPIR_CVAR_ENABLE_SMP_BARRIER &&
                MPIR_Comm_is_node_aware(comm_ptr));

#if defined(FINEGRAIN_MPI)
    int colocated_size = -1;
    int colocated_sense = -1;
    /* do  barrier on osproc_colocated_comm */
    if (comm_ptr->osproc_colocated_comm != NULL)
    {
        colocated_size = comm_ptr->osproc_colocated_comm->local_size;
        MPIU_Assert( (comm_ptr->osproc_colocated_comm->co_shared_vars != NULL) && (comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars != NULL) );
        MPIU_Assert(colocated_size > 1 );
        colocated_sense = comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_signal;

        if( comm_ptr->osproc_colocated_comm->rank != 0 ) { /* non-leader */
            (comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_counter)++;
            if (comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_counter == (colocated_size-1)){ /* excluding the leader */
                comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->leader_signal = 1;
            }
            while(comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_signal == colocated_sense) {
                FG_Yield();
            }
        }
        else { /* leader */
            while(comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->leader_signal == 0) {
                FG_Yield();
            }
        }

#if 0 /* Non-optimized version */
        mpi_errno = MPIR_Barrier_impl(comm_ptr->osproc_colocated_comm, errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
            MPIU_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
#endif
    }
#endif

    /* do the intranode barrier on all nodes */
    if (comm_ptr->node_comm != NULL)
    {
        mpi_errno = MPIR_Barrier_impl(comm_ptr->node_comm, errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
            MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
    }

    /* do the barrier across roots of all nodes */
    if (comm_ptr->node_roots_comm != NULL) {
        mpi_errno = MPIR_Barrier_impl(comm_ptr->node_roots_comm, errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
            MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
    }

    /* release the local processes on each node with a 1-byte
       broadcast (0-byte broadcast just returns without doing
       anything) */
    if (comm_ptr->node_comm != NULL)
    {
        int i=0;
        mpi_errno = MPIR_Bcast_impl(&i, 1, MPI_BYTE, 0, comm_ptr->node_comm, errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
            MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
    }

#if defined(FINEGRAIN_MPI)
    if (comm_ptr->osproc_colocated_comm != NULL)
    {
        if (comm_ptr->osproc_colocated_comm->rank == 0) { /* leader */
            comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->leader_signal = 0;
            comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_counter = 0;
            comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_signal = 1 - comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_signal;
        }

#if 0 /* Non-optimized version */
        /* release the colocated processes in each OS-process with a 1-byte
           broadcast (0-byte broadcast just returns without doing
           anything) */
        int i=0;
        mpi_errno = MPIR_Bcast_impl(&i, 1, MPI_BYTE, 0, comm_ptr->osproc_colocated_comm, errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
            MPIU_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
#endif
    }
#endif

 fn_exit:
    if (mpi_errno_ret)
        mpi_errno = mpi_errno_ret;
    else if (*errflag != MPIR_ERR_NONE)
        MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail");
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
Exemplo n.º 8
0
/*@

MPI_Barrier - Blocks until all processes in the communicator have
reached this routine.  

Input Parameters:
. comm - communicator (handle) 

Notes:
Blocks the caller until all processes in the communicator have called it; 
that is, the call returns at any process only after all members of the
communicator have entered the call.

.N ThreadSafe

.N Fortran

.N Errors
.N MPI_SUCCESS
.N MPI_ERR_COMM
@*/
int MPI_Barrier( MPI_Comm comm )
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Comm *comm_ptr = NULL;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    MPID_MPI_STATE_DECL(MPID_STATE_MPI_BARRIER);

    MPIR_ERRTEST_INITIALIZED_ORDIE();
    
    MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    MPID_MPI_COLL_FUNC_ENTER(MPID_STATE_MPI_BARRIER);
    
    /* Validate parameters, especially handles needing to be converted */
#   ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
	    MPIR_ERRTEST_COMM(comm, mpi_errno);
	}
        MPID_END_ERROR_CHECKS;
    }
#   endif /* HAVE_ERROR_CHECKING */

    /* Convert MPI object handles to object pointers */
    MPID_Comm_get_ptr( comm, comm_ptr );
    
    /* Validate parameters and objects (post conversion) */
#   ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
	    /* Validate communicator */
            MPID_Comm_valid_ptr( comm_ptr, mpi_errno, FALSE );
            if (mpi_errno) goto fn_fail;
        }
        MPID_END_ERROR_CHECKS;
    }
#   endif /* HAVE_ERROR_CHECKING */

    /* ... body of routine ...  */

    mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag);
    if (mpi_errno) goto fn_fail;
    
    /* ... end of body of routine ... */

  fn_exit:
    MPID_MPI_COLL_FUNC_EXIT(MPID_STATE_MPI_BARRIER);
    MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    return mpi_errno;

  fn_fail:
    /* --BEGIN ERROR HANDLING-- */
#   ifdef HAVE_ERROR_CHECKING
    {
	mpi_errno = MPIR_Err_create_code(
	    mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, 
	    "**mpi_barrier", "**mpi_barrier %C", comm);
    }
#   endif
    mpi_errno = MPIR_Err_return_comm( comm_ptr, FCNAME, mpi_errno );
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Exemplo n.º 9
0
/**
 * \brief Shut down the system
 *
 * At this time, no attempt is made to free memory being used for MPI structures.
 * \return MPI_SUCCESS
*/
int MPID_Finalize()
{
  pami_result_t rc;
  int mpierrno = MPI_SUCCESS;
  mpir_errflag_t errflag=MPIR_ERR_NONE;
  MPIR_Barrier_impl(MPIR_Process.comm_world, &errflag);

#ifdef MPIDI_STATISTICS
  if (MPIDI_Process.mp_statistics) {
      MPIDI_print_statistics();
  }
  MPIDI_close_pe_extension();
#endif

#ifdef DYNAMIC_TASKING
  mpidi_finalized = 1;
  if(mpidi_dynamic_tasking) {
    /* Tell the process group code that we're done with the process groups.
       This will notify PMI (with PMI_Finalize) if necessary.  It
       also frees all PG structures, including the PG for COMM_WORLD, whose
       pointer is also saved in MPIDI_Process.my_pg */
    mpierrno = MPIDI_PG_Finalize();
    if (mpierrno) {
	TRACE_ERR("MPIDI_PG_Finalize returned with mpierrno=%d\n", mpierrno);
    }

    MPIDI_FreeParentPort();
  }
  if(_conn_info_list) 
    MPIU_Free(_conn_info_list);
  MPIDI_free_all_tranid_node();
#endif


  /* ------------------------- */
  /* shutdown request queues   */
  /* ------------------------- */
  MPIDI_Recvq_finalize();

  PAMIX_Finalize(MPIDI_Client);

#ifdef MPID_NEEDS_ICOMM_WORLD
    MPIR_Comm_release_always(MPIR_Process.icomm_world, 0);
#endif

  MPIR_Comm_release_always(MPIR_Process.comm_self,0);
  MPIR_Comm_release_always(MPIR_Process.comm_world,0);

  rc = PAMI_Context_destroyv(MPIDI_Context, MPIDI_Process.avail_contexts);
  MPID_assert_always(rc == PAMI_SUCCESS);

  rc = PAMI_Client_destroy(&MPIDI_Client);
  MPID_assert_always(rc == PAMI_SUCCESS);

#ifdef MPIDI_TRACE
 {  int i;
  for (i=0; i< MPIDI_Process.numTasks; i++) {
      if (MPIDI_Trace_buf[i].R)
          MPIU_Free(MPIDI_Trace_buf[i].R);
      if (MPIDI_Trace_buf[i].PR)
          MPIU_Free(MPIDI_Trace_buf[i].PR);
      if (MPIDI_Trace_buf[i].S)
          MPIU_Free(MPIDI_Trace_buf[i].S);
  }
 }
 MPIU_Free(MPIDI_Trace_buf);
#endif

#ifdef OUT_OF_ORDER_HANDLING
  MPIU_Free(MPIDI_In_cntr);
  MPIU_Free(MPIDI_Out_cntr);
#endif

 if (TOKEN_FLOW_CONTROL_ON)
   {
     #if TOKEN_FLOW_CONTROL
     extern char *EagerLimit;

     if (EagerLimit) MPIU_Free(EagerLimit);
     MPIU_Free(MPIDI_Token_cntr);
     MPIDI_close_mm();
     #else
     MPID_assert_always(0);
     #endif
   }

  return MPI_SUCCESS;
}
Exemplo n.º 10
0
/* This function produces topology aware trees for reduction and broadcasts, with different
 * K values. This is a heavy-weight function as it allocates shared memory, generates topology
 * information, builds a package-level tree (for package leaders), and a per-package tree.
 * These are combined in shared memory for other ranks to read out from.
 * */
int MPIDI_SHM_topology_tree_init(MPIR_Comm * comm_ptr, int root, int bcast_k,
                                 MPIR_Treealgo_tree_t * bcast_tree, int *bcast_topotree_fail,
                                 int reduce_k, MPIR_Treealgo_tree_t * reduce_tree,
                                 int *reduce_topotree_fail, MPIR_Errflag_t * errflag)
{
    int *shared_region;
    MPL_shm_hnd_t fd;
    int num_ranks, rank;
    int mpi_errno = MPI_SUCCESS, mpi_errno_ret = MPI_SUCCESS;
    size_t shm_size;
    int **bind_map = NULL;
    int *max_entries_per_level = NULL;
    int **ranks_per_package = NULL;
    int *package_ctr = NULL;
    size_t topo_depth = 0;
    int package_level = 0, i, max_ranks_per_package = 0;
    bool mapfail_flag = false;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_TOPOLOGY_TREE_INIT);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_TOPOLOGY_TREE_INIT);

    num_ranks = MPIR_Comm_size(comm_ptr);
    rank = MPIR_Comm_rank(comm_ptr);

    /* Calculate the size of shared memory that would be needed */
    shm_size = sizeof(int) * 5 * num_ranks + num_ranks * sizeof(cpu_set_t);

    /* STEP 1. Create shared memory region for exchanging topology information (root only) */
    mpi_errno = MPIDIU_allocate_shm_segment(comm_ptr, shm_size, &fd, (void **) &shared_region,
                                            &mapfail_flag);
    if (mpi_errno || mapfail_flag) {
        /* for communication errors, just record the error but continue */
        *errflag =
            MPIX_ERR_PROC_FAILED ==
            MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
        MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
        MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
    }
    /* STEP 2. Collect cpu_sets for each rank at the root */
    cpu_set_t my_cpu_set;
    CPU_ZERO(&my_cpu_set);
    sched_getaffinity(0, sizeof(my_cpu_set), &my_cpu_set);
    ((cpu_set_t *) (shared_region))[rank] = my_cpu_set;
    mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag);
    if (mpi_errno) {
        /* for communication errors, just record the error but continue */
        *errflag =
            MPIX_ERR_PROC_FAILED ==
            MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
        MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
        MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
    }
    /* STEP 3. Root has all the cpu_set information, now build tree */
    if (rank == root) {
        topo_depth = hwloc_topology_get_depth(MPIR_Process.hwloc_topology);
        bind_map = (int **) MPL_malloc(num_ranks * sizeof(int *), MPL_MEM_OTHER);
        MPIR_ERR_CHKANDJUMP(!bind_map, mpi_errno, MPI_ERR_OTHER, "**nomem");
        for (i = 0; i < num_ranks; ++i) {
            bind_map[i] = (int *) MPL_calloc(topo_depth, sizeof(int), MPL_MEM_OTHER);
            MPIR_ERR_CHKANDJUMP(!bind_map[i], mpi_errno, MPI_ERR_OTHER, "**nomem");
        }
        MPIDI_SHM_hwloc_init_bindmap(num_ranks, topo_depth, shared_region, bind_map);
        /* Done building the topology information */

        /* STEP 3.1. Count the maximum entries at each level - used for breaking the tree into
         * intra/inter socket */
        max_entries_per_level = (int *) MPL_calloc(topo_depth, sizeof(size_t), MPL_MEM_OTHER);
        MPIR_ERR_CHKANDJUMP(!max_entries_per_level, mpi_errno, MPI_ERR_OTHER, "**nomem");
        package_level =
            MPIDI_SHM_topotree_get_package_level(topo_depth, max_entries_per_level, num_ranks,
                                                 bind_map);
        if (MPIDI_SHM_TOPOTREE_DEBUG)
            fprintf(stderr, "Breaking topology at :: %d (default= %d)\n", package_level,
                    MPIDI_SHM_TOPOTREE_CUTOFF);

        /* STEP 3.2. allocate space for the entries that go in each package based on hwloc info */
        ranks_per_package =
            (int
             **) MPL_malloc(max_entries_per_level[package_level] * sizeof(int *), MPL_MEM_OTHER);
        MPIR_ERR_CHKANDJUMP(!ranks_per_package, mpi_errno, MPI_ERR_OTHER, "**nomem");
        package_ctr =
            (int *) MPL_calloc(max_entries_per_level[package_level], sizeof(int), MPL_MEM_OTHER);
        MPIR_ERR_CHKANDJUMP(!package_ctr, mpi_errno, MPI_ERR_OTHER, "**nomem");
        for (i = 0; i < max_entries_per_level[package_level]; ++i) {
            package_ctr[i] = 0;
            ranks_per_package[i] = (int *) MPL_calloc(num_ranks, sizeof(int), MPL_MEM_OTHER);
            MPIR_ERR_CHKANDJUMP(!ranks_per_package[i], mpi_errno, MPI_ERR_OTHER, "**nomem");
        }
        /* sort the ranks into packages based on the binding information */
        for (i = 0; i < num_ranks; ++i) {
            int package = bind_map[i][package_level];
            ranks_per_package[package][package_ctr[package]++] = i;
        }
        max_ranks_per_package = 0;
        for (i = 0; i < max_entries_per_level[package_level]; ++i) {
            max_ranks_per_package = MPL_MAX(max_ranks_per_package, package_ctr[i]);
        }
        /* At this point we have done the common work in extracting topology information
         * and restructuring it to our needs. Now we generate the tree. */

        /* For Bcast, package leaders are added before the package local ranks, and the per_package
         * tree is left_skewed */
        mpi_errno = MPIDI_SHM_gen_tree(bcast_k, shared_region, max_entries_per_level,
                                       ranks_per_package, max_ranks_per_package, package_ctr,
                                       package_level, num_ranks, 1 /*package_leaders_first */ ,
                                       0 /*left_skewed */ , errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag =
                MPIX_ERR_PROC_FAILED ==
                MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
            MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
    }
    mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag);
    if (mpi_errno) {
        /* for communication errors, just record the error but continue */
        *errflag =
            MPIX_ERR_PROC_FAILED ==
            MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
        MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
        MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
    }

    /* Every rank copies their tree out from shared memory */
    MPIDI_SHM_copy_tree(shared_region, num_ranks, rank, bcast_tree, bcast_topotree_fail);
    if (MPIDI_SHM_TOPOTREE_DEBUG)
        MPIDI_SHM_print_topotree_file("BCAST", comm_ptr->context_id, rank, bcast_tree);

    /* Wait until shared memory is available */
    mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag);
    if (mpi_errno) {
        /* for communication errors, just record the error but continue */
        *errflag =
            MPIX_ERR_PROC_FAILED ==
            MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
        MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
        MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
    }
    /* Generate the reduce tree */
    /* For Reduce, package leaders are added after the package local ranks, and the per_package
     * tree is right_skewed (children are added in the reverse order */
    if (rank == root) {
        memset(shared_region, 0, shm_size);
        mpi_errno = MPIDI_SHM_gen_tree(reduce_k, shared_region, max_entries_per_level,
                                       ranks_per_package, max_ranks_per_package, package_ctr,
                                       package_level, num_ranks, 0 /*package_leaders_last */ ,
                                       1 /*right_skewed */ , errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag =
                MPIX_ERR_PROC_FAILED ==
                MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
            MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
    }

    mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag);
    if (mpi_errno) {
        /* for communication errors, just record the error but continue */
        *errflag =
            MPIX_ERR_PROC_FAILED ==
            MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
        MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
        MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
    }
    /* each rank copy the reduce tree out */
    MPIDI_SHM_copy_tree(shared_region, num_ranks, rank, reduce_tree, reduce_topotree_fail);

    if (MPIDI_SHM_TOPOTREE_DEBUG)
        MPIDI_SHM_print_topotree_file("REDUCE", comm_ptr->context_id, rank, reduce_tree);
    /* Wait for all ranks to copy out the tree */
    mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag);
    if (mpi_errno) {
        /* for communication errors, just record the error but continue */
        *errflag =
            MPIX_ERR_PROC_FAILED ==
            MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
        MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
        MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
    }
    /* Cleanup */
    if (rank == root) {
        for (i = 0; i < max_entries_per_level[package_level]; ++i) {
            MPL_free(ranks_per_package[i]);
        }
        MPL_free(ranks_per_package);
        MPL_free(package_ctr);
        if (MPIDI_SHM_TOPOTREE_DEBUG)
            for (i = 0; i < topo_depth; ++i) {
                fprintf(stderr, "Level :: %d, Max :: %d\n", i, max_entries_per_level[i]);
            }
        for (i = 0; i < num_ranks; ++i) {
            MPL_free(bind_map[i]);
        }
        MPL_free(max_entries_per_level);
        MPL_free(bind_map);
    }
    MPIDIU_destroy_shm_segment(shm_size, &fd, (void **) &shared_region);

  fn_exit:
    if (rank == root && MPIDI_SHM_TOPOTREE_DEBUG)
        fprintf(stderr, "Done creating tree for %d\n", num_ranks);
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_TOPOLOGY_TREE_INIT);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Exemplo n.º 11
0
static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPIR_Info * info,
                                       MPIR_Comm * comm_ptr, void *base_ptr, MPIR_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    void **base_pp = (void **) base_ptr;
    int i, node_size, node_rank;
    MPIR_Comm *node_comm_ptr;
    MPI_Aint *node_sizes;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    int noncontig = FALSE;
    MPIR_CHKPMEM_DECL(1);
    MPIR_CHKLMEM_DECL(1);
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);

    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);

    if ((*win_ptr)->comm_ptr->node_comm == NULL) {
        mpi_errno =
            MPIDI_CH3U_Win_allocate_no_shm(size, disp_unit, info, comm_ptr, base_ptr, win_ptr);
        goto fn_exit;
    }

    /* see if we can allocate all windows contiguously */
    noncontig = (*win_ptr)->info_args.alloc_shared_noncontig;

    (*win_ptr)->shm_allocated = TRUE;

    /* When allocating shared memory region segment, we need comm of processes
     * that are on the same node as this process (node_comm).
     * If node_comm == NULL, this process is the only one on this node, therefore
     * we use comm_self as node comm. */
    node_comm_ptr = (*win_ptr)->comm_ptr->node_comm;
    MPIR_Assert(node_comm_ptr != NULL);
    node_size = node_comm_ptr->local_size;
    node_rank = node_comm_ptr->rank;

    MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather);
    /* allocate memory for the base addresses, disp_units, and
     * completion counters of all processes */
    MPIR_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **,
                        node_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs");

    /* get the sizes of the windows and window objectsof
     * all processes.  allocate temp. buffer for communication */
    MPIR_CHKLMEM_MALLOC(node_sizes, MPI_Aint *, node_size * sizeof(MPI_Aint), mpi_errno,
                        "node_sizes");

    /* FIXME: This needs to be fixed for heterogeneous systems */
    node_sizes[node_rank] = (MPI_Aint) size;

    mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
                                    node_sizes, sizeof(MPI_Aint), MPI_BYTE,
                                    node_comm_ptr, &errflag);
    MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
    MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

    (*win_ptr)->shm_segment_len = 0;

    for (i = 0; i < node_size; i++) {
        if (noncontig)
            /* Round up to next page size */
            (*win_ptr)->shm_segment_len += MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[i]);
        else
            (*win_ptr)->shm_segment_len += node_sizes[i];
    }

    if ((*win_ptr)->shm_segment_len == 0) {
        (*win_ptr)->base = NULL;
    }

    else {
        mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->shm_segment_handle);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        if (node_rank == 0) {
            char *serialized_hnd_ptr = NULL;

            /* create shared memory region for all processes in win and map */
            mpi_errno =
                MPL_shm_seg_create_and_attach((*win_ptr)->shm_segment_handle,
                                                (*win_ptr)->shm_segment_len,
                                                (char **) &(*win_ptr)->shm_base_addr, 0);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            /* serialize handle and broadcast it to the other processes in win */
            mpi_errno =
                MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->shm_segment_handle,
                                                    &serialized_hnd_ptr);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            mpi_errno =
                MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                                &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            /* wait for other processes to attach to win */
            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            /* unlink shared memory region so it gets deleted when all processes exit */
            mpi_errno = MPL_shm_seg_remove((*win_ptr)->shm_segment_handle);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

        }
        else {
            char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 };

            /* get serialized handle from rank 0 and deserialize it */
            mpi_errno =
                MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                                &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            mpi_errno =
                MPL_shm_hnd_deserialize((*win_ptr)->shm_segment_handle, serialized_hnd,
                                          strlen(serialized_hnd));
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            /* attach to shared memory region created by rank 0 */
            mpi_errno =
                MPL_shm_seg_attach((*win_ptr)->shm_segment_handle, (*win_ptr)->shm_segment_len,
                                     (char **) &(*win_ptr)->shm_base_addr, 0);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
        }

        /* Allocated the interprocess mutex segment. */
        mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->shm_mutex_segment_handle);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        if (node_rank == 0) {
            char *serialized_hnd_ptr = NULL;

            /* create shared memory region for all processes in win and map */
            mpi_errno =
                MPL_shm_seg_create_and_attach((*win_ptr)->shm_mutex_segment_handle,
                                                sizeof(MPIDI_CH3I_SHM_MUTEX),
                                                (char **) &(*win_ptr)->shm_mutex, 0);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            MPIDI_CH3I_SHM_MUTEX_INIT(*win_ptr);

            /* serialize handle and broadcast it to the other processes in win */
            mpi_errno =
                MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->shm_mutex_segment_handle,
                                                    &serialized_hnd_ptr);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            mpi_errno =
                MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                                &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            /* wait for other processes to attach to win */
            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            /* unlink shared memory region so it gets deleted when all processes exit */
            mpi_errno = MPL_shm_seg_remove((*win_ptr)->shm_mutex_segment_handle);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
        }
        else {
            char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 };

            /* get serialized handle from rank 0 and deserialize it */
            mpi_errno =
                MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                                &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            mpi_errno =
                MPL_shm_hnd_deserialize((*win_ptr)->shm_mutex_segment_handle, serialized_hnd,
                                          strlen(serialized_hnd));
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            /* attach to shared memory region created by rank 0 */
            mpi_errno =
                MPL_shm_seg_attach((*win_ptr)->shm_mutex_segment_handle,
                                     sizeof(MPIDI_CH3I_SHM_MUTEX), (char **) &(*win_ptr)->shm_mutex,
                                     0);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
        }

        /* compute the base addresses of each process within the shared memory segment */
        {
            char *cur_base;
            int cur_rank;

            cur_base = (*win_ptr)->shm_base_addr;
            cur_rank = 0;
            ((*win_ptr)->shm_base_addrs)[0] = (*win_ptr)->shm_base_addr;
            for (i = 1; i < node_size; ++i) {
                if (node_sizes[i]) {
                    /* For the base addresses, we track the previous
                     * process that has allocated non-zero bytes of shared
                     * memory.  We can not simply use "i-1" for the
                     * previous process because rank "i-1" might not have
                     * allocated any memory. */
                    if (noncontig) {
                        ((*win_ptr)->shm_base_addrs)[i] =
                            cur_base + MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[cur_rank]);
                    }
                    else {
                        ((*win_ptr)->shm_base_addrs)[i] = cur_base + node_sizes[cur_rank];
                    }
                    cur_base = ((*win_ptr)->shm_base_addrs)[i];
                    cur_rank = i;
                }
                else {
                    ((*win_ptr)->shm_base_addrs)[i] = NULL;
                }
            }
        }

        (*win_ptr)->base = (*win_ptr)->shm_base_addrs[node_rank];
    }

    *base_pp = (*win_ptr)->base;

    /* gather window information among processes via shared memory region. */
    mpi_errno = MPIDI_CH3I_Win_gather_info((*base_pp), size, disp_unit, info, comm_ptr, win_ptr);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    /* Cache SHM windows */
    MPIDI_CH3I_SHM_Wins_append(&shm_wins_list, (*win_ptr));

  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    MPIR_CHKPMEM_REAP();
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Exemplo n.º 12
0
static int MPIDI_CH3I_Win_gather_info(void *base, MPI_Aint size, int disp_unit, MPIR_Info * info,
                                      MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr)
{
    MPIR_Comm *node_comm_ptr = NULL;
    int node_rank;
    int comm_rank, comm_size;
    MPI_Aint *tmp_buf = NULL;
    int i, k;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    int mpi_errno = MPI_SUCCESS;
    MPIR_CHKLMEM_DECL(1);
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO);

    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO);

    if ((*win_ptr)->comm_ptr->node_comm == NULL) {
        mpi_errno = MPIDI_CH3U_Win_gather_info(base, size, disp_unit, info, comm_ptr, win_ptr);
        goto fn_exit;
    }

    comm_size = (*win_ptr)->comm_ptr->local_size;
    comm_rank = (*win_ptr)->comm_ptr->rank;

    node_comm_ptr = (*win_ptr)->comm_ptr->node_comm;
    MPIR_Assert(node_comm_ptr != NULL);
    node_rank = node_comm_ptr->rank;

    (*win_ptr)->info_shm_segment_len = comm_size * sizeof(MPIDI_Win_basic_info_t);

    mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->info_shm_segment_handle);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    if (node_rank == 0) {
        char *serialized_hnd_ptr = NULL;

        /* create shared memory region for all processes in win and map. */
        mpi_errno = MPL_shm_seg_create_and_attach((*win_ptr)->info_shm_segment_handle,
                                                    (*win_ptr)->info_shm_segment_len,
                                                    (char **) &(*win_ptr)->info_shm_base_addr, 0);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        /* serialize handle and broadcast it to the other processes in win */
        mpi_errno =
            MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->info_shm_segment_handle,
                                                &serialized_hnd_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        mpi_errno =
            MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                            &errflag);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
        MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

        /* wait for other processes to attach to win */
        mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
        MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

        /* unlink shared memory region so it gets deleted when all processes exit */
        mpi_errno = MPL_shm_seg_remove((*win_ptr)->info_shm_segment_handle);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
    }
    else {
        char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 };

        /* get serialized handle from rank 0 and deserialize it */
        mpi_errno =
            MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                            &errflag);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
        MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

        mpi_errno = MPL_shm_hnd_deserialize((*win_ptr)->info_shm_segment_handle, serialized_hnd,
                                              strlen(serialized_hnd));
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        /* attach to shared memory region created by rank 0 */
        mpi_errno =
            MPL_shm_seg_attach((*win_ptr)->info_shm_segment_handle,
                                 (*win_ptr)->info_shm_segment_len,
                                 (char **) &(*win_ptr)->info_shm_base_addr, 0);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
        MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
    }

    (*win_ptr)->basic_info_table = (MPIDI_Win_basic_info_t *) ((*win_ptr)->info_shm_base_addr);

    MPIR_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint),
                        mpi_errno, "tmp_buf");

    tmp_buf[4 * comm_rank] = MPIR_Ptr_to_aint(base);
    tmp_buf[4 * comm_rank + 1] = size;
    tmp_buf[4 * comm_rank + 2] = (MPI_Aint) disp_unit;
    tmp_buf[4 * comm_rank + 3] = (MPI_Aint) (*win_ptr)->handle;

    mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, tmp_buf, 4, MPI_AINT,
                                    (*win_ptr)->comm_ptr, &errflag);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    if (node_rank == 0) {
        /* only node_rank == 0 writes results to basic_info_table on shared memory region. */
        k = 0;
        for (i = 0; i < comm_size; i++) {
            (*win_ptr)->basic_info_table[i].base_addr = MPIR_Aint_to_ptr(tmp_buf[k++]);
            (*win_ptr)->basic_info_table[i].size = tmp_buf[k++];
            (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++];
            (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++];
        }
    }

    /* Make sure that all local processes see the results written by node_rank == 0 */
    mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Exemplo n.º 13
0
int MPID_Win_free(MPIR_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    int in_use;
    MPIR_Comm *comm_ptr;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_WIN_FREE);

    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_WIN_FREE);

    MPIR_ERR_CHKANDJUMP(((*win_ptr)->states.access_state != MPIDI_RMA_NONE &&
                         (*win_ptr)->states.access_state != MPIDI_RMA_FENCE_ISSUED &&
                         (*win_ptr)->states.access_state != MPIDI_RMA_FENCE_GRANTED) ||
                        ((*win_ptr)->states.exposure_state != MPIDI_RMA_NONE),
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

    /* 1. Here we must wait until all passive locks are released on this target,
     * because for some UNLOCK messages, we do not send ACK back to origin,
     * we must wait until lock is released so that we can free window.
     * 2. We also need to wait until AT completion counter being zero, because
     * this counter is increment everytime we meet a GET-like operation, it is
     * possible that when target entering Win_free, passive epoch is not finished
     * yet and there are still GETs doing on this target.
     * 3. We also need to wait until lock queue becomes empty. It is possible
     * that some lock requests is still waiting in the queue when target is
     * entering Win_free. */
    while ((*win_ptr)->current_lock_type != MPID_LOCK_NONE ||
           (*win_ptr)->at_completion_counter != 0 ||
           (*win_ptr)->target_lock_queue_head != NULL ||
           (*win_ptr)->current_target_lock_data_bytes != 0 || (*win_ptr)->sync_request_cnt != 0) {
        mpi_errno = wait_progress_engine();
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);
    }

    mpi_errno = MPIR_Barrier_impl((*win_ptr)->comm_ptr, &errflag);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    /* Free window resources in lower layer. */
    if (MPIDI_CH3U_Win_hooks.win_free != NULL) {
        mpi_errno = MPIDI_CH3U_Win_hooks.win_free(win_ptr);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);
    }

    /* dequeue window from the global list */
    MPIR_Assert((*win_ptr)->active == FALSE);
    MPL_DL_DELETE(MPIDI_RMA_Win_inactive_list_head, (*win_ptr));

    if (MPIDI_RMA_Win_inactive_list_head == NULL && MPIDI_RMA_Win_active_list_head == NULL) {
        /* this is the last window, de-register RMA progress hook */
        mpi_errno = MPID_Progress_deregister_hook(MPIDI_CH3I_RMA_Progress_hook_id);
        if (mpi_errno != MPI_SUCCESS) {
            MPIR_ERR_POP(mpi_errno);
        }
    }

    comm_ptr = (*win_ptr)->comm_ptr;
    mpi_errno = MPIR_Comm_free_impl(comm_ptr);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    if ((*win_ptr)->basic_info_table != NULL)
        MPL_free((*win_ptr)->basic_info_table);
    MPL_free((*win_ptr)->op_pool_start);
    MPL_free((*win_ptr)->target_pool_start);
    MPL_free((*win_ptr)->slots);
    MPL_free((*win_ptr)->target_lock_entry_pool_start);

    MPIR_Assert((*win_ptr)->current_target_lock_data_bytes == 0);

    /* Free the attached buffer for windows created with MPI_Win_allocate() */
    if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE ||
        (*win_ptr)->create_flavor == MPI_WIN_FLAVOR_SHARED) {
        if ((*win_ptr)->shm_allocated == FALSE && (*win_ptr)->size > 0) {
            MPL_free((*win_ptr)->base);
        }
    }

    MPIR_Object_release_ref(*win_ptr, &in_use);
    /* MPI windows don't have reference count semantics, so this should always be true */
    MPIR_Assert(!in_use);
    MPIR_Handle_obj_free(&MPIR_Win_mem, *win_ptr);

  fn_exit:
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_WIN_FREE);
    return mpi_errno;

  fn_fail:
    goto fn_exit;
}