示例#1
0
文件: mpid_1s.c 项目: dbrowneup/pmap
void
MPIDI_Win_datatype_map(MPIDI_Datatype * dt)
{
  if (dt->contig)
    {
      dt->num_contig = 1;
      dt->map = &dt->__map;
      dt->map[0].DLOOP_VECTOR_BUF = (void*)(size_t)dt->true_lb;
      dt->map[0].DLOOP_VECTOR_LEN = dt->size;
    }
  else
    {
      unsigned map_size = dt->pointer->max_contig_blocks*dt->count + 1;
      dt->num_contig = map_size;
      dt->map = (DLOOP_VECTOR*)MPIU_Malloc(map_size * sizeof(DLOOP_VECTOR));
      MPID_assert(dt->map != NULL);

      DLOOP_Offset last = dt->pointer->size*dt->count;
      MPID_Segment seg;
      MPID_Segment_init(NULL, dt->count, dt->type, &seg, 0);
      MPID_Segment_pack_vector(&seg, 0, &last, dt->map, &dt->num_contig);
      MPID_assert((unsigned)dt->num_contig <= map_size);
#ifdef TRACE_ON
      TRACE_ERR("dt->pointer->size=%d  num_contig:  orig=%u  new=%d\n", dt->pointer->size, map_size, dt->num_contig);
      int i;
      for(i=0; i<dt->num_contig; ++i)
        TRACE_ERR("     %d:  BUF=%zu  LEN=%zu\n", i, (size_t)dt->map[i].DLOOP_VECTOR_BUF, (size_t)dt->map[i].DLOOP_VECTOR_LEN);
#endif
    }
}
static pami_result_t
MPIDI_Fetch_and_op_using_pami_rmw(pami_context_t   context,
                                  void           * _req)
{
    MPIDI_Win_request *req = (MPIDI_Win_request*)_req;
    pami_result_t rc;
    int  target_rank;  
  
    MPID_assert(req != NULL);
    target_rank = req->target.rank;

    pami_rmw_t  params; 
    params=zero_rmw_parms;
    params.dest=req->dest;
    params.cookie=(void *)req;
    params.done_fn=MPIDI_Win_DoneCB;
    params.type = req->pami_datatype;
    params.operation = req->pami_op;
    params.local=req->user_buffer;  /*result*/
    params.remote=req->win->mpid.info[target_rank].base_addr + req->offset + (size_t)req->origin.dt.map[0].DLOOP_VECTOR_BUF;
    params.value=req->buffer;        /* replaced value with origin */

    rc = PAMI_Rmw(context, &params);
    MPID_assert(rc == PAMI_SUCCESS);
    return rc;
}
示例#3
0
static inline int
MPIDI_Get_use_pami_get(pami_context_t context, MPIDI_Win_request * req)
{
  pami_result_t rc;
  pami_get_simple_t params;

  params=zero_get_parms;

  params.rma.dest=req->dest;
  params.rma.hints.use_rdma          = PAMI_HINT_DEFAULT;
#ifndef OUT_OF_ORDER_HANDLING
  params.rma.hints.no_long_header= 1,
#endif
  params.rma.bytes   = 0;
  params.rma.cookie  = req;
  params.rma.done_fn = MPIDI_Win_DoneCB;
  params.addr.local=req->buffer;
  params.addr.remote= req->win->mpid.info[req->target.rank].base_addr;

  struct MPIDI_Win_sync* sync = &req->win->mpid.sync;
  TRACE_ERR("Start       index=%u/%d  l-addr=%p  r-base=%p  r-offset=%zu (sync->started=%u  sync->complete=%u)\n",
	    req->state.index, req->target.dt.num_contig, req->buffer, req->win->mpid.info[req->target.rank].base_addr, req->offset, sync->started, sync->complete);
  while (req->state.index < req->target.dt.num_contig) {
    if (sync->started > sync->complete + MPIDI_Process.rma_pending)
      {
	TRACE_ERR("Bailing out;  index=%u/%d  sync->started=%u  sync->complete=%u\n",
		  req->state.index, req->target.dt.num_contig, sync->started, sync->complete);
	return PAMI_EAGAIN;
      }
    ++sync->started;


    params.rma.bytes          =                       req->target.dt.map[req->state.index].DLOOP_VECTOR_LEN;
    params.addr.local          = req->buffer+req->state.local_offset;
    params.addr.remote         = req->win->mpid.info[req->target.rank].base_addr+ req->offset + (size_t)req->target.dt.map[req->state.index].DLOOP_VECTOR_BUF;

#ifdef TRACE_ON
    unsigned* buf = (unsigned*)(req->buffer + params.rdma.local.offset);
#endif
    TRACE_ERR("  Sub     index=%u  bytes=%zu  l-offset=%zu  r-offset=%zu  buf=%p  *(int*)buf=0x%08x\n",
	      req->state.index, params.rma.bytes, params.rdma.local.offset, params.rdma.remote.offset, buf, *buf);
    
    /** sync->total will be updated with every RMA and the complete
	will not change till that RMA has completed. In the meanwhile
	the rest of the RMAs will have memory leaks */
    if (req->target.dt.num_contig - req->state.index == 1) {
        rc = PAMI_Get(context, &params);
        MPID_assert(rc == PAMI_SUCCESS);
        return PAMI_SUCCESS;
    } else {
        rc = PAMI_Get(context, &params);
        MPID_assert(rc == PAMI_SUCCESS);
        req->state.local_offset += params.rma.bytes;
        ++req->state.index;
    }
  }
  return PAMI_SUCCESS;
}
示例#4
0
static pami_result_t
MPIDI_Accumulate(pami_context_t   context,
                 void           * _req)
{
  MPIDI_Win_request *req = (MPIDI_Win_request*)_req;
  pami_result_t rc;
  void *map;
  pami_send_t params;

  params = zero_send_parms;
  params.send.header.iov_len = sizeof(MPIDI_Win_MsgInfo);
  params.send.dispatch = MPIDI_Protocols_WinAccum;
  params.send.dest = req->dest;
  params.events.cookie = req;
  params.events.remote_fn = MPIDI_Win_DoneCB;

  struct MPIDI_Win_sync* sync = &req->win->mpid.sync;
  TRACE_ERR("Start       index=%u/%d  l-addr=%p  r-base=%p  r-offset=%zu (sync->started=%u  sync->complete=%u)\n",
            req->state.index, req->target.dt.num_contig, req->buffer, req->win->mpid.info[req->target.rank].base_addr, req->offset, sync->started, sync->complete);
  while (req->state.index < req->target.dt.num_contig) {
    if (sync->started > sync->complete + MPIDI_Process.rma_pending)
      {
        TRACE_ERR("Bailing out;  index=%u/%d  sync->started=%u  sync->complete=%u\n",
                req->state.index, req->target.dt.num_contig, sync->started, sync->complete);
        return PAMI_EAGAIN;
      }
    ++sync->started;


    params.send.header.iov_base = &(((MPIDI_Win_MsgInfo *)req->accum_headers)[req->state.index]);
    params.send.data.iov_len    = req->target.dt.map[req->state.index].DLOOP_VECTOR_LEN;
    params.send.data.iov_base   = req->buffer + req->state.local_offset;

#ifdef TRACE_ON
    void    *  buf = params.send.data.iov_base;
    unsigned* ibuf = (unsigned*)buf;
    double  * dbuf = (double  *)buf;
    TRACE_ERR("  Sub     index=%u  bytes=%zu  l-offset=%zu  r-addr=%p  l-buf=%p  *(int*)buf=0x%08x  *(double*)buf=%g\n",
              req->state.index, params.send.data.iov_len, req->state.local_offset, req->accum_headers[req->state.index].addr, buf, *ibuf, *dbuf);
#endif
    /** sync->total will be updated with every RMA and the complete
	will not change till that RMA has completed. In the meanwhile
	the rest of the RMAs will have memory leaks */
      if (req->target.dt.num_contig - req->state.index == 1) {
          rc = PAMI_Send(context, &params);
          MPID_assert(rc == PAMI_SUCCESS);
          return PAMI_SUCCESS;
      } else {
          rc = PAMI_Send(context, &params);
          MPID_assert(rc == PAMI_SUCCESS);
          req->state.local_offset += params.send.data.iov_len;
          ++req->state.index;
      }
  }


  return PAMI_SUCCESS;
}
void
MPIDI_WinAtomicCB(pami_context_t    context,
		  void            * cookie,
		  const void      * _hdr,
		  size_t            size,
		  const void      * sndbuf,
		  size_t            sndlen,
		  pami_endpoint_t   sender,
		  pami_recv_t     * recv)
{
  MPIDI_AtomicHeader_t *ahdr = (MPIDI_AtomicHeader_t *) _hdr;
  MPID_assert (ahdr != NULL);
  MPID_assert (sizeof(MPIDI_AtomicHeader_t) == size);
  MPIDI_AtomicHeader_t ack_hdr = *ahdr;

  void *dest_addr = ahdr->remote_addr; 
  int len;       
  len = MPID_Datatype_get_basic_size (ahdr->datatype);

  if (ahdr->atomic_type == MPIDI_WIN_REQUEST_COMPARE_AND_SWAP) {

    //overwrite value with result in ack_hdr
    MPIU_Memcpy(ack_hdr.buf, dest_addr, len);
    
    if (MPIR_Compare_equal (&ahdr->test, dest_addr, ahdr->datatype))
      MPIU_Memcpy(dest_addr, ahdr->buf, len);      
  }    
  else if (ahdr->atomic_type == MPIDI_WIN_REQUEST_FETCH_AND_OP) {
    //overwrite value with result
    MPIU_Memcpy(ack_hdr.buf, dest_addr, len);

    MPI_User_function *uop;
    int one = 1;
    uop = MPIR_OP_HDL_TO_FN(ahdr->op);

    if (ahdr->op == MPI_REPLACE) 
      MPIU_Memcpy(dest_addr, ahdr->buf, len);
    else if (ahdr->op == MPI_NO_OP);
    else
      (*uop) ((void *)ahdr->buf, dest_addr, &one, &ahdr->datatype);
  }
  else
    MPID_abort();

  pami_send_immediate_t params = {
    .dispatch = MPIDI_Protocols_WinAtomicAck,
    .dest     = sender,
    .header   = {
      .iov_base = &ack_hdr,
      .iov_len  = sizeof(MPIDI_AtomicHeader_t),
    },
    .data     = {
       .iov_base = NULL,
       .iov_len  = 0,
     },
    .hints = {0}, 
示例#6
0
/**
 * \brief The callback for a new "zero byte" RZV RTS
 * \param[in]  context      The context on which the message is being received.
 * \param[in]  cookie       Unused
 * \param[in]  _msginfo     The extended header information
 * \param[in]  msginfo_size The size of the extended header information
 * \param[in]  sndbuf       Unused
 * \param[in]  sndlen       Unused
 * \param[in]  sender       The origin endpoint
 * \param[out] recv         Unused
 */
void
MPIDI_RecvRzvCB_zerobyte(pami_context_t    context,
                         void            * cookie,
                         const void      * _msginfo,
                         size_t            msginfo_size,
                         const void      * sndbuf,
                         size_t            sndlen,
                         pami_endpoint_t   sender,
                         pami_recv_t     * recv)
{
  MPID_assert(recv == NULL);
  MPID_assert(sndlen == 0);
  MPIDI_RecvRzvCB_impl (context, sender, _msginfo, msginfo_size, 1);
}
示例#7
0
/* MSGQUEUE lock is not held */
void
MPIDI_Callback_process_userdefined_dt(pami_context_t      context,
                                      const void        * sndbuf,
                                      size_t              sndlen,
                                      MPID_Request      * rreq)
{
  unsigned dt_contig, dt_size;
  MPID_Datatype *dt_ptr;
  MPI_Aint dt_true_lb;
  MPIDI_Datatype_get_info(rreq->mpid.userbufcount,
                          rreq->mpid.datatype,
                          dt_contig,
                          dt_size,
                          dt_ptr,
                          dt_true_lb);

  /* ----------------------------- */
  /*  Test for truncated message.  */
  /* ----------------------------- */
  if (unlikely(sndlen > dt_size))
    {
#if ASSERT_LEVEL > 0
      MPIDI_Callback_process_trunc(context, rreq, NULL, sndbuf);
      return;
#else
      sndlen = dt_size;
#endif
    }

  /*
   * This is to test that the fields don't need to be
   * initialized.  Remove after this doesn't fail for a while.
   */
  if (likely (dt_contig))
    {
      MPID_assert(rreq->mpid.uebuf    == NULL);
      MPID_assert(rreq->mpid.uebuflen == 0);
      void* rcvbuf = rreq->mpid.userbuf +  dt_true_lb;;

      memcpy(rcvbuf, sndbuf, sndlen);
      MPIDI_Request_complete(rreq);
      return;
    }

  MPIDI_Request_setCA(rreq, MPIDI_CA_UNPACK_UEBUF_AND_COMPLETE);
  rreq->mpid.uebuflen = sndlen;
  rreq->mpid.uebuf    = (void*)sndbuf;
  MPIDI_RecvDoneCB(context, rreq, PAMI_SUCCESS);
  MPID_Request_release(rreq);
}
示例#8
0
/* MSGQUEUE lock is not held */
void
MPIDI_Callback_process_trunc(pami_context_t  context,
                             MPID_Request   *rreq,
                             pami_recv_t    *recv,
                             const void     *sndbuf)
{
  rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE;

  /* -------------------------------------------------------------- */
  /*  The data is already available, so we can just unpack it now.  */
  /* -------------------------------------------------------------- */
  if (recv)
    {
      MPIDI_Request_setCA(rreq, MPIDI_CA_UNPACK_UEBUF_AND_COMPLETE);
      rreq->mpid.uebuflen = MPIR_STATUS_GET_COUNT(rreq->status);
      rreq->mpid.uebuf    = MPIU_Malloc(MPIR_STATUS_GET_COUNT(rreq->status));
      MPID_assert(rreq->mpid.uebuf != NULL);
      rreq->mpid.uebuf_malloc = mpiuMalloc;

      recv->addr = rreq->mpid.uebuf;
    }
  else
    {
      MPIDI_Request_setCA(rreq, MPIDI_CA_UNPACK_UEBUF_AND_COMPLETE);
      rreq->mpid.uebuflen = MPIR_STATUS_GET_COUNT(rreq->status);
      rreq->mpid.uebuf    = (void*)sndbuf;
      MPIDI_RecvDoneCB(context, rreq, PAMI_SUCCESS);
      MPID_Request_release(rreq);
    }
}
示例#9
0
void
MPIDI_WinCtrlSend(pami_context_t       context,
                  MPIDI_Win_control_t *control,
                  int                  rank,
                  MPID_Win            *win)
{
  pami_task_t  taskid;
  MPIDI_WinLock_info *winLock;
  control->win = win->mpid.info[rank].win;
  control->rank = win->comm_ptr->rank;
  taskid=MPID_VCR_GET_LPID(win->comm_ptr->vcr,rank);

  pami_endpoint_t dest;
  pami_result_t rc;
  taskid=MPID_VCR_GET_LPID(win->comm_ptr->vcr,rank);
  rc = PAMI_Endpoint_create(MPIDI_Client,taskid, 0, &dest);
  MPID_assert(rc == PAMI_SUCCESS);

  if ((control->type == MPIDI_WIN_MSGTYPE_UNLOCK) ||
      (control->type == MPIDI_WIN_MSGTYPE_UNLOCKALL)) {
    pami_send_t params = {
      .send   = {
        .dispatch = MPIDI_Protocols_WinCtrl,
        .dest     = dest,
        .header   = {
          .iov_base = control,
          .iov_len  = sizeof(MPIDI_Win_control_t),
        },
      },
      .events = {
        .cookie   = win,
        .local_fn = NULL,
        .remote_fn= MPIDI_WinUnlockDoneCB,
      },
    };
示例#10
0
void
MPIDI_WinLockReq_proc(pami_context_t              context,
                      const MPIDI_Win_control_t * info,
                      unsigned                    peer)
{
  MPID_Win * win = info->win;
  struct MPIDI_Win_lock* lock = MPL_calloc0(1, struct MPIDI_Win_lock);
  if (info->type == MPIDI_WIN_MSGTYPE_LOCKREQ)
       lock->mtype = MPIDI_REQUEST_LOCK;
  else if (info->type == MPIDI_WIN_MSGTYPE_LOCKALLREQ) {
       lock->mtype = MPIDI_REQUEST_LOCKALL;
       lock->flagAddr = (void *) info->flagAddr;
  }
  lock->rank = info->rank;
  lock->type = info->data.lock.type;

  struct MPIDI_Win_queue* q = &win->mpid.sync.lock.local.requested;
  MPID_assert( (q->head != NULL) ^ (q->tail == NULL) );
  if (q->tail == NULL)
    q->head = lock;
  else
    q->tail->next = lock;
  q->tail = lock;

  MPIDI_WinLockAdvance(context, win);
}
示例#11
0
void MPIDI_Request_allocate_pool()
{
  int i;
  MPID_Request *prev, *cur;
  /* batch allocate a linked list of requests */
  MPIU_THREAD_CS_ENTER(HANDLEALLOC,);
  prev = MPIU_Handle_obj_alloc_unsafe(&MPID_Request_mem);
  MPID_assert(prev != NULL);
  prev->mpid.next = NULL;
  for (i = 1; i < MPID_REQUEST_TLS_MAX; ++i) {
    cur = MPIU_Handle_obj_alloc_unsafe(&MPID_Request_mem);
    MPID_assert(cur != NULL);
    cur->mpid.next = prev;
    prev = cur;
  }
  MPIU_THREAD_CS_EXIT(HANDLEALLOC,);
  MPIDI_Process.request_handles[MPIDI_THREAD_ID()].head = cur;
  MPIDI_Process.request_handles[MPIDI_THREAD_ID()].count += MPID_REQUEST_TLS_MAX;
}
示例#12
0
int
MPID_Win_set_info(MPID_Win     *win, MPID_Info    *info)
{
    int mpi_errno = MPI_SUCCESS;

    mpi_errno = MPIDI_Win_set_info(win, info);
    MPID_assert(mpi_errno == MPI_SUCCESS);
    mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &mpi_errno);
    return mpi_errno;
}
示例#13
0
void
MPIDI_RecvShortSyncCB(pami_context_t    context,
                      void            * cookie,
                      const void      * _msginfo,
                      size_t            msginfo_size,
                      const void      * sndbuf,
                      size_t            sndlen,
                      pami_endpoint_t   sender,
                      pami_recv_t     * recv)
{
  MPID_assert(recv == NULL);
  MPID_assert(msginfo_size == sizeof(MPIDI_MsgInfo));
  MPIDI_RecvShortCB(context,
                    _msginfo,
                    sndbuf,
                    sndlen,
                    sender,
                    1);
}
static inline int
MPID_Cancel_send_rsm(MPID_Request * sreq)
{
  int flag;
  MPID_assert(sreq != NULL);

  /* ------------------------------------------------- */
  /* Check if we already have a cancel request pending */
  /* ------------------------------------------------- */
  MPIDI_DCMF_Request_cancel_pending(sreq, &flag);
  if (flag)
    return MPI_SUCCESS;

  /* ------------------------------------ */
  /* Try to cancel a send request to self */
  /* ------------------------------------ */
  if (MPID_Request_isSelf(sreq))
    {
      int source     = MPID_Request_getMatchRank(sreq);
      int tag        = MPID_Request_getMatchTag (sreq);
      int context_id = MPID_Request_getMatchCtxt(sreq);
      MPID_Request * rreq = MPIDI_Recvq_FDUR(sreq, source, tag, context_id);
      if (rreq)
        {
          MPID_assert(rreq->partner_request == sreq);
          MPID_Request_release(rreq);
          sreq->status.cancelled = TRUE;
          sreq->cc = 0;
        }
      return MPI_SUCCESS;
    }
  else
    {
      if(!sreq->comm)
        return MPI_SUCCESS;

      MPID_Request_increment_cc(sreq);
      MPIDI_DCMF_postCancelReq(sreq);

      return MPI_SUCCESS;
    }
}
static void 
MPIDI_Win_GetAccumSendAck(pami_context_t   context,
			  void           * _info,
			  pami_result_t    result)
{
  MPIDI_Win_GetAccMsgInfo *msginfo = (MPIDI_Win_GetAccMsgInfo *) _info;  
  pami_result_t rc = PAMI_SUCCESS;

  //Copy from msginfo->addr to a contiguous buffer
  char *buffer = NULL;

  buffer      = MPIU_Malloc(msginfo->size);
  MPID_assert(buffer != NULL);
  
  if (msginfo->num_contig == 1)
    memcpy(buffer, msginfo->addr, msginfo->size);
  else
    {
      int mpi_errno = 0;
      mpi_errno = MPIR_Localcopy(msginfo->addr,
                                 msginfo->count,
                                 msginfo->type,
                                 buffer,
                                 msginfo->size,
                                 MPI_CHAR);
      MPID_assert(mpi_errno == MPI_SUCCESS);      
    }

  //Schedule sends to source to result buffer and trigger completion
  //callback there
  pami_send_t params = {
    .send = {
      .header = {
	 .iov_base = msginfo,
	 .iov_len = sizeof(MPIDI_Win_GetAccMsgInfo),
       },
      .dispatch = MPIDI_Protocols_WinGetAccumAck,
      .dest     = msginfo->src_endpoint,
    },
    .events = {
示例#16
0
int
MPIDI_Win_allgather( MPI_Aint size, MPID_Win **win_ptr )
{
  int mpi_errno = MPI_SUCCESS;
  MPID_Win *win;
  int rank;
  MPID_Comm *comm_ptr;
  size_t length_out = 0;
  pami_result_t rc;
  MPIDI_Win_info  *winfo;
  static char FCNAME[] = "MPIDI_Win_allgather";

  win = *win_ptr;
  comm_ptr = win->comm_ptr;
  rank = comm_ptr->rank;
  winfo = &win->mpid.info[rank];

  if (size != 0 && win->create_flavor != MPI_WIN_FLAVOR_SHARED)
    {
#ifndef USE_PAMI_RDMA
      if (!MPIDI_Process.mp_s_use_pami_get)
        {
#endif
          /* --------------------------------------- */
          /*  Setup the PAMI sections of the window  */
          /* --------------------------------------- */
          rc = PAMI_Memregion_create(MPIDI_Context[0], win->mpid.info[rank].base_addr, win->size, &length_out, &winfo->memregion);
#ifdef USE_PAMI_RDMA
          MPIU_ERR_CHKANDJUMP((rc != PAMI_SUCCESS), mpi_errno, MPI_ERR_OTHER, "**nomem");
          MPIU_ERR_CHKANDJUMP((win->size < length_out), mpi_errno, MPI_ERR_OTHER, "**nomem");
#else
          if (rc == PAMI_SUCCESS)
            {
              winfo->memregion_used = 1;
              MPID_assert(win->size == length_out);
            }
        }
#endif
    }

  mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE,
                                  0,
                                  MPI_DATATYPE_NULL,
                                  win->mpid.info,
                                  sizeof(struct MPIDI_Win_info),
                                  MPI_BYTE,
                                  comm_ptr,
                                  &mpi_errno);

fn_fail:
   return mpi_errno;
}
示例#17
0
void
MPIDI_WinAccumCB(pami_context_t    context,
                 void            * cookie,
                 const void      * _msginfo,
                 size_t            msginfo_size,
                 const void      * sndbuf,
                 size_t            sndlen,
                 pami_endpoint_t   sender,
                 pami_recv_t     * recv)
{
  MPID_assert(recv   != NULL);
  MPID_assert(sndbuf == NULL);
  MPID_assert(msginfo_size == sizeof(MPIDI_Win_MsgInfo));
  MPID_assert(_msginfo != NULL);
  const MPIDI_Win_MsgInfo * msginfo = (const MPIDI_Win_MsgInfo*)_msginfo;

  int null=0;
  pami_type_t         pami_type;
  pami_data_function  pami_op;
  MPIDI_Datatype_to_pami(msginfo->type, &pami_type, msginfo->op, &pami_op, &null);

#ifdef TRACE_ON
  void    *  buf = msginfo->addr;
  unsigned* ibuf = (unsigned*)buf;
  double  * dbuf = (double  *)buf;
  TRACE_ERR("New accum msg:  len=%zu  type=%x  op=%x  l-buf=%p  *(int*)buf=0x%08x  *(double*)buf=%g\n", sndlen, msginfo->type, msginfo->op, buf, *ibuf, *dbuf);
  TRACE_ERR("                PAMI:    type=%p  op=%p\n", pami_type, pami_op);
#endif

  MPID_assert(recv != NULL);
  *recv = zero_recv_parms;
  recv->cookie      = NULL;
  recv->local_fn    = NULL;
  recv->addr        = msginfo->addr;
  recv->type        = pami_type;
  recv->offset      = 0;
  recv->data_fn     = pami_op;
  recv->data_cookie = NULL;
}
示例#18
0
/**
 * \brief MPI-PAMI glue for MPI_Win_allocate function
 *
 * Create a window object. Allocates a MPID_Win object and initializes it,
 * then allocates the collective info array, initalizes our entry, and
 * performs an Allgather to distribute/collect the rest of the array entries.
 * On each process, it allocates memory of at least size bytes, returns a
 * pointer to it, and returns a window object that can be used by all processes
 * in comm to * perform RMA operations. The returned memory consists of size
 * bytes local to each process, starting at address base_ptr and is associated
 * with the window as if the user called 'MPI_Win_create' on existing memory.
 * The size argument may be different at each process and size = 0 is valid;
 * however, a library might allocate and expose more memory in order to create
 * a fast, globally symmetric allocation.
 * Input Parameters:
 * \param[in] size      size of window in bytes (nonnegative integer)
 * \param[in] disp_unit local unit size for displacements, in bytes (positive integer)
 * \param[in] info      info argument (handle))
 * \param[in] comm_ptr  Communicator (handle)
 * \param[out] base_ptr - base address of the window in local memory
 * \param[out] win_ptr  window object returned by the call (handle)
 * \return MPI_SUCCESS, MPI_ERR_ARG, MPI_ERR_COMM, MPI_ERR_INFO. MPI_ERR_OTHER,
 *         MPI_ERR_SIZE
 */
int
MPID_Win_allocate(MPI_Aint     size,
                  int          disp_unit,
                  MPID_Info  * info,
                  MPID_Comm  * comm_ptr,
                  void *base_ptr,
                  MPID_Win  ** win_ptr)
{
  int mpi_errno  = MPI_SUCCESS;
  int rc = MPI_SUCCESS;
  mpir_errflag_t errflag = MPIR_ERR_NONE;
  void *baseP; 
  static char FCNAME[] = "MPID_Win_allocate";
  MPIDI_Win_info  *winfo;
  MPID_Win   *win;
  int        rank;

  rc=MPIDI_Win_init(size,disp_unit,win_ptr, info, comm_ptr, MPI_WIN_FLAVOR_ALLOCATE, MPI_WIN_UNIFIED);
  win = *win_ptr;

  if (size > 0) {
      baseP = MPIU_Malloc(size);
  #ifndef MPIDI_NO_ASSERT
      MPID_assert(baseP != NULL);
  #else
      MPIU_ERR_CHKANDJUMP((baseP == NULL), mpi_errno, MPI_ERR_BUFFER, "**bufnull");
  #endif

  } else if (size == 0) {
      baseP = NULL;
  } else {
      MPIU_ERR_CHKANDSTMT(size >=0 , mpi_errno, MPI_ERR_SIZE,
                          return mpi_errno, "**rmasize");
  }

  win->base = baseP;
  rank = comm_ptr->rank;
  winfo = &win->mpid.info[rank];
  winfo->base_addr = baseP;
  winfo->win = win;
  winfo->disp_unit = disp_unit;

  rc= MPIDI_Win_allgather(size,win_ptr);
  if (rc != MPI_SUCCESS)
      return rc;
  *(void**) base_ptr = (void *) win->base;
  mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag);

  fn_fail:
  return mpi_errno;
}
示例#19
0
/**
 * Insert a request in the OutOfOrderList, make sure this list is
 * arranged in the ascending order.
 */
void MPIDI_Recvq_enqueue_ool(pami_task_t src, MPID_Request *req)
{
  MPID_Request  *q;
  void *head;
  int insert,i;
  MPIDI_In_cntr_t *in_cntr;

  in_cntr=&MPIDI_In_cntr[src];
  if (in_cntr->n_OutOfOrderMsgs != 0) {
    head=in_cntr->OutOfOrderList;
    q=in_cntr->OutOfOrderList;
    insert=0;
    MPID_assert(q->mpid.nextR != NULL);
    while(q->mpid.nextR != head) {
      if (((int)(MPIDI_Request_getMatchSeq(q) - MPIDI_Request_getMatchSeq(req))) > 0) {
        insert=1;
        break;
      }
      q=q->mpid.nextR;
    }
    if (insert) {
      MPIDI_Recvq_insert_ool(q,req);
      if (q == head) { /* 1st element in the list */
        in_cntr->OutOfOrderList=req;
      }
    } else {
      if (((int)(MPIDI_Request_getMatchSeq(q) - MPIDI_Request_getMatchSeq(req))) > 0) {
        MPIDI_Recvq_insert_ool(q,req);
        if (q == head) { /* 1st element in the list */
          in_cntr->OutOfOrderList=req;
        }
      } else {
        MPIDI_Recvq_insert_ool((MPID_Request *)q->mpid.nextR,req);
      }
    }
  } else {   /*  empty list    */
    in_cntr->OutOfOrderList=req;
    req->mpid.prevR=req;
    req->mpid.nextR=req;
  }
  in_cntr->n_OutOfOrderMsgs++;
#if (MPIDI_STATISTICS)
  MPID_NSTAT(mpid_statp->unorderedMsgs);
#endif
} /* void MPIDI_Recvq_insert_ool(pami_task_t src, MPID_Request *N) */
示例#20
0
文件: mpid_1s.c 项目: dbrowneup/pmap
void
MPIDI_Win_DoneCB(pami_context_t  context,
                 void          * cookie,
                 pami_result_t   result)
{
  MPIDI_Win_request *req = (MPIDI_Win_request*)cookie;
  ++req->win->mpid.sync.complete;

  if ((req->buffer_free) && (req->type == MPIDI_WIN_REQUEST_GET))
    {
      ++req->origin.completed;
      if (req->origin.completed == req->target.dt.num_contig)
        {
          int mpi_errno;
          mpi_errno = MPIR_Localcopy(req->buffer,
                                     req->origin.dt.size,
                                     MPI_CHAR,
                                     req->origin.addr,
                                     req->origin.count,
                                     req->origin.datatype);
          MPID_assert(mpi_errno == MPI_SUCCESS);
          MPID_Datatype_release(req->origin.dt.pointer);
          MPIU_Free(req->buffer);
          req->buffer_free = 0;
        }
    }

  if (req->win->mpid.sync.total == req->win->mpid.sync.complete)
    {
      if (req->buffer_free)
        MPIU_Free(req->buffer);
      if (req->accum_headers)
        MPIU_Free(req->accum_headers);
      MPIU_Free(req);
    }
  MPIDI_Progress_signal();
}
示例#21
0
void
MPIDI_RecvMsg_Unexp(MPID_Request  * rreq,
                    void          * buf,
                    int             count,
                    MPI_Datatype    datatype)
{
  /* ------------------------------------------------------------ */
  /* message was found in unexpected queue                        */
  /* ------------------------------------------------------------ */
  /* We must acknowledge synchronous send requests                */
  /* The recvnew callback will acknowledge the posted messages    */
  /* Recv functions will ack the messages that are unexpected     */
  /* ------------------------------------------------------------ */
#ifdef MPIDI_TRACE
  MPIDI_In_cntr[(rreq->mpid.partner_id)].R[(rreq->mpid.idx)].matchedInUQ=1;
#endif

  if (MPIDI_Request_isRzv(rreq))
    {
      const unsigned is_sync = MPIDI_Request_isSync(rreq);
      const unsigned is_zero = (rreq->mpid.envelope.length==0);

      /* -------------------------------------------------------- */
      /* Received an expected flow-control rendezvous RTS.        */
      /*     This is very similar to the found/incomplete case    */
      /* -------------------------------------------------------- */
      if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN)
        {
          MPID_Datatype_get_ptr(datatype, rreq->mpid.datatype_ptr);
          MPID_Datatype_add_ref(rreq->mpid.datatype_ptr);
        }

      if (likely((is_sync+is_zero) == 0))
        MPIDI_Context_post(MPIDI_Context_local(rreq), &rreq->mpid.post_request, MPIDI_RendezvousTransfer, rreq);
      else if (is_sync != 0)
        MPIDI_Context_post(MPIDI_Context_local(rreq), &rreq->mpid.post_request, MPIDI_RendezvousTransfer_SyncAck, rreq);
      else
        MPIDI_Context_post(MPIDI_Context_local(rreq), &rreq->mpid.post_request, MPIDI_RendezvousTransfer_zerobyte, rreq);
    }
  else 
    {
     if (MPID_cc_is_complete(&rreq->cc))
     {
      if (unlikely(MPIDI_Request_isSync(rreq)))
      {
        /* Post this to the context for asynchronous progresss. We cannot do
         * the send-immediate inline here because we may not have the
         * context locked (its is being asynchrously advanced).
         * Must "uncomplete" the message (increment the ref and completion counts) so we
         * hold onto this request object until this send has completed.  When MPIDI_SyncAck_handoff
         * finishes sending the ack, it will complete the request, decrementing the ref and
         * completion counts.
         */
        MPIDI_Request_uncomplete(rreq);
        MPIDI_Send_post(MPIDI_SyncAck_handoff, rreq);
      }
      /* -------------------------------- */
      /* request is complete              */
      /* -------------------------------- */
      if (rreq->mpid.uebuf != NULL)
        {
          if (likely(rreq->status.cancelled == FALSE))
            {
              MPIDI_msg_sz_t _count=0;
              MPIDI_Buffer_copy(rreq->mpid.uebuf,
                                rreq->mpid.uebuflen,
                                MPI_CHAR,
                                &rreq->status.MPI_ERROR,
                                buf,
                                count,
                                datatype,
                                &_count,
                                &rreq->status.MPI_ERROR);
              rreq->status.count = _count;
            }
        }
      else
        {
          MPID_assert(rreq->mpid.uebuflen == 0);
          rreq->status.count = 0;
        }
     }
     else
     {
      /* -------------------------------- */
      /* request is incomplete            */
      /* -------------------------------- */
      if (unlikely(MPIDI_Request_isSync(rreq)))
        {
          /* Post this to the context for asynchronous progresss. We cannot do
           * the send-immediate inline here because we may not have the
           * context locked (its is being asynchrously advanced).
           * Must "uncomplete" the message (increment the ref and completion counts) so we
           * hold onto this request object until this send has completed.  When MPIDI_SyncAck_handoff
           * finishes sending the ack, it will complete the request, decrementing the ref and
           * completion counts.
           */
          MPIDI_Request_uncomplete(rreq);
          MPIDI_Send_post(MPIDI_SyncAck_handoff, rreq);
        }
      if(rreq->status.cancelled == FALSE)
        {
          MPIDI_Request_setCA(rreq, MPIDI_CA_UNPACK_UEBUF_AND_COMPLETE);
        }
      if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN)
        {
          MPID_Datatype_get_ptr(datatype, rreq->mpid.datatype_ptr);
          MPID_Datatype_add_ref(rreq->mpid.datatype_ptr);
        }
     }
    }
}
示例#22
0
/**
 * \brief MPID buffer copy
 *
 * Implements non-contiguous buffers correctly.
 *
 * \param[in]  sbuf       The address of the input buffer
 * \param[in]  scount     The number of elements in that buffer
 * \param[in]  sdt        The datatype of those elements
 * \param[out] smpi_errno Returns errors
 * \param[in]  rbuf       The address of the output buffer
 * \param[out] rcount     The number of elements in that buffer
 * \param[in]  rdt        The datatype of those elements
 * \param[out] rsz        The size of the ouput data
 * \param[out] rmpi_errno Returns errors
 */
void MPIDI_Buffer_copy(
    const void * const sbuf, MPI_Aint scount, MPI_Datatype sdt,                       int * smpi_errno,
          void * const rbuf, MPI_Aint rcount, MPI_Datatype rdt, MPIDI_msg_sz_t * rsz, int * rmpi_errno)
{
    int sdt_contig;
    int rdt_contig;
    MPI_Aint sdt_true_lb, rdt_true_lb;
    MPIDI_msg_sz_t sdata_sz;
    MPIDI_msg_sz_t rdata_sz;
    MPID_Datatype * sdt_ptr;
    MPID_Datatype * rdt_ptr;

    MPI_Aint  sdt_extent;
    MPI_Aint  rdt_extent;

    *smpi_errno = MPI_SUCCESS;
    *rmpi_errno = MPI_SUCCESS;

    /* printf("bufcopy: src count=%d dt=%d\n", scount, sdt); */
    /* printf("bufcopy: dst count=%d dt=%d\n", rcount, rdt); */

    MPIDI_Datatype_get_info(scount, sdt, sdt_contig, sdata_sz, sdt_ptr, sdt_true_lb);
    MPIDI_Datatype_get_info(rcount, rdt, rdt_contig, rdata_sz, rdt_ptr, rdt_true_lb);

    /* --BEGIN ERROR HANDLING-- */
    if (sdata_sz > rdata_sz)
    {
        *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d", sdata_sz, rdata_sz );
        sdata_sz = rdata_sz;
    }
    /* --END ERROR HANDLING-- */

    if (sdata_sz == 0)
    {
        *rsz = 0;
        goto fn_exit;
    }

    if (sdt_contig && rdt_contig)
    {
#if CUDA_AWARE_SUPPORT
      if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rbuf))
      {
        cudaError_t cudaerr = CudaMemcpy(rbuf + rdt_true_lb, sbuf + sdt_true_lb, sdata_sz, cudaMemcpyHostToDevice);
      }
      else
#endif
        memcpy((char*)rbuf + rdt_true_lb, (const char *)sbuf + sdt_true_lb, sdata_sz);
        *rsz = sdata_sz;
    }
    else if (sdt_contig)
    {
#if CUDA_AWARE_SUPPORT
      // This will need to be done in two steps:
      // 1 - Allocate a temp buffer which is the same size as user buffer and unpack in it.
      // 2 - Copy unpacked data into user buffer from temp buffer.
      if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rbuf))
      {
        MPID_Datatype_get_extent_macro(rdt, rdt_extent);
        char *buf =  MPL_malloc(rdt_extent * rcount);
        memset(buf, 0, rdt_extent * rcount);        
        MPID_Segment seg;
        DLOOP_Offset last;

        MPID_Segment_init(buf, rcount, rdt, &seg, 0);
        last = sdata_sz;
        MPID_Segment_unpack(&seg, 0, &last, (char*)sbuf + sdt_true_lb);
        /* --BEGIN ERROR HANDLING-- */
        if (last != sdata_sz)
        {
            *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0);
        }
        /* --END ERROR HANDLING-- */

       *rsz = last;

        
        cudaError_t cudaerr = CudaMemcpy(rbuf + rdt_true_lb, buf, rdt_extent * rcount, cudaMemcpyHostToDevice);

        MPL_free(buf);

        goto fn_exit;

      }
#endif

        MPID_Segment seg;
        DLOOP_Offset last;

        MPID_Segment_init(rbuf, rcount, rdt, &seg, 0);
        last = sdata_sz;
        MPID_Segment_unpack(&seg, 0, &last, (char*)sbuf + sdt_true_lb);
        /* --BEGIN ERROR HANDLING-- */
        if (last != sdata_sz)
        {
            *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0);
        }
        /* --END ERROR HANDLING-- */

        *rsz = last;
    }
    else if (rdt_contig)
    {
        MPID_Segment seg;
        DLOOP_Offset last;

        MPID_Segment_init(sbuf, scount, sdt, &seg, 0);
        last = sdata_sz;
        MPID_Segment_pack(&seg, 0, &last, (char*)rbuf + rdt_true_lb);
        /* --BEGIN ERROR HANDLING-- */
        if (last != sdata_sz)
        {
            *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0);
        }
        /* --END ERROR HANDLING-- */

        *rsz = last;
    }
    else
    {
        char * buf;
        MPIDI_msg_sz_t buf_off;
        MPID_Segment sseg;
        MPIDI_msg_sz_t sfirst;
        MPID_Segment rseg;
        MPIDI_msg_sz_t rfirst;

        buf = MPL_malloc(MPIDI_COPY_BUFFER_SZ);
        /* --BEGIN ERROR HANDLING-- */
        if (buf == NULL)
        {
            *smpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, __FUNCTION__, __LINE__, MPI_ERR_OTHER, "**nomem", 0);
            *rmpi_errno = *smpi_errno;
            *rsz = 0;
            goto fn_exit;
        }
        /* --END ERROR HANDLING-- */

        MPID_Segment_init(sbuf, scount, sdt, &sseg, 0);
        MPID_Segment_init(rbuf, rcount, rdt, &rseg, 0);

        sfirst = 0;
        rfirst = 0;
        buf_off = 0;

        for(;;)
        {
            DLOOP_Offset last;
            char * buf_end;

            if (sdata_sz - sfirst > MPIDI_COPY_BUFFER_SZ - buf_off)
            {
                last = sfirst + (MPIDI_COPY_BUFFER_SZ - buf_off);
            }
            else
            {
                last = sdata_sz;
            }

            MPID_Segment_pack(&sseg, sfirst, &last, buf + buf_off);
            /* --BEGIN ERROR HANDLING-- */
            MPID_assert(last > sfirst);
            /* --END ERROR HANDLING-- */

            buf_end = buf + buf_off + (last - sfirst);
            sfirst = last;

            MPID_Segment_unpack(&rseg, rfirst, &last, buf);
            /* --BEGIN ERROR HANDLING-- */
            MPID_assert(last > rfirst);
            /* --END ERROR HANDLING-- */

            rfirst = last;

            if (rfirst == sdata_sz)
            {
                /* successful completion */
                break;
            }

            /* --BEGIN ERROR HANDLING-- */
            if (sfirst == sdata_sz)
            {
                /* datatype mismatch -- remaining bytes could not be unpacked */
                *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0);
                break;
            }
            /* --END ERROR HANDLING-- */

            buf_off = sfirst - rfirst;
            if (buf_off > 0)
            {
                memmove(buf, buf_end - buf_off, buf_off);
            }
        }

        *rsz = rfirst;
        MPL_free(buf);
    }

  fn_exit:
    return;
}
示例#23
0
void MPIDI_Coll_comm_create(MPID_Comm *comm)
{
   volatile int geom_init = 1;
   int i;
   MPIDI_Post_geom_create_t geom_post;

  TRACE_ERR("MPIDI_Coll_comm_create enter\n");
  if (!MPIDI_Process.optimized.collectives)
    return;

  if(comm->comm_kind != MPID_INTRACOMM) return;
  /* Create a geometry */

  comm->coll_fns = MPIU_Calloc0(1, MPID_Collops);
  MPID_assert(comm->coll_fns != NULL);

   if(comm->mpid.geometry != MPIDI_Process.world_geometry)
   {
      if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_0 && comm->rank == 0))
         fprintf(stderr,"world geom: %p parent geom: %p\n", MPIDI_Process.world_geometry, comm->mpid.parent);
      TRACE_ERR("Creating subgeom\n");
      /* Change to this at some point */

      comm->mpid.tasks = NULL;
      for(i=1;i<comm->local_size;i++)
      {
        /* only if sequential tasks should we use a (single) range.
           Multi or reordered ranges are inefficient */
        if(MPID_VCR_GET_LPID(comm->vcr, i) != (MPID_VCR_GET_LPID(comm->vcr, i-1) + 1)) {
        /* not sequential, use tasklist */
          MPID_VCR_GET_LPIDS(comm, comm->mpid.tasks);
          break;
        }
      }
      /* Should we use a range? (no task list set) */
      if(comm->mpid.tasks == NULL)
      {
         /* one range, {first rank ... last rank} */
         comm->mpid.range.lo = MPID_VCR_GET_LPID(comm->vcr, 0);
         comm->mpid.range.hi = MPID_VCR_GET_LPID(comm->vcr, comm->local_size-1);
      }

      if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_0 && comm->rank == 0))
         fprintf(stderr,"create geometry tasks %p {%u..%u}\n", comm->mpid.tasks, MPID_VCR_GET_LPID(comm->vcr, 0),MPID_VCR_GET_LPID(comm->vcr, comm->local_size-1));

      pami_configuration_t config[3];
      size_t numconfigs = 0;
#ifdef HAVE_PAMI_GEOMETRY_NONCONTIG
      config[0].name = PAMI_GEOMETRY_NONCONTIG;
      if(MPIDI_Process.optimized.memory & MPID_OPT_LVL_NONCONTIG) 
         config[0].value.intval = 0; // Disable non-contig, pamid doesn't use pami for non-contig data collectives
      else
         config[0].value.intval = 1; // Enable non-contig even though pamid doesn't use pami for non-contig data collectives, 
                                     // we still possibly want those collectives for other reasons.
      ++numconfigs;
#endif
      if(MPIDI_Process.optimized.subcomms)
      {
         config[numconfigs].name = PAMI_GEOMETRY_OPTIMIZE;
         config[numconfigs].value.intval = 1; 
         ++numconfigs;
      }
#ifdef HAVE_PAMI_GEOMETRY_MEMORY_OPTIMIZE
      if(MPIDI_Process.optimized.memory) 
      {
         config[numconfigs].name = PAMI_GEOMETRY_MEMORY_OPTIMIZE;
         config[numconfigs].value.intval = MPIDI_Process.optimized.memory; /* level of optimization */
         ++numconfigs;
      }
#endif

      if((MPIDI_Process.optimized.memory  & MPID_OPT_LVL_IRREG) && (comm->local_size & (comm->local_size-1)))
      {
         /* Don't create irregular geometries.  Fallback to MPICH only collectives */
         geom_init = 0;
         comm->mpid.geometry = PAMI_GEOMETRY_NULL;
      }
      else if(comm->mpid.tasks == NULL)
      {   
         geom_post.client = MPIDI_Client;
         geom_post.configs = config;
         geom_post.context_offset = 0; /* TODO BES investigate */
         geom_post.num_configs = numconfigs;
         geom_post.newgeom = &comm->mpid.geometry,
         geom_post.parent = PAMI_GEOMETRY_NULL;
         geom_post.id     = comm->context_id;
         geom_post.ranges = &comm->mpid.range;
         geom_post.tasks = NULL;;
         geom_post.count = (size_t)1;
         geom_post.fn = geom_create_cb_done;
         geom_post.cookie = (void*)&geom_init;

         TRACE_ERR("%s geom_rangelist_create\n", MPIDI_Process.context_post>0?"Posting":"Invoking");
         MPIDI_Context_post(MPIDI_Context[0], &geom_post.state,
                            geom_rangelist_create_wrapper, (void *)&geom_post);
      }
      else
      {
         geom_post.client = MPIDI_Client;
         geom_post.configs = config;
         geom_post.context_offset = 0; /* TODO BES investigate */
         geom_post.num_configs = numconfigs;
         geom_post.newgeom = &comm->mpid.geometry,
         geom_post.parent = PAMI_GEOMETRY_NULL;
         geom_post.id     = comm->context_id;
         geom_post.ranges = NULL;
         geom_post.tasks = comm->mpid.tasks;
         geom_post.count = (size_t)comm->local_size;
         geom_post.fn = geom_create_cb_done;
         geom_post.cookie = (void*)&geom_init;

         TRACE_ERR("%s geom_tasklist_create\n", MPIDI_Process.context_post>0?"Posting":"Invoking");
         MPIDI_Context_post(MPIDI_Context[0], &geom_post.state,
                            geom_tasklist_create_wrapper, (void *)&geom_post);
      }

      TRACE_ERR("Waiting for geom create to finish\n");
      MPID_PROGRESS_WAIT_WHILE(geom_init);

      if(comm->mpid.geometry == PAMI_GEOMETRY_NULL)
      {
         if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_0 && comm->rank == 0))
            fprintf(stderr,"Created unoptimized communicator id=%u, size=%u\n", (unsigned) comm->context_id,comm->local_size);
         MPIU_TestFree(&comm->coll_fns);
         return;
      }
   }
   /* Initialize the async flow control in case it will be used. */
   comm->mpid.num_requests = MPIDI_Process.optimized.num_requests;

   TRACE_ERR("Querying protocols\n");
   /* Determine what protocols are available for this comm/geom */
   /* These two functions moved to mpid_collselect.c */
   MPIDI_Comm_coll_query(comm);
   MPIDI_Comm_coll_envvars(comm);
   if(MPIDI_Process.optimized.select_colls)
      MPIDI_Comm_coll_select(comm);
   TRACE_ERR("mpir barrier\n");
   int mpierrno = FALSE;
   /* Switch to comm->coll_fns->fn() */
   MPIDO_Barrier(comm, &mpierrno);


  TRACE_ERR("MPIDI_Coll_comm_create exit\n");
}
示例#24
0
/* MSGQUEUE lock must be held by caller */
void
MPIDI_Callback_process_unexp(MPID_Request *newreq,
			     pami_context_t        context,
                             const MPIDI_MsgInfo * msginfo,
                             size_t                sndlen,
                             pami_endpoint_t       sender,
                             const void          * sndbuf,
                             pami_recv_t         * recv,
                             unsigned              isSync)
{
  MPID_Request *rreq = NULL;

  /* ---------------------------------------------------- */
  /*  Fallback position:                                  */
  /*     + Request was not posted, or                     */
  /*     + Request was long & not contiguous.             */
  /*  We must allocate enough space to hold the message.  */
  /*  The temporary buffer will be unpacked later.        */
  /* ---------------------------------------------------- */
  unsigned rank       = msginfo->MPIrank;
  unsigned tag        = msginfo->MPItag;
  unsigned context_id = msginfo->MPIctxt;
#ifndef OUT_OF_ORDER_HANDLING
  rreq = MPIDI_Recvq_AEU(newreq, rank, tag, context_id);
#else
  unsigned msg_seqno  = msginfo->MPIseqno;
  rreq = MPIDI_Recvq_AEU(newreq, rank, PAMIX_Endpoint_query(sender), tag, context_id, msg_seqno);
#endif
  /* ---------------------- */
  /*  Copy in information.  */
  /* ---------------------- */
  rreq->status.MPI_SOURCE = rank;
  rreq->status.MPI_TAG    = tag;
  MPIR_STATUS_SET_COUNT(rreq->status, sndlen);
  MPIDI_Request_setCA          (rreq, MPIDI_CA_COMPLETE);
  MPIDI_Request_cpyPeerRequestH(rreq, msginfo);
  MPIDI_Request_setSync        (rreq, isSync);

  /* Set the rank of the sender if a sync msg. */
#ifndef OUT_OF_ORDER_HANDLING
  if (isSync)
    {
#endif
      MPIDI_Request_setPeerRank_comm(rreq, rank);
      MPIDI_Request_setPeerRank_pami(rreq, PAMIX_Endpoint_query(sender));
#ifndef OUT_OF_ORDER_HANDLING
    }
#endif

  MPID_assert(!sndlen || rreq->mpid.uebuf != NULL);
  TRACE_MEMSET_R(PAMIX_Endpoint_query(sender),msg_seqno,recv_status);
  TRACE_SET_R_VAL(PAMIX_Endpoint_query(sender),(msginfo->MPIseqno & SEQMASK),msgid,msginfo->MPIseqno);
  TRACE_SET_R_VAL(PAMIX_Endpoint_query(sender),(msginfo->MPIseqno & SEQMASK),rtag,tag);
  TRACE_SET_R_VAL(PAMIX_Endpoint_query(sender),(msginfo->MPIseqno & SEQMASK),rctx,msginfo->MPIctxt);
  TRACE_SET_R_VAL(PAMIX_Endpoint_query(sender),(msginfo->MPIseqno & SEQMASK),rlen,sndlen);
  TRACE_SET_R_VAL(PAMIX_Endpoint_query(sender),(msginfo->MPIseqno & SEQMASK),fl.f.sync,isSync);
  TRACE_SET_R_VAL(PAMIX_Endpoint_query(sender),(msginfo->MPIseqno & SEQMASK),rsource,PAMIX_Endpoint_query(sender));
  TRACE_SET_REQ_VAL(rreq->mpid.idx,(msginfo->MPIseqno & SEQMASK));

  if (recv != NULL)
    {
      recv->local_fn = MPIDI_RecvDoneCB_mutexed;
      recv->cookie   = rreq;
      /* -------------------------------------------------- */
      /*  Let PAMI know where to put the rest of the data.  */
      /* -------------------------------------------------- */
      recv->addr = rreq->mpid.uebuf;
    }
  else
    {
      /* ------------------------------------------------- */
      /*  We have the data; copy it and complete the msg.  */
      /* ------------------------------------------------- */
      memcpy(rreq->mpid.uebuf, sndbuf,   sndlen);
      MPIDI_RecvDoneCB(context, rreq, PAMI_SUCCESS);
      /* caller must release rreq, after unlocking MSGQUEUE */
    }
}
示例#25
0
/**
 * \brief The callback for a new RZV RTS
 * \note  Because this is a short message, the data is already received
 * \param[in]  context      The context on which the message is being received.
 * \param[in]  sender       The origin endpoint
 * \param[in]  _msginfo     The extended header information
 * \param[in]  msginfo_size The size of the extended header information
 * \param[in]  is_zero_byte The rendezvous message is zero bytes in length.
 */
void
MPIDI_RecvRzvCB_impl(pami_context_t    context,
                     pami_endpoint_t   sender,
                     const void      * _msginfo,
                     size_t            msginfo_size,
                     const unsigned    is_zero_byte)
{
  MPID_assert(_msginfo != NULL);
  MPID_assert(msginfo_size == sizeof(MPIDI_MsgEnvelope));
  const MPIDI_MsgEnvelope * envelope = (const MPIDI_MsgEnvelope *)_msginfo;
  const MPIDI_MsgInfo * msginfo = (const MPIDI_MsgInfo *)&envelope->msginfo;

  MPID_Request * rreq = NULL;
  int found;
  pami_task_t source;
#if TOKEN_FLOW_CONTROL
  int  rettoks=0;
#endif

  /* -------------------- */
  /*  Match the request.  */
  /* -------------------- */
  unsigned rank       = msginfo->MPIrank;
  unsigned tag        = msginfo->MPItag;
  unsigned context_id = msginfo->MPIctxt;

  MPID_Request *newreq = MPIDI_Request_create2();
  MPIU_THREAD_CS_ENTER(MSGQUEUE,0);
  source = PAMIX_Endpoint_query(sender);
  MPIDI_Receive_tokens(msginfo,source);
#ifndef OUT_OF_ORDER_HANDLING
  rreq = MPIDI_Recvq_FDP_or_AEU(newreq, rank, tag, context_id, &found);
#else
  rreq = MPIDI_Recvq_FDP_or_AEU(newreq, rank, source, tag, context_id, msginfo->MPIseqno, &found);
#endif
  TRACE_ERR("RZV CB for req=%p remote-mr=0x%llx bytes=%zu (%sfound)\n",
            rreq,
            *(unsigned long long*)&envelope->envelope.memregion,
            envelope->envelope.length,
            found?"":"not ");

  /* ---------------------- */
  /*  Copy in information.  */
  /* ---------------------- */
  rreq->status.MPI_SOURCE = rank;
  rreq->status.MPI_TAG    = tag;
  MPIR_STATUS_SET_COUNT(rreq->status, envelope->length);
  MPIDI_Request_setPeerRank_comm(rreq, rank);
  MPIDI_Request_setPeerRank_pami(rreq, source);
  MPIDI_Request_cpyPeerRequestH (rreq, msginfo);
  MPIDI_Request_setSync         (rreq, msginfo->isSync);
  MPIDI_Request_setRzv          (rreq, 1);

  /* ----------------------------------------------------- */
  /* Save the rendezvous information for when the target   */
  /* node calls a receive function and the data is         */
  /* retreived from the origin node.                       */
  /* ----------------------------------------------------- */
  if (is_zero_byte)
    {
      rreq->mpid.envelope.length = 0;
      rreq->mpid.envelope.data   = NULL;
    }
  else
    {
#ifdef USE_PAMI_RDMA
      memcpy(&rreq->mpid.envelope.memregion,
             &envelope->memregion,
             sizeof(pami_memregion_t));
#else
      rreq->mpid.envelope.memregion_used = envelope->memregion_used;
      if(envelope->memregion_used)
        {
          memcpy(&rreq->mpid.envelope.memregion,
                 &envelope->memregion,
                 sizeof(pami_memregion_t));
        }
      rreq->mpid.envelope.data   = envelope->data;
#endif
      rreq->mpid.envelope.length = envelope->length;
     TRACE_SET_R_VAL(source,(rreq->mpid.idx),req,rreq);
     TRACE_SET_R_VAL(source,(rreq->mpid.idx),rlen,envelope->length);
     TRACE_SET_R_VAL(source,(rreq->mpid.idx),fl.f.sync,msginfo->isSync);
     TRACE_SET_R_BIT(source,(rreq->mpid.idx),fl.f.rzv);
     if (TOKEN_FLOW_CONTROL_ON)
       {
         #if TOKEN_FLOW_CONTROL
         MPIDI_Must_return_tokens(context,source);
         #else
         MPID_assert_always(0);
         #endif
       }
    }
  /* ----------------------------------------- */
  /* figure out target buffer for request data */
  /* ----------------------------------------- */
  if (found)
    {
#if (MPIDI_STATISTICS)
       MPID_NSTAT(mpid_statp->earlyArrivalsMatched);
#endif
      /* --------------------------- */
      /* if synchronized, post ack.  */
      /* --------------------------- */
      if (unlikely(MPIDI_Request_isSync(rreq)))
        MPIDI_SyncAck_post(context, rreq, MPIDI_Request_getPeerRank_pami(rreq));

      MPIU_THREAD_CS_EXIT(MSGQUEUE,0);

      if (is_zero_byte)
        MPIDI_RecvRzvDoneCB_zerobyte(context, rreq, PAMI_SUCCESS);
      else
        {
          MPIDI_RendezvousTransfer(context, rreq);
          TRACE_SET_R_BIT(source,(rreq->mpid.idx),fl.f.sync_com_in_HH);
          TRACE_SET_R_BIT(source,(rreq->mpid.idx),fl.f.matchedInHH);
          TRACE_SET_R_VAL(source,(rreq->mpid.idx),bufadd,rreq->mpid.userbuf);
        }
      MPID_Request_discard(newreq);
    }

  /* ------------------------------------------------------------- */
  /* Request was not posted. */
  /* ------------------------------------------------------------- */
  else
    {
#if (MPIDI_STATISTICS)
       MPID_NSTAT(mpid_statp->earlyArrivals);
#endif
      /*
       * This is to test that the fields don't need to be
       * initialized.  Remove after this doesn't fail for a while.
       */
      MPID_assert(rreq->mpid.uebuf    == NULL);
      MPID_assert(rreq->mpid.uebuflen == 0);
      /* rreq->mpid.uebuf = NULL; */
      /* rreq->mpid.uebuflen = 0; */
#ifdef OUT_OF_ORDER_HANDLING
  if (MPIDI_In_cntr[source].n_OutOfOrderMsgs > 0) {
     MPIDI_Recvq_process_out_of_order_msgs(source, context);
  }
#endif
      MPIU_THREAD_CS_EXIT(MSGQUEUE,0);
    }
  /* ---------------------------------------- */
  /*  Signal that the recv has been started.  */
  /* ---------------------------------------- */
  MPIDI_Progress_signal();
}
示例#26
0
int MPIDO_Bcast(void *buffer,
                int count,
                MPI_Datatype datatype,
                int root,
                MPID_Comm *comm_ptr,
                int *mpierrno)
{
   TRACE_ERR("in mpido_bcast\n");
   const size_t BCAST_LIMIT =      0x40000000;
   int data_contig, rc;
   void *data_buffer    = NULL,
        *noncontig_buff = NULL;
   volatile unsigned active = 1;
   MPI_Aint data_true_lb = 0;
   MPID_Datatype *data_ptr;
   MPID_Segment segment;
   MPIDI_Post_coll_t bcast_post;
   const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
   const int rank = comm_ptr->rank;
#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
   const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif
   const int selected_type = mpid->user_selected_type[PAMI_XFER_BROADCAST];

   /* Must calculate data_size based on count=1 in case it's total size is > integer */
   int data_size_one;
   MPIDI_Datatype_get_info(1, datatype,
			   data_contig, data_size_one, data_ptr, data_true_lb);
   /* do this calculation once and use twice */
   const size_t data_size_sz = (size_t)data_size_one*(size_t)count;
   if(unlikely(verbose))
     fprintf(stderr,"bcast count %d, size %d (%#zX), root %d, buffer %p\n",
	     count,data_size_one, (size_t)data_size_one*(size_t)count, root,buffer);
   if(unlikely( data_size_sz > BCAST_LIMIT) )
   {
      void *new_buffer=buffer;
      int c, new_count = (int)BCAST_LIMIT/data_size_one;
      MPID_assert(new_count > 0);

      for(c=1; ((size_t)c*(size_t)new_count) <= (size_t)count; ++c)
      {
        if ((rc = MPIDO_Bcast(new_buffer,
                        new_count,
                        datatype,
                        root,
                        comm_ptr,
                              mpierrno)) != MPI_SUCCESS)
         return rc;
	 new_buffer = (char*)new_buffer + (size_t)data_size_one*(size_t)new_count;
      }
      new_count = count % new_count; /* 0 is ok, just returns no-op */
      return MPIDO_Bcast(new_buffer,
                         new_count,
                         datatype,
                         root,
                         comm_ptr,
                         mpierrno);
   }

   /* Must use data_size based on count for byte bcast processing.
      Previously calculated as a size_t but large data_sizes were 
      handled above so this cast to int should be fine here.  
   */
   const int data_size = (int)data_size_sz;

   if(selected_type == MPID_COLL_USE_MPICH || data_size == 0)
   {
     if(unlikely(verbose))
       fprintf(stderr,"Using MPICH bcast algorithm\n");
      MPIDI_Update_last_algorithm(comm_ptr,"BCAST_MPICH");
      return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno);
   }

   data_buffer = (char *)buffer + data_true_lb;

   if(!data_contig)
   {
      noncontig_buff = MPIU_Malloc(data_size);
      data_buffer = noncontig_buff;
      if(noncontig_buff == NULL)
      {
         MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
            "Fatal:  Cannot allocate pack buffer");
      }
      if(rank == root)
      {
         DLOOP_Offset last = data_size;
         MPID_Segment_init(buffer, count, datatype, &segment, 0);
         MPID_Segment_pack(&segment, 0, &last, noncontig_buff);
      }
   }

   pami_xfer_t bcast;
   pami_algorithm_t my_bcast;
   const pami_metadata_t *my_md = (pami_metadata_t *)NULL;
   int queryreq = 0;

   bcast.cb_done = cb_bcast;
   bcast.cookie = (void *)&active;
   bcast.cmd.xfer_broadcast.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0);
   bcast.algorithm = mpid->user_selected[PAMI_XFER_BROADCAST];
   bcast.cmd.xfer_broadcast.buf = data_buffer;
   bcast.cmd.xfer_broadcast.type = PAMI_TYPE_BYTE;
   /* Needs to be sizeof(type)*count since we are using bytes as * the generic type */
   bcast.cmd.xfer_broadcast.typecount = data_size;

   if(selected_type == MPID_COLL_OPTIMIZED)
   {
      TRACE_ERR("Optimized bcast (%s) and (%s) were pre-selected\n",
         mpid->opt_protocol_md[PAMI_XFER_BROADCAST][0].name,
         mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1].name);

      if(mpid->cutoff_size[PAMI_XFER_BROADCAST][1] != 0)/* SSS: There is FCA cutoff (FCA only sets cutoff for [PAMI_XFER_BROADCAST][1]) */
      {
        if(data_size <= mpid->cutoff_size[PAMI_XFER_BROADCAST][1])
        {
          my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][1];
          my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1];
          queryreq = mpid->must_query[PAMI_XFER_BROADCAST][1];
        }
        else
        {
          return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno);
        }
      }

      if(data_size > mpid->cutoff_size[PAMI_XFER_BROADCAST][0])
      {
         my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][1];
         my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1];
         queryreq = mpid->must_query[PAMI_XFER_BROADCAST][1];
      }
      else
      {
         my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][0];
         my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][0];
         queryreq = mpid->must_query[PAMI_XFER_BROADCAST][0];
      }
   }
   else
   {
      TRACE_ERR("Bcast (%s) was specified by user\n",
         mpid->user_metadata[PAMI_XFER_BROADCAST].name);
      my_bcast =  mpid->user_selected[PAMI_XFER_BROADCAST];
      my_md = &mpid->user_metadata[PAMI_XFER_BROADCAST];
      queryreq = selected_type;
   }

   bcast.algorithm = my_bcast;

   if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY ||
               queryreq == MPID_COLL_CHECK_FN_REQUIRED))
   {
      metadata_result_t result = {0};
      TRACE_ERR("querying bcast protocol %s, type was: %d\n",
                my_md->name, queryreq);
      if(my_md->check_fn != NULL) /* calling the check fn is sufficient */
      {
         metadata_result_t result = {0};
         result = my_md->check_fn(&bcast);
         result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */
      } 
      else /* no check_fn, manually look at the metadata fields */
      {
         TRACE_ERR("Optimzed selection line %d\n",__LINE__);
         /* Check if the message range if restricted */
         if(my_md->check_correct.values.rangeminmax)
         {
            if((my_md->range_lo <= data_size) &&
               (my_md->range_hi >= data_size))
               ; /* ok, algorithm selected */
            else
            {
               result.check.range = 1;
               if(unlikely(verbose))
               {   
                  fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n",
                          data_size,
                          my_md->range_lo,
                          my_md->range_hi,
                          my_md->name);
               }
            }
         }
         /* \todo check the rest of the metadata */
      }
      TRACE_ERR("bitmask: %#X\n", result.bitmask);
      if(result.bitmask)
      {
         if(unlikely(verbose))
            fprintf(stderr,"Using MPICH bcast algorithm - query fn failed\n");
         MPIDI_Update_last_algorithm(comm_ptr,"BCAST_MPICH");
         return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno);
      }
      if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) 
      { 
         comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
         int tmpmpierrno;   
         if(unlikely(verbose))
            fprintf(stderr,"Query barrier required for %s\n", my_md->name);
         MPIDO_Barrier(comm_ptr, &tmpmpierrno);
      }
   }

   if(unlikely(verbose))
   {
      unsigned long long int threadID;
      MPIU_Thread_id_t tid;
      MPIU_Thread_self(&tid);
      threadID = (unsigned long long int)tid;
      fprintf(stderr,"<%llx> Using protocol %s for bcast on %u\n", 
              threadID,
              my_md->name,
              (unsigned) comm_ptr->context_id);
   }

   MPIDI_Context_post(MPIDI_Context[0], &bcast_post.state, MPIDI_Pami_post_wrapper, (void *)&bcast);
   MPIDI_Update_last_algorithm(comm_ptr, my_md->name);
   MPID_PROGRESS_WAIT_WHILE(active);
   TRACE_ERR("bcast done\n");

   if(!data_contig)
   {
      if(rank != root)
         MPIR_Localcopy(noncontig_buff, data_size, MPI_CHAR,
                        buffer,         count,     datatype);
      MPIU_Free(noncontig_buff);
   }

   TRACE_ERR("leaving bcast\n");
   return 0;
}
示例#27
0
int
MPIDI_Win_init( MPI_Aint length,
                int disp_unit,
                MPID_Win  **win_ptr,
                MPID_Info  *info,
                MPID_Comm *comm_ptr,
                int create_flavor,
                int model)
{
  int mpi_errno=MPI_SUCCESS;
  size_t rank, size;
  MPIDI_Win_info *winfo;
  static char FCNAME[] = "MPIDI_Win_init";

  /* ----------------------------------------- */
  /*  Setup the common sections of the window  */
  /* ----------------------------------------- */
  MPID_Win *win = (MPID_Win*)MPIU_Handle_obj_alloc(&MPID_Win_mem);

  MPIU_ERR_CHKANDSTMT(win == NULL, mpi_errno, MPI_ERR_NO_MEM,
                     return mpi_errno, "**nomem");

  *win_ptr = win;
  memset(&win->mpid, 0, sizeof(struct MPIDI_Win));
  win->comm_ptr = comm_ptr; MPIR_Comm_add_ref(comm_ptr);
  size = comm_ptr->local_size;
  rank = comm_ptr->rank;

  win->mpid.info = MPIU_Malloc(size * sizeof(struct MPIDI_Win_info));
  MPID_assert(win->mpid.info != NULL);
  memset((void *) win->mpid.info,0,(size * sizeof(struct MPIDI_Win_info)));
  winfo = &win->mpid.info[rank];
  win->errhandler          = NULL;
  win->base                = NULL;
  win->size                = length;
  win->disp_unit           = disp_unit;
  win->create_flavor       = create_flavor;
  win->model               = model;
  win->copyCreateFlavor    = 0;
  win->copyModel           = 0;
  win->attributes          = NULL;
  win->comm_ptr            = comm_ptr;
  if ((info != NULL) && ((int *)info != (int *) MPI_INFO_NULL)) {
      mpi_errno= MPIDI_Win_set_info(win, info);
      MPID_assert(mpi_errno == 0);
  }
  MPID_assert(mpi_errno == 0);


    /* Initialize the info (hint) flags per window */
  win->mpid.info_args.no_locks            = 0;
  win->mpid.info_args.accumulate_ordering =
      (MPIDI_ACCU_ORDER_RAR | MPIDI_ACCU_ORDER_RAW | MPIDI_ACCU_ORDER_WAR | MPIDI_ACCU_ORDER_WAW);
  win->mpid.info_args.accumulate_ops      = MPIDI_ACCU_SAME_OP_NO_OP; /*default */
  win->mpid.info_args.same_size           = 0;
  win->mpid.info_args.alloc_shared_noncontig = 0;

  win->copyDispUnit=0;
  win->copySize=0;
  winfo->memregion_used = 0;
  winfo->disp_unit = disp_unit;

  return mpi_errno;
}
示例#28
0
static inline void
MPIDI_RecvShortCB(pami_context_t    context,
                  const void      * _msginfo,
                  const void      * sndbuf,
                  size_t            sndlen,
                  pami_endpoint_t   sender,
                  unsigned          isSync)
{
  MPID_assert(_msginfo != NULL);

  const MPIDI_MsgInfo *msginfo = (const MPIDI_MsgInfo *)_msginfo;
  MPID_Request * rreq = NULL;
  pami_task_t source;
#if TOKEN_FLOW_CONTROL
  int          rettoks=0;
#endif

  /* -------------------- */
  /*  Match the request.  */
  /* -------------------- */
  unsigned rank       = msginfo->MPIrank;
  unsigned tag        = msginfo->MPItag;
  unsigned context_id = msginfo->MPIctxt;

  MPIU_THREAD_CS_ENTER(MSGQUEUE,0);
  source = PAMIX_Endpoint_query(sender);
  MPIDI_Receive_tokens(msginfo,source);
#ifndef OUT_OF_ORDER_HANDLING
  rreq = MPIDI_Recvq_FDP(rank, tag, context_id);
#else
  rreq = MPIDI_Recvq_FDP(rank, source, tag, context_id, msginfo->MPIseqno);
#endif

  /* Match not found */
  if (unlikely(rreq == NULL))
    {
#if (MPIDI_STATISTICS)
         MPID_NSTAT(mpid_statp->earlyArrivals);
#endif
      MPIU_THREAD_CS_EXIT(MSGQUEUE,0);
      MPID_Request *newreq = MPIDI_Request_create2();
      MPID_assert(newreq != NULL);
      if (sndlen)
      {
        newreq->mpid.uebuflen = sndlen;
        if (!TOKEN_FLOW_CONTROL_ON)
          {
            newreq->mpid.uebuf = MPL_malloc(sndlen);
            newreq->mpid.uebuf_malloc = mpiuMalloc;
          }
        else
          {
            #if TOKEN_FLOW_CONTROL
            MPIU_THREAD_CS_ENTER(MSGQUEUE,0);
            newreq->mpid.uebuf = MPIDI_mm_alloc(sndlen);
            newreq->mpid.uebuf_malloc = mpidiBufMM;
            MPIU_THREAD_CS_EXIT(MSGQUEUE,0);
            #else
            MPID_assert_always(0);
            #endif
          }
        MPID_assert(newreq->mpid.uebuf != NULL);
      }
      MPIU_THREAD_CS_ENTER(MSGQUEUE,0);
#ifndef OUT_OF_ORDER_HANDLING
      rreq = MPIDI_Recvq_FDP(rank, tag, context_id);
#else
      rreq = MPIDI_Recvq_FDP(rank, PAMIX_Endpoint_query(sender), tag, context_id, msginfo->MPIseqno);
#endif
      
      if (unlikely(rreq == NULL))
      {
        MPIDI_Callback_process_unexp(newreq, context, msginfo, sndlen, sender, sndbuf, NULL, isSync);
        /* request is always complete now */
        if (TOKEN_FLOW_CONTROL_ON && sndlen)
          {
            #if TOKEN_FLOW_CONTROL
            MPIDI_Token_cntr[source].unmatched++;
            #else
            MPID_assert_always(0);
            #endif
          }
        MPIU_THREAD_CS_EXIT(MSGQUEUE,0);
        MPID_Request_release(newreq);
        goto fn_exit_short;
      }
      else
      {       
        MPIU_THREAD_CS_EXIT(MSGQUEUE,0);
        MPID_Request_discard(newreq);
      }         
    }
  else
    {
#if (MPIDI_STATISTICS)
     MPID_NSTAT(mpid_statp->earlyArrivalsMatched);
#endif
      if (TOKEN_FLOW_CONTROL_ON && sndlen)
        {
          #if TOKEN_FLOW_CONTROL
          MPIDI_Update_rettoks(source);
          MPIDI_Must_return_tokens(context,source);
          #else
          MPID_assert_always(0);
          #endif
        }
      MPIU_THREAD_CS_EXIT(MSGQUEUE,0);
    }

  /* the receive queue processing has been completed and we found match*/

  /* ---------------------- */
  /*  Copy in information.  */
  /* ---------------------- */
  rreq->status.MPI_SOURCE = rank;
  rreq->status.MPI_TAG    = tag;
  MPIR_STATUS_SET_COUNT(rreq->status, sndlen);
  MPIDI_Request_setCA          (rreq, MPIDI_CA_COMPLETE);
  MPIDI_Request_cpyPeerRequestH(rreq, msginfo);
  MPIDI_Request_setSync        (rreq, isSync);
  MPIDI_Request_setRzv         (rreq, 0);

  /* ----------------------------- */
  /*  Request was already posted.  */
  /* ----------------------------- */
  if (unlikely(isSync))
    MPIDI_SyncAck_post(context, rreq, PAMIX_Endpoint_query(sender));

  if (unlikely(HANDLE_GET_KIND(rreq->mpid.datatype) != HANDLE_KIND_BUILTIN))
    {
      MPIDI_Callback_process_userdefined_dt(context, sndbuf, sndlen, rreq);
      goto fn_exit_short;
    }

  size_t dt_size = rreq->mpid.userbufcount * MPID_Datatype_get_basic_size(rreq->mpid.datatype);

  /* ----------------------------- */
  /*  Test for truncated message.  */
  /* ----------------------------- */
  if (unlikely(sndlen > dt_size))
    {
#if ASSERT_LEVEL > 0
      MPIDI_Callback_process_trunc(context, rreq, NULL, sndbuf);
      goto fn_exit_short;
#else
      sndlen = dt_size;
#endif
    }

  MPID_assert(rreq->mpid.uebuf    == NULL);
  MPID_assert(rreq->mpid.uebuflen == 0);
  void* rcvbuf = rreq->mpid.userbuf;

  if (sndlen > 0)
  {
#if CUDA_AWARE_SUPPORT
    if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rcvbuf))
    {
      cudaError_t cudaerr = CudaMemcpy(rcvbuf, sndbuf, (size_t)sndlen, cudaMemcpyHostToDevice);
    }
    else
#endif
      memcpy(rcvbuf, sndbuf, sndlen);
  }
  TRACE_SET_R_VAL(source,(rreq->mpid.idx),rlen,sndlen);
  TRACE_SET_R_BIT(source,(rreq->mpid.idx),fl.f.comp_in_HH);
  TRACE_SET_R_VAL(source,(rreq->mpid.idx),bufadd,rreq->mpid.userbuf);
  MPIDI_Request_complete(rreq);

 fn_exit_short:
#ifdef OUT_OF_ORDER_HANDLING
  MPIU_THREAD_CS_ENTER(MSGQUEUE,0);
  if (MPIDI_In_cntr[source].n_OutOfOrderMsgs>0)  {
    MPIDI_Recvq_process_out_of_order_msgs(source, context);
  }
  MPIU_THREAD_CS_EXIT(MSGQUEUE,0);
#endif

  /* ---------------------------------------- */
  /*  Signal that the recv has been started.  */
  /* ---------------------------------------- */
  MPIDI_Progress_signal();
}
示例#29
0
static inline int
MPIDI_Put_use_pami_rput(pami_context_t context, MPIDI_Win_request * req,int *freed)
{
  pami_result_t rc;
  void  *map;
  pami_rput_simple_t params;
  /* params need to zero out to avoid passing garbage to PAMI */
  params=zero_rput_parms;

  params.rma.dest=req->dest;
  params.rma.hints.buffer_registered = PAMI_HINT_ENABLE;
  params.rma.hints.use_rdma          = PAMI_HINT_ENABLE;
  params.rma.bytes   = 0;
  params.rma.cookie  = req;
  params.rma.done_fn = NULL;
  params.rdma.local.mr=&req->origin.memregion;
  params.rdma.remote.mr=&req->win->mpid.info[req->target.rank].memregion;
  params.rdma.remote.offset= req->offset;
  params.put.rdone_fn= MPIDI_Win_DoneCB;

  struct MPIDI_Win_sync* sync = &req->win->mpid.sync;
  TRACE_ERR("Start       index=%u/%d  l-addr=%p  r-base=%p  r-offset=%zu (sync->started=%u  sync->complete=%u)\n",
	    req->state.index, req->target.dt.num_contig, req->buffer, req->win->mpid.info[req->target.rank].base_addr, req->offset, sync->started, sync->complete);
  while (req->state.index < req->target.dt.num_contig) {
    if (sync->started > sync->complete + MPIDI_Process.rma_pending)
      {
	TRACE_ERR("Bailing out;  index=%u/%d  sync->started=%u  sync->complete=%u\n",
		  req->state.index, req->target.dt.num_contig, sync->started, sync->complete);
	return PAMI_EAGAIN;
      }
    ++sync->started;


    params.rma.bytes          =                       req->target.dt.map[req->state.index].DLOOP_VECTOR_LEN;
    params.rdma.remote.offset = req->offset + (size_t)req->target.dt.map[req->state.index].DLOOP_VECTOR_BUF;
    params.rdma.local.offset  = req->state.local_offset;
#ifdef TRACE_ON
    unsigned* buf = (unsigned*)(req->buffer + params.rdma.local.offset);
#endif
    TRACE_ERR("  Sub     index=%u  bytes=%zu  l-offset=%zu  r-offset=%zu  buf=%p  *(int*)buf=0x%08x\n",
	      req->state.index, params.rma.bytes, params.rdma.local.offset, params.rdma.remote.offset, buf, *buf);

    /** sync->total will be updated with every RMA and the complete
	will not change till that RMA has completed. In the meanwhile
	the rest of the RMAs will have memory leaks */
    if (req->target.dt.num_contig - req->state.index == 1) {
         map=NULL;
         if (req->target.dt.map != &req->target.dt.__map) {
             map=(void *) req->target.dt.map;
         }
         rc = PAMI_Rput(context, &params);
         MPID_assert(rc == PAMI_SUCCESS);
         if (map) {
             MPIU_Free(map);
         }
         *freed=1;
         return PAMI_SUCCESS;
    } else {
          rc = PAMI_Rput(context, &params);
          MPID_assert(rc == PAMI_SUCCESS);
          req->state.local_offset += params.rma.bytes;
          ++req->state.index;
    }
  }
  return PAMI_SUCCESS;
}