示例#1
0
/** Optimized implementation of the ARMCI IOV operation that uses a single
  * lock/unlock pair.
  */
int ARMCII_Iov_op_batched(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count,
    MPI_Datatype type, int proc) {

  int i;
  gmr_t *mreg;
  void *shr_ptr;

  switch(op) {
    case ARMCII_OP_PUT:
      shr_ptr = dst[0];
      break;
    case ARMCII_OP_GET:
      shr_ptr = src[0];
      break;
    case ARMCII_OP_ACC:
      shr_ptr = dst[0];
      break;
    default:
      ARMCII_Error("unknown operation (%d)", op);
      return 1;
  }

  mreg = gmr_lookup(shr_ptr, proc);
  ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");

  gmr_lock(mreg, proc);

  for (i = 0; i < count; i++) {

    if (   ARMCII_GLOBAL_STATE.iov_batched_limit > 0 
        && i % ARMCII_GLOBAL_STATE.iov_batched_limit == 0
        && i > 0 )
    {
      gmr_unlock(mreg, proc);
      gmr_lock(mreg, proc);
    }

    switch(op) {
      case ARMCII_OP_PUT:
        gmr_put(mreg, src[i], dst[i], elem_count, proc);
        break;
      case ARMCII_OP_GET:
        gmr_get(mreg, src[i], dst[i], elem_count, proc);
        break;
      case ARMCII_OP_ACC:
        gmr_accumulate(mreg, src[i], dst[i], elem_count, type, proc);
        break;
      default:
        ARMCII_Error("unknown operation (%d)", op);
        return 1;
    }
  }

  gmr_unlock(mreg, proc);

  return 0;
}
示例#2
0
/** One-sided accumulate operation.
  *
  * @param[in] datatype ARMCI data type for the accumulate operation (see armci.h)
  * @param[in] scale    Pointer for a scalar of type datatype that will be used to
  *                     scale values in the source buffer
  * @param[in] src      Source address (remote)
  * @param[in] dst      Destination address (local)
  * @param[in] bytes    Number of bytes to transfer
  * @param[in] proc     Process id to target
  * @return             0 on success, non-zero on failure
  */
int PARMCI_Acc(int datatype, void *scale, void *src, void *dst, int bytes, int proc) {
  void  *src_buf;
  int    count, type_size, scaled;
  MPI_Datatype type;
  gmr_t *src_mreg, *dst_mreg;

  /* If NOGUARD is set, assume the buffer is not shared */
  if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD)
    src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank);
  else
    src_mreg = NULL;

  dst_mreg = gmr_lookup(dst, proc);

  ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer");

  /* Prepare the input data: Apply scaling if needed and acquire the DLA lock if
   * needed.  We hold the DLA lock if (src_buf == src && src_mreg != NULL). */

  scaled = ARMCII_Buf_acc_is_scaled(datatype, scale);

  if (scaled) {
      MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf);
      ARMCII_Assert(src_buf != NULL);
      ARMCII_Buf_acc_scale(src, src_buf, bytes, datatype, scale);
  } else {
    src_buf = src;
  }

  /* Check if we need to copy: user requested it or same mem region */
  if (   (src_buf == src) /* buf_prepare didn't make a copy */
      && (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY || src_mreg == dst_mreg) )
  {
    MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf);
    ARMCII_Assert(src_buf != NULL);
    ARMCI_Copy(src, src_buf, bytes);
  }

  ARMCII_Acc_type_translate(datatype, &type, &type_size);
  count = bytes/type_size;

  ARMCII_Assert_msg(bytes % type_size == 0, 
      "Transfer size is not a multiple of the datatype size");

  /* TODO: Support a local accumulate operation more efficiently */

  gmr_accumulate(dst_mreg, src_buf, dst, count, type, proc);
  gmr_flush(dst_mreg, proc, 1); /* flush_local */

  if (src_buf != src)
    MPI_Free_mem(src_buf);

  return 0;
}