C++ (Cpp) ARMCII_Assert_msg примеры использования

Пример #1

0

Показать файл

Файл: gmr.c Проект: abhinavvishnu/matex

/** One-sided accumulate operation with typed arguments.  Source buffer must be private.
  *
  * @param[in] mreg      Memory region
  * @param[in] src       Address of source data
  * @param[in] src_count Number of elements of the given type at the source
  * @param[in] src_type  MPI datatype of the source elements
  * @param[in] dst       Address of destination buffer
  * @param[in] dst_count Number of elements of the given type at the destination
  * @param[in] src_type  MPI datatype of the destination elements
  * @param[in] size      Number of bytes to transfer
  * @param[in] proc      Absolute process id of target process
  * @return              0 on success, non-zero on failure
  */
int gmr_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type,
    void *dst, int dst_count, MPI_Datatype dst_type, int proc) {

  int        grp_proc;
  gmr_size_t disp;
  MPI_Aint lb, extent;

  grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc);
  ARMCII_Assert(grp_proc >= 0);

  // Calculate displacement from beginning of the window
  if (dst == MPI_BOTTOM) 
    disp = 0;
  else
    disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base);

  // Perform checks
  MPI_Type_get_true_extent(dst_type, &lb, &extent);
  ARMCII_Assert(mreg->lock_state != GMR_LOCK_UNLOCKED);
  ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address");
  ARMCII_Assert_msg(disp + dst_count*extent <= mreg->slices[proc].size, "Transfer is out of range");

  MPI_Accumulate(src, src_count, src_type, grp_proc, (MPI_Aint) disp, dst_count, dst_type, MPI_SUM, mreg->window);

  return 0;
}

Пример #2

0

Показать файл

Файл: onesided.c Проект: jeffhammond/armci-mpi

/** One-sided accumulate operation.
  *
  * @param[in] datatype ARMCI data type for the accumulate operation (see armci.h)
  * @param[in] scale    Pointer for a scalar of type datatype that will be used to
  *                     scale values in the source buffer
  * @param[in] src      Source address (remote)
  * @param[in] dst      Destination address (local)
  * @param[in] bytes    Number of bytes to transfer
  * @param[in] proc     Process id to target
  * @return             0 on success, non-zero on failure
  */
int PARMCI_Acc(int datatype, void *scale, void *src, void *dst, int bytes, int proc) {
  void  *src_buf;
  int    count, type_size, scaled;
  MPI_Datatype type;
  gmr_t *src_mreg, *dst_mreg;

  /* If NOGUARD is set, assume the buffer is not shared */
  if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD)
    src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank);
  else
    src_mreg = NULL;

  dst_mreg = gmr_lookup(dst, proc);

  ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer");

  /* Prepare the input data: Apply scaling if needed and acquire the DLA lock if
   * needed.  We hold the DLA lock if (src_buf == src && src_mreg != NULL). */

  scaled = ARMCII_Buf_acc_is_scaled(datatype, scale);

  if (scaled) {
      MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf);
      ARMCII_Assert(src_buf != NULL);
      ARMCII_Buf_acc_scale(src, src_buf, bytes, datatype, scale);
  } else {
    src_buf = src;
  }

  /* Check if we need to copy: user requested it or same mem region */
  if (   (src_buf == src) /* buf_prepare didn't make a copy */
      && (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY || src_mreg == dst_mreg) )
  {
    MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf);
    ARMCII_Assert(src_buf != NULL);
    ARMCI_Copy(src, src_buf, bytes);
  }

  ARMCII_Acc_type_translate(datatype, &type, &type_size);
  count = bytes/type_size;

  ARMCII_Assert_msg(bytes % type_size == 0, 
      "Transfer size is not a multiple of the datatype size");

  /* TODO: Support a local accumulate operation more efficiently */

  gmr_accumulate(dst_mreg, src_buf, dst, count, type, proc);
  gmr_flush(dst_mreg, proc, 1); /* flush_local */

  if (src_buf != src)
    MPI_Free_mem(src_buf);

  return 0;
}

Пример #3

0

Показать файл

Файл: onesided.c Проект: addy004/mpich2-yarn

/** Declare the start of a local access epoch.  This allows direct access to
  * data in local memory.
  *
  * @param[in] ptr Pointer to the allocation that will be accessed directly 
  */
void ARMCI_Access_begin(void *ptr) {
  gmr_t *mreg;

  mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank);
  ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");

  ARMCII_Assert_msg((mreg->access_mode & ARMCIX_MODE_NO_LOAD_STORE) == 0,
      "Direct local access is not permitted in the current access mode");

  gmr_dla_lock(mreg);
}

Пример #4

0

Показать файл

Файл: vector.c Проект: OngOngoing/219351_homework

/** Perform an I/O vector operation.  Local buffers must be private.
  *
  * @param[in] op          Operation to be performed (ARMCII_OP_PUT, ...)
  * @param[in] src         Array of source pointers
  * @param[in] dst         Array of destination pointers
  * @param[in] count       Length of pointer arrays
  * @param[in] size        Size of each transfer
  * @param[in] datatype    Data type for accumulate op (ignored for all others)
  * @param[in] overlapping Do remote regions overlap?
  * @param[in] same_alloc  Do remote regions correspond to the same allocation?
  * @param[in] proc        Target process
  * @return                Zero on success, error code otherwise
  */
int ARMCII_Iov_op_dispatch(enum ARMCII_Op_e op, void **src, void **dst, int count, int size,
    int datatype, int overlapping, int same_alloc, int proc) {

  MPI_Datatype type;
  int type_count, type_size;

  if (op == ARMCII_OP_ACC) {
    ARMCII_Acc_type_translate(datatype, &type, &type_size);
    type_count = size/type_size;
    ARMCII_Assert_msg(size % type_size == 0, "Transfer size is not a multiple of type size");
  } else {
    type = MPI_BYTE;
    MPI_Type_size(type, &type_size);
    type_count = size/type_size;
    ARMCII_Assert_msg(size % type_size == 0, "Transfer size is not a multiple of type size");
  }

  // CONSERVATIVE CASE: If remote pointers overlap or remote pointers correspond to
  // multiple allocations, use the safe implementation to avoid invalid MPI
  // use.

  if (overlapping || !same_alloc || ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_CONSRV) {
    if (overlapping) ARMCII_Warning("IOV remote buffers overlap\n");
    if (!same_alloc) ARMCII_Warning("IOV remote buffers are not within the same allocation\n");
    return ARMCII_Iov_op_safe(op, src, dst, count, type_count, type, proc);
  }

  // OPTIMIZED CASE: It's safe for us to issue all the operations under a
  // single lock.

  else if (   ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_DIRECT
           || ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_AUTO  ) {
    if (ARMCII_GLOBAL_STATE.no_mpi_bottom == 1) {
      return ARMCII_Iov_op_datatype_no_bottom(op, src, dst, count, type_count, type, proc);
    } else {
      return ARMCII_Iov_op_datatype(op, src, dst, count, type_count, type, proc);
    }

  } else if (ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_BATCHED) {
    return ARMCII_Iov_op_batched(op, src, dst, count, type_count, type, proc);

  } else {
    ARMCII_Error("unknown iov method (%d)\n", ARMCII_GLOBAL_STATE.iov_method);
    return 1;
  }
}

Пример #5

0

Показать файл

Файл: onesided.c Проект: addy004/mpich2-yarn

/** Query the access mode for the given allocation.  Non-collective.
  *
  * @param[in] ptr      Pointer within the allocation.
  * @return             Current access mode.
  */
int ARMCIX_Mode_get(void *ptr) {
  gmr_t *mreg;

  mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank);
  ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");

  return mreg->access_mode;
}

Пример #6

0

Показать файл

Файл: onesided.c Проект: jeffhammond/armci-mpi

/** Declare the end of a local access epoch.
  *
  * \note MPI-2 does not allow multiple locks at once, so you can have only one
  * access epoch open at a time and cannot do put/get/acc while in an access
  * region.
  *
  * @param[in] ptr Pointer to the allocation that was accessed directly 
  */
void PARMCI_Access_end(void *ptr) {
  gmr_t *mreg;

  mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank);
  ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");

  gmr_sync(mreg);
}

Пример #7

0

Показать файл

Файл: vector.c Проект: OngOngoing/219351_homework

/** Optimized implementation of the ARMCI IOV operation that uses a single
  * lock/unlock pair.
  */
int ARMCII_Iov_op_batched(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count,
    MPI_Datatype type, int proc) {

  int i;
  gmr_t *mreg;
  void *shr_ptr;

  switch(op) {
    case ARMCII_OP_PUT:
      shr_ptr = dst[0];
      break;
    case ARMCII_OP_GET:
      shr_ptr = src[0];
      break;
    case ARMCII_OP_ACC:
      shr_ptr = dst[0];
      break;
    default:
      ARMCII_Error("unknown operation (%d)", op);
      return 1;
  }

  mreg = gmr_lookup(shr_ptr, proc);
  ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");

  gmr_lock(mreg, proc);

  for (i = 0; i < count; i++) {

    if (   ARMCII_GLOBAL_STATE.iov_batched_limit > 0 
        && i % ARMCII_GLOBAL_STATE.iov_batched_limit == 0
        && i > 0 )
    {
      gmr_unlock(mreg, proc);
      gmr_lock(mreg, proc);
    }

    switch(op) {
      case ARMCII_OP_PUT:
        gmr_put(mreg, src[i], dst[i], elem_count, proc);
        break;
      case ARMCII_OP_GET:
        gmr_get(mreg, src[i], dst[i], elem_count, proc);
        break;
      case ARMCII_OP_ACC:
        gmr_accumulate(mreg, src[i], dst[i], elem_count, type, proc);
        break;
      default:
        ARMCII_Error("unknown operation (%d)", op);
        return 1;
    }
  }

  gmr_unlock(mreg, proc);

  return 0;
}

Пример #8

0

Показать файл

Файл: rmw.c Проект: abhinavvishnu/matex

/** Perform atomic read-modify-write on the given integer or long location and
  * return the location's original value.
  *
  * \note ARMCI RMW operations are atomic with respect to other RMW operations,
  * but not with respect to other one-sided operations (get, put, acc, etc).
  *
  * @param[in]  op    Operation to be performed:
  *                     ARMCI_FETCH_AND_ADD (int)
  *                     ARMCI_FETCH_AND_ADD_LONG
  *                     ARMCI_SWAP (int)
  *                     ARMCI_SWAP_LONG
  * @param[out] ploc  Location to store the original value.
  * @param[in]  prem  Location on which to perform atomic operation.
  * @param[in]  value Value to add to remote location (ignored for swap).
  * @param[in]  proc  Process rank for the target buffer.
  */
int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) {
  int           is_long;
  gmr_t *mreg;

  mreg = gmr_lookup(prem, proc);
  ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");

  if (op == ARMCI_SWAP_LONG || op == ARMCI_FETCH_AND_ADD_LONG)
    is_long = 1;
  else
    is_long = 0;

  if (op == ARMCI_SWAP || op == ARMCI_SWAP_LONG) {
    long swap_val_l;
    int  swap_val_i;

    ARMCIX_Lock_hdl(mreg->rmw_mutex, 0, proc);
    PARMCI_Get(prem, is_long ? (void*) &swap_val_l : (void*) &swap_val_i, 
              is_long ? sizeof(long) : sizeof(int), proc);
    PARMCI_Put(ploc, prem, is_long ? sizeof(long) : sizeof(int), proc);
    ARMCIX_Unlock_hdl(mreg->rmw_mutex, 0, proc);

    if (is_long)
      *(long*) ploc = swap_val_l;
    else
      *(int*) ploc = swap_val_i;
  }

  else if (op == ARMCI_FETCH_AND_ADD || op == ARMCI_FETCH_AND_ADD_LONG) {
    long fetch_val_l, new_val_l;
    int  fetch_val_i, new_val_i;
    
    ARMCIX_Lock_hdl(mreg->rmw_mutex, 0, proc);
    PARMCI_Get(prem, is_long ? (void*) &fetch_val_l : (void*) &fetch_val_i,
              is_long ? sizeof(long) : sizeof(int), proc);
    
    if (is_long)
      new_val_l = fetch_val_l + value;
    else
      new_val_i = fetch_val_i + value;

    PARMCI_Put(is_long ? (void*) &new_val_l : (void*) &new_val_i, prem, 
              is_long ? sizeof(long) : sizeof(int), proc);
    ARMCIX_Unlock_hdl(mreg->rmw_mutex, 0, proc);

    if (is_long)
      *(long*) ploc = fetch_val_l;
    else
      *(int*) ploc = fetch_val_i;
  }

  else {
    ARMCII_Error("invalid operation (%d)", op);
  }

  return 0;
}

Пример #9

0

Показать файл

Файл: strided.c Проект: abhinavvishnu/matex

/** Convert an ARMCI strided access description into an MPI subarray datatype.
  *
  * @param[in]  stride_array    Array of strides
  * @param[in]  count           Array of transfer counts
  * @param[in]  stride_levels   Number of levels of striding
  * @param[in]  old_type        Type of the data element described by count and stride_array
  * @param[out] new_type        New MPI type for the given strided access
  */
void ARMCII_Strided_to_dtype(int stride_array[/*stride_levels*/], int count[/*stride_levels+1*/],
                             int stride_levels, MPI_Datatype old_type, MPI_Datatype *new_type)
{
  int sizes   [stride_levels+1];
  int subsizes[stride_levels+1];
  int starts  [stride_levels+1];
  int i, old_type_size;

  MPI_Type_size(old_type, &old_type_size);

  /* Eliminate counts that don't count (all 1 counts at the end) */
  for (i = stride_levels+1; i > 0 && stride_levels > 0 && count[i-1] == 1; i--)
    stride_levels--;

  /* A correct strided spec should me monotonic increasing and stride_array[i+1] should
     be a multiple of stride_array[i]. */
  if (stride_levels > 0) {
    for (i = 1; i < stride_levels; i++)
      ARMCII_Assert(stride_array[i] >= stride_array[i-1] && stride_array[i] % stride_array[i-1] == 0);
  }

  /* Test for a contiguous transfer */
  if (stride_levels == 0) {
    int elem_count = count[0]/old_type_size;

    ARMCII_Assert(count[0] % old_type_size == 0);
    MPI_Type_contiguous(elem_count, old_type, new_type);
  }

  /* Transfer is non-contiguous */
  else {

    for (i = 0; i < stride_levels+1; i++)
      starts[i] = 0;

    sizes   [stride_levels] = stride_array[0]/old_type_size;
    subsizes[stride_levels] = count[0]/old_type_size;

    ARMCII_Assert(stride_array[0] % old_type_size == 0 && count[0] % old_type_size == 0);

    for (i = 1; i < stride_levels; i++) {
      /* Convert strides into dimensions by dividing out contributions from lower dims */
      sizes   [stride_levels-i] = stride_array[i]/stride_array[i-1];
      subsizes[stride_levels-i] = count[i];

      ARMCII_Assert_msg(stride_array[i] % stride_array[i-1] == 0, "Invalid striding");
    }

    sizes   [0] = count[stride_levels];
    subsizes[0] = count[stride_levels];

    MPI_Type_create_subarray(stride_levels+1, sizes, subsizes, starts, MPI_ORDER_C, old_type, new_type);
  }
}

Пример #10

0

Показать файл

Файл: onesided.c Проект: addy004/mpich2-yarn

/** Set the acess mode for the given allocation.  Collective across the
  * allocation's group.  Waits for all processes, finishes all communication,
  * and then sets the new access mode.
  *
  * @param[in] new_mode The new access mode.
  * @param[in] ptr      Pointer within the allocation.
  * @return             Zero upon success, error code otherwise.
  */
int ARMCIX_Mode_set(int new_mode, void *ptr, ARMCI_Group *group) {
  gmr_t *mreg;

  mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank);
  ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");

  ARMCII_Assert(group->comm == mreg->group.comm);

  ARMCII_Assert_msg(mreg->lock_state != GMR_LOCK_DLA,
      "Cannot change the access mode; window is locked for local access.");
  ARMCII_Assert_msg(mreg->lock_state == GMR_LOCK_UNLOCKED,
      "Cannot change the access mode on a window that is locked.");

  // Wait for all processes to complete any outstanding communication before we
  // do the mode switch
  MPI_Barrier(mreg->group.comm);

  mreg->access_mode = new_mode;

  return 0;
}

Пример #11

0

Показать файл

Файл: onesided.c Проект: addy004/mpich2-yarn

/** One-sided put operation.
  *
  * @param[in] src    Source address (remote)
  * @param[in] dst    Destination address (local)
  * @param[in] size   Number of bytes to transfer
  * @param[in] target Process id to target
  * @return           0 on success, non-zero on failure
  */
int ARMCI_Put(void *src, void *dst, int size, int target) {
  gmr_t *src_mreg, *dst_mreg;

  src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank);
  dst_mreg = gmr_lookup(dst, target);

  ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer");

  /* Local operation */
  if (target == ARMCI_GROUP_WORLD.rank) {
    if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) {
      gmr_dla_lock(dst_mreg);
      if (src_mreg) gmr_dla_lock(src_mreg);
    }

    ARMCI_Copy(src, dst, size);
    
    if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) {
      gmr_dla_unlock(dst_mreg);
      if (src_mreg) gmr_dla_unlock(src_mreg);
    }
  }

  /* Origin buffer is private */
  else if (src_mreg == NULL || ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_NOGUARD) {
    gmr_lock(dst_mreg, target);
    gmr_put(dst_mreg, src, dst, size, target);
    gmr_unlock(dst_mreg, target);
  }

  /* COPY: Either origin and target buffers are in the same window and we can't
   * lock the same window twice (MPI semantics) or the user has requested
   * always-copy mode. */
  else {
    void *src_buf;

    MPI_Alloc_mem(size, MPI_INFO_NULL, &src_buf);
    ARMCII_Assert(src_buf != NULL);

    gmr_dla_lock(src_mreg);
    ARMCI_Copy(src, src_buf, size);
    gmr_dla_unlock(src_mreg);

    gmr_lock(dst_mreg, target);
    gmr_put(dst_mreg, src_buf, dst, size, target);
    gmr_unlock(dst_mreg, target);

    MPI_Free_mem(src_buf);
  }

  return 0;
}

Пример #12

0

Показать файл

Файл: malloc.c Проект: jeffhammond/armci-mpi

/** Free a shared memory allocation.  Collective.
  *
  * @param[in] ptr Pointer to the local patch of the allocation
  */
int ARMCI_Free_group(void *ptr, ARMCI_Group *group) {
  gmr_t *mreg;

  if (ptr != NULL) {
    mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank);
    ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer");
  } else {
    ARMCII_Dbg_print(DEBUG_CAT_ALLOC, "given NULL\n");
    mreg = NULL;
  }
  gmr_destroy(mreg, group);

  return 0;
}

Пример #13

0

Показать файл

Файл: gmr.c Проект: abhinavvishnu/matex

/** Unlock a memory region that was locked for direct local access.
  *
  * @param[in] mreg     Memory region
  */
void gmr_dla_unlock(gmr_t *mreg) {
  int grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank);

  ARMCII_Assert(grp_proc >= 0);
  ARMCII_Assert(mreg->lock_state == GMR_LOCK_DLA);
  ARMCII_Assert_msg((mreg->access_mode & ARMCIX_MODE_NO_LOAD_STORE) == 0,
      "Direct local access is not allowed in the current access mode");

  mreg->dla_lock_count--;

  if (mreg->dla_lock_count == 0) {
    MPI_Win_unlock(grp_proc, mreg->window);
    mreg->lock_state = GMR_LOCK_UNLOCKED;
  }
}

Пример #14

0

Показать файл

Файл: mutex_hdl_queue.c Проект: abhinavvishnu/matex

/** Create a group of ARMCI mutexes.  Collective onthe ARMCI group.
  *
  * @param[in] count  Number of mutexes on the local process.
  * @param[in] pgroup ARMCI group on which to create mutexes
  * @return           Handle to the mutex group.
  */
armcix_mutex_hdl_t ARMCIX_Create_mutexes_hdl(int my_count, ARMCI_Group *pgroup) {
  int rank, nproc, max_count, i;
  armcix_mutex_hdl_t hdl;

  hdl = malloc(sizeof(struct armcix_mutex_hdl_s));
  ARMCII_Assert(hdl != NULL);

  ARMCIX_Group_dup(pgroup, &hdl->grp);

  MPI_Comm_rank(hdl->grp.comm, &rank);
  MPI_Comm_size(hdl->grp.comm, &nproc);

  hdl->my_count = my_count;

  /* Find the max. count to determine how many windows we need. */
  MPI_Allreduce(&my_count, &max_count, 1, MPI_INT, MPI_MAX, hdl->grp.comm);
  ARMCII_Assert_msg(max_count > 0, "Invalid number of mutexes");

  hdl->max_count = max_count;
  hdl->windows = malloc(sizeof(MPI_Win)*max_count);

  if (my_count > 0) {
    hdl->bases = malloc(sizeof(uint8_t*)*my_count);
  } else {
    hdl->bases = NULL;
  }

  /* We need multiple windows here: one for each mutex.  Otherwise
     performance will suffer due to exclusive access epochs. */
  for (i = 0; i < max_count; i++) {
    int   size = 0;
    void *base = NULL;

    if (i < my_count) {
      MPI_Alloc_mem(nproc, MPI_INFO_NULL, &hdl->bases[i]);
      ARMCII_Assert(hdl->bases[i] != NULL);
      ARMCII_Bzero(hdl->bases[i], nproc);

      base = hdl->bases[i];
      size = nproc;
    }

    MPI_Win_create(base, size, sizeof(uint8_t), MPI_INFO_NULL, hdl->grp.comm, &hdl->windows[i]);
  }

  return hdl;
}

Пример #15

0

Показать файл

Файл: onesided.c Проект: jeffhammond/armci-mpi

/** One-sided get operation.
  *
  * @param[in] src    Source address (remote)
  * @param[in] dst    Destination address (local)
  * @param[in] size   Number of bytes to transfer
  * @param[in] target Process id to target
  * @return           0 on success, non-zero on failure
  */
int PARMCI_Get(void *src, void *dst, int size, int target) {
  gmr_t *src_mreg, *dst_mreg;

  src_mreg = gmr_lookup(src, target);

  /* If NOGUARD is set, assume the buffer is not shared */
  if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD)
    dst_mreg = gmr_lookup(dst, ARMCI_GROUP_WORLD.rank);
  else
    dst_mreg = NULL;

  ARMCII_Assert_msg(src_mreg != NULL, "Invalid remote pointer");

  /* Local operation */
  if (target == ARMCI_GROUP_WORLD.rank && dst_mreg == NULL) {
    ARMCI_Copy(src, dst, size);
  }

  /* Origin buffer is private */
  else if (dst_mreg == NULL) {
    gmr_get(src_mreg, src, dst, size, target);
    gmr_flush(src_mreg, target, 0); /* it's a round trip so w.r.t. flush, local=remote */
  }

  /* COPY: Either origin and target buffers are in the same window and we can't
   * lock the same window twice (MPI semantics) or the user has requested
   * always-copy mode. */
  else {
    void *dst_buf;

    MPI_Alloc_mem(size, MPI_INFO_NULL, &dst_buf);
    ARMCII_Assert(dst_buf != NULL);

    gmr_get(src_mreg, src, dst_buf, size, target);
    gmr_flush(src_mreg, target, 0); /* it's a round trip so w.r.t. flush, local=remote */

    ARMCI_Copy(dst_buf, dst, size);

    MPI_Free_mem(dst_buf);
  }

  return 0;
}

Пример #16

0

Показать файл

Файл: gmr.c Проект: abhinavvishnu/matex

/** One-sided accumulate operation.  Source buffer must be private.
  *
  * @param[in] mreg     Memory region
  * @param[in] src      Source address (local)
  * @param[in] dst      Destination address (remote)
  * @param[in] type     MPI type of the given buffers
  * @param[in] count    Number of elements of the given type to transfer
  * @param[in] proc     Absolute process id of the target
  * @return             0 on success, non-zero on failure
  */
int gmr_accumulate(gmr_t *mreg, void *src, void *dst, int count, MPI_Datatype type, int proc) {
  ARMCII_Assert_msg(src != NULL, "Invalid local address");
  return gmr_accumulate_typed(mreg, src, count, type, dst, count, type, proc);
}

Пример #17

0

Показать файл

Файл: gmr.c Проект: abhinavvishnu/matex

/** One-sided get operation.  Destination buffer must be private.
  *
  * @param[in] mreg   Memory region
  * @param[in] src    Source address (remote)
  * @param[in] dst    Destination address (local)
  * @param[in] size   Number of bytes to transfer
  * @param[in] proc   Absolute process id of target process
  * @return           0 on success, non-zero on failure
  */
int gmr_get(gmr_t *mreg, void *src, void *dst, int size, int proc) {
  ARMCII_Assert_msg(dst != NULL, "Invalid local address");
  return gmr_get_typed(mreg, src, size, MPI_BYTE, dst, size, MPI_BYTE, proc);
}

Пример #18

0

Показать файл

Файл: gmr.c Проект: abhinavvishnu/matex

/** Destroy/free a shared memory region.
  *
  * @param[in] ptr   Pointer within range of the segment (e.g. base pointer).
  * @param[in] group Group on which to perform the free.
  */
void gmr_destroy(gmr_t *mreg, ARMCI_Group *group) {
  int   search_proc_in, search_proc_out, search_proc_out_grp;
  void *search_base;
  int   alloc_me, alloc_nproc;
  int   world_me, world_nproc;

  MPI_Comm_rank(group->comm, &alloc_me);
  MPI_Comm_size(group->comm, &alloc_nproc);
  MPI_Comm_rank(ARMCI_GROUP_WORLD.comm, &world_me);
  MPI_Comm_size(ARMCI_GROUP_WORLD.comm, &world_nproc);

  /* All-to-all exchange of a <base address, proc> pair.  This is so that we
   * can support passing NULL into ARMCI_Free() which is permitted when a
   * process allocates 0 bytes.  Unfortunately, in this case we still need to
   * identify the mem region and free it.
   */

  if (mreg == NULL)
    search_proc_in = -1;
  else {
    search_proc_in = world_me;
    search_base    = mreg->slices[world_me].base;
  }

  /* Collectively decide on who will provide the base address */
  MPI_Allreduce(&search_proc_in, &search_proc_out, 1, MPI_INT, MPI_MAX, group->comm);

  /* Everyone passed NULL.  Nothing to free. */
  if (search_proc_out < 0)
    return;

  /* Translate world rank to group rank */
  search_proc_out_grp = ARMCII_Translate_absolute_to_group(group, search_proc_out);

  /* Broadcast the base address */
  MPI_Bcast(&search_base, sizeof(void*), MPI_BYTE, search_proc_out_grp, group->comm);

  /* If we were passed NULL, look up the mem region using the <base, proc> pair */
  if (mreg == NULL)
    mreg = gmr_lookup(search_base, search_proc_out);

  /* If it's still not found, the user may have passed the wrong group */
  ARMCII_Assert_msg(mreg != NULL, "Could not locate the desired allocation");

  switch (mreg->lock_state) {
    case GMR_LOCK_UNLOCKED:
      break;
    case GMR_LOCK_DLA:
      ARMCII_Warning("Releasing direct local access before freeing shared allocation\n");
      gmr_dla_unlock(mreg);
      break;
    default:
      ARMCII_Error("Unable to free locked memory region (%d)\n", mreg->lock_state);
  }

  /* Remove from the list of mem regions */
  if (mreg->prev == NULL) {
    ARMCII_Assert(gmr_list == mreg);
    gmr_list = mreg->next;

    if (mreg->next != NULL)
      mreg->next->prev = NULL;

  } else {
    mreg->prev->next = mreg->next;
    if (mreg->next != NULL)
      mreg->next->prev = mreg->prev;
  }

  /* Destroy the window and free all buffers */
  MPI_Win_free(&mreg->window);

  if (mreg->slices[world_me].base != NULL)
    MPI_Free_mem(mreg->slices[world_me].base);

  free(mreg->slices);
  ARMCIX_Destroy_mutexes_hdl(mreg->rmw_mutex);

  free(mreg);
}

Пример #19

0

Показать файл

Файл: buffer.c Проект: abhinavvishnu/matex

/** Prepare a set of buffers for use with an accumulate operation.  The
  * returned set of buffers is guaranteed to be in private space and scaled.
  * Copies will be made if needed, the result should be completed by finish.
  *
  * @param[in]  buf       Original set of buffers.
  * @param[in]  count     Number of entries in the buffer list.
  * @param[in]  size      The size of the buffers (all are of the same size).
  * @param[in]  datatype  The type of the buffer.
  * @param[in]  scale     Scaling constant to apply to each buffer.
  * @return               Pointer to the new buffer or buf
  */
void ARMCII_Buf_acc_scale(void *buf_in, void *buf_out, int size, int datatype, void *scale) {
  int   j, nelem;
  int   type_size = -1;
  MPI_Datatype type;

  switch (datatype) {
    case ARMCI_ACC_INT:
      MPI_Type_size(MPI_INT, &type_size);
      type = MPI_INT;
      nelem= size/type_size;

      {
        int *src_i = (int*) buf_in;
        int *scl_i = (int*) buf_out;
        const int s = *((int*) scale);

        for (j = 0; j < nelem; j++)
          scl_i[j] = src_i[j]*s;
      }
      break;

    case ARMCI_ACC_LNG:
      MPI_Type_size(MPI_LONG, &type_size);
      type = MPI_LONG;
      nelem= size/type_size;

      {
        long *src_l = (long*) buf_in;
        long *scl_l = (long*) buf_out;
        const long s = *((long*) scale);

        for (j = 0; j < nelem; j++)
          scl_l[j] = src_l[j]*s;
      }
      break;

    case ARMCI_ACC_FLT:
      MPI_Type_size(MPI_FLOAT, &type_size);
      type = MPI_FLOAT;
      nelem= size/type_size;

      {
        float *src_f = (float*) buf_in;
        float *scl_f = (float*) buf_out;
        const float s = *((float*) scale);

        for (j = 0; j < nelem; j++)
          scl_f[j] = src_f[j]*s;
      }
      break;

    case ARMCI_ACC_DBL:
      MPI_Type_size(MPI_DOUBLE, &type_size);
      type = MPI_DOUBLE;
      nelem= size/type_size;

      {
        double *src_d = (double*) buf_in;
        double *scl_d = (double*) buf_out;
        const double s = *((double*) scale);

        for (j = 0; j < nelem; j++)
          scl_d[j] = src_d[j]*s;
      }
      break;

    case ARMCI_ACC_CPL:
      MPI_Type_size(MPI_FLOAT, &type_size);
      type = MPI_FLOAT;
      nelem= size/type_size;

      {
        float *src_fc = (float*) buf_in;
        float *scl_fc = (float*) buf_out;
        const float s_r = ((float*)scale)[0];
        const float s_c = ((float*)scale)[1];

        for (j = 0; j < nelem; j += 2) {
          // Complex multiplication: (a + bi)*(c + di)
          const float src_fc_j   = src_fc[j];
          const float src_fc_j_1 = src_fc[j+1];
          /*
          scl_fc[j]   = src_fc[j]*s_r   - src_fc[j+1]*s_c;
          scl_fc[j+1] = src_fc[j+1]*s_r + src_fc[j]*s_c;
          */
          scl_fc[j]   = src_fc_j*s_r   - src_fc_j_1*s_c;
          scl_fc[j+1] = src_fc_j_1*s_r + src_fc_j*s_c;
        }
      }
      break;

    case ARMCI_ACC_DCP:
      MPI_Type_size(MPI_DOUBLE, &type_size);
      type = MPI_DOUBLE;
      nelem= size/type_size;

      {
        double *src_dc = (double*) buf_in;
        double *scl_dc = (double*) buf_out;
        const double s_r = ((double*)scale)[0];
        const double s_c = ((double*)scale)[1];

        for (j = 0; j < nelem; j += 2) {
          // Complex multiplication: (a + bi)*(c + di)
          const double src_dc_j   = src_dc[j];
          const double src_dc_j_1 = src_dc[j+1];
          /*
          scl_dc[j]   = src_dc[j]*s_r   - src_dc[j+1]*s_c;
          scl_dc[j+1] = src_dc[j+1]*s_r + src_dc[j]*s_c;
          */
          scl_dc[j]   = src_dc_j*s_r   - src_dc_j_1*s_c;
          scl_dc[j+1] = src_dc_j_1*s_r + src_dc_j*s_c;
        }
      }
      break;

    default:
      ARMCII_Error("unknown data type (%d)", datatype);
  }

  ARMCII_Assert_msg(size % type_size == 0, 
      "Transfer size is not a multiple of the datatype size");
}

Пример #20

0

Показать файл

Файл: strided.c Проект: abhinavvishnu/matex

/** Blocking operation that accumulates data from the local process into the
  * memory of the remote process.  The data transfer is strided and blocking.
  *
  * @param[in] datatype        Type of data to be transferred.
  * @param[in] scale           Pointer to the value that input data should be scaled by.
  * @param[in] src_ptr         Source starting address of the data block to put.
  * @param[in] src_stride_arr  Source array of stride distances in bytes.
  * @param[in] dst_ptr         Destination starting address to put data.
  * @param[in] dst_stride_ar   Destination array of stride distances in bytes.
  * @param[in] count           Block size in each dimension. count[0] should be the
  *                            number of bytes of contiguous data in leading dimension.
  * @param[in] stride_levels   The level of strides.
  * @param[in] proc            Remote process ID (destination).
  *
  * @return                    Zero on success, error code otherwise.
  */
int PARMCI_AccS(int datatype, void *scale,
               void *src_ptr, int src_stride_ar[/*stride_levels*/],
               void *dst_ptr, int dst_stride_ar[/*stride_levels*/],
               int count[/*stride_levels+1*/], int stride_levels, int proc) {

  int err;

  if (ARMCII_GLOBAL_STATE.strided_method == ARMCII_STRIDED_DIRECT) {
    void         *src_buf = NULL;
    gmr_t *mreg, *gmr_loc = NULL;
    MPI_Datatype src_type, dst_type, mpi_datatype;
    int          scaled, mpi_datatype_size;

    ARMCII_Acc_type_translate(datatype, &mpi_datatype, &mpi_datatype_size);
    scaled = ARMCII_Buf_acc_is_scaled(datatype, scale);

    /* SCALE: copy and scale if requested */
    if (scaled) {
      armci_giov_t iov;
      int i, nelem;

      if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD)
        gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank);

      for (i = 1, nelem = count[0]/mpi_datatype_size; i < stride_levels+1; i++)
        nelem *= count[i];

      MPI_Alloc_mem(nelem*mpi_datatype_size, MPI_INFO_NULL, &src_buf);
      ARMCII_Assert(src_buf != NULL);

      if (gmr_loc != NULL) gmr_dla_lock(gmr_loc);

      /* Shoehorn the strided information into an IOV */
      ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, src_ptr, src_stride_ar, count, stride_levels);

      for (i = 0; i < iov.ptr_array_len; i++)
        ARMCII_Buf_acc_scale(iov.src_ptr_array[i], ((uint8_t*)src_buf) + i*iov.bytes, iov.bytes, datatype, scale);

      free(iov.src_ptr_array);
      free(iov.dst_ptr_array);

      if (gmr_loc != NULL) gmr_dla_unlock(gmr_loc);

      MPI_Type_contiguous(nelem, mpi_datatype, &src_type);
    }

    /* COPY: Guard shared buffers */
    else if (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY) {
      gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank);

      if (gmr_loc != NULL) {
        int i, nelem;

        for (i = 1, nelem = count[0]/mpi_datatype_size; i < stride_levels+1; i++)
          nelem *= count[i];

        MPI_Alloc_mem(nelem*mpi_datatype_size, MPI_INFO_NULL, &src_buf);
        ARMCII_Assert(src_buf != NULL);

        gmr_dla_lock(gmr_loc);
        armci_write_strided(src_ptr, stride_levels, src_stride_ar, count, src_buf);
        gmr_dla_unlock(gmr_loc);

        MPI_Type_contiguous(nelem, mpi_datatype, &src_type);
      }
    }

    /* NOGUARD: If src_buf hasn't been assigned to a copy, the strided source
     * buffer is going to be used directly. */
    if (src_buf == NULL) { 
        src_buf = src_ptr;
        ARMCII_Strided_to_dtype(src_stride_ar, count, stride_levels, mpi_datatype, &src_type);
    }

    ARMCII_Strided_to_dtype(dst_stride_ar, count, stride_levels, mpi_datatype, &dst_type);

    MPI_Type_commit(&src_type);
    MPI_Type_commit(&dst_type);

    int src_size, dst_size;

    MPI_Type_size(src_type, &src_size);
    MPI_Type_size(dst_type, &dst_size);

    ARMCII_Assert(src_size == dst_size);

    mreg = gmr_lookup(dst_ptr, proc);
    ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer");

    gmr_lock(mreg, proc);
    gmr_accumulate_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc);
    gmr_unlock(mreg, proc);

    MPI_Type_free(&src_type);
    MPI_Type_free(&dst_type);

    /* COPY/SCALE: Free temp buffer */
    if (src_buf != src_ptr)
      MPI_Free_mem(src_buf);

    err = 0;

  } else {
    armci_giov_t iov;

    ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels);
    err = PARMCI_AccV(datatype, scale, &iov, 1, proc);

    free(iov.src_ptr_array);
    free(iov.dst_ptr_array);
  }

  return err;
}

Пример #21

0

Показать файл

Файл: vector.c Проект: OngOngoing/219351_homework

/** Optimized implementation of the ARMCI IOV operation that uses an MPI
  * datatype to achieve a one-sided gather/scatter.  Does not use MPI_BOTTOM.
  */
int ARMCII_Iov_op_datatype_no_bottom(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count,
    MPI_Datatype type, int proc) {

    gmr_t *mreg;
    MPI_Datatype  type_loc, type_rem;
    MPI_Aint      disp_loc[count];
    int           disp_rem[count];
    int           block_len[count];
    void         *dst_win_base;
    int           dst_win_size, i, type_size;
    void        **buf_rem, **buf_loc;
    MPI_Aint      base_rem;
    MPI_Aint      base_loc;
    void         *base_loc_ptr;

    switch(op) {
      case ARMCII_OP_ACC:
      case ARMCII_OP_PUT:
        buf_rem = dst;
        buf_loc = src;
        break;
      case ARMCII_OP_GET:
        buf_rem = src;
        buf_loc = dst;
        break;
      default:
        ARMCII_Error("unknown operation (%d)", op);
        return 1;
    }

    MPI_Type_size(type, &type_size);

    mreg = gmr_lookup(buf_rem[0], proc);
    ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");

    dst_win_base = mreg->slices[proc].base;
    dst_win_size = mreg->slices[proc].size;

    MPI_Get_address(dst_win_base, &base_rem);

    /* Pick a base address for the start of the origin's datatype */
    base_loc_ptr = buf_loc[0];
    MPI_Get_address(base_loc_ptr, &base_loc);

    for (i = 0; i < count; i++) {
      MPI_Aint target_rem, target_loc;
      MPI_Get_address(buf_loc[i], &target_loc);
      MPI_Get_address(buf_rem[i], &target_rem);
      disp_loc[i]  =  target_loc - base_loc;
      disp_rem[i]  = (target_rem - base_rem)/type_size;
      block_len[i] = elem_count;

      ARMCII_Assert_msg((target_rem - base_rem) % type_size == 0, "Transfer size is not a multiple of type size");
      ARMCII_Assert_msg(disp_rem[i] >= 0 && disp_rem[i] < dst_win_size, "Invalid remote pointer");
      ARMCII_Assert_msg(((uint8_t*)buf_rem[i]) + block_len[i] <= ((uint8_t*)dst_win_base) + dst_win_size, "Transfer exceeds buffer length");
    }

    MPI_Type_create_hindexed(count, block_len, disp_loc, type, &type_loc);
    MPI_Type_create_indexed_block(count, elem_count, disp_rem, type, &type_rem);
    //MPI_Type_indexed(count, block_len, disp_rem, type, &type_rem);

    MPI_Type_commit(&type_loc);
    MPI_Type_commit(&type_rem);

    gmr_lock(mreg, proc);

    switch(op) {
      case ARMCII_OP_ACC:
        gmr_accumulate_typed(mreg, base_loc_ptr, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc);
        break;
      case ARMCII_OP_PUT:
        gmr_put_typed(mreg, base_loc_ptr, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc);
        break;
      case ARMCII_OP_GET:
        gmr_get_typed(mreg, MPI_BOTTOM, 1, type_rem, base_loc_ptr, 1, type_loc, proc);
        break;
      default:
        ARMCII_Error("unknown operation (%d)", op);
        return 1;
    }

    gmr_unlock(mreg, proc);

    MPI_Type_free(&type_loc);
    MPI_Type_free(&type_rem);

    return 0;
}

Пример #22

0

Показать файл

Файл: strided.c Проект: abhinavvishnu/matex

/** Blocking operation that transfers data from the remote process to the
  * memory of the calling process.  The data transfer is strided and blocking.
  *
  * @param[in] src_ptr         Source starting address of the data block to put.
  * @param[in] src_stride_arr  Source array of stride distances in bytes.
  * @param[in] dst_ptr         Destination starting address to put data.
  * @param[in] dst_stride_ar   Destination array of stride distances in bytes.
  * @param[in] count           Block size in each dimension. count[0] should be the
  *                            number of bytes of contiguous data in leading dimension.
  * @param[in] stride_levels   The level of strides.
  * @param[in] proc            Remote process ID (destination).
  *
  * @return                    Zero on success, error code otherwise.
  */
int PARMCI_GetS(void *src_ptr, int src_stride_ar[/*stride_levels*/],
               void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
               int count[/*stride_levels+1*/], int stride_levels, int proc) {

  int err;

  if (ARMCII_GLOBAL_STATE.strided_method == ARMCII_STRIDED_DIRECT) {
    void         *dst_buf = NULL;
    gmr_t *mreg, *gmr_loc = NULL;
    MPI_Datatype src_type, dst_type;

    /* COPY: Guard shared buffers */
    if (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY) {
      gmr_loc = gmr_lookup(dst_ptr, ARMCI_GROUP_WORLD.rank);

      if (gmr_loc != NULL) {
        int i, size;

        for (i = 1, size = count[0]; i < stride_levels+1; i++)
          size *= count[i];

        MPI_Alloc_mem(size, MPI_INFO_NULL, &dst_buf);
        ARMCII_Assert(dst_buf != NULL);

        MPI_Type_contiguous(size, MPI_BYTE, &dst_type);
      }
    }

    /* NOGUARD: If dst_buf hasn't been assigned to a copy, the strided source
     * buffer is going to be used directly. */
    if (dst_buf == NULL) { 
        dst_buf = dst_ptr;
        ARMCII_Strided_to_dtype(dst_stride_ar, count, stride_levels, MPI_BYTE, &dst_type);
    }

    ARMCII_Strided_to_dtype(src_stride_ar, count, stride_levels, MPI_BYTE, &src_type);

    MPI_Type_commit(&src_type);
    MPI_Type_commit(&dst_type);

    mreg = gmr_lookup(src_ptr, proc);
    ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer");

    gmr_lock(mreg, proc);
    gmr_get_typed(mreg, src_ptr, 1, src_type, dst_buf, 1, dst_type, proc);
    gmr_unlock(mreg, proc);

    /* COPY: Finish the transfer */
    if (dst_buf != dst_ptr) {
      gmr_dla_lock(gmr_loc);
      armci_read_strided(dst_ptr, stride_levels, dst_stride_ar, count, dst_buf);
      gmr_dla_unlock(gmr_loc);
      MPI_Free_mem(dst_buf);
    }

    MPI_Type_free(&src_type);
    MPI_Type_free(&dst_type);

    err = 0;

  } else {
    armci_giov_t iov;

    ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels);
    err = PARMCI_GetV(&iov, 1, proc);

    free(iov.src_ptr_array);
    free(iov.dst_ptr_array);
  }

  return err;
}

Пример #23

0

Показать файл

Файл: rmw.c Проект: jeffhammond/armci-mpi

/** Perform atomic read-modify-write on the given integer or long location and
  * return the location's original value.
  *
  * \note ARMCI RMW operations are atomic with respect to other RMW operations,
  * but not with respect to other one-sided operations (get, put, acc, etc).
  *
  * @param[in]  op    Operation to be performed:
  *                     ARMCI_FETCH_AND_ADD (int)
  *                     ARMCI_FETCH_AND_ADD_LONG
  *                     ARMCI_SWAP (int)
  *                     ARMCI_SWAP_LONG
  * @param[out] ploc  Location to store the original value.
  * @param[in]  prem  Location on which to perform atomic operation.
  * @param[in]  value Value to add to remote location (ignored for swap).
  * @param[in]  proc  Process rank for the target buffer.
  */
int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) {

  int is_swap = 0, is_long = 0;
  MPI_Datatype type;
  MPI_Op       rop;
  gmr_t *src_mreg, *dst_mreg;

  /* If NOGUARD is set, assume the buffer is not shared */
  if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD)
    src_mreg = gmr_lookup(ploc, ARMCI_GROUP_WORLD.rank);
  else
    src_mreg = NULL;

  dst_mreg = gmr_lookup(prem, proc);

  ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer");

  if (op == ARMCI_SWAP_LONG || op == ARMCI_FETCH_AND_ADD_LONG) {
    is_long = 1;
    type = MPI_LONG;
  }
  else
    type = MPI_INT;

  if (op == ARMCI_SWAP || op == ARMCI_SWAP_LONG) {
    is_swap = 1;
    rop = MPI_REPLACE;
  }
  else if (op == ARMCI_FETCH_AND_ADD || op == ARMCI_FETCH_AND_ADD_LONG)
    rop = MPI_SUM;
  else
    ARMCII_Error("invalid operation (%d)", op);

  /* We hold the DLA lock if (src_mreg != NULL). */

  if (is_swap) {
    long out_val_l, src_val_l = *((long*)ploc);
    int  out_val_i, src_val_i = *((int*)ploc);

    gmr_fetch_and_op(dst_mreg, 
                     is_long ? (void*) &src_val_l : (void*) &src_val_i /* src */,
                     is_long ? (void*) &out_val_l : (void*) &out_val_i /* out */,
    		     prem /* dst */, type, rop, proc);
    gmr_flush(dst_mreg, proc, 0); /* it's a round trip so w.r.t. flush, local=remote */
    if (is_long)
      *(long*) ploc = out_val_l;
    else
      *(int*) ploc = out_val_i;
  }
  else /* fetch-and-add */ {
    long fetch_val_l, add_val_l = value;
    int  fetch_val_i, add_val_i = value;

    gmr_fetch_and_op(dst_mreg,
                     is_long ? (void*) &add_val_l   : (void*) &add_val_i   /* src */,
                     is_long ? (void*) &fetch_val_l : (void*) &fetch_val_i /* out */,
                     prem /* dst */, type, rop, proc);
    gmr_flush(dst_mreg, proc, 0); /* it's a round trip so w.r.t. flush, local=remote */

    if (is_long)
      *(long*) ploc = fetch_val_l;
    else
      *(int*) ploc = fetch_val_i;
  }

  return 0;
}