Example #1
0
static int _ZMPI_Alltoall_int_proclists_put(int alloc_mem, int nphases, int *sendbuf, int nsprocs, int *sprocs, int *recvbuf, int nrprocs, int *rprocs, MPI_Comm comm)
{
  int i, p, size, rank, *rcounts_put;

  MPI_Win win;


  MPI_Comm_size(comm, &size);
  MPI_Comm_rank(comm, &rank);

  if (alloc_mem) MPI_Alloc_mem(size * sizeof(int), MPI_INFO_NULL, &rcounts_put);
  else rcounts_put = recvbuf;

  if (nrprocs >= 0)
    for (i = 0; i < nrprocs; ++i) rcounts_put[rprocs[i]] = DEFAULT_INT;
  else
    for (i = 0; i < size; ++i) rcounts_put[i] = DEFAULT_INT;

  MPI_Win_create(rcounts_put, size * sizeof(int), sizeof(int), MPI_INFO_NULL, comm, &win);
  MPI_Win_fence(MPI_MODE_NOSTORE|MPI_MODE_NOPRECEDE, win);

  for (p = 0; p < nphases; ++p)
  {
/*    printf("%d: phase = %d of %d\n", rank, p, nphases);*/
  
    if (rank % nphases == p)
    {
      if (nsprocs >= 0)
      {
        for (i = 0; i < nsprocs; ++i)
          if (sendbuf[sprocs[i]] != DEFAULT_INT) MPI_Put(&sendbuf[sprocs[i]], 1, MPI_INT, sprocs[i], rank, 1, MPI_INT, win);

      } else
      {
        for (i = 0; i < size; ++i)
          if (sendbuf[i] != DEFAULT_INT) MPI_Put(&sendbuf[i], 1, MPI_INT, i, rank, 1, MPI_INT, win);
      }
    }

    if (p < nphases - 1) MPI_Win_fence(0, win);
  }

  MPI_Win_fence(MPI_MODE_NOPUT|MPI_MODE_NOSUCCEED, win);
  MPI_Win_free(&win);

  if (alloc_mem)
  {
    if (nrprocs >= 0)
      for (i = 0; i < nrprocs; ++i) recvbuf[rprocs[i]] = rcounts_put[rprocs[i]];
    else
      for (i = 0; i < size; ++i) recvbuf[i] = rcounts_put[i];

    MPI_Free_mem(rcounts_put);    
  }

  return MPI_SUCCESS;
}
Example #2
0
/*Run PUT with Fence */
void run_put_with_fence(int rank, WINDOW type)
{
    double t; 
    int size, i, j;
    MPI_Aint disp = 0;
    MPI_Win     win;

    int window_size = WINDOW_SIZE_LARGE;
    for (size = 1; size <= MAX_SIZE; size = size * 2) {
        allocate_memory(rank, sbuf_original, rbuf_original, &sbuf, &rbuf, &rbuf, size*window_size, type, &win);

#if MPI_VERSION >= 3
        if (type == WIN_DYNAMIC) {
            disp = disp_remote;
        }
#endif

        if(size > LARGE_MESSAGE_SIZE) {
            loop = LOOP_LARGE;
            skip = SKIP_LARGE;
        }

        MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

        if(rank == 0) {
            for (i = 0; i < skip + loop; i++) {
                if (i == skip) {
                    t_start = MPI_Wtime ();
                }
                MPI_CHECK(MPI_Win_fence(0, win));
                for(j = 0; j < window_size; j++) {
                    MPI_CHECK(MPI_Put(sbuf+(j*size), size, MPI_CHAR, 1, disp + (j * size), size, MPI_CHAR,
                            win));
                }
                MPI_CHECK(MPI_Win_fence(0, win));
            }
            t_end = MPI_Wtime ();
            t = t_end - t_start;
        } else {
            for (i = 0; i < skip + loop; i++) {
                MPI_CHECK(MPI_Win_fence(0, win));
                for(j = 0; j < window_size; j++) {
                    MPI_CHECK(MPI_Put(sbuf+(j*size), size, MPI_CHAR, 0, disp + (j * size), size, MPI_CHAR,
                            win));
                }
                MPI_CHECK(MPI_Win_fence(0, win));
            }
        }

        MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

        print_bibw(rank, size, t);

        free_memory (sbuf, rbuf, win, rank);
    }
}
Example #3
0
int do_test(int origin_count, MPI_Datatype origin_type, int result_count,
            MPI_Datatype result_type, int target_count, MPI_Datatype target_type)
{
    int errs = 0, ret, origin_type_size, result_type_size;

    ret = MPI_Put(origin_buf, origin_count, origin_type, 1, 0, target_count, target_type, win);
    if (ret)
        errs++;

    ret = MPI_Get(origin_buf, origin_count, origin_type, 1, 0, target_count, target_type, win);
    if (ret)
        errs++;

    ret = MPI_Accumulate(origin_buf, origin_count, origin_type, 1, 0, target_count,
                         target_type, MPI_SUM, win);
    if (ret)
        errs++;

    ret = MPI_Get_accumulate(origin_buf, origin_count, origin_type, result_buf, result_count,
                             result_type, 1, 0, target_count, target_type, MPI_SUM, win);
    if (ret)
        errs++;

    MPI_Type_size(origin_type, &origin_type_size);
    MPI_Type_size(result_type, &result_type_size);

    if (origin_count == 0 || origin_type_size == 0) {
        ret = MPI_Put(NULL, origin_count, origin_type, 1, 0, target_count, target_type, win);
        if (ret)
            errs++;

        ret = MPI_Get(NULL, origin_count, origin_type, 1, 0, target_count, target_type, win);
        if (ret)
            errs++;

        ret = MPI_Accumulate(NULL, origin_count, origin_type, 1, 0, target_count, target_type,
                             MPI_SUM, win);
        if (ret)
            errs++;

        ret = MPI_Get_accumulate(NULL, origin_count, origin_type, result_buf, result_count,
                                 result_type, 1, 0, target_count, target_type, MPI_SUM, win);
        if (ret)
            errs++;

        if (result_count == 0 || result_type_size == 0) {
            ret = MPI_Get_accumulate(NULL, origin_count, origin_type, NULL, result_count,
                                     result_type, 1, 0, target_count, target_type, MPI_SUM, win);
            if (ret)
                errs++;
        }
    }

    return errs;
}
Example #4
0
dart_ret_t dart_put(
  dart_gptr_t  gptr,
  const void * src,
  size_t       nbytes)
{
  MPI_Aint    disp_s,
              disp_rel;
  MPI_Win     win;
  dart_unit_t target_unitid_abs;
  uint64_t offset   = gptr.addr_or_offs.offset;
  int16_t  seg_id   = gptr.segid;
  target_unitid_abs = gptr.unitid;
  if (seg_id) {
    uint16_t index = gptr.flags;
    dart_unit_t target_unitid_rel;
    win = dart_win_lists[index];
    unit_g2l (index, target_unitid_abs, &target_unitid_rel);
    if (dart_adapt_transtable_get_disp(
          seg_id,
          target_unitid_rel,
          &disp_s) == -1) {
      return DART_ERR_INVAL;
    }
    disp_rel = disp_s + offset;
    MPI_Put(
      src,
      nbytes,
      MPI_BYTE,
      target_unitid_rel,
      disp_rel,
      nbytes,
      MPI_BYTE,
      win);
    DART_LOG_DEBUG("dart_put: nbytes:%zu (from collective allocation) "
                   "target unit: %d offset: %"PRIu64"",
                   nbytes, target_unitid_abs, offset);
  } else {
    win = dart_win_local_alloc;
    MPI_Put(
      src,
      nbytes,
      MPI_BYTE,
      target_unitid_abs,
      offset,
      nbytes,
      MPI_BYTE,
      win);
    DART_LOG_DEBUG("dart_put: nbytes:%zu (from local allocation) "
                   "target unit: %d offset: %"PRIu64"",
                   nbytes, target_unitid_abs, offset);
  }
  return DART_OK;
}
Example #5
0
int main(int argc, char **argv){
  int i, me, target;
  unsigned int size;
  double t, t_max;
  MPI_Win win;

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  MPI_Win_create(&send_buf, sizeof(char)*MAX_SIZE, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win);
  
  target = 1 - me;
  MPI_Win_lock_all(0, win);
  
  init_buf(send_buf, me);

  if(me==0) print_items();

  for(size=1;size<MAX_SIZE+1;size*=2){
    MPI_Barrier(MPI_COMM_WORLD);
    for(i=0;i<LOOP+WARMUP;i++){
      if(WARMUP == i)
	t = wtime();

      if(me == 0){
	MPI_Put(send_buf, size, MPI_CHAR, target, 0, size, MPI_CHAR, win);
	MPI_Win_flush_local(target, win);

	while(send_buf[0] == '0' || send_buf[size-1] == '0'){ MPI_Win_flush(me, win); }
	send_buf[0] = '0'; send_buf[size-1] = '0';
      }
      else {
	while(send_buf[0] == '1' || send_buf[size-1] == '1'){ MPI_Win_flush(me, win); }
	send_buf[0] = '1'; send_buf[size-1] = '1';

	MPI_Put(send_buf, size, MPI_CHAR, target, 0, size, MPI_CHAR, win);
	MPI_Win_flush_local(target, win);
      }
    } //end of LOOP

    t = wtime() - t;
    MPI_Reduce(&t, &t_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
    if(me == 0)
      print_results(size, t_max);
  }

  MPI_Win_unlock_all(win);
  MPI_Win_free(&win);
  MPI_Finalize();
  return 0;
}
Example #6
0
void exchange(field * temperature, parallel_data * parallel)
{

    MPI_Win_fence(0, temperature->rma_window);
    // Put upwards
    MPI_Put(temperature->data[1], temperature->ny + 2, MPI_DOUBLE,
            parallel->nup, (temperature->ny + 2) * (temperature->nx + 1),
            temperature->ny + 2, MPI_DOUBLE, temperature->rma_window);
    // Put downwards
    MPI_Put(temperature->data[temperature->nx], temperature->ny + 2,
            MPI_DOUBLE, parallel->ndown, 0, temperature->ny + 2,
            MPI_DOUBLE, temperature->rma_window);
    MPI_Win_fence(0, temperature->rma_window);
}
Example #7
0
double message_rate (long * buffer, int size, int iterations, int me, int pairs, int nxtpe, MPI_Win win)
{
	int64_t begin, end; 
	int i, offset;

	/*
	 * Touch memory
	 */
	memset(buffer, size, MAX_MSG_SZ * ITERS_LARGE * sizeof(long));

	MPI_Barrier(MPI_COMM_WORLD);
	
	if (me < pairs) {
		begin = TIME();

		for (i = 0, offset = 0; i < iterations; i++, offset++) {
			MPI_Put ((buffer + offset*size), size, MPI_LONG, nxtpe, offset*size, size, MPI_LONG, win);
			//MPI_Win_flush_local (nxtpe, win);
		}
		//MPI_Win_flush_all(win);
		MPI_Win_flush(nxtpe, win);
		end = TIME();

		return ((double)iterations * 1e6) / ((double)end - (double)begin);
	}
	return 0;
}
Example #8
0
int main(int argc, char *argv[])
{
    int rank, nprocs, A[SIZE2], B[SIZE2], i;
    MPI_Win win;
    int errs = 0;

    MTest_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD,&rank);

    if (nprocs != 2) {
        printf("Run this program with 2 processes\n");
        MPI_Abort(MPI_COMM_WORLD,1);
    }

    if (rank == 0) {
        for (i=0; i<SIZE2; i++) A[i] = B[i] = i;
        MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win);

        for (i=0; i<SIZE1; i++) {
            MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win);
            MPI_Put(A+i, 1, MPI_INT, 1, i, 1, MPI_INT, win);
            MPI_Win_unlock(1, win);
        }

        for (i=0; i<SIZE1; i++) {
            MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win);
            MPI_Get(B+i, 1, MPI_INT, 1, SIZE1+i, 1, MPI_INT, win);
            MPI_Win_unlock(1, win);
        }

        MPI_Win_free(&win);

        for (i=0; i<SIZE1; i++)
            if (B[i] != (-4)*(i+SIZE1)) {
                printf("Get Error: B[%d] is %d, should be %d\n", i, B[i], (-4)*(i+SIZE1));
                errs++;
            }
    }

    else {  /* rank=1 */
        for (i=0; i<SIZE2; i++) B[i] = (-4)*i;
        MPI_Win_create(B, SIZE2*sizeof(int), sizeof(int), MPI_INFO_NULL,
                       MPI_COMM_WORLD, &win);

        MPI_Win_free(&win);

        for (i=0; i<SIZE1; i++) {
            if (B[i] != i) {
                printf("Put Error: B[%d] is %d, should be %d\n", i, B[i], i);
                errs++;
            }
        }
    }

    /*    if (rank==0) printf("Done\n");*/
    MTest_Finalize(errs);
    MPI_Finalize();
    return 0;
}
void MPIMutex::lock(int proc) {
	int rank, nproc, already_locked;
  MPI_Comm_rank(comm, &rank);
  MPI_Comm_size(comm, &nproc);
	//std::cout << "trying to get lock" << std::endl;
	MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, win);
	byte *buff = (byte*)malloc(sizeof(byte)*nproc);
	buff[rank] = 1;
  MPI_Put(&(buff[rank]), 1, MPI_BYTE, proc, rank, 1, MPI_BYTE, win);
  /* Get data to the left of rank */
  if (rank > 0) {
      MPI_Get(buff, rank, MPI_BYTE, proc, 0, rank, MPI_BYTE, win);
  }
  /* Get data to the right of rank */
  if (rank < nproc - 1) {
      MPI_Get(&(buff[rank+1]), nproc-1-rank, MPI_BYTE, proc, rank+1, nproc-1-rank,
              MPI_BYTE, win);
  }
  MPI_Win_unlock(proc, win);
	/* check if anyone has the lock*/
  for (int i = already_locked = 0; i < nproc; i++)
    if (buff[i] && i != rank)
        already_locked = 1;

  /* Wait for notification */
  if (already_locked) {
      MPI_Status status;
      //std::cout << "waiting for notification [proc = "<<proc<<"]" << std::endl;
      MPI_Recv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE, MPI_MUTEX_TAG+id, comm, &status);
  }
	
	//std::cout << "lock acquired [proc = "<<proc<<"]" << std::endl;
  free(buff);
};
Example #10
0
bool MPIMutex::try_lock(int proc) {
	int rank, nproc, already_locked;
  MPI_Comm_rank(comm, &rank);
  MPI_Comm_size(comm, &nproc);

	MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, win);
	byte *buff = (byte*)malloc(sizeof(byte)*nproc);
	buff[rank] = 1;
  MPI_Put(&(buff[rank]), 1, MPI_BYTE, proc, rank, 1, MPI_BYTE, win);
  /* Get data to the left of rank */
  if (rank > 0) {
      MPI_Get(buff, rank, MPI_BYTE, proc, 0, rank, MPI_BYTE, win);
  }
  /* Get data to the right of rank */
  if (rank < nproc - 1) {
      MPI_Get(&(buff[rank+1]), nproc-1-rank, MPI_BYTE, proc, rank+1, nproc-1-rank,
              MPI_BYTE, win);
  }
  MPI_Win_unlock(proc, win);
	/* check if anyone has the lock*/
	already_locked = 1; //1 means succesfully got the lock
	for (int i = 0; i < nproc; i++)
    if (buff[i] && i != rank)
        already_locked = 0;

	free(buff);
	return already_locked;	
};
Example #11
0
void DO_OP_LOOP(int dst, int iter)
{
    int i, x;

    switch (OP_TYPE) {
    case OP_ACC:
        for (x = 0; x < iter; x++) {
            for (i = 0; i < NOP; i++)
                MPI_Accumulate(&locbuf[0], OP_SIZE, MPI_DOUBLE, dst, 0, OP_SIZE, MPI_DOUBLE,
                               MPI_SUM, win);
            MPI_Win_flush(dst, win);
        }
        break;
    case OP_PUT:
        for (x = 0; x < iter; x++) {
            for (i = 0; i < NOP; i++)
                MPI_Put(&locbuf[0], OP_SIZE, MPI_DOUBLE, dst, 0, OP_SIZE, MPI_DOUBLE, win);
            MPI_Win_flush(dst, win);
        }
        break;
    case OP_GET:
        for (x = 0; x < iter; x++) {
            for (i = 0; i < NOP; i++)
                MPI_Get(&locbuf[0], OP_SIZE, MPI_DOUBLE, dst, 0, OP_SIZE, MPI_DOUBLE, win);
            MPI_Win_flush(dst, win);
        }
        break;
    }
}
Example #12
0
/** One-sided put operation with type arguments.  Source buffer must be private.
  *
  * @param[in] mreg      Memory region
  * @param[in] src       Address of source data
  * @param[in] src_count Number of elements of the given type at the source
  * @param[in] src_type  MPI datatype of the source elements
  * @param[in] dst       Address of destination buffer
  * @param[in] dst_count Number of elements of the given type at the destination
  * @param[in] src_type  MPI datatype of the destination elements
  * @param[in] size      Number of bytes to transfer
  * @param[in] proc      Absolute process id of target process
  * @return              0 on success, non-zero on failure
  */
int gmr_put_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type,
    void *dst, int dst_count, MPI_Datatype dst_type, int proc) {

  int        grp_proc;
  gmr_size_t disp;
  MPI_Aint lb, extent;

  grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc);
  ARMCII_Assert(grp_proc >= 0);

  // Calculate displacement from beginning of the window
  if (dst == MPI_BOTTOM) 
    disp = 0;
  else
    disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base);

  // Perform checks
  MPI_Type_get_true_extent(dst_type, &lb, &extent);
  ARMCII_Assert(mreg->lock_state != GMR_LOCK_UNLOCKED);
  ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address");
  ARMCII_Assert_msg(disp + dst_count*extent <= mreg->slices[proc].size, "Transfer is out of range");

  MPI_Put(src, src_count, src_type, grp_proc, (MPI_Aint) disp, dst_count, dst_type, mreg->window);

  return 0;
}
Example #13
0
int main(int argc, char *argv[])
{
    int errs = 0, err;
    int rank, size;
    int *buf, bufsize;
    int *result;
    int *rmabuf, rsize, rcount;
    MPI_Comm comm;
    MPI_Win win;
    MPI_Request req;
    MPI_Datatype derived_dtp;

    MTest_Init(&argc, &argv);

    bufsize = 256 * sizeof(int);
    buf = (int *) malloc(bufsize);
    if (!buf) {
        fprintf(stderr, "Unable to allocated %d bytes\n", bufsize);
        MPI_Abort(MPI_COMM_WORLD, 1);
    }
    result = (int *) malloc(bufsize);
    if (!result) {
        fprintf(stderr, "Unable to allocated %d bytes\n", bufsize);
        MPI_Abort(MPI_COMM_WORLD, 1);
    }
    rcount = 16;
    rsize = rcount * sizeof(int);
    rmabuf = (int *) malloc(rsize);
    if (!rmabuf) {
        fprintf(stderr, "Unable to allocated %d bytes\n", rsize);
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    MPI_Type_contiguous(2, MPI_INT, &derived_dtp);
    MPI_Type_commit(&derived_dtp);

    /* The following loop is used to run through a series of communicators
     * that are subsets of MPI_COMM_WORLD, of size 1 or greater. */
    while (MTestGetIntracommGeneral(&comm, 1, 1)) {
        int count = 0;

        if (comm == MPI_COMM_NULL)
            continue;
        /* Determine the sender and receiver */
        MPI_Comm_rank(comm, &rank);
        MPI_Comm_size(comm, &size);

        MPI_Win_create(buf, bufsize, 2 * sizeof(int), MPI_INFO_NULL, comm, &win);
        /* To improve reporting of problems about operations, we
         * change the error handler to errors return */
        MPI_Win_set_errhandler(win, MPI_ERRORS_RETURN);

        /** TEST OPERATIONS USING ACTIVE TARGET (FENCE) SYNCHRONIZATION **/
        MPI_Win_fence(0, win);

        TEST_FENCE_OP("Put", MPI_Put(rmabuf, count, MPI_INT, TARGET, 0, count, MPI_INT, win);
);

        TEST_FENCE_OP("Get", MPI_Get(rmabuf, count, MPI_INT, TARGET, 0, count, MPI_INT, win);
);
Example #14
0
void benchmark (long * msg_buffer, int me, int pairs, int nxtpe, MPI_Win win)
{
	static double mr, mr_sum;
	int iters;
	
	if (msg_buffer == NULL) {
		printf("Input buffer is NULL, no reason to proceed\n");
		exit(-1);
	}
	/*
	 * Warmup
	 */
	if (me < pairs) {
		for (int i = 0; i < ITERS_LARGE; i += 1) {
			MPI_Put ((msg_buffer + i*MAX_MSG_SZ), MAX_MSG_SZ, MPI_LONG, nxtpe, i*MAX_MSG_SZ, MAX_MSG_SZ, MPI_LONG, win);
			MPI_Win_flush_local (nxtpe, win);
		}
	}

	MPI_Win_flush_all(win);
	MPI_Barrier(MPI_COMM_WORLD);
	/*
	 * Benchmark
	 */
	for (long size = 1; size <= MAX_MSG_SZ; size <<= 1) {
        	iters = size < LARGE_THRESHOLD ? ITERS_SMALL : ITERS_LARGE;
		mr = message_rate(msg_buffer, size, iters, me, pairs, nxtpe, win);
		MPI_Reduce(&mr, &mr_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
		print_message_rate(size, mr_sum, me);
	}
}
static void _mpi_scalar_mput(const int target_rank, 
			     const _XMP_coarray_t *dst_desc, const void *src,
			     const size_t dst_offset, const size_t src_offset,
			     const int dst_dims, 
			     const _XMP_array_section_t *dst_info,
			     const bool is_dst_on_acc)
{
  int allelmt_dim = _XMP_get_dim_of_allelmts(dst_dims, dst_info);
  size_t element_size = dst_desc->elmt_size;
  size_t allelmt_size = (allelmt_dim == dst_dims)? element_size : dst_info[allelmt_dim].distance * dst_info[allelmt_dim].elmts;
  char *laddr = (allelmt_dim == dst_dims)? ((char*)src + src_offset) : _XMP_alloc(allelmt_size);
  char *raddr = get_remote_addr(dst_desc, target_rank, is_dst_on_acc) + dst_offset;
  MPI_Win win = get_window(dst_desc, is_dst_on_acc);

  XACC_DEBUG("scalar_mput(src_p=%p, size=%zd, target=%d, dst_p=%p, is_acc=%d)", laddr, element_size, target_rank, raddr, is_dst_on_acc);

  XACC_DEBUG("allelmt_dim=%d, dst_dims=%d", allelmt_dim, dst_dims);
  if(allelmt_dim != dst_dims){
    //mcopy
    _XMP_array_section_t info;
    info.start = 0;
    info.length = allelmt_size/element_size;
    info.stride = 1;
    info.elmts = info.length;
    info.distance = element_size;
    _XMP_stride_memcpy_1dim(laddr, (char*)src+src_offset, &info, element_size, _XMP_SCALAR_MCOPY);
    XACC_DEBUG("mcopy(%lld, %lld, %lld), %lld",info.start, info.length, info.stride, info.elmts);
  }
  long long idxs[allelmt_dim+1];
  for(int i = 0; i < allelmt_dim+1; i++) idxs[i]=0;

  while(1){
    size_t offset = 0;
    for(int i = 0; i < allelmt_dim; i++){
      offset += dst_info[i].distance * idxs[i+1] * dst_info[i].stride;
    }

    MPI_Put((void*)laddr, allelmt_size, MPI_BYTE, target_rank,
	    (MPI_Aint)(raddr+offset), allelmt_size, MPI_BYTE,
	    win);

    ++idxs[allelmt_dim];
    for(int i = allelmt_dim-1; i >= 0; i--){
      long long length = dst_info[i].length;
      if(idxs[i+1] >= length){
	idxs[i+1] -= length;
	++idxs[i];
      }else{
	break;
      }
    }
    if(idxs[0] > 0){
      break;
    }
  }
  _wait_puts(target_rank, win);
  if(allelmt_dim != dst_dims){
    _XMP_free(laddr);
  }
}
void SweptDiscretization2D::updateRemoteConstants(unsigned char *buffer)
{
	void *sendingBuffer = NULL;
	FILE *inFile = NULL;

	if(pg.rank == 0)
	{
		int bufferSize = this->remoteConstantsCount * n * n * pg.mpiSize * sizeof(double);
		MPI_Alloc_mem(bufferSize, MPI_INFO_NULL, &sendingBuffer);
		
		for(int r=0;r<pg.mpiSize;r++)
		{
			double *processing = (double*)sendingBuffer + (this->remoteConstantsCount * n * n * r);
			int jIndex = (r % (pg.xNodes*pg.yNodes)) / pg.xNodes;
			int iIndex = r % pg.xNodes;
			for(int j=0;j<n;j++)
			{
				for(int i=0;i<n;i++)
				{
					int iGlobal = n*iIndex + (i);
					int jGlobal = n*jIndex + (j);
					int index   = this->ijToConstantIndex(i,j);
					int globalIndex = this->remoteConstantsCount * (iGlobal + jGlobal * n * pg.xNodes);
					for(int k=0;k<this->remoteConstantsCount;k++)
					{
						processing[index + k] = ((double*)buffer)[k + globalIndex];
					}					
				}
			}		
		}
	}
	
	MPI_Win_fence(MPI_MODE_NOPRECEDE, this->constantsWindow);
	if(pg.rank == 0)
	{
		for(int r=0;r<pg.mpiSize;r++)
		{
			MPI_Put((unsigned char*)sendingBuffer + (r * remoteConstantsCount * n * n * sizeof(double)), remoteConstantsCount * n * n * sizeof(double), MPI_BYTE, r, 0, remoteConstantsCount * n * n * sizeof(double), MPI_BYTE, constantsWindow);			
		}
	}
	MPI_Win_fence((MPI_MODE_NOSTORE | MPI_MODE_NOSUCCEED), this->constantsWindow);

	if(pg.rank == 0)
	{
		MPI_Free_mem(sendingBuffer);					
	}
	for(int i=1;i<n+1;i++)
	{
		for(int j=1;j<n+1;j++)
		{
			for(int k=0;k<this->remoteConstantsCount;k++)
			{
				int windowIndex    = this->ijToConstantIndex(i-1,j-1);
				int foundationIndex = this->ijToIndex(i,j);
				this->foundation[foundationIndex + k] = this->remoteConstants[windowIndex + k];
			}
		}
	}
}
Example #17
0
void add_scatter_constant_request(scatter_constant * sc, int remote_rank,
				  size_t remote_idx, size_t req_id)
{
	assert(sc->valid);
#pragma omp critical
	MPI_Put(sc->constant, 1, sc->datatype, remote_rank, remote_idx, 1,
		sc->datatype, sc->win);
}
void add_scatter_request(scatter* sc, const char* local_data, int remote_rank, size_t remote_idx, size_t req_id) {
  assert (sc->valid);
  assert (sc->request_count < sc->nrequests_max);
  memcpy(sc->send_data + sc->request_count * sc->elt_size, local_data, sc->elt_size);
#pragma omp critical
  MPI_Put(sc->send_data + sc->request_count * sc->elt_size, 1, sc->datatype, remote_rank, remote_idx, 1, sc->datatype, sc->win);
  ++sc->request_count;
}
Example #19
0
int main(int argc, char *argv[])
{
    int errs = 0, err;
    int rank, size;
    int *buf, bufsize;
    int *result;
    int *rmabuf, rsize, rcount;
    MPI_Comm comm;
    MPI_Win win;
    MPI_Request req;

    MTest_Init(&argc, &argv);

    bufsize = 256 * sizeof(int);
    buf = (int *) malloc(bufsize);
    if (!buf) {
        fprintf(stderr, "Unable to allocated %d bytes\n", bufsize);
        MPI_Abort(MPI_COMM_WORLD, 1);
    }
    result = (int *) malloc(bufsize);
    if (!result) {
        fprintf(stderr, "Unable to allocated %d bytes\n", bufsize);
        MPI_Abort(MPI_COMM_WORLD, 1);
    }
    rcount = 16;
    rsize = rcount * sizeof(int);
    rmabuf = (int *) malloc(rsize);
    if (!rmabuf) {
        fprintf(stderr, "Unable to allocated %d bytes\n", rsize);
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    /* The following illustrates the use of the routines to
     * run through a selection of communicators and datatypes.
     * Use subsets of these for tests that do not involve combinations
     * of communicators, datatypes, and counts of datatypes */
    while (MTestGetIntracommGeneral(&comm, 1, 1)) {
        if (comm == MPI_COMM_NULL)
            continue;
        /* Determine the sender and receiver */
        MPI_Comm_rank(comm, &rank);
        MPI_Comm_size(comm, &size);

        MPI_Win_create(buf, bufsize, sizeof(int), MPI_INFO_NULL, comm, &win);
        /* To improve reporting of problems about operations, we
         * change the error handler to errors return */
        MPI_Win_set_errhandler(win, MPI_ERRORS_RETURN);

        /** TEST OPERATIONS USING ACTIVE TARGET (FENCE) SYNCHRONIZATION **/
        MPI_Win_fence(0, win);

        TEST_FENCE_OP("Put",
                      MPI_Put(rmabuf, rcount, MPI_INT, MPI_PROC_NULL, 0, rcount, MPI_INT, win);
);

        TEST_FENCE_OP("Get",
                      MPI_Get(rmabuf, rcount, MPI_INT, MPI_PROC_NULL, 0, rcount, MPI_INT, win);
);
Example #20
0
void Master::UpdateCriticalAttribute(Attribute attr, AgentId agent_id, AgentType agent_type, void* location) {
	AgentType type = GlobalToLocalType(agent_id);
	auto p = std::make_pair(type, attr);
	size_t target_disp = critical_agents_offsets_.at(agent_id) + critical_attributes_offsets_.at(p);
	MPI_Datatype attribute_type = attributes_MPI_types_.at(p);
	for (MasterId id=0; id<nb_masters_; id++) {
		MPI_Put(location, 1, attribute_type, id, target_disp, 1, attribute_type, critical_window_);
	}
}
Example #21
0
/** Lock a mutex.
  *
  * @param[in] hdl   Mutex group that the mutex belongs to
  * @param[in] mutex Desired mutex number [0..count-1]
  * @param[in] proc  Rank of process where the mutex lives
  * @return          MPI status
  */
int MPIX_Mutex_lock(MPIX_Mutex hdl, int mutex, int proc)
{
    int rank, nproc, already_locked, i;
    uint8_t *buf;

    assert(mutex >= 0 && mutex < hdl->max_count);

    MPI_Comm_rank(hdl->comm, &rank);
    MPI_Comm_size(hdl->comm, &nproc);

    assert(proc >= 0 && proc < nproc);

    buf = malloc(nproc * sizeof(uint8_t));
    assert(buf != NULL);

    buf[rank] = 1;

    /* Get all data from the lock_buf, except the byte belonging to
     * me. Set the byte belonging to me to 1. */
    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->windows[mutex]);

    MPI_Put(&buf[rank], 1, MPI_BYTE, proc, rank, 1, MPI_BYTE, hdl->windows[mutex]);

    /* Get data to the left of rank */
    if (rank > 0) {
        MPI_Get(buf, rank, MPI_BYTE, proc, 0, rank, MPI_BYTE, hdl->windows[mutex]);
    }

    /* Get data to the right of rank */
    if (rank < nproc - 1) {
        MPI_Get(&buf[rank + 1], nproc - 1 - rank, MPI_BYTE, proc, rank + 1, nproc - 1 - rank,
                MPI_BYTE, hdl->windows[mutex]);
    }

    MPI_Win_unlock(proc, hdl->windows[mutex]);

    assert(buf[rank] == 1);

    for (i = already_locked = 0; i < nproc; i++)
        if (buf[i] && i != rank)
            already_locked = 1;

    /* Wait for notification */
    if (already_locked) {
        MPI_Status status;
        debug_print("waiting for notification [proc = %d, mutex = %d]\n", proc, mutex);
        MPI_Recv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE, MPIX_MUTEX_TAG + mutex, hdl->comm, &status);
    }

    debug_print("lock acquired [proc = %d, mutex = %d]\n", proc, mutex);
    free(buf);

    return MPI_SUCCESS;
}
Example #22
0
int main(int argc, char *argv[])
{
    int rank, nprocs, A[SIZE], B[SIZE], i;
    MPI_Comm CommDeuce;
    MPI_Win win;
    int errs = 0;

    MTest_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    if (nprocs < 2) {
        printf("Run this program with 2 or more processes\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);

    if (rank < 2) {
        if (rank == 0) {
            for (i = 0; i < SIZE; i++)
                A[i] = B[i] = i;
        }
        else {
            for (i = 0; i < SIZE; i++) {
                A[i] = (-3) * i;
                B[i] = (-4) * i;
            }
        }

        MPI_Win_create(B, SIZE * sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);

        MPI_Win_fence(0, win);

        if (rank == 0) {
            for (i = 0; i < SIZE - 1; i++)
                MPI_Put(A + i, 1, MPI_INT, 1, i, 1, MPI_INT, win);
        }
        else {
            for (i = 0; i < SIZE - 1; i++)
                MPI_Get(A + i, 1, MPI_INT, 0, i, 1, MPI_INT, win);

            MPI_Accumulate(A + i, 1, MPI_INT, 0, i, 1, MPI_INT, MPI_SUM, win);
        }
        MPI_Win_fence(0, win);

        if (rank == 1) {
            for (i = 0; i < SIZE - 1; i++) {
                if (A[i] != B[i]) {
                    SQUELCH(printf("Put/Get Error: A[i]=%d, B[i]=%d\n", A[i], B[i]););
                    errs++;
                }
            }
        }
Example #23
0
int MPIX_Put_x(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
               int target_rank, MPI_Aint target_disp, MPI_Count target_count, MPI_Datatype target_datatype, MPI_Win win)
{
    int rc = MPI_SUCCESS;

    if (likely (origin_count <= bigmpi_int_max && target_count <= bigmpi_int_max)) {
        rc = MPI_Put(origin_addr, origin_count, origin_datatype,
                     target_rank, target_disp, target_count, target_datatype, win);
    } else {
        MPI_Datatype neworigin_datatype, newtarget_datatype;
        MPIX_Type_contiguous_x(origin_count, origin_datatype, &neworigin_datatype);
        MPIX_Type_contiguous_x(target_count, target_datatype, &newtarget_datatype);
        MPI_Type_commit(&neworigin_datatype);
        MPI_Type_commit(&newtarget_datatype);
        rc = MPI_Put(origin_addr, 1, neworigin_datatype,
                     target_rank, target_disp, 1, newtarget_datatype, win);
        MPI_Type_free(&neworigin_datatype);
        MPI_Type_free(&newtarget_datatype);
    }
    return rc;
}
Example #24
0
int main(int argc, char *argv[]) 
{ 
    int rank, destrank, nprocs, *A, *B, i;
    MPI_Comm CommDeuce;
    MPI_Group comm_group, group;
    MPI_Win win;
    int errs = 0;

    MTest_Init(&argc,&argv); 
    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 

    if (nprocs < 2) {
        printf("Run this program with 2 or more processes\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);

    if (rank < 2)
    {

        i = MPI_Alloc_mem(SIZE2 * sizeof(int), MPI_INFO_NULL, &A);
        if (i) {
            printf("Can't allocate memory in test program\n");
            MPI_Abort(MPI_COMM_WORLD, 1);
        }
        i = MPI_Alloc_mem(SIZE2 * sizeof(int), MPI_INFO_NULL, &B);
        if (i) {
            printf("Can't allocate memory in test program\n");
            MPI_Abort(MPI_COMM_WORLD, 1);
        }

        MPI_Comm_group(CommDeuce, &comm_group);

        if (rank == 0) {
            for (i=0; i<SIZE2; i++) A[i] = B[i] = i;
            MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, CommDeuce, &win);
            destrank = 1;
            MPI_Group_incl(comm_group, 1, &destrank, &group);
            MPI_Win_start(group, 0, win);
            for (i=0; i<SIZE1; i++)
                MPI_Put(A+i, 1, MPI_INT, 1, i, 1, MPI_INT, win);
            for (i=0; i<SIZE1; i++)
                MPI_Get(B+i, 1, MPI_INT, 1, SIZE1+i, 1, MPI_INT, win);

            MPI_Win_complete(win);

            for (i=0; i<SIZE1; i++)
                if (B[i] != (-4)*(i+SIZE1)) {
                    SQUELCH( printf("Get Error: B[i] is %d, should be %d\n", B[i], (-4)*(i+SIZE1)); );
                    errs++;
                }
Example #25
0
/** Unlock a mutex.
  *
  * @param[in] hdl   Mutex group that the mutex belongs to.
  * @param[in] mutex Desired mutex number [0..count-1]
  * @param[in] proc  Rank of process where the mutex lives
  * @return          MPI status
  */
int MPIX_Mutex_unlock(MPIX_Mutex hdl, int mutex, int proc)
{
    int rank, nproc, i;
    uint8_t *buf;

    assert(mutex >= 0 && mutex < hdl->max_count);

    MPI_Comm_rank(hdl->comm, &rank);
    MPI_Comm_size(hdl->comm, &nproc);

    assert(proc >= 0 && proc < nproc);

    buf = malloc(nproc * sizeof(uint8_t));
    assert(buf != NULL);

    buf[rank] = 0;

    /* Get all data from the lock_buf, except the byte belonging to
     * me. Set the byte belonging to me to 0. */
    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->windows[mutex]);

    MPI_Put(&buf[rank], 1, MPI_BYTE, proc, rank, 1, MPI_BYTE, hdl->windows[mutex]);

    /* Get data to the left of rank */
    if (rank > 0) {
        MPI_Get(buf, rank, MPI_BYTE, proc, 0, rank, MPI_BYTE, hdl->windows[mutex]);
    }

    /* Get data to the right of rank */
    if (rank < nproc - 1) {
        MPI_Get(&buf[rank + 1], nproc - 1 - rank, MPI_BYTE, proc, rank + 1, nproc - 1 - rank,
                MPI_BYTE, hdl->windows[mutex]);
    }

    MPI_Win_unlock(proc, hdl->windows[mutex]);

    assert(buf[rank] == 0);

    /* Notify the next waiting process, starting to my right for fairness */
    for (i = 1; i < nproc; i++) {
        int p = (rank + i) % nproc;
        if (buf[p] == 1) {
            debug_print("notifying %d [proc = %d, mutex = %d]\n", p, proc, mutex);
            MPI_Send(NULL, 0, MPI_BYTE, p, MPI_MUTEX_TAG + mutex, hdl->comm);
            break;
        }
    }

    debug_print("lock released [proc = %d, mutex = %d]\n", proc, mutex);
    free(buf);

    return MPI_SUCCESS;
}
Example #26
0
void test_put(void)
{
    int me, nproc;
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
    MPI_Comm_rank(MPI_COMM_WORLD, &me);
    MPI_Win dst_win;
    double *dst_buf;
    double src_buf[MAXELEMS];
    int i, j;

    MPI_Alloc_mem(sizeof(double) * nproc * MAXELEMS, MPI_INFO_NULL, &dst_buf);
    MPI_Win_create(dst_buf, sizeof(double) * nproc * MAXELEMS, 1, MPI_INFO_NULL, MPI_COMM_WORLD,
                   &dst_win);

    for (i = 0; i < MAXELEMS; i++)
        src_buf[i] = me + 1.0;

    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, me, 0, dst_win);

    for (i = 0; i < nproc * MAXELEMS; i++)
        dst_buf[i] = 0.0;

    MPI_Win_unlock(me, dst_win);

    MPI_Barrier(MPI_COMM_WORLD);

    for (i = 0; i < nproc; i++) {
        int target = i;

        for (j = 0; j < COUNT; j++) {
            if (verbose)
                printf("%2d -> %2d [%2d]\n", me, target, j);
            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, target, 0, dst_win);
            MPI_Put(&src_buf[j], sizeof(double), MPI_BYTE, target,
                    (me * MAXELEMS + j) * sizeof(double), sizeof(double), MPI_BYTE, dst_win);
            MPI_Win_unlock(target, dst_win);
        }

        for (j = 0; j < COUNT; j++) {
            if (verbose)
                printf("%2d <- %2d [%2d]\n", me, target, j);
            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, target, 0, dst_win);
            MPI_Get(&src_buf[j], sizeof(double), MPI_BYTE, target,
                    (me * MAXELEMS + j) * sizeof(double), sizeof(double), MPI_BYTE, dst_win);
            MPI_Win_unlock(target, dst_win);
        }
    }

    MPI_Barrier(MPI_COMM_WORLD);

    MPI_Win_free(&dst_win);
    MPI_Free_mem(dst_buf);
}
Example #27
0
/** Lock a mutex.
  * 
  * @param[in] hdl        Mutex group that the mutex belongs to.
  * @param[in] mutex      Desired mutex number [0..count-1]
  * @param[in] world_proc Absolute ID of process where the mutex lives
  */
void ARMCIX_Lock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) {
  int       rank, nproc, already_locked, i, proc;
  uint8_t *buf;

  ARMCII_Assert(mutex >= 0 && mutex < hdl->max_count);

  MPI_Comm_rank(hdl->grp.comm, &rank);
  MPI_Comm_size(hdl->grp.comm, &nproc);

  /* User gives us the absolute ID.  Translate to the rank in the mutex's group. */
  proc = ARMCII_Translate_absolute_to_group(&hdl->grp, world_proc);
  ARMCII_Assert(proc >= 0);

  buf = malloc(nproc*sizeof(uint8_t));
  ARMCII_Assert(buf != NULL);

  buf[rank] = 1;

  /* Get all data from the lock_buf, except the byte belonging to
   * me. Set the byte belonging to me to 1. */
  MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->windows[mutex]);
  
  MPI_Put(&buf[rank], 1, MPI_BYTE, proc, rank, 1, MPI_BYTE, hdl->windows[mutex]);

  /* Get data to the left of rank */
  if (rank > 0) {
    MPI_Get(buf, rank, MPI_BYTE, proc, 0, rank, MPI_BYTE, hdl->windows[mutex]);
  }

  /* Get data to the right of rank */
  if (rank < nproc - 1) {
    MPI_Get(&buf[rank+1], nproc-1-rank, MPI_BYTE, proc, rank + 1, nproc-1-rank, MPI_BYTE, hdl->windows[mutex]);
  }
  
  MPI_Win_unlock(proc, hdl->windows[mutex]);

  ARMCII_Assert(buf[rank] == 1);

  for (i = already_locked = 0; i < nproc; i++)
    if (buf[i] && i != rank)
      already_locked = 1;

  /* Wait for notification */
  if (already_locked) {
    MPI_Status status;
    ARMCII_Dbg_print(DEBUG_CAT_MUTEX, "waiting for notification [proc = %d, mutex = %d]\n", proc, mutex);
    MPI_Recv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE, ARMCI_MUTEX_TAG+mutex, hdl->grp.comm, &status);
  }

  ARMCII_Dbg_print(DEBUG_CAT_MUTEX, "lock acquired [proc = %d, mutex = %d]\n", proc, mutex);
  free(buf);
}
Example #28
0
/*
 * Class:     mpi_Win
 * Method:    put
 * Signature: (JLjava/lang/Object;IJIIIJI)V
 */
JNIEXPORT void JNICALL Java_mpi_Win_put(
        JNIEnv *env, jobject jthis, jlong win, jobject origin,
        jint orgCount, jlong orgType, jint targetRank, jint targetDisp,
        jint targetCount, jlong targetType, jint baseType)
{
    void *orgPtr = (*env)->GetDirectBufferAddress(env, origin);

    int rc = MPI_Put(orgPtr, orgCount, (MPI_Datatype)orgType,
                     targetRank, (MPI_Aint)targetDisp, targetCount,
                     (MPI_Datatype)targetType, (MPI_Win)win);

    ompi_java_exceptionCheck(env, rc);
}
Example #29
0
MTEST_THREAD_RETURN_TYPE run_test(void *arg)
{
    int i;

    for (i = 0; i < LOOPS; i++) {
        /* send a global variable, rather than a stack variable, so
         * other threads can access the address during flush */
        MPI_Put(&dummy, 1, MPI_INT, 0, 0, 1, MPI_INT, win);
        MPI_Win_flush(0, win);
    }

    return (MTEST_THREAD_RETURN_TYPE) NULL;
}
Example #30
0
/** Unlock a mutex.
  * 
  * @param[in] hdl   Mutex group that the mutex belongs to.
  * @param[in] mutex Desired mutex number [0..count-1]
  * @param[in] world_proc Absolute ID of process where the mutex lives
  */
void ARMCIX_Unlock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) {
  int      rank, nproc, i, proc;
  uint8_t *buf;

  ARMCII_Assert(mutex >= 0 && mutex < hdl->max_count);

  MPI_Comm_rank(hdl->grp.comm, &rank);
  MPI_Comm_size(hdl->grp.comm, &nproc);

  proc = ARMCII_Translate_absolute_to_group(&hdl->grp, world_proc);
  ARMCII_Assert(proc >= 0);

  buf = malloc(nproc*sizeof(uint8_t));

  buf[rank] = 0;

  /* Get all data from the lock_buf, except the byte belonging to
   * me. Set the byte belonging to me to 0. */
  MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->windows[mutex]);
  
  MPI_Put(&buf[rank], 1, MPI_BYTE, proc, rank, 1, MPI_BYTE, hdl->windows[mutex]);

  /* Get data to the left of rank */
  if (rank > 0) {
    MPI_Get(buf, rank, MPI_BYTE, proc, 0, rank, MPI_BYTE, hdl->windows[mutex]);
  }

  /* Get data to the right of rank */
  if (rank < nproc - 1) {
    MPI_Get(&buf[rank+1], nproc-1-rank, MPI_BYTE, proc, rank + 1, nproc-1-rank, MPI_BYTE, hdl->windows[mutex]);
  }
  
  MPI_Win_unlock(proc, hdl->windows[mutex]);

  ARMCII_Assert(buf[rank] == 0);

  /* Notify the next waiting process, starting to my right for fairness */
  for (i = 1; i < nproc; i++) {
    int p = (rank + i) % nproc;
    if (buf[p] == 1) {
      ARMCII_Dbg_print(DEBUG_CAT_MUTEX, "notifying %d [proc = %d, mutex = %d]\n", p, proc, mutex);
      MPI_Send(NULL, 0, MPI_BYTE, p, ARMCI_MUTEX_TAG+mutex, hdl->grp.comm);
      break;
    }
  }

  ARMCII_Dbg_print(DEBUG_CAT_MUTEX, "lock released [proc = %d, mutex = %d]\n", proc, mutex);
  free(buf);
}