void _XMP_reduce_gpu_NODES_ENTIRE(_XMP_nodes_t *nodes, void *dev_addr, int count, int datatype, int op)
{
  if (count == 0) {
    return; // FIXME not good implementation
  }
  if (!nodes->is_member) {
    return;
  }

  // setup information
  MPI_Datatype mpi_datatype = MPI_DATATYPE_NULL;
  size_t datatype_size = 0;
  MPI_Op mpi_op;
  _XMP_setup_reduce_type(&mpi_datatype, &datatype_size, datatype);
  _XMP_setup_reduce_op(&mpi_op, op);

  size_t size = datatype_size * count;
  void *host_buf = _XMP_alloc(size);
  cudaError_t e;

  // copy dev to host
  e = cudaMemcpy(host_buf, dev_addr, size, cudaMemcpyDeviceToHost);
  cudaErrorCheck(e);

  MPI_Allreduce(MPI_IN_PLACE, host_buf, count, mpi_datatype, mpi_op, *((MPI_Comm *)nodes->comm));

  // copy host to dev
  e = cudaMemcpy(dev_addr, host_buf, size, cudaMemcpyHostToDevice);
  cudaErrorCheck(e);

  _XMP_free(host_buf);
}
Exemple #2
0
void xmp_gather_(_XMP_array_t **x_d, _XMP_array_t **a_d, ... )
{
  int          i;
  va_list      valst;
  _XMP_array_t *idx_p;
  _XMP_array_t **idx_pp;
  _XMP_array_t **idx_array;
//  _XMP_array_t *x_p = *(_XMP_array_t **)x_d;
  _XMP_array_t *a_p = *(_XMP_array_t **)a_d;

  idx_array = (_XMP_array_t **)_XMP_alloc(sizeof(_XMP_array_t *)*a_p->dim);

  va_start( valst, a_d );

  for(i=0;i<a_p->dim;i++){
     idx_pp = va_arg( valst , _XMP_array_t** );
     idx_p  = *(_XMP_array_t **)idx_pp;
     idx_array[i] = idx_p;
  }

  va_end(valst);

   xmpf_gather(*x_d, *a_d, idx_array);

   _XMP_free(idx_array);
}
void _XMP_reduce_gpu_CLAUSE(void *dev_addr, int count, int datatype, int op) {
  // setup information
  MPI_Datatype mpi_datatype = MPI_DATATYPE_NULL;
  size_t datatype_size = 0;
  MPI_Op mpi_op;
  _XMP_setup_reduce_type(&mpi_datatype, &datatype_size, datatype);
  _XMP_setup_reduce_op(&mpi_op, op);

  size_t size = datatype_size * count;
  void *host_buf = _XMP_alloc(size);
  cudaError_t e;

  // copy dev to host
  e = cudaMemcpy(host_buf, dev_addr, size, cudaMemcpyDeviceToHost);
  cudaErrorCheck(e);

  // reduce
  MPI_Allreduce(MPI_IN_PLACE, host_buf, count, mpi_datatype, mpi_op, *((MPI_Comm *)(_XMP_get_execution_nodes())->comm));

  // copy host to dev
  e = cudaMemcpy(dev_addr, host_buf, size, cudaMemcpyHostToDevice);
  cudaErrorCheck(e);

  _XMP_free(host_buf);
}
static void _mpi_scalar_mput(const int target_rank, 
			     const _XMP_coarray_t *dst_desc, const void *src,
			     const size_t dst_offset, const size_t src_offset,
			     const int dst_dims, 
			     const _XMP_array_section_t *dst_info,
			     const bool is_dst_on_acc)
{
  int allelmt_dim = _XMP_get_dim_of_allelmts(dst_dims, dst_info);
  size_t element_size = dst_desc->elmt_size;
  size_t allelmt_size = (allelmt_dim == dst_dims)? element_size : dst_info[allelmt_dim].distance * dst_info[allelmt_dim].elmts;
  char *laddr = (allelmt_dim == dst_dims)? ((char*)src + src_offset) : _XMP_alloc(allelmt_size);
  char *raddr = get_remote_addr(dst_desc, target_rank, is_dst_on_acc) + dst_offset;
  MPI_Win win = get_window(dst_desc, is_dst_on_acc);

  XACC_DEBUG("scalar_mput(src_p=%p, size=%zd, target=%d, dst_p=%p, is_acc=%d)", laddr, element_size, target_rank, raddr, is_dst_on_acc);

  XACC_DEBUG("allelmt_dim=%d, dst_dims=%d", allelmt_dim, dst_dims);
  if(allelmt_dim != dst_dims){
    //mcopy
    _XMP_array_section_t info;
    info.start = 0;
    info.length = allelmt_size/element_size;
    info.stride = 1;
    info.elmts = info.length;
    info.distance = element_size;
    _XMP_stride_memcpy_1dim(laddr, (char*)src+src_offset, &info, element_size, _XMP_SCALAR_MCOPY);
    XACC_DEBUG("mcopy(%lld, %lld, %lld), %lld",info.start, info.length, info.stride, info.elmts);
  }
  long long idxs[allelmt_dim+1];
  for(int i = 0; i < allelmt_dim+1; i++) idxs[i]=0;

  while(1){
    size_t offset = 0;
    for(int i = 0; i < allelmt_dim; i++){
      offset += dst_info[i].distance * idxs[i+1] * dst_info[i].stride;
    }

    MPI_Put((void*)laddr, allelmt_size, MPI_BYTE, target_rank,
	    (MPI_Aint)(raddr+offset), allelmt_size, MPI_BYTE,
	    win);

    ++idxs[allelmt_dim];
    for(int i = allelmt_dim-1; i >= 0; i--){
      long long length = dst_info[i].length;
      if(idxs[i+1] >= length){
	idxs[i+1] -= length;
	++idxs[i];
      }else{
	break;
      }
    }
    if(idxs[0] > 0){
      break;
    }
  }
  _wait_puts(target_rank, win);
  if(allelmt_dim != dst_dims){
    _XMP_free(laddr);
  }
}
void _XMP_create_TCA_handle(void *acc_addr, _XMP_array_t *adesc)
{
  if(adesc->set_handle)
    return;

  // 64KB align ?
  long tmp = ((long)acc_addr/65536)*65536;
  if(tmp != (long)acc_addr){
    _XMP_fatal("An array is not aligned at 64KB.");
    return;
  }

  size_t size = (size_t)(adesc->type_size * adesc->total_elmts);

#if 0
  printf("[%d] tcaCreateHandle size = %d addr=%p\n", _XMP_world_rank, size, acc_addr);
#endif
  tcaHandle tmp_handle;
  TCA_CHECK(tcaCreateHandle(&tmp_handle, acc_addr, size, tcaMemoryGPU));

  adesc->tca_handle = _XMP_alloc(sizeof(tcaHandle) * _XMP_world_size);
  MPI_Allgather(&tmp_handle, sizeof(tcaHandle), MPI_BYTE,
                adesc->tca_handle, sizeof(tcaHandle), MPI_BYTE, MPI_COMM_WORLD);

  adesc->set_handle = _XMP_N_INT_TRUE;
}
void xmpf_array_init_shadow__(_XMP_array_t **a_desc, int *i_dim,
			      int *lshadow, int *ushadow)
{
  _XMP_array_t *array = *a_desc;
  _XMP_array_info_t *ai = &(array->info[*i_dim]);

  if (*lshadow == 0 && *ushadow == 0){
    ai->shadow_type = _XMP_N_SHADOW_NONE;
  }
  else if (*lshadow > 0 || *ushadow > 0){

    _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK || ai->align_manner == _XMP_N_ALIGN_GBLOCK);

    ai->shadow_type = _XMP_N_SHADOW_NORMAL;
    ai->shadow_size_lo = *lshadow;
    ai->shadow_size_hi = *ushadow;
      
    if (array->is_allocated){
      ai->local_lower += *lshadow;
      ai->local_upper += *lshadow;
      // ai->local_stride is not changed
      ai->alloc_size += *lshadow + *ushadow;
      // ai->temp0 shuld not be used in XMP/F.
      // *(ai->temp0) -= *lshadow;
      ai->temp0_v -= *lshadow;
    }

    if (!ai->reflect_sched){
      _XMP_reflect_sched_t *sched = _XMP_alloc(sizeof(_XMP_reflect_sched_t));
      sched->is_periodic = -1; /* not used yet */
      sched->datatype_lo = MPI_DATATYPE_NULL;
      sched->datatype_hi = MPI_DATATYPE_NULL;
      for (int j = 0; j < 4; j++) sched->req[j] = MPI_REQUEST_NULL;
      sched->lo_send_buf = NULL;
      sched->lo_recv_buf = NULL;
      sched->hi_send_buf = NULL;
      sched->hi_recv_buf = NULL;
      ai->reflect_sched = sched;
    }

    //_XMP_create_shadow_comm(array, *i_dim);

  }
  else { // *lshadow < 0 && *ushadow < 0
    ai->shadow_type = _XMP_N_SHADOW_FULL;

    if (array->is_allocated){
      ai->shadow_size_lo = ai->par_lower - ai->ser_lower;
      ai->shadow_size_hi = ai->ser_upper - ai->par_upper;
	
      ai->local_lower = ai->par_lower - ai->ser_lower;
      ai->local_upper = ai->par_upper - ai->ser_lower;
      ai->local_stride = ai->par_stride;
      ai->alloc_size = ai->ser_size;
    }
    
    _XMP_create_shadow_comm(array, *i_dim);
  }
}
void _XMP_tca_comm_init()
{
  _ring_bufs = _XMP_alloc(sizeof(tca_ring_buf_t) * _XMP_world_size);
  for(int i = 0; i < _XMP_world_size; i++){
    if(i == _XMP_world_rank) continue;
    _XMP_tca_ring_buf_init(&_ring_bufs[i], i);
  }
}
void _XMP_alloc_tca(_XMP_array_t *adesc)
{
  adesc->set_handle = _XMP_N_INT_FALSE;
  int array_dim = adesc->dim;
  for(int i=0;i<array_dim;i++){
    _XMP_array_info_t *ai = &(adesc->info[i]);
    if(ai->shadow_type == _XMP_N_SHADOW_NONE)
      continue;
    ai->reflect_acc_sched = _XMP_alloc(sizeof(_XMP_reflect_sched_t));
  }

  adesc->wait_slot = 0;  // No change ?
  adesc->wait_tag  = 0x100;  // No change ?
}
void _XMP_mpi_coarray_attach(_XMP_coarray_t *coarray_desc, void *addr, const size_t coarray_size, const bool is_acc)
{
  MPI_Win win = MPI_WIN_NULL;
  char **each_addr = NULL;  // head address of a local array on each node

  _XMP_nodes_t *nodes = _XMP_get_execution_nodes();
  int comm_size = nodes->comm_size;
  MPI_Comm comm = *(MPI_Comm *)nodes->comm;

  XACC_DEBUG("attach addr=%p, size=%zd, is_acc=%d", addr, coarray_size, is_acc);

  if(_XMP_flag_multi_win){
    _XMP_mpi_onesided_create_win(&win, addr, coarray_size, comm);
    MPI_Win_lock_all(MPI_MODE_NOCHECK, win);
  }else{
    win = _xmp_mpi_distarray_win;
#ifdef _XMP_XACC
    if(is_acc){
      win = _xmp_mpi_distarray_win_acc;
    }
#endif
    MPI_Win_attach(win, addr, coarray_size);

    each_addr = (char**)_XMP_alloc(sizeof(char *) * comm_size);

    MPI_Allgather(&addr, sizeof(char *), MPI_BYTE,
		  each_addr, sizeof(char *), MPI_BYTE,
		  comm); // exchange displacement
  }

  if(is_acc){
#ifdef _XMP_XACC
    coarray_desc->addr_dev = each_addr;
    coarray_desc->real_addr_dev = addr;
    coarray_desc->win_acc = win;
    coarray_desc->nodes = nodes;
#endif
  }else{
    coarray_desc->addr = each_addr;
    coarray_desc->real_addr = addr;
    coarray_desc->win = win;
    coarray_desc->win_acc = MPI_WIN_NULL;
    coarray_desc->nodes = nodes;
  }
}
/**
 * Build table and Initialize for sync images
 */
void _XMP_mpi_build_sync_images_table()
{
  size_t table_size = sizeof(unsigned int) * _XMP_world_size;
#ifdef _SYNCIMAGE_SENDRECV
  _sync_images_table = _XMP_alloc(table_size);
#else
  struct _shift_queue_t *shift_queue = &_shift_queue;
  _sync_images_table = (unsigned int*)(_xmp_mpi_onesided_buf + shift_queue->total_shift);
  _sync_images_table_disp = (unsigned int*)(shift_queue->total_shift);

  size_t shift;
  if(table_size % _XMP_MPI_ALIGNMENT == 0)
    shift = table_size;
  else{
    shift = ((table_size / _XMP_MPI_ALIGNMENT) + 1) * _XMP_MPI_ALIGNMENT;
  }
  _push_shift_queue(shift_queue, shift);
#endif

  for(int i=0;i<_XMP_world_size;i++)
    _sync_images_table[i] = 0;

  MPI_Barrier(MPI_COMM_WORLD);
}
Exemple #11
0
void xmp_transpose_(_XMP_array_t **dst_d, _XMP_array_t **src_d, int *opt){

#if 1
   xmpf_transpose(*dst_d, *src_d, *opt);
   return;
#else
  _XMP_array_t *dst_array = *(_XMP_array_t **)dst_d;
  _XMP_array_t *src_array = *(_XMP_array_t **)src_d;

  int nnodes;

  int dst_block_dim, src_block_dim;

  void *sendbuf=NULL, *recvbuf=NULL;
  unsigned long long count, bufsize;

  int dst_chunk_size, dst_ser_size, type_size;
  int src_chunk_size, src_ser_size;

  nnodes = dst_array->align_template->onto_nodes->comm_size;

  // 2-dimensional Matrix
  if (dst_array->dim != 2) {
    _XMP_fatal("bad dimension for xmp_transpose");
  }

  // No Shadow
  if (dst_array->info[0].shadow_size_lo != 0 ||
      dst_array->info[0].shadow_size_hi != 0 ||
      src_array->info[0].shadow_size_lo != 0 ||
      src_array->info[0].shadow_size_hi != 0) {
   _XMP_fatal("A global array must not have shadows");
  fflush(stdout);
  }

  // Dividable by the number of nodes
  if (dst_array->info[0].ser_size % nnodes != 0) {
   _XMP_fatal("Not dividable by the number of nodes");
  fflush(stdout);
  }

  dst_block_dim = (dst_array->info[0].align_manner == _XMP_N_ALIGN_BLOCK) ? 0 : 1;
  src_block_dim = (src_array->info[0].align_manner == _XMP_N_ALIGN_BLOCK) ? 0 : 1;

  dst_chunk_size = dst_array->info[dst_block_dim].par_size;
  dst_ser_size = dst_array->info[dst_block_dim].ser_size;
  src_chunk_size = src_array->info[src_block_dim].par_size;
  src_ser_size = src_array->info[src_block_dim].ser_size;
  type_size = dst_array->type_size;

  count =  dst_chunk_size * src_chunk_size;
  bufsize = count * nnodes * type_size;

  _XMP_check_reflect_type();

  if (src_block_dim == 1){
    if (*opt ==0){
      sendbuf = _XMP_alloc(bufsize);
    }else if (*opt==1){
      sendbuf = dst_array->array_addr_p;
    }
    // src_array -> sendbuf
    _XMP_pack_vector2((char *)sendbuf, (char *)src_array->array_addr_p ,
		      src_chunk_size, dst_chunk_size, nnodes, type_size,
		      src_block_dim);
  }
  else {
    sendbuf = src_array->array_addr_p;
  }

  if (*opt == 0){
    recvbuf = _XMP_alloc(bufsize);
  }else if (*opt ==1){
    recvbuf = src_array->array_addr_p;
  }
  MPI_Alltoall(sendbuf, count * type_size, MPI_BYTE, recvbuf, count * type_size,
               MPI_BYTE, *((MPI_Comm *)src_array->align_template->onto_nodes->comm));

  if (dst_block_dim == 1){
    _XMPF_unpack_transpose_vector((char *)dst_array->array_addr_p ,
       (char *)recvbuf , src_ser_size, dst_chunk_size, type_size, dst_block_dim);

    if (*opt==0){
      _XMP_free(recvbuf);
    }
  }

  if (src_block_dim == 1){
    if (*opt == 0){
      _XMP_free(sendbuf);
    }
  }


  return;
#endif
}
static void _XMP_reflect_pcopy_sched_dim(_XMP_array_t *adesc, int target_dim,
					 int lwidth, int uwidth, int is_periodic, void *dev_array_addr, int *lwidths, int *uwidths){
  //printf("desc=%p, tardim=%d, lw=%d, uw=%d, devp=%p\n", adesc, target_dim, lwidth, uwidth, dev_array_addr);
  
  if (lwidth == 0 && uwidth == 0) return;

  _XMP_array_info_t *ai = &(adesc->info[target_dim]);
  _XMP_array_info_t *ainfo = adesc->info;
  _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK);
  _XMP_ASSERT(ai->is_shadow_comm_member);

  if (lwidth > ai->shadow_size_lo || uwidth > ai->shadow_size_hi){
    _XMP_fatal("reflect width is larger than shadow width.");
  }

  _XMP_reflect_sched_t *reflect = ai->reflect_acc_sched;

  int target_tdim = ai->align_template_index;
  _XMP_nodes_info_t *ni = adesc->align_template->chunk[target_tdim].onto_nodes_info;

  int ndims = adesc->dim;

  // 0-origin
  int my_pos = ni->rank;
  int lb_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_lower);
  int ub_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_upper);

  int lo_pos = (my_pos == lb_pos) ? ub_pos : my_pos - 1;
  int hi_pos = (my_pos == ub_pos) ? lb_pos : my_pos + 1;

  MPI_Comm *comm = adesc->align_template->onto_nodes->comm;
  int my_rank = adesc->align_template->onto_nodes->comm_rank;

  int lo_rank = my_rank + (lo_pos - my_pos) * ni->multiplier;
  int hi_rank = my_rank + (hi_pos - my_pos) * ni->multiplier;

  int type_size = adesc->type_size;
  //void *array_addr = adesc->array_addr_p;

  void *lo_send_array = NULL;
  void *lo_recv_array = NULL;
  void *hi_send_array = NULL;
  void *hi_recv_array = NULL;

  void *lo_send_dev_buf = NULL;
  void *lo_recv_dev_buf = NULL;
  void *hi_send_dev_buf = NULL;
  void *hi_recv_dev_buf = NULL;
  void *lo_send_host_buf = NULL;
  void *lo_recv_host_buf = NULL;
  void *hi_send_host_buf = NULL;
  void *hi_recv_host_buf = NULL;

  void *mpi_lo_send_buf = NULL;
  void *mpi_lo_recv_buf = NULL;
  void *mpi_hi_send_buf = NULL;
  void *mpi_hi_recv_buf = NULL;

  int lo_buf_size = 0;
  int hi_buf_size = 0;

  //
  // setup data_type
  //

  int count = 0, blocklength = 0;
  long long stride = 0;
  //  int count_offset = 0;

  if (_XMPF_running && !_XMPC_running){ /* for XMP/F */

    count = 1;
    blocklength = type_size;
    stride = ainfo[0].alloc_size * type_size;

    for (int i = ndims - 2; i >= target_dim; i--){
      count *= ainfo[i+1].alloc_size;
    }

    for (int i = 1; i <= target_dim; i++){
      blocklength *= ainfo[i-1].alloc_size;
      stride *= ainfo[i].alloc_size;
    }

  }
  else if (!_XMPF_running && _XMPC_running){ /* for XMP/C */

    count = 1;
    blocklength = type_size;
    stride = ainfo[ndims-1].alloc_size * type_size;

    
    /* if(target_dim > 0){ */
    /*   count *= ainfo[0].par_size; */
    /*   count_offset = ainfo[0].shadow_size_lo; */
    /* } */
    /* for (int i = 1; i < target_dim; i++){ */
    /*   count *= ainfo[i].alloc_size; */
    /* } */

    /* for (int i = ndims - 2; i >= target_dim; i--){ */
    /*   blocklength *= ainfo[i+1].alloc_size; */
    /*   stride *= ainfo[i].alloc_size; */
    /* } */

    if(target_dim == 0){
      count *= 1;
      if(ndims >= 2){
	blocklength *= (ainfo[1].par_size + lwidths[1] + uwidths[1]);
      }
    }else{
      count *= (ainfo[0].par_size + lwidths[0] + uwidths[0]);
      for(int i = 1; i < target_dim; i++){
	count *= ainfo[i].alloc_size;
      }
      blocklength *= ainfo[target_dim+1].alloc_size;
      stride *= ainfo[target_dim].alloc_size;
    }
    for(int i = target_dim+2; i < ndims; i++){
      blocklength *= ainfo[i].alloc_size;
    }
    for(int i = target_dim+1 ; i < ndims - 1; i++){
      stride *= ainfo[i].alloc_size;
    }

    /* mod_4 */
    count = 1;
    blocklength = 1;
    stride = 1;

    for(int i = 0; i < ndims; i++){
      int fact = (i == target_dim)? 1 : (ainfo[i].par_size + lwidths[i] + uwidths[i]);
      int alloc_size = ainfo[i].alloc_size;

      if(blocklength == 1 || fact == alloc_size){
	blocklength *= fact;
	stride *= alloc_size;
      }else if(count == 1 && target_dim != 0){ //to be contiguous if target_dim==0
	count = blocklength;
	blocklength = fact;
	stride = alloc_size;
      }else{
	blocklength *= alloc_size;
	stride *= alloc_size;
      }
      //printf("tar=%d, i=%d, fact=%d, allocsize=%d, (%d,%d,%lld)\n", target_dim, i, fact, alloc_size, count , blocklength, stride);
    }

    blocklength *= type_size;
    stride *= type_size;
    /* mod_4 end */
    
    /* it used at 150717
    for (int i = 1; i <= target_dim; i++){
      count *= ainfo[i-1].alloc_size;
    }

    for (int i = ndims - 2; i >= target_dim; i--){
      blocklength *= ainfo[i+1].alloc_size;
      stride *= ainfo[i].alloc_size;
    }
    */

    /* for (int i = target_dim + 1; i < ndims; i++){ */
    /*   blocklength *= ainfo[i].alloc_size; */
    /* } */
    /* for (int i = target_dim; i < ndims - 1; i++){ */
    /*   stride *= ainfo[i].alloc_size; */
    /* } */

    //    printf("count =%d, blength=%d, stride=%lld\n", count ,blocklength, stride);
    //    printf("ainfo[0].par_size=%d\n", ainfo[0].par_size);
    //    printf("count_ofset=%d,\n", count_offset);
  }
  else {
    _XMP_fatal("cannot determin the base language.");
  }

  //
  // calculate base address
  //

  // for lower reflect

  if (lwidth){
    lo_send_array = lo_recv_array = (void *)((char*)dev_array_addr + /*count_offset*/0 * stride);

    for (int i = 0; i < ndims; i++) {
      int lb_send, lb_recv;
      unsigned long long dim_acc;

      if (i == target_dim) {
	//printf("ainfo[%d].local_upper=%d\n",i,ainfo[i].local_upper);
	lb_send = ainfo[i].local_upper - lwidth + 1;
	lb_recv = ainfo[i].shadow_size_lo - lwidth; ////ainfo[i].local_lower - lwidth;
      } else {
	// Note: including shadow area
	lb_send = 0; //// ainfo[i].local_lower - ainfo[i].shadow_size_lo;
	lb_recv = 0; //// ainfo[i].local_lower - ainfo[i].shadow_size_lo;
      }

      dim_acc = ainfo[i].dim_acc;

      lo_send_array = (void *)((char *)lo_send_array + lb_send * dim_acc * type_size);
      lo_recv_array = (void *)((char *)lo_recv_array + lb_recv * dim_acc * type_size);
    }
  }

  // for upper reflect

  if (uwidth){
    hi_send_array = hi_recv_array = (void *)((char*)dev_array_addr + /*count_offset*/0 * stride);

    for (int i = 0; i < ndims; i++) {
      int lb_send, lb_recv;
      unsigned long long dim_acc;

      if (i == target_dim) {
	lb_send = ainfo[i].local_lower;
	lb_recv = ainfo[i].local_upper + 1;
      } else {
	// Note: including shadow area
	lb_send = 0; //ainfo[i].local_lower - ainfo[i].shadow_size_lo;
	lb_recv = 0; //ainfo[i].local_lower - ainfo[i].shadow_size_lo;
      }

      dim_acc = ainfo[i].dim_acc;

      hi_send_array = (void *)((char *)hi_send_array + lb_send * dim_acc * type_size);
      hi_recv_array = (void *)((char *)hi_recv_array + lb_recv * dim_acc * type_size);
    }
  }

  // for lower reflect
  if (reflect->datatype_lo != MPI_DATATYPE_NULL){
    MPI_Type_free(&reflect->datatype_lo);
  }
  if(packVector || count == 1){
    MPI_Type_contiguous(blocklength * lwidth * count, MPI_BYTE, &reflect->datatype_lo);
    //    MPI_Type_contiguous(blocklength * lwidth * count / type_size, MPI_FLOAT, &reflect->datatype_lo);
    fprintf(stderr, "dim=%d, send elements lo = %d\n", target_dim, blocklength * lwidth * count / type_size);
    //fprintf(stderr, "useHostBuf=%c , packVector=%c\n", useHostBuffer, packVector);
    //    if(useHostBuffer){ fprintf(stderr,"using host buffer\n"); }
    //    if(packVector){ fprintf(stderr, "using pack vector\n"); }
  }else{
    MPI_Type_vector(count, blocklength * lwidth, stride, MPI_BYTE, &reflect->datatype_lo);
  }
  MPI_Type_commit(&reflect->datatype_lo);

  // for upper reflect
  if (reflect->datatype_hi != MPI_DATATYPE_NULL){
    MPI_Type_free(&reflect->datatype_hi);
  }
  if(packVector || count == 1){
    MPI_Type_contiguous(blocklength * uwidth * count, MPI_BYTE, &reflect->datatype_hi);
    //    MPI_Type_contiguous(blocklength * uwidth * count / type_size, MPI_FLOAT, &reflect->datatype_hi);
    fprintf(stderr, "dim=%d, send elements hi = %d\n", target_dim, blocklength * uwidth * count / type_size);    
  }else{
    MPI_Type_vector(count, blocklength * uwidth, stride, MPI_BYTE, &reflect->datatype_hi);
  }
  MPI_Type_commit(&reflect->datatype_hi);


  //
  // Allocate buffers
  //
  if(useHostBuffer){
    CUDA_SAFE_CALL(cudaFreeHost(reflect->lo_send_host_buf));
    CUDA_SAFE_CALL(cudaFreeHost(reflect->lo_recv_host_buf));
  }    

  if ((_XMPF_running && target_dim != ndims - 1) ||
      (_XMPC_running && target_dim != 0)){
    if(packVector){
      CUDA_SAFE_CALL(cudaFree(reflect->lo_send_buf));
      CUDA_SAFE_CALL(cudaFree(reflect->lo_recv_buf));
    }
  }
  if ((_XMPF_running && target_dim == ndims - 1) ||
      (_XMPC_running && target_dim == 0)){
    //
  }
  

  // for lower reflect

  if (lwidth){

    lo_buf_size = lwidth * blocklength * count;
    hi_buf_size = uwidth * blocklength * count;

    if ((_XMPF_running && target_dim == ndims - 1) ||
	(_XMPC_running && target_dim == 0)){
      lo_send_dev_buf = lo_send_array;
      lo_recv_dev_buf = lo_recv_array;
      hi_send_dev_buf = hi_send_array;
      hi_recv_dev_buf = hi_recv_array;
    } else {
      _XMP_TSTART(t0);
      if(packVector){
	CUDA_SAFE_CALL(cudaMalloc((void **)&lo_send_dev_buf, lo_buf_size + hi_buf_size));
	hi_send_dev_buf = (char*)lo_send_dev_buf + lo_buf_size;
	CUDA_SAFE_CALL(cudaMalloc((void **)&lo_recv_dev_buf, lo_buf_size + hi_buf_size));
	hi_recv_dev_buf = (char*)lo_recv_dev_buf + lo_buf_size;	
      }else{
	lo_send_dev_buf = lo_send_array;
	lo_recv_dev_buf = lo_recv_array;
	hi_send_dev_buf = hi_send_array;
	hi_recv_dev_buf = hi_recv_array;
      }
      _XMP_TEND2(xmptiming_.t_mem, xmptiming_.tdim_mem[target_dim], t0);
    }

    if(useHostBuffer){
      CUDA_SAFE_CALL(cudaMallocHost((void**)&lo_send_host_buf, lo_buf_size + hi_buf_size));
      hi_send_host_buf = (char*)lo_send_host_buf + lo_buf_size;
      CUDA_SAFE_CALL(cudaMallocHost((void**)&lo_recv_host_buf, lo_buf_size + hi_buf_size));
      hi_recv_host_buf = (char*)lo_recv_host_buf + lo_buf_size;
      mpi_lo_send_buf = lo_send_host_buf;
      mpi_lo_recv_buf = lo_recv_host_buf;
      mpi_hi_send_buf = hi_send_host_buf;
      mpi_hi_recv_buf = hi_recv_host_buf;
    }else{
      mpi_lo_send_buf = lo_send_dev_buf;
      mpi_lo_recv_buf = lo_recv_dev_buf;
      mpi_hi_send_buf = hi_send_dev_buf;
      mpi_hi_recv_buf = hi_recv_dev_buf;
    }
  }

  // for upper reflect


  //
  // initialize communication
  //

  int src, dst;

  if (!is_periodic && my_pos == lb_pos){ // no periodic
    lo_rank = MPI_PROC_NULL;
  }

  if (!is_periodic && my_pos == ub_pos){ // no periodic
    hi_rank = MPI_PROC_NULL;
  }

  // for lower shadow

  if (lwidth){
    src = lo_rank;
    dst = hi_rank;
  } else {
    src = MPI_PROC_NULL;
    dst = MPI_PROC_NULL;
  }
  //  fprintf(stderr, "dim=%d,  lo_src=%d, lo_dst=%d\n", target_dim, src, dst);

  if (reflect->req[0] != MPI_REQUEST_NULL){
    MPI_Request_free(&reflect->req[0]);
  }
	
  if (reflect->req[1] != MPI_REQUEST_NULL){
    MPI_Request_free(&reflect->req[1]);
  }

  MPI_Recv_init(mpi_lo_recv_buf, 1, reflect->datatype_lo, src,
		_XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[0]);
  MPI_Send_init(mpi_lo_send_buf, 1, reflect->datatype_lo, dst,
		_XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[1]);

  // for upper shadow

  if (uwidth){
    src = hi_rank;
    dst = lo_rank;
  } else {
    src = MPI_PROC_NULL;
    dst = MPI_PROC_NULL;
  }
  //  fprintf(stderr, "dim=%d,  hi_src=%d, hi_dst=%d\n", target_dim, src, dst);

  if (reflect->req[2] != MPI_REQUEST_NULL){
    MPI_Request_free(&reflect->req[2]);
  }
	
  if (reflect->req[3] != MPI_REQUEST_NULL){
    MPI_Request_free(&reflect->req[3]);
  }

  MPI_Recv_init(mpi_hi_recv_buf, 1, reflect->datatype_hi, src,
		_XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[2]);
  MPI_Send_init(mpi_hi_send_buf, 1, reflect->datatype_hi, dst,
		_XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[3]);

  //
  // cache schedule
  //

  reflect->count = count;
  reflect->blocklength = blocklength;
  reflect->stride = stride;

  reflect->lo_send_array = lo_send_array;
  reflect->lo_recv_array = lo_recv_array;
  reflect->hi_send_array = hi_send_array;
  reflect->hi_recv_array = hi_recv_array;

  if(packVector){
    reflect->lo_send_buf = lo_send_dev_buf;
    reflect->lo_recv_buf = lo_recv_dev_buf;
    reflect->hi_send_buf = hi_send_dev_buf;
    reflect->hi_recv_buf = hi_recv_dev_buf;
  }

  if(useHostBuffer){
    reflect->lo_send_host_buf = lo_send_host_buf;
    reflect->lo_recv_host_buf = lo_recv_host_buf;
    reflect->hi_send_host_buf = hi_send_host_buf;
    reflect->hi_recv_host_buf = hi_recv_host_buf;
  }

  reflect->lo_rank = lo_rank;
  reflect->hi_rank = hi_rank;

  // gpu async
  reflect->lo_async_id = _XMP_alloc(sizeof(cudaStream_t));
  CUDA_SAFE_CALL(cudaStreamCreate(reflect->lo_async_id));

  if(target_dim != 0 &&
     (!useHostBuffer || (lo_rank != MPI_PROC_NULL && hi_rank != MPI_PROC_NULL && (lo_buf_size / type_size) <= useSingleStreamLimit)) ){
    reflect->hi_async_id = NULL;
  }else{
    cudaStream_t *hi_stream = (cudaStream_t*)_XMP_alloc(sizeof(cudaStream_t));
    CUDA_SAFE_CALL(cudaStreamCreate(hi_stream));
    reflect->hi_async_id = (void*)hi_stream;
  }

  reflect->event = _XMP_alloc(sizeof(cudaEvent_t));
  CUDA_SAFE_CALL(cudaEventCreateWithFlags(reflect->event, cudaEventDisableTiming));
}
void _XMP_mpi_coarray_malloc(_XMP_coarray_t *coarray_desc, void **addr, const size_t coarray_size, bool is_acc)
{
  char **each_addr = NULL;  // gap_size on each node
  struct _shift_queue_t *shift_queue = is_acc? &_shift_queue_acc : &_shift_queue;
  size_t shift;
  char *real_addr = NULL;
  MPI_Win win = MPI_WIN_NULL;
  _XMP_nodes_t *nodes = _XMP_get_execution_nodes();
  MPI_Comm comm = *(MPI_Comm *)nodes->comm;

  if(coarray_size == 0){
    _XMP_fatal("_XMP_mpi_coarray_malloc: zero size is not allowed");
  }

  if(_XMP_flag_multi_win){
    _XMP_mpi_onesided_alloc_win(&win, (void**)&real_addr, coarray_size, comm, is_acc);
    MPI_Win_lock_all(MPI_MODE_NOCHECK, win);

    XACC_DEBUG("addr=%p, size=%zd, is_acc=%d", real_addr, coarray_size, is_acc);
  }else{
  each_addr = (char**)_XMP_alloc(sizeof(char *) * _XMP_world_size);
  for(int i=0;i<_XMP_world_size;i++){
    each_addr[i] = (char *)(shift_queue->total_shift);
  }
  real_addr = _xmp_mpi_onesided_buf;
#ifdef _XMP_XACC
  if(is_acc){
    real_addr = _xmp_mpi_onesided_buf_acc;
  }
#endif
  real_addr += shift_queue->total_shift;

  XACC_DEBUG("malloc_do: addr=%p, shift=%zd, is_acc=%d", real_addr, shift_queue->total_shift, is_acc);
  
  if(coarray_size % _XMP_MPI_ALIGNMENT == 0)
    shift = coarray_size;
  else{
    shift = ((coarray_size / _XMP_MPI_ALIGNMENT) + 1) * _XMP_MPI_ALIGNMENT;
  }
  
  _push_shift_queue(shift_queue, shift);

  size_t total_shift = shift_queue->total_shift;

  if(total_shift > _xmp_mpi_onesided_heap_size){
    fprintf(stderr, "_xmp_mpi_onesided_heap_size=%zd\n", _xmp_mpi_onesided_heap_size);
    if(_XMP_world_rank == 0){
      fprintf(stderr, "[ERROR] Cannot allocate coarray. Heap memory size of coarray is too small.\n");
      fprintf(stderr, "        Please set the environmental variable \"XMP_ONESIDED_HEAP_SIZE\".\n");
      fprintf(stderr, "        e.g.) export XMP_ONESIDED_HEAP_SIZE=%zuM (or more).\n",
	      (total_shift/1024/1024)+1);
    }
    _XMP_fatal_nomsg();
  }
  }

  if(is_acc){
#ifdef _XMP_XACC
    coarray_desc->addr_dev = each_addr;
    coarray_desc->real_addr_dev = real_addr;
    coarray_desc->win_acc = win;
    coarray_desc->nodes = nodes;
#endif
  }else{
    coarray_desc->addr = each_addr;
    coarray_desc->real_addr = real_addr;
    coarray_desc->win = win;
    coarray_desc->win_acc = MPI_WIN_NULL;
    coarray_desc->nodes = nodes;
  }
  *addr = real_addr;
}
void _XMP_reflect_pcopy_sched_dim(_XMP_array_t *adesc, int target_dim,
				  int lwidth, int uwidth, int is_periodic, int shadow_comm_type){

  if (lwidth == 0 && uwidth == 0) return;

  _XMP_array_info_t *ai = &(adesc->info[target_dim]);
  _XMP_array_info_t *ainfo = adesc->info;
  _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK);
  _XMP_ASSERT(ai->is_shadow_comm_member);

  if (lwidth > ai->shadow_size_lo || uwidth > ai->shadow_size_hi){
    _XMP_fatal("reflect width is larger than shadow width.");
  }

  _XMP_reflect_sched_t *reflect = ai->reflect_sched;

  int target_tdim = ai->align_template_index;
  _XMP_nodes_info_t *ni = adesc->align_template->chunk[target_tdim].onto_nodes_info;

  if (ni->size == 1 && !is_periodic) return;

  int ndims = adesc->dim;

  // 0-origin
  int my_pos = ni->rank;
  int lb_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_lower);
  int ub_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_upper);

  int lo_pos = (my_pos == lb_pos) ? ub_pos : my_pos - 1;
  int hi_pos = (my_pos == ub_pos) ? lb_pos : my_pos + 1;

  MPI_Comm *comm = adesc->align_template->onto_nodes->comm;
  int my_rank = adesc->align_template->onto_nodes->comm_rank;

  int lo_rank = my_rank + (lo_pos - my_pos) * ni->multiplier;
  int hi_rank = my_rank + (hi_pos - my_pos) * ni->multiplier;

  int count = 0, blocklength = 0;
  long long stride = 0;

  int type_size = adesc->type_size;
  void *array_addr = adesc->array_addr_p;

  void *lo_send_array = NULL, *lo_recv_array = NULL;
  void *hi_send_array = NULL, *hi_recv_array = NULL;

  void *lo_send_buf = NULL;
  void *lo_recv_buf = NULL;
  void *hi_send_buf = NULL;
  void *hi_recv_buf = NULL;

  int lo_buf_size = 0;
  int hi_buf_size = 0;

  if (reflect->prev_pcopy_sched_type &&
      lwidth == reflect->lo_width &&
      uwidth == reflect->hi_width &&
      is_periodic == reflect->is_periodic){
    if ((adesc->order == MPI_ORDER_FORTRAN && target_dim != ndims - 1) ||
	(adesc->order == MPI_ORDER_C && target_dim != 0)){
      goto init_comm;
    }
    else if (reflect->prev_pcopy_sched_type != shadow_comm_type){
      count = reflect->count;
      blocklength = reflect->blocklength;
      stride = reflect->stride;
      goto alloc_buf;
    }
  }
  
  //
  // setup data_type
  //

  if (adesc->order == MPI_ORDER_FORTRAN){ /* for XMP/F */

    count = 1;
    blocklength = type_size;
    stride = ainfo[0].alloc_size * type_size;

    for (int i = ndims - 2; i >= target_dim; i--){
      count *= ainfo[i+1].alloc_size;
    }

    for (int i = 1; i <= target_dim; i++){
      blocklength *= ainfo[i-1].alloc_size;
      stride *= ainfo[i].alloc_size;
    }

  }
  else if (adesc->order == MPI_ORDER_C){ /* for XMP/C */

    count = 1;
    blocklength = type_size;
    stride = ainfo[ndims-1].alloc_size * type_size;

    for (int i = 1; i <= target_dim; i++){
      count *= ainfo[i-1].alloc_size;
    }

    for (int i = ndims - 2; i >= target_dim; i--){
      blocklength *= ainfo[i+1].alloc_size;
      stride *= ainfo[i].alloc_size;
    }

  }
  else {
    _XMP_fatal("cannot determin the base language.");
  }

  //
  // calculate base address
  //

 alloc_buf:
  
  // for lower reflect

  if (lwidth){

    lo_send_array = array_addr;
    lo_recv_array = array_addr;

    for (int i = 0; i < ndims; i++) {

      int lb_send, lb_recv;
      unsigned long long dim_acc;

      if (i == target_dim) {
	lb_send = ainfo[i].local_upper - lwidth + 1;
	lb_recv = ainfo[i].shadow_size_lo - lwidth;;
      }
      else {
	// Note: including shadow area
	lb_send = 0;
	lb_recv = 0;
      }

      dim_acc = ainfo[i].dim_acc;

      lo_send_array = (void *)((char *)lo_send_array + lb_send * dim_acc * type_size);
      lo_recv_array = (void *)((char *)lo_recv_array + lb_recv * dim_acc * type_size);

    }

  }

  // for upper reflect

  if (uwidth){

    hi_send_array = array_addr;
    hi_recv_array = array_addr;

    for (int i = 0; i < ndims; i++) {

      int lb_send, lb_recv;
      unsigned long long dim_acc;

      if (i == target_dim) {
	lb_send = ainfo[i].local_lower;
	lb_recv = ainfo[i].local_upper + 1;
      }
      else {
	// Note: including shadow area
	lb_send = 0;
	lb_recv = 0;
      }

      dim_acc = ainfo[i].dim_acc;

      hi_send_array = (void *)((char *)hi_send_array + lb_send * dim_acc * type_size);
      hi_recv_array = (void *)((char *)hi_recv_array + lb_recv * dim_acc * type_size);

    }

  }

  //
  // Allocate buffers
  //

  if (reflect->prev_pcopy_sched_type == _XMP_COMM_REFLECT &&
      ((adesc->order == MPI_ORDER_FORTRAN && target_dim == ndims - 1) ||
       (adesc->order == MPI_ORDER_C && target_dim == 0))){
    ;
  }
  else {
    _XMP_free(reflect->lo_send_buf);
    _XMP_free(reflect->lo_recv_buf);
    _XMP_free(reflect->hi_send_buf);
    _XMP_free(reflect->hi_recv_buf);
  }

  // for lower reflect

  if (lwidth){

    lo_buf_size = lwidth * blocklength * count;

    if (shadow_comm_type == _XMP_COMM_REFLECT &&
	((adesc->order == MPI_ORDER_FORTRAN && target_dim == ndims - 1) ||
	 (adesc->order == MPI_ORDER_C && target_dim == 0))){
      lo_send_buf = lo_send_array;
      lo_recv_buf = lo_recv_array;
    }
    else {
      _XMP_TSTART(t0);
      lo_send_buf = _XMP_alloc(lo_buf_size);
      lo_recv_buf = _XMP_alloc(lo_buf_size);
      _XMP_TEND2(xmptiming_.t_mem, xmptiming_.tdim_mem[target_dim], t0);
    }

  }

  // for upper reflect

  if (uwidth){

    hi_buf_size = uwidth * blocklength * count;

    if (shadow_comm_type == _XMP_COMM_REFLECT &&
	((adesc->order == MPI_ORDER_FORTRAN && target_dim == ndims - 1) ||
	 (adesc->order == MPI_ORDER_C && target_dim == 0))){
      hi_send_buf = hi_send_array;
      hi_recv_buf = hi_recv_array;
    }
    else {
      _XMP_TSTART(t0);
      hi_send_buf = _XMP_alloc(hi_buf_size);
      hi_recv_buf = _XMP_alloc(hi_buf_size);
      _XMP_TEND2(xmptiming_.t_mem, xmptiming_.tdim_mem[target_dim], t0);
    }

  }

  //
  // cache schedule
  //

  reflect->count = count;
  reflect->blocklength = blocklength;
  reflect->stride = stride;

  reflect->lo_send_array = lo_send_array;
  reflect->lo_recv_array = lo_recv_array;
  reflect->hi_send_array = hi_send_array;
  reflect->hi_recv_array = hi_recv_array;

  reflect->lo_send_buf = lo_send_buf;
  reflect->lo_recv_buf = lo_recv_buf;
  reflect->hi_send_buf = hi_send_buf;
  reflect->hi_recv_buf = hi_recv_buf;

  //
  // initialize communication
  //

  int src, dst;

 init_comm:
  
  if (!is_periodic && my_pos == lb_pos){ // no periodic
    lo_rank = MPI_PROC_NULL;
  }

  if (!is_periodic && my_pos == ub_pos){ // no periodic
    hi_rank = MPI_PROC_NULL;
  }

  lo_buf_size = lwidth * reflect->blocklength * reflect->count;
  hi_buf_size = uwidth * reflect->blocklength * reflect->count;

  // for lower shadow

  if (lwidth){
    src = lo_rank;
    dst = hi_rank;
  }
  else {
    src = MPI_PROC_NULL;
    dst = MPI_PROC_NULL;
  }

  if (shadow_comm_type == _XMP_COMM_REDUCE_SHADOW){
    if (reflect->req_reduce[0] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req_reduce[0]);
    }
	
    if (reflect->req_reduce[1] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req_reduce[1]);
    }

    MPI_Send_init(reflect->lo_recv_buf, lo_buf_size, MPI_BYTE, src,
		  _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req_reduce[0]);
    MPI_Recv_init(reflect->lo_send_buf, lo_buf_size, MPI_BYTE, dst,
		  _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req_reduce[1]);
  }
  else {
    if (reflect->req[0] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req[0]);
    }
	
    if (reflect->req[1] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req[1]);
    }

    MPI_Recv_init(reflect->lo_recv_buf, lo_buf_size, MPI_BYTE, src,
		  _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[0]);
    MPI_Send_init(reflect->lo_send_buf, lo_buf_size, MPI_BYTE, dst,
		  _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[1]);
  }
  
  // for upper shadow

  if (uwidth){
    src = hi_rank;
    dst = lo_rank;
  }
  else {
    src = MPI_PROC_NULL;
    dst = MPI_PROC_NULL;
  }

  if (shadow_comm_type == _XMP_COMM_REDUCE_SHADOW){
    if (reflect->req_reduce[2] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req_reduce[2]);
    }
	
    if (reflect->req_reduce[3] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req_reduce[3]);
    }

    MPI_Send_init(reflect->hi_recv_buf, hi_buf_size, MPI_BYTE, src,
		  _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req_reduce[2]);
    MPI_Recv_init(reflect->hi_send_buf, hi_buf_size, MPI_BYTE, dst,
		  _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req_reduce[3]);
  }
  else {
    if (reflect->req[2] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req[2]);
    }
	
    if (reflect->req[3] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req[3]);
    }

    MPI_Recv_init(reflect->hi_recv_buf, hi_buf_size, MPI_BYTE, src,
		  _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[2]);
    MPI_Send_init(reflect->hi_send_buf, hi_buf_size, MPI_BYTE, dst,
		  _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[3]);
  }
  
  reflect->prev_pcopy_sched_type = shadow_comm_type;
  
  reflect->lo_rank = lo_rank;
  reflect->hi_rank = hi_rank;

}
void _XMP_reflect_async_ordinal(_XMP_array_t *a, int async_id){

  int n = a->dim;
  _XMP_async_reflect_t *async_reflect;

  _Bool reusable_sched = false;

  if (!a->async_reflect){
    int max_nreqs = (pow(3, n) - 1) * 2;
    async_reflect = (_XMP_async_reflect_t *)_XMP_alloc(sizeof(_XMP_async_reflect_t));
    async_reflect->datatype = (MPI_Datatype *)_XMP_alloc(sizeof(MPI_Datatype) * max_nreqs);
    async_reflect->reqs = (MPI_Request *)_XMP_alloc(sizeof(MPI_Request) * max_nreqs);
    for (int i = 0; i < max_nreqs; i++){
      async_reflect->datatype[i] = MPI_DATATYPE_NULL;
      async_reflect->reqs[i] = MPI_REQUEST_NULL;
    }
    async_reflect->nreqs = 0;
    a->async_reflect = async_reflect;
  }
  else {
    reusable_sched = true;
    async_reflect = a->async_reflect;
    for (int i = 0; i < n; i++){
      if (async_reflect->lwidth[i] != _xmp_lwidth[i] ||
	  async_reflect->uwidth[i] != _xmp_uwidth[i] ||
	  async_reflect->is_periodic[i] != _xmp_is_periodic[i]){
	reusable_sched = false;
	break;
      }
    }
  }

  if (!reusable_sched){

    int lb[_XMP_N_MAX_DIM] = { 0 };
    int ub[_XMP_N_MAX_DIM] = { 0 };

    for (int i = 0; i < n; i++){
      async_reflect->lwidth[i] = _xmp_lwidth[i];
      async_reflect->uwidth[i] = _xmp_uwidth[i];
      async_reflect->is_periodic[i] = _xmp_is_periodic[i];

      if (_xmp_lwidth[i] > 0) lb[i] = -1;
      if (_xmp_uwidth[i] > 0) ub[i] = 1;
    }

    for (int i = 0; i < async_reflect->nreqs; i++){
      if (async_reflect->datatype[i] != MPI_DATATYPE_NULL)
	MPI_Type_free(&async_reflect->datatype[i]);
      if (async_reflect->reqs[i] != MPI_REQUEST_NULL)
	MPI_Request_free(&async_reflect->reqs[i]);
    }
    async_reflect->nreqs = 0;

    int ishadow[_XMP_N_MAX_DIM];
    for (ishadow[0] = lb[0]; ishadow[0] <= ub[0]; ishadow[0]++){
    for (ishadow[1] = lb[1]; ishadow[1] <= ub[1]; ishadow[1]++){
    for (ishadow[2] = lb[2]; ishadow[2] <= ub[2]; ishadow[2]++){
    for (ishadow[3] = lb[3]; ishadow[3] <= ub[3]; ishadow[3]++){
    for (ishadow[4] = lb[4]; ishadow[4] <= ub[4]; ishadow[4]++){
    for (ishadow[5] = lb[5]; ishadow[5] <= ub[5]; ishadow[5]++){
    for (ishadow[6] = lb[6]; ishadow[6] <= ub[6]; ishadow[6]++){

      // When ishadow > 0, upper shadow is to be updated, and vice versa.

      int nnzero = 0;
      for (int i = 0; i < n; i++){
	if (ishadow[i] != 0) nnzero++;
      }
      if (nnzero == 0) continue;

      _XMP_reflect_sched_dir(a, ishadow, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic);

    }}}}}}}

  }

  _XMP_async_comm_t *async = _XMP_get_current_async();
  MPI_Request *reqs = &async->reqs[async->nreqs];

  // copy to async
  if (async->nreqs + async_reflect->nreqs > _XMP_MAX_ASYNC_REQS){
    _XMP_fatal("too many arrays in an asynchronous reflect");
  }
  memcpy(reqs, async_reflect->reqs, async_reflect->nreqs * sizeof(MPI_Request));

  async->nreqs += async_reflect->nreqs;

  _XMP_TSTART(t0);
  MPI_Startall(async_reflect->nreqs, reqs);
  _XMP_TEND(xmptiming_.t_start, t0);

}
Exemple #16
0
void xmpf_array_alloc__(_XMP_array_t **a_desc, int *n_dim, int *type,
			_XMP_template_t **t_desc)
{
  _XMP_array_t *a = _XMP_alloc(sizeof(_XMP_array_t) + sizeof(_XMP_array_info_t) * (*n_dim - 1));

  // moved to xmpf_align_info
  //a->is_allocated = (*t_desc)->is_owner;

  a->is_align_comm_member = false;
  a->dim = *n_dim;
  a->type = *type;
  a->type_size = _XMP_get_datatype_size(a->type);
  size_t dummy;
  _XMP_setup_reduce_type(&a->mpi_type, &dummy, *type);
  a->order = MPI_ORDER_FORTRAN;
  a->total_elmts = 0;

  a->async_reflect = NULL;

  a->align_comm = NULL;
  a->align_comm_size = 1;
  a->align_comm_rank = _XMP_N_INVALID_RANK;

  a->array_nodes = NULL;

#ifdef _XMP_MPI3_ONESIDED
  a->coarray = NULL;
#endif

  //a->num_reqs = -1;
  //a->mpi_req_shadow = _XMP_alloc(sizeof(MPI_Request) * 4 * (*n_dim));

  a->align_template = *t_desc;

  *a_desc = a;

  for (int i = 0; i < *n_dim; i++) {
    _XMP_array_info_t *ai = &(a->info[i]);

    ai->is_shadow_comm_member = false;

    ai->ser_lower = 0;
    ai->ser_upper = 0;
    ai->ser_size = 0;
    ai->par_lower = 0;
    ai->par_upper = 0;
    ai->par_stride = 0;
    ai->par_size = 0;
    ai->local_lower = 0;
    ai->local_upper = 0;
    ai->local_stride = 0;
    ai->alloc_size = 0;

    ai->shadow_type = _XMP_N_SHADOW_NONE;
    ai->shadow_size_lo  = 0;
    ai->shadow_size_hi  = 0;

    ai->reflect_sched = NULL;

    ai->shadow_comm = NULL;
    ai->shadow_comm_size = 1;
    ai->shadow_comm_rank = _XMP_N_INVALID_RANK;
  }

  //xmpf_dbg_printf("xmpf_array_alloc ends\n");
}
static void _XMP_reflect_sched(_XMP_array_t *a, int *lwidth, int *uwidth,
			       int *is_periodic, int is_async, void *dev_addr)
{
  _XMP_TSTART(t0);
  for (int i = 0; i < a->dim; i++){

    _XMP_array_info_t *ai = &(a->info[i]);

    if (ai->shadow_type == _XMP_N_SHADOW_NONE){
      continue;
    }
    else if (ai->shadow_type == _XMP_N_SHADOW_NORMAL){

      _XMP_reflect_sched_t *reflect = ai->reflect_acc_sched;

      if(reflect == NULL){
	reflect = _XMP_alloc(sizeof(_XMP_reflect_sched_t));
	reflect->is_periodic = -1; /* not used yet */
	reflect->datatype_lo = MPI_DATATYPE_NULL;
	reflect->datatype_hi = MPI_DATATYPE_NULL;
	for (int j = 0; j < 4; j++) reflect->req[j] = MPI_REQUEST_NULL;
	reflect->lo_send_buf = NULL;
	reflect->lo_recv_buf = NULL;
	reflect->hi_send_buf = NULL;
	reflect->hi_recv_buf = NULL;
	reflect->lo_send_host_buf = NULL;
	reflect->lo_recv_host_buf = NULL;
	reflect->hi_send_host_buf = NULL;
	reflect->hi_recv_host_buf = NULL;
	ai->reflect_acc_sched = reflect;
      }else{
	//
      }

      if (1/*lwidth[i] || uwidth[i]*/){

	_XMP_ASSERT(reflect);

	if (reflect->is_periodic == -1 /* not set yet */ ||
	    lwidth[i] != reflect->lo_width ||
	    uwidth[i] != reflect->hi_width ||
	    is_periodic[i] != reflect->is_periodic){

	  reflect->lo_width = lwidth[i];
	  reflect->hi_width = uwidth[i];
	  reflect->is_periodic = is_periodic[i];

	  if (/*_xmp_reflect_pack_flag && !is_async*/ 1){
	    _XMP_reflect_pcopy_sched_dim(a, i, lwidth[i], uwidth[i], is_periodic[i], dev_addr, lwidth, uwidth);
	  }
	  else {
	    //_XMP_reflect_normal_sched_dim(a, i, lwidth[i], uwidth[i], is_periodic[i]);
	  }
	}
      }

    }
    else { /* _XMP_N_SHADOW_FULL */
      ;
    }
    
  }
  _XMP_TEND(xmptiming_.t_sched, t0);

}