Ejemplo n.º 1
0
void _XMP_reflect_async_cardinal(_XMP_array_t *a, int async_id)
{

  _XMP_async_comm_t *async = _XMP_get_current_async();
  MPI_Request *reqs = &async->reqs[async->nreqs];
  int nreqs = 0;

  _XMP_TSTART(t0);
  for (int i = 0; i < a->dim; i++){

    _XMP_array_info_t *ai = &(a->info[i]);

    if (ai->shadow_type == _XMP_N_SHADOW_NONE){
      continue;
    }
    else if (ai->shadow_type == _XMP_N_SHADOW_NORMAL){

      _XMP_reflect_sched_t *reflect = ai->reflect_sched;

      if (_xmp_lwidth[i] || _xmp_uwidth[i]){

	_XMP_ASSERT(reflect);

	if (reflect->is_periodic == -1 /* not set yet */ ||
	    _xmp_lwidth[i] != reflect->lo_width ||
	    _xmp_uwidth[i] != reflect->hi_width ||
	    _xmp_is_periodic[i] != reflect->is_periodic){

	  reflect->lo_width = _xmp_lwidth[i];
	  reflect->hi_width = _xmp_uwidth[i];
	  reflect->is_periodic = _xmp_is_periodic[i];

	  _XMP_reflect_normal_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i]);

	}

	if (async->nreqs + nreqs + 4 > _XMP_MAX_ASYNC_REQS){
	  _XMP_fatal("too many arrays in an asynchronous reflect");
	}
	memcpy(&reqs[nreqs], reflect->req, 4 * sizeof(MPI_Request));
	nreqs += 4;

	_XMP_TSTART(t0);
	if (reflect->req[0] != MPI_REQUEST_NULL) // if req[0] isn't null, any others shouldn't be null.
	  MPI_Startall(4, reflect->req);
	_XMP_TEND2(xmptiming_.t_comm, xmptiming_.tdim_comm[i], t0);

      }

    }
    else { /* _XMP_N_SHADOW_FULL */
      _XMP_reflect_shadow_FULL(a->array_addr_p, a, i);
    }
    
  }
  _XMP_TEND(xmptiming_.t_sched, t0);

  async->nreqs += nreqs;

}
Ejemplo n.º 2
0
void xmpf_array_init_shadow__(_XMP_array_t **a_desc, int *i_dim,
			      int *lshadow, int *ushadow)
{
  _XMP_array_t *array = *a_desc;
  _XMP_array_info_t *ai = &(array->info[*i_dim]);

  if (*lshadow == 0 && *ushadow == 0){
    ai->shadow_type = _XMP_N_SHADOW_NONE;
  }
  else if (*lshadow > 0 || *ushadow > 0){

    _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK || ai->align_manner == _XMP_N_ALIGN_GBLOCK);

    ai->shadow_type = _XMP_N_SHADOW_NORMAL;
    ai->shadow_size_lo = *lshadow;
    ai->shadow_size_hi = *ushadow;
      
    if (array->is_allocated){
      ai->local_lower += *lshadow;
      ai->local_upper += *lshadow;
      // ai->local_stride is not changed
      ai->alloc_size += *lshadow + *ushadow;
      // ai->temp0 shuld not be used in XMP/F.
      // *(ai->temp0) -= *lshadow;
      ai->temp0_v -= *lshadow;
    }

    if (!ai->reflect_sched){
      _XMP_reflect_sched_t *sched = _XMP_alloc(sizeof(_XMP_reflect_sched_t));
      sched->is_periodic = -1; /* not used yet */
      sched->datatype_lo = MPI_DATATYPE_NULL;
      sched->datatype_hi = MPI_DATATYPE_NULL;
      for (int j = 0; j < 4; j++) sched->req[j] = MPI_REQUEST_NULL;
      sched->lo_send_buf = NULL;
      sched->lo_recv_buf = NULL;
      sched->hi_send_buf = NULL;
      sched->hi_recv_buf = NULL;
      ai->reflect_sched = sched;
    }

    //_XMP_create_shadow_comm(array, *i_dim);

  }
  else { // *lshadow < 0 && *ushadow < 0
    ai->shadow_type = _XMP_N_SHADOW_FULL;

    if (array->is_allocated){
      ai->shadow_size_lo = ai->par_lower - ai->ser_lower;
      ai->shadow_size_hi = ai->ser_upper - ai->par_upper;
	
      ai->local_lower = ai->par_lower - ai->ser_lower;
      ai->local_upper = ai->par_upper - ai->ser_lower;
      ai->local_stride = ai->par_stride;
      ai->alloc_size = ai->ser_size;
    }
    
    _XMP_create_shadow_comm(array, *i_dim);
  }
}
Ejemplo n.º 3
0
void xmpf_create_task_nodes__(_XMP_nodes_t **n, _XMP_object_ref_t **r_desc)
{
  _XMP_object_ref_t *rp = *r_desc;
  _XMP_ASSERT(rp->ndims <= _XMP_N_ALIGN_BLOCK);
  
  if (rp->ref_kind == XMP_OBJ_REF_NODES){
    int ref_lower[_XMP_N_MAX_DIM];
    int ref_upper[_XMP_N_MAX_DIM];
    int ref_stride[_XMP_N_MAX_DIM];
    int asterisk[_XMP_N_MAX_DIM];
    int dim_size[_XMP_N_MAX_DIM];

    for(int i=0;i<rp->ndims;i++) {
      ref_lower[i] = rp->REF_LBOUND[i];
      ref_upper[i] = rp->REF_UBOUND[i];
      ref_stride[i] = rp->REF_STRIDE[i];
      asterisk[i] = (rp->subscript_type[i] == SUBSCRIPT_ASTERISK);

      if (!asterisk[i]){
	dim_size[i] = _XMP_M_COUNT_TRIPLETi(ref_lower[i], ref_upper[i], ref_stride[i]);
      }
      else {
	dim_size[i] = 1;
      }
    }

    *n = _XMP_init_nodes_struct_NODES_NAMED(rp->ndims, rp->n_desc, asterisk,
					   ref_lower, ref_upper, ref_stride,
					   dim_size, true);
  }
  else {
    long long ref_lower[_XMP_N_MAX_DIM];
    long long ref_upper[_XMP_N_MAX_DIM];
    long long ref_stride[_XMP_N_MAX_DIM];
    int asterisk[_XMP_N_MAX_DIM];
    
    for(int i=0;i<rp->ndims;i++){
      ref_lower[i] = (long long)rp->REF_LBOUND[i];
      ref_upper[i] = (long long)rp->REF_UBOUND[i];
      ref_stride[i] = (long long)rp->REF_STRIDE[i];
      asterisk[i] = (rp->subscript_type[i] == SUBSCRIPT_ASTERISK);
    }
    
    *n = _XMP_create_nodes_by_template_ref(rp->t_desc, asterisk, ref_lower, ref_upper, ref_stride);
  }
}
Ejemplo n.º 4
0
void xmpf_loop_sched__(int *lb, int *ub, int *st, int *r_idx, _XMP_object_ref_t **r_desc)
{
  _XMP_object_ref_t *rp = *r_desc;
  _XMP_ASSERT(rp->ref_kind == XMP_OBJ_REF_TEMPL);

  //if (rp->index[*r_idx] != -1){
  if (rp->REF_INDEX[*r_idx] != -1){

    _XMP_ASSERT(*st != 0);

    _XMP_template_t *t_desc = rp->t_desc;
    int t_idx = rp->REF_INDEX[*r_idx];
    int off = rp->REF_OFFSET[*r_idx];  

    int global_ub_C = (*st > 0) ? (*ub + 1) : (*ub - 1);

    switch (t_desc->chunk[t_idx].dist_manner){

    case _XMP_N_DIST_DUPLICATION:
      _XMP_sched_loop_template_DUPLICATION(*lb + off, global_ub_C + off, *st,
					   lb, ub, st,
					   t_desc, t_idx);
      break;

    case _XMP_N_DIST_BLOCK:
      _XMP_sched_loop_template_BLOCK(*lb + off, global_ub_C + off, *st,
				     lb, ub, st,
				     t_desc, t_idx);
      break;

    case _XMP_N_DIST_CYCLIC:
      _XMP_sched_loop_template_CYCLIC(*lb + off, global_ub_C + off, *st,
				      lb, ub, st,
				      t_desc, t_idx);
      break;

    case _XMP_N_DIST_BLOCK_CYCLIC:
      _XMP_sched_loop_template_BLOCK_CYCLIC(*lb + off, global_ub_C + off, *st,
					    lb, ub, st,
					    t_desc, t_idx);
      break;

    case _XMP_N_DIST_GBLOCK:
      _XMP_sched_loop_template_GBLOCK(*lb + off, global_ub_C + off, *st,
				      lb, ub, st,
				      t_desc, t_idx);
      break;

    default:
      _XMP_fatal("xmpf_sched_loop_template: unknown chunk dist_manner");

    }

    //(*lb) -= off;
    //(*ub) -= off;

    (*ub)--; // because upper bound in Fortran is inclusive
  }
  else {
    ; /* the nest is not aligned with any dimension of the template. */
  }

  //xmpf_dbg_printf("loop = (%d : %d)\n", *lb, *ub);

  return;

}
Ejemplo n.º 5
0
void _XMP_reflect_pcopy_sched_dim(_XMP_array_t *adesc, int target_dim,
				  int lwidth, int uwidth, int is_periodic, int shadow_comm_type){

  if (lwidth == 0 && uwidth == 0) return;

  _XMP_array_info_t *ai = &(adesc->info[target_dim]);
  _XMP_array_info_t *ainfo = adesc->info;
  _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK);
  _XMP_ASSERT(ai->is_shadow_comm_member);

  if (lwidth > ai->shadow_size_lo || uwidth > ai->shadow_size_hi){
    _XMP_fatal("reflect width is larger than shadow width.");
  }

  _XMP_reflect_sched_t *reflect = ai->reflect_sched;

  int target_tdim = ai->align_template_index;
  _XMP_nodes_info_t *ni = adesc->align_template->chunk[target_tdim].onto_nodes_info;

  if (ni->size == 1 && !is_periodic) return;

  int ndims = adesc->dim;

  // 0-origin
  int my_pos = ni->rank;
  int lb_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_lower);
  int ub_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_upper);

  int lo_pos = (my_pos == lb_pos) ? ub_pos : my_pos - 1;
  int hi_pos = (my_pos == ub_pos) ? lb_pos : my_pos + 1;

  MPI_Comm *comm = adesc->align_template->onto_nodes->comm;
  int my_rank = adesc->align_template->onto_nodes->comm_rank;

  int lo_rank = my_rank + (lo_pos - my_pos) * ni->multiplier;
  int hi_rank = my_rank + (hi_pos - my_pos) * ni->multiplier;

  int count = 0, blocklength = 0;
  long long stride = 0;

  int type_size = adesc->type_size;
  void *array_addr = adesc->array_addr_p;

  void *lo_send_array = NULL, *lo_recv_array = NULL;
  void *hi_send_array = NULL, *hi_recv_array = NULL;

  void *lo_send_buf = NULL;
  void *lo_recv_buf = NULL;
  void *hi_send_buf = NULL;
  void *hi_recv_buf = NULL;

  int lo_buf_size = 0;
  int hi_buf_size = 0;

  if (reflect->prev_pcopy_sched_type &&
      lwidth == reflect->lo_width &&
      uwidth == reflect->hi_width &&
      is_periodic == reflect->is_periodic){
    if ((adesc->order == MPI_ORDER_FORTRAN && target_dim != ndims - 1) ||
	(adesc->order == MPI_ORDER_C && target_dim != 0)){
      goto init_comm;
    }
    else if (reflect->prev_pcopy_sched_type != shadow_comm_type){
      count = reflect->count;
      blocklength = reflect->blocklength;
      stride = reflect->stride;
      goto alloc_buf;
    }
  }
  
  //
  // setup data_type
  //

  if (adesc->order == MPI_ORDER_FORTRAN){ /* for XMP/F */

    count = 1;
    blocklength = type_size;
    stride = ainfo[0].alloc_size * type_size;

    for (int i = ndims - 2; i >= target_dim; i--){
      count *= ainfo[i+1].alloc_size;
    }

    for (int i = 1; i <= target_dim; i++){
      blocklength *= ainfo[i-1].alloc_size;
      stride *= ainfo[i].alloc_size;
    }

  }
  else if (adesc->order == MPI_ORDER_C){ /* for XMP/C */

    count = 1;
    blocklength = type_size;
    stride = ainfo[ndims-1].alloc_size * type_size;

    for (int i = 1; i <= target_dim; i++){
      count *= ainfo[i-1].alloc_size;
    }

    for (int i = ndims - 2; i >= target_dim; i--){
      blocklength *= ainfo[i+1].alloc_size;
      stride *= ainfo[i].alloc_size;
    }

  }
  else {
    _XMP_fatal("cannot determin the base language.");
  }

  //
  // calculate base address
  //

 alloc_buf:
  
  // for lower reflect

  if (lwidth){

    lo_send_array = array_addr;
    lo_recv_array = array_addr;

    for (int i = 0; i < ndims; i++) {

      int lb_send, lb_recv;
      unsigned long long dim_acc;

      if (i == target_dim) {
	lb_send = ainfo[i].local_upper - lwidth + 1;
	lb_recv = ainfo[i].shadow_size_lo - lwidth;;
      }
      else {
	// Note: including shadow area
	lb_send = 0;
	lb_recv = 0;
      }

      dim_acc = ainfo[i].dim_acc;

      lo_send_array = (void *)((char *)lo_send_array + lb_send * dim_acc * type_size);
      lo_recv_array = (void *)((char *)lo_recv_array + lb_recv * dim_acc * type_size);

    }

  }

  // for upper reflect

  if (uwidth){

    hi_send_array = array_addr;
    hi_recv_array = array_addr;

    for (int i = 0; i < ndims; i++) {

      int lb_send, lb_recv;
      unsigned long long dim_acc;

      if (i == target_dim) {
	lb_send = ainfo[i].local_lower;
	lb_recv = ainfo[i].local_upper + 1;
      }
      else {
	// Note: including shadow area
	lb_send = 0;
	lb_recv = 0;
      }

      dim_acc = ainfo[i].dim_acc;

      hi_send_array = (void *)((char *)hi_send_array + lb_send * dim_acc * type_size);
      hi_recv_array = (void *)((char *)hi_recv_array + lb_recv * dim_acc * type_size);

    }

  }

  //
  // Allocate buffers
  //

  if (reflect->prev_pcopy_sched_type == _XMP_COMM_REFLECT &&
      ((adesc->order == MPI_ORDER_FORTRAN && target_dim == ndims - 1) ||
       (adesc->order == MPI_ORDER_C && target_dim == 0))){
    ;
  }
  else {
    _XMP_free(reflect->lo_send_buf);
    _XMP_free(reflect->lo_recv_buf);
    _XMP_free(reflect->hi_send_buf);
    _XMP_free(reflect->hi_recv_buf);
  }

  // for lower reflect

  if (lwidth){

    lo_buf_size = lwidth * blocklength * count;

    if (shadow_comm_type == _XMP_COMM_REFLECT &&
	((adesc->order == MPI_ORDER_FORTRAN && target_dim == ndims - 1) ||
	 (adesc->order == MPI_ORDER_C && target_dim == 0))){
      lo_send_buf = lo_send_array;
      lo_recv_buf = lo_recv_array;
    }
    else {
      _XMP_TSTART(t0);
      lo_send_buf = _XMP_alloc(lo_buf_size);
      lo_recv_buf = _XMP_alloc(lo_buf_size);
      _XMP_TEND2(xmptiming_.t_mem, xmptiming_.tdim_mem[target_dim], t0);
    }

  }

  // for upper reflect

  if (uwidth){

    hi_buf_size = uwidth * blocklength * count;

    if (shadow_comm_type == _XMP_COMM_REFLECT &&
	((adesc->order == MPI_ORDER_FORTRAN && target_dim == ndims - 1) ||
	 (adesc->order == MPI_ORDER_C && target_dim == 0))){
      hi_send_buf = hi_send_array;
      hi_recv_buf = hi_recv_array;
    }
    else {
      _XMP_TSTART(t0);
      hi_send_buf = _XMP_alloc(hi_buf_size);
      hi_recv_buf = _XMP_alloc(hi_buf_size);
      _XMP_TEND2(xmptiming_.t_mem, xmptiming_.tdim_mem[target_dim], t0);
    }

  }

  //
  // cache schedule
  //

  reflect->count = count;
  reflect->blocklength = blocklength;
  reflect->stride = stride;

  reflect->lo_send_array = lo_send_array;
  reflect->lo_recv_array = lo_recv_array;
  reflect->hi_send_array = hi_send_array;
  reflect->hi_recv_array = hi_recv_array;

  reflect->lo_send_buf = lo_send_buf;
  reflect->lo_recv_buf = lo_recv_buf;
  reflect->hi_send_buf = hi_send_buf;
  reflect->hi_recv_buf = hi_recv_buf;

  //
  // initialize communication
  //

  int src, dst;

 init_comm:
  
  if (!is_periodic && my_pos == lb_pos){ // no periodic
    lo_rank = MPI_PROC_NULL;
  }

  if (!is_periodic && my_pos == ub_pos){ // no periodic
    hi_rank = MPI_PROC_NULL;
  }

  lo_buf_size = lwidth * reflect->blocklength * reflect->count;
  hi_buf_size = uwidth * reflect->blocklength * reflect->count;

  // for lower shadow

  if (lwidth){
    src = lo_rank;
    dst = hi_rank;
  }
  else {
    src = MPI_PROC_NULL;
    dst = MPI_PROC_NULL;
  }

  if (shadow_comm_type == _XMP_COMM_REDUCE_SHADOW){
    if (reflect->req_reduce[0] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req_reduce[0]);
    }
	
    if (reflect->req_reduce[1] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req_reduce[1]);
    }

    MPI_Send_init(reflect->lo_recv_buf, lo_buf_size, MPI_BYTE, src,
		  _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req_reduce[0]);
    MPI_Recv_init(reflect->lo_send_buf, lo_buf_size, MPI_BYTE, dst,
		  _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req_reduce[1]);
  }
  else {
    if (reflect->req[0] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req[0]);
    }
	
    if (reflect->req[1] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req[1]);
    }

    MPI_Recv_init(reflect->lo_recv_buf, lo_buf_size, MPI_BYTE, src,
		  _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[0]);
    MPI_Send_init(reflect->lo_send_buf, lo_buf_size, MPI_BYTE, dst,
		  _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[1]);
  }
  
  // for upper shadow

  if (uwidth){
    src = hi_rank;
    dst = lo_rank;
  }
  else {
    src = MPI_PROC_NULL;
    dst = MPI_PROC_NULL;
  }

  if (shadow_comm_type == _XMP_COMM_REDUCE_SHADOW){
    if (reflect->req_reduce[2] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req_reduce[2]);
    }
	
    if (reflect->req_reduce[3] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req_reduce[3]);
    }

    MPI_Send_init(reflect->hi_recv_buf, hi_buf_size, MPI_BYTE, src,
		  _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req_reduce[2]);
    MPI_Recv_init(reflect->hi_send_buf, hi_buf_size, MPI_BYTE, dst,
		  _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req_reduce[3]);
  }
  else {
    if (reflect->req[2] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req[2]);
    }
	
    if (reflect->req[3] != MPI_REQUEST_NULL){
      MPI_Request_free(&reflect->req[3]);
    }

    MPI_Recv_init(reflect->hi_recv_buf, hi_buf_size, MPI_BYTE, src,
		  _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[2]);
    MPI_Send_init(reflect->hi_send_buf, hi_buf_size, MPI_BYTE, dst,
		  _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[3]);
  }
  
  reflect->prev_pcopy_sched_type = shadow_comm_type;
  
  reflect->lo_rank = lo_rank;
  reflect->hi_rank = hi_rank;

}
Ejemplo n.º 6
0
static void _XMP_reflect_rdma_sched_dim(_XMP_array_t *adesc, int target_dim,
					int lwidth, int uwidth, int is_periodic){

  if (lwidth == 0 && uwidth == 0) return;

  _XMP_array_info_t *ai = &(adesc->info[target_dim]);
  _XMP_array_info_t *ainfo = adesc->info;
  _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK);
  _XMP_ASSERT(ai->is_shadow_comm_member);

  if (lwidth > ai->shadow_size_lo || uwidth > ai->shadow_size_hi){
    _XMP_fatal("reflect width is larger than shadow width.");
  }

  _XMP_reflect_sched_t *reflect = ai->reflect_sched;

  int target_tdim = ai->align_template_index;
  _XMP_nodes_info_t *ni = adesc->align_template->chunk[target_tdim].onto_nodes_info;

  int ndims = adesc->dim;

  // 0-origin
  int my_pos = ni->rank;
  int lb_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_lower);
  int ub_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_upper);

  int my_rank = adesc->align_template->onto_nodes->comm_rank;

  int lo_pos = (my_pos == lb_pos) ? ub_pos : my_pos - 1;
  int hi_pos = (my_pos == ub_pos) ? lb_pos : my_pos + 1;

  int lo_rank = my_rank + (lo_pos - my_pos) * ni->multiplier;
  int hi_rank = my_rank + (hi_pos - my_pos) * ni->multiplier;

  int type_size = adesc->type_size;

  void *lo_send_array, *lo_recv_array;
  void *hi_send_array, *hi_recv_array;

  uint64_t rdma_raddr;

  //
  // calculate offset
  //

  int count = 0, blocklength = 0;
  long long stride = 0;

  if (adesc->order == MPI_ORDER_FORTRAN){ /* for XMP/F */

    count = 1;
    blocklength = type_size;
    stride = ainfo[0].alloc_size * type_size;

    for (int i = ndims - 2; i >= target_dim; i--){
      count *= ainfo[i+1].alloc_size;
    }

    for (int i = 1; i <= target_dim; i++){
      blocklength *= ainfo[i-1].alloc_size;
      stride *= ainfo[i].alloc_size;
    }

  }
  else if (adesc->order == MPI_ORDER_C){ /* for XMP/C */

    count = 1;
    blocklength = type_size;
    stride = ainfo[ndims-1].alloc_size * type_size;

    for (int i = 1; i <= target_dim; i++){
      count *= ainfo[i-1].alloc_size;
    }

    for (int i = ndims - 2; i >= target_dim; i--){
      blocklength *= ainfo[i+1].alloc_size;
      stride *= ainfo[i].alloc_size;
    }

  }
  else {
    _XMP_fatal("cannot determin the base language.");
  }
  
  //
  // calculate base address
  //

  // for lower reflect
    
  while ((rdma_raddr = FJMPI_Rdma_get_remote_addr(hi_rank, adesc->rdma_memid)) == FJMPI_RDMA_ERROR);

  if (lwidth){
    
    lo_send_array = (void *)adesc->rdma_addr;
    lo_recv_array = (void *)rdma_raddr;

    for (int i = 0; i < ndims; i++) {
      
      int lb_send, lb_recv, dim_acc;
      
      if (i == target_dim) {
	lb_send = ainfo[i].local_upper - lwidth + 1;
	lb_recv = ainfo[i].shadow_size_lo - lwidth;
      }
      else {
	// Note: including shadow area
	lb_send = 0;
	lb_recv = 0;
      }
      
      dim_acc = ainfo[i].dim_acc;
      
      lo_send_array = (void *)((uint64_t)lo_send_array + lb_send * dim_acc * type_size);
      lo_recv_array = (void *)((uint64_t)lo_recv_array + lb_recv * dim_acc * type_size);
      
    }
    
  }
  
  // for upper reflect
  
  while ((rdma_raddr = FJMPI_Rdma_get_remote_addr(lo_rank, adesc->rdma_memid)) == FJMPI_RDMA_ERROR);

  if (uwidth){
    
    hi_send_array = (void *)adesc->rdma_addr;
    hi_recv_array = (void *)rdma_raddr;

    for (int i = 0; i < ndims; i++) {
      
      int lb_send, lb_recv, dim_acc;
      
      if (i == target_dim) {
	lb_send = ainfo[i].local_lower;
	lb_recv = ainfo[i].local_upper + 1;
      }
      else {
	// Note: including shadow area
	lb_send = 0;
	lb_recv = 0;
      }
      
      dim_acc = ainfo[i].dim_acc;
	
      hi_send_array = (void *)((uint64_t)hi_send_array + lb_send * dim_acc * type_size);
      hi_recv_array = (void *)((uint64_t)hi_recv_array + lb_recv * dim_acc * type_size);
      
    }
      
  }
  
  //
  // cache schedule
  //

  if (!is_periodic && my_pos == lb_pos){ // no periodic
    lo_rank = -1;
  }

  if (!is_periodic && my_pos == ub_pos){ // no periodic
    hi_rank = -1;
  }

  reflect->count = count;
  reflect->blocklength = blocklength;
  reflect->stride = stride;

  reflect->lo_send_array = lo_send_array;
  reflect->lo_recv_array = lo_recv_array;
  reflect->hi_send_array = hi_send_array;
  reflect->hi_recv_array = hi_recv_array;

  reflect->lo_rank = lo_rank;
  reflect->hi_rank = hi_rank;

}
Ejemplo n.º 7
0
static void _XMP_reflect_normal_sched_dim(_XMP_array_t *adesc, int target_dim,
					  int lwidth, int uwidth, int is_periodic){

  if (lwidth == 0 && uwidth == 0) return;

  _XMP_array_info_t *ai = &(adesc->info[target_dim]);
  _XMP_array_info_t *ainfo = adesc->info;
  _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK);
  _XMP_ASSERT(ai->is_shadow_comm_member);

  if (lwidth > ai->shadow_size_lo || uwidth > ai->shadow_size_hi){
    _XMP_fatal("reflect width is larger than shadow width.");
  }

  _XMP_reflect_sched_t *reflect = ai->reflect_sched;

  int target_tdim = ai->align_template_index;
  _XMP_nodes_info_t *ni = adesc->align_template->chunk[target_tdim].onto_nodes_info;

  if (ni->size == 1 && !is_periodic) return;
  
  int ndims = adesc->dim;

  // 0-origin
  int my_pos = ni->rank;
  int lb_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_lower);
  int ub_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_upper);

  int lo_pos = (my_pos == lb_pos) ? ub_pos : my_pos - 1;
  int hi_pos = (my_pos == ub_pos) ? lb_pos : my_pos + 1;

  MPI_Comm *comm = adesc->align_template->onto_nodes->comm;
  int my_rank = adesc->align_template->onto_nodes->comm_rank;

  int lo_rank = my_rank + (lo_pos - my_pos) * ni->multiplier;
  int hi_rank = my_rank + (hi_pos - my_pos) * ni->multiplier;

  int type_size = adesc->type_size;

  void *lo_recv_buf = adesc->array_addr_p;
  void *lo_send_buf = adesc->array_addr_p;
  void *hi_recv_buf = adesc->array_addr_p;
  void *hi_send_buf = adesc->array_addr_p;

  //
  // setup MPI_data_type
  //

  int count = 0, blocklength = 0;
  long long stride = 0;

  if (adesc->order == MPI_ORDER_FORTRAN){ /* for XMP/F */

    count = 1;
    blocklength = type_size;
    stride = ainfo[0].alloc_size * type_size;

    for (int i = ndims - 2; i >= target_dim; i--){
      count *= ainfo[i+1].alloc_size;
    }

    for (int i = 1; i <= target_dim; i++){
      blocklength *= ainfo[i-1].alloc_size;
      stride *= ainfo[i].alloc_size;
    }

  }
  else if (adesc->order == MPI_ORDER_C){ /* for XMP/C */

    count = 1;
    blocklength = type_size;
    stride = ainfo[ndims-1].alloc_size * type_size;

    for (int i = 1; i <= target_dim; i++){
      count *= ainfo[i-1].alloc_size;
    }

    for (int i = ndims - 2; i >= target_dim; i--){
      blocklength *= ainfo[i+1].alloc_size;
      stride *= ainfo[i].alloc_size;
    }

  }
  else {
    _XMP_fatal("cannot determin the base language.");
  }

  // for lower reflect

  if (reflect->datatype_lo != MPI_DATATYPE_NULL){
    MPI_Type_free(&reflect->datatype_lo);
  }

  MPI_Type_vector(count, blocklength * lwidth, stride,
		  MPI_BYTE, &reflect->datatype_lo);
  MPI_Type_commit(&reflect->datatype_lo);

  // for upper reflect

  if (reflect->datatype_hi != MPI_DATATYPE_NULL){
    MPI_Type_free(&reflect->datatype_hi);
  }

  MPI_Type_vector(count, blocklength * uwidth, stride,
		  MPI_BYTE, &reflect->datatype_hi);
  MPI_Type_commit(&reflect->datatype_hi);

  //
  // calculate base address
  //

  // for lower reflect

  if (lwidth){
      
    for (int i = 0; i < ndims; i++) {

      int lb_send, lb_recv, dim_acc;

      if (i == target_dim) {
	lb_send = ainfo[i].local_upper - lwidth + 1;
	lb_recv = ainfo[i].shadow_size_lo - lwidth;
      }
      else {
	// Note: including shadow area
	lb_send = 0;
	lb_recv = 0;
      }

      dim_acc = ainfo[i].dim_acc;

      lo_send_buf = (void *)((char *)lo_send_buf + lb_send * dim_acc * type_size);
      lo_recv_buf = (void *)((char *)lo_recv_buf + lb_recv * dim_acc * type_size);
      
    }

  }

  // for upper reflect

  if (uwidth){

    for (int i = 0; i < ndims; i++) {

      int lb_send, lb_recv, dim_acc;

      if (i == target_dim) {
	lb_send = ainfo[i].local_lower;
	lb_recv = ainfo[i].local_upper + 1;
      }
      else {
	// Note: including shadow area
	lb_send = 0;
	lb_recv = 0;
      }

      dim_acc = ainfo[i].dim_acc;

      hi_send_buf = (void *)((char *)hi_send_buf + lb_send * dim_acc * type_size);
      hi_recv_buf = (void *)((char *)hi_recv_buf + lb_recv * dim_acc * type_size);
      
    }

  }

  //
  // initialize communication
  //

  int src, dst;

  if (!is_periodic && my_pos == lb_pos){ // no periodic
    lo_rank = MPI_PROC_NULL;
  }

  if (!is_periodic && my_pos == ub_pos){ // no periodic
    hi_rank = MPI_PROC_NULL;
  }

  // for lower reflect

  if (lwidth){
    src = lo_rank;
    dst = hi_rank;
  }
  else {
    src = MPI_PROC_NULL;
    dst = MPI_PROC_NULL;
  }

  if (reflect->req[0] != MPI_REQUEST_NULL){
    MPI_Request_free(&reflect->req[0]);
  }
	
  if (reflect->req[1] != MPI_REQUEST_NULL){
    MPI_Request_free(&reflect->req[1]);
  }

  MPI_Recv_init(lo_recv_buf, 1, reflect->datatype_lo, src,
		_XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[0]);
  MPI_Send_init(lo_send_buf, 1, reflect->datatype_lo, dst,
		_XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[1]);

  // for upper reflect

  if (uwidth){
    src = hi_rank;
    dst = lo_rank;
  }
  else {
    src = MPI_PROC_NULL;
    dst = MPI_PROC_NULL;
  }

  if (reflect->req[2] != MPI_REQUEST_NULL){
    MPI_Request_free(&reflect->req[2]);
  }
	
  if (reflect->req[3] != MPI_REQUEST_NULL){
    MPI_Request_free(&reflect->req[3]);
  }

  MPI_Recv_init(hi_recv_buf, 1, reflect->datatype_hi, src,
		_XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[2]);
  MPI_Send_init(hi_send_buf, 1, reflect->datatype_hi, dst,
		_XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[3]);
}
Ejemplo n.º 8
0
void _XMP_reflect__(_XMP_array_t *a)
{

  int is_ordinal = 1;

  //_XMP_RETURN_IF_SINGLE;
  if (!a->is_allocated){
    _xmp_set_reflect_flag = 0;
    return;
  }

  if (!_xmp_set_reflect_flag){
    for (int i = 0; i < a->dim; i++){
      _XMP_array_info_t *ai = &(a->info[i]);
      _xmp_lwidth[i] = ai->shadow_size_lo;
      _xmp_uwidth[i] = ai->shadow_size_hi;
      _xmp_is_periodic[i] = 0;
    }
  }

  _XMP_TSTART(t0);
  for (int i = 0; i < a->dim; i++){

    _XMP_array_info_t *ai = &(a->info[i]);

    if (ai->shadow_type == _XMP_N_SHADOW_NONE){
      continue;
    }
    else if (ai->shadow_type == _XMP_N_SHADOW_NORMAL){

      _XMP_reflect_sched_t *reflect = ai->reflect_sched;

      if (_xmp_lwidth[i] || _xmp_uwidth[i]){

	_XMP_ASSERT(reflect);

	if (reflect->is_periodic == -1 /* not set yet */ ||
	    _xmp_lwidth[i] != reflect->lo_width ||
	    _xmp_uwidth[i] != reflect->hi_width ||
	    _xmp_is_periodic[i] != reflect->is_periodic){

	  reflect->lo_width = _xmp_lwidth[i];
	  reflect->hi_width = _xmp_uwidth[i];
	  reflect->is_periodic = _xmp_is_periodic[i];

	  _XMP_reflect_rdma_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i]);

	}

      }

    }
    else { /* _XMP_N_SHADOW_FULL */
      ;
    }
    
  }
  _XMP_TEND(xmptiming_.t_sched, t0);

  _XMP_reflect_start(a, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic, 0);

  _XMP_reflect_wait(a, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic);

  _xmp_set_reflect_flag = 0;
  for (int i = 0; i < a->dim; i++){
    _xmp_lwidth[i] = 0;
    _xmp_uwidth[i] = 0;
    _xmp_is_periodic[i] = 0;
  }

}
Ejemplo n.º 9
0
void _XMP_reflect__(_XMP_array_t *a)
{

  int is_ordinal = 1;

  //_XMP_RETURN_IF_SINGLE;
  if (!a->is_allocated){
    _xmp_set_reflect_flag = 0;
    return;
  }

  if (!_xmp_set_reflect_flag){
    for (int i = 0; i < a->dim; i++){
      _XMP_array_info_t *ai = &(a->info[i]);
      _xmp_lwidth[i] = ai->shadow_size_lo;
      _xmp_uwidth[i] = ai->shadow_size_hi;
      _xmp_is_periodic[i] = 0;
    }
  }

  _XMP_TSTART(t0);
  for (int i = 0; i < a->dim; i++){

    _XMP_array_info_t *ai = &(a->info[i]);

    if (ai->shadow_type == _XMP_N_SHADOW_NONE){
      continue;
    }
    else if (ai->shadow_type == _XMP_N_SHADOW_NORMAL){

      _XMP_reflect_sched_t *reflect = ai->reflect_sched;

      if (_xmp_lwidth[i] || _xmp_uwidth[i]){

	_XMP_ASSERT(reflect);

	/* if (!reflect->reflect_is_initialized || */
	/*     _xmp_lwidth[i] != reflect->lo_width || */
	/*     _xmp_uwidth[i] != reflect->hi_width || */
	/*     _xmp_is_periodic[i] != reflect->is_periodic){ */

	/*   reflect->lo_width = _xmp_lwidth[i]; */
	/*   reflect->hi_width = _xmp_uwidth[i]; */
	/*   reflect->is_periodic = _xmp_is_periodic[i]; */

	/*   if (_xmp_reflect_pack_flag){ */
	/*     _XMP_reflect_pcopy_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i], 0); */
	/*   } */
	/*   else { */
	/*     _XMP_reflect_normal_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i]); */
	/*   } */

	/*   reflect->reflect_is_initialized = 1; */
	/* } */

	if (!reflect->reflect_is_initialized ||
	    _xmp_lwidth[i] != reflect->lo_width ||
	    _xmp_uwidth[i] != reflect->hi_width ||
	    _xmp_is_periodic[i] != reflect->is_periodic){

	  if (_xmp_reflect_pack_flag){
	    _XMP_reflect_pcopy_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i], _XMP_COMM_REFLECT);
	  }
	  else {
	    _XMP_reflect_normal_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i]);
	  }

	  reflect->reflect_is_initialized = 1;
	  reflect->lo_width = _xmp_lwidth[i];
	  reflect->hi_width = _xmp_uwidth[i];
	  reflect->is_periodic = _xmp_is_periodic[i];
	}

	if (_xmp_reflect_pack_flag && reflect->req[0] != MPI_REQUEST_NULL){
	  _XMP_TSTART(t0);
	  _XMP_reflect_pack_dim(a, i, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic, _XMP_COMM_REFLECT);
	  _XMP_TEND(xmptiming_.t_copy, t0);
	}

	_XMP_TSTART(t0);
	if (reflect->req[0] != MPI_REQUEST_NULL) // if req[0] isn't null, any others shouldn't be null.
	  MPI_Startall(4, reflect->req);
	_XMP_TEND2(xmptiming_.t_comm, xmptiming_.tdim_comm[i], t0);

	if (is_ordinal){
	  _XMP_TSTART(t0);
	  MPI_Waitall(4, reflect->req, MPI_STATUSES_IGNORE);
	  _XMP_TEND2(xmptiming_.t_wait, xmptiming_.tdim_wait[i], t0);
	  if (_xmp_reflect_pack_flag && reflect->req[0] != MPI_REQUEST_NULL){
	    _XMP_TSTART(t0);
	    _XMP_reflect_unpack_dim(a, i, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic);
	    _XMP_TEND(xmptiming_.t_copy, t0);
	  }
	}

      }

    }
    else { /* _XMP_N_SHADOW_FULL */
      _XMP_reflect_shadow_FULL(a->array_addr_p, a, i);
    }
    
  }
  _XMP_TEND(xmptiming_.t_sched, t0);

  //  t0 = MPI_Wtime();
  if (!is_ordinal)
    _XMP_reflect_wait(a, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic);
  //  t_wait = t_wait + (MPI_Wtime() - t0);

  _xmp_set_reflect_flag = 0;
  for (int i = 0; i < a->dim; i++){
    _xmp_lwidth[i] = 0;
    _xmp_uwidth[i] = 0;
    _xmp_is_periodic[i] = 0;
  }

}
Ejemplo n.º 10
0
static void _XMP_reflect_sched_dir(_XMP_array_t *adesc, int ishadow[],
				   int lwidth[], int uwidth[], int is_periodic_dim[]){

  int ndims = adesc->dim;

  _XMP_array_info_t *ainfo = adesc->info;

  MPI_Comm *comm = adesc->align_template->onto_nodes->comm;
  int my_rank = adesc->align_template->onto_nodes->comm_rank;

  int src = my_rank;
  int dst = my_rank;

  _XMP_async_reflect_t *async_reflect = adesc->async_reflect;

  MPI_Datatype *send_dtype = &async_reflect->datatype[async_reflect->nreqs];
  MPI_Datatype *recv_dtype = send_dtype + 1;

  MPI_Request *send_req = &async_reflect->reqs[async_reflect->nreqs];
  MPI_Request *recv_req = send_req + 1;

  int width[_XMP_N_MAX_DIM] = { 0 };

  int is_periodic = 1;
  int at_tail = 0, at_head = 0;

  void *recv_buf = adesc->array_addr_p;
  void *send_buf = adesc->array_addr_p;

  //
  // setup neighbor nodes
  //

  for (int i = 0; i < ndims; i++){

    if (ishadow[i] == 0) continue;

    width[i] = ishadow[i] > 0 ? uwidth[i] : lwidth[i];
    is_periodic = is_periodic * is_periodic_dim[i];

    _XMP_array_info_t *ai = &(adesc->info[i]);

    _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK);
    _XMP_ASSERT(ai->is_shadow_comm_member);

    if (lwidth[i] > ai->shadow_size_lo || uwidth[i] > ai->shadow_size_hi){
      _XMP_fatal("reflect width is larger than shadow width.");
    }

    int tdim = ai->align_template_index;
    _XMP_nodes_info_t *ni = adesc->align_template->chunk[tdim].onto_nodes_info;
    
    // don't skip if no comm. is needed.
    //if (ni->size == 1 && !is_periodic_dim[i]) return;

    // 0-origin
    int my_pos = ni->rank;
    int lb_pos = _XMP_get_owner_pos(adesc, i, ai->ser_lower);
    int ub_pos = _XMP_get_owner_pos(adesc, i, ai->ser_upper);

    int src_pos;
    int dst_pos;

    if (ishadow[i] > 0){
      src_pos = my_pos + 1;
      dst_pos = my_pos - 1;
      if (my_pos == lb_pos){
	at_head = 1;
	dst_pos = ub_pos;
      }
      if (my_pos == ub_pos){
	at_tail = 1;
	src_pos = lb_pos;
      }
    }
    else { //ishadow[i] < 0
      src_pos = my_pos - 1;
      dst_pos = my_pos + 1;
      if (my_pos == lb_pos){
	at_tail = 1;
	src_pos = ub_pos;
      }
      if (my_pos == ub_pos){
	at_head = 1;
	dst_pos = lb_pos;
      }
    }

    src = src + (src_pos - my_pos) * ni->multiplier;
    dst = dst + (dst_pos - my_pos) * ni->multiplier;

  }

  src = (is_periodic || !at_tail) ? src : MPI_PROC_NULL;
  dst = (is_periodic || !at_head) ? dst : MPI_PROC_NULL;

  //
  // setup MPI_data_type
  //

  int sizes[_XMP_N_MAX_DIM];
  int subsizes[_XMP_N_MAX_DIM];
  int send_starts[_XMP_N_MAX_DIM];
  int recv_starts[_XMP_N_MAX_DIM];

  for (int i = 0; i < ndims; i++){

    sizes[i] = ainfo[i].alloc_size;
    subsizes[i] = (ishadow[i] == 0) ? ainfo[i].par_size : width[i];

    if (ishadow[i] == 0){
      // excludes shadow area
      send_starts[i] = ainfo[i].shadow_size_lo;
      recv_starts[i] = ainfo[i].shadow_size_lo;
    }
    else if (ishadow[i] > 0){
      send_starts[i] = ainfo[i].shadow_size_lo;
      recv_starts[i] = ainfo[i].local_upper + 1;
    }
    else {
      send_starts[i] = ainfo[i].local_upper - width[i] + 1;
      recv_starts[i] = ainfo[i].shadow_size_lo - width[i];
    }

  }

  MPI_Type_create_subarray(ndims, sizes, subsizes, send_starts,
			   adesc->order, adesc->mpi_type, send_dtype);
  MPI_Type_create_subarray(ndims, sizes, subsizes, recv_starts,
			   adesc->order, adesc->mpi_type, recv_dtype);

  MPI_Type_commit(send_dtype);
  MPI_Type_commit(recv_dtype);

  //
  // initialize communication
  //

  MPI_Send_init(send_buf, 1, *send_dtype, dst,
  		_XMP_N_MPI_TAG_REFLECT_LO, *comm, send_req);
  MPI_Recv_init(recv_buf, 1, *recv_dtype, src,
  		_XMP_N_MPI_TAG_REFLECT_LO, *comm, recv_req);

  async_reflect->nreqs += 2;

}
Ejemplo n.º 11
0
static void _XMP_reflect_pcopy_sched_dim(_XMP_array_t *adesc, int target_dim,
					 int lwidth, int uwidth, int is_periodic, void *dev_array_addr, int *lwidths, int *uwidths){
  //printf("desc=%p, tardim=%d, lw=%d, uw=%d, devp=%p\n", adesc, target_dim, lwidth, uwidth, dev_array_addr);
  
  if (lwidth == 0 && uwidth == 0) return;

  _XMP_array_info_t *ai = &(adesc->info[target_dim]);
  _XMP_array_info_t *ainfo = adesc->info;
  _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK);
  _XMP_ASSERT(ai->is_shadow_comm_member);

  if (lwidth > ai->shadow_size_lo || uwidth > ai->shadow_size_hi){
    _XMP_fatal("reflect width is larger than shadow width.");
  }

  _XMP_reflect_sched_t *reflect = ai->reflect_acc_sched;

  int target_tdim = ai->align_template_index;
  _XMP_nodes_info_t *ni = adesc->align_template->chunk[target_tdim].onto_nodes_info;

  int ndims = adesc->dim;

  // 0-origin
  int my_pos = ni->rank;
  int lb_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_lower);
  int ub_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_upper);

  int lo_pos = (my_pos == lb_pos) ? ub_pos : my_pos - 1;
  int hi_pos = (my_pos == ub_pos) ? lb_pos : my_pos + 1;

  MPI_Comm *comm = adesc->align_template->onto_nodes->comm;
  int my_rank = adesc->align_template->onto_nodes->comm_rank;

  int lo_rank = my_rank + (lo_pos - my_pos) * ni->multiplier;
  int hi_rank = my_rank + (hi_pos - my_pos) * ni->multiplier;

  int type_size = adesc->type_size;
  //void *array_addr = adesc->array_addr_p;

  void *lo_send_array = NULL;
  void *lo_recv_array = NULL;
  void *hi_send_array = NULL;
  void *hi_recv_array = NULL;

  void *lo_send_dev_buf = NULL;
  void *lo_recv_dev_buf = NULL;
  void *hi_send_dev_buf = NULL;
  void *hi_recv_dev_buf = NULL;
  void *lo_send_host_buf = NULL;
  void *lo_recv_host_buf = NULL;
  void *hi_send_host_buf = NULL;
  void *hi_recv_host_buf = NULL;

  void *mpi_lo_send_buf = NULL;
  void *mpi_lo_recv_buf = NULL;
  void *mpi_hi_send_buf = NULL;
  void *mpi_hi_recv_buf = NULL;

  int lo_buf_size = 0;
  int hi_buf_size = 0;

  //
  // setup data_type
  //

  int count = 0, blocklength = 0;
  long long stride = 0;
  //  int count_offset = 0;

  if (_XMPF_running && !_XMPC_running){ /* for XMP/F */

    count = 1;
    blocklength = type_size;
    stride = ainfo[0].alloc_size * type_size;

    for (int i = ndims - 2; i >= target_dim; i--){
      count *= ainfo[i+1].alloc_size;
    }

    for (int i = 1; i <= target_dim; i++){
      blocklength *= ainfo[i-1].alloc_size;
      stride *= ainfo[i].alloc_size;
    }

  }
  else if (!_XMPF_running && _XMPC_running){ /* for XMP/C */

    count = 1;
    blocklength = type_size;
    stride = ainfo[ndims-1].alloc_size * type_size;

    
    /* if(target_dim > 0){ */
    /*   count *= ainfo[0].par_size; */
    /*   count_offset = ainfo[0].shadow_size_lo; */
    /* } */
    /* for (int i = 1; i < target_dim; i++){ */
    /*   count *= ainfo[i].alloc_size; */
    /* } */

    /* for (int i = ndims - 2; i >= target_dim; i--){ */
    /*   blocklength *= ainfo[i+1].alloc_size; */
    /*   stride *= ainfo[i].alloc_size; */
    /* } */

    if(target_dim == 0){
      count *= 1;
      if(ndims >= 2){
	blocklength *= (ainfo[1].par_size + lwidths[1] + uwidths[1]);
      }
    }else{
      count *= (ainfo[0].par_size + lwidths[0] + uwidths[0]);
      for(int i = 1; i < target_dim; i++){
	count *= ainfo[i].alloc_size;
      }
      blocklength *= ainfo[target_dim+1].alloc_size;
      stride *= ainfo[target_dim].alloc_size;
    }
    for(int i = target_dim+2; i < ndims; i++){
      blocklength *= ainfo[i].alloc_size;
    }
    for(int i = target_dim+1 ; i < ndims - 1; i++){
      stride *= ainfo[i].alloc_size;
    }

    /* mod_4 */
    count = 1;
    blocklength = 1;
    stride = 1;

    for(int i = 0; i < ndims; i++){
      int fact = (i == target_dim)? 1 : (ainfo[i].par_size + lwidths[i] + uwidths[i]);
      int alloc_size = ainfo[i].alloc_size;

      if(blocklength == 1 || fact == alloc_size){
	blocklength *= fact;
	stride *= alloc_size;
      }else if(count == 1 && target_dim != 0){ //to be contiguous if target_dim==0
	count = blocklength;
	blocklength = fact;
	stride = alloc_size;
      }else{
	blocklength *= alloc_size;
	stride *= alloc_size;
      }
      //printf("tar=%d, i=%d, fact=%d, allocsize=%d, (%d,%d,%lld)\n", target_dim, i, fact, alloc_size, count , blocklength, stride);
    }

    blocklength *= type_size;
    stride *= type_size;
    /* mod_4 end */
    
    /* it used at 150717
    for (int i = 1; i <= target_dim; i++){
      count *= ainfo[i-1].alloc_size;
    }

    for (int i = ndims - 2; i >= target_dim; i--){
      blocklength *= ainfo[i+1].alloc_size;
      stride *= ainfo[i].alloc_size;
    }
    */

    /* for (int i = target_dim + 1; i < ndims; i++){ */
    /*   blocklength *= ainfo[i].alloc_size; */
    /* } */
    /* for (int i = target_dim; i < ndims - 1; i++){ */
    /*   stride *= ainfo[i].alloc_size; */
    /* } */

    //    printf("count =%d, blength=%d, stride=%lld\n", count ,blocklength, stride);
    //    printf("ainfo[0].par_size=%d\n", ainfo[0].par_size);
    //    printf("count_ofset=%d,\n", count_offset);
  }
  else {
    _XMP_fatal("cannot determin the base language.");
  }

  //
  // calculate base address
  //

  // for lower reflect

  if (lwidth){
    lo_send_array = lo_recv_array = (void *)((char*)dev_array_addr + /*count_offset*/0 * stride);

    for (int i = 0; i < ndims; i++) {
      int lb_send, lb_recv;
      unsigned long long dim_acc;

      if (i == target_dim) {
	//printf("ainfo[%d].local_upper=%d\n",i,ainfo[i].local_upper);
	lb_send = ainfo[i].local_upper - lwidth + 1;
	lb_recv = ainfo[i].shadow_size_lo - lwidth; ////ainfo[i].local_lower - lwidth;
      } else {
	// Note: including shadow area
	lb_send = 0; //// ainfo[i].local_lower - ainfo[i].shadow_size_lo;
	lb_recv = 0; //// ainfo[i].local_lower - ainfo[i].shadow_size_lo;
      }

      dim_acc = ainfo[i].dim_acc;

      lo_send_array = (void *)((char *)lo_send_array + lb_send * dim_acc * type_size);
      lo_recv_array = (void *)((char *)lo_recv_array + lb_recv * dim_acc * type_size);
    }
  }

  // for upper reflect

  if (uwidth){
    hi_send_array = hi_recv_array = (void *)((char*)dev_array_addr + /*count_offset*/0 * stride);

    for (int i = 0; i < ndims; i++) {
      int lb_send, lb_recv;
      unsigned long long dim_acc;

      if (i == target_dim) {
	lb_send = ainfo[i].local_lower;
	lb_recv = ainfo[i].local_upper + 1;
      } else {
	// Note: including shadow area
	lb_send = 0; //ainfo[i].local_lower - ainfo[i].shadow_size_lo;
	lb_recv = 0; //ainfo[i].local_lower - ainfo[i].shadow_size_lo;
      }

      dim_acc = ainfo[i].dim_acc;

      hi_send_array = (void *)((char *)hi_send_array + lb_send * dim_acc * type_size);
      hi_recv_array = (void *)((char *)hi_recv_array + lb_recv * dim_acc * type_size);
    }
  }

  // for lower reflect
  if (reflect->datatype_lo != MPI_DATATYPE_NULL){
    MPI_Type_free(&reflect->datatype_lo);
  }
  if(packVector || count == 1){
    MPI_Type_contiguous(blocklength * lwidth * count, MPI_BYTE, &reflect->datatype_lo);
    //    MPI_Type_contiguous(blocklength * lwidth * count / type_size, MPI_FLOAT, &reflect->datatype_lo);
    fprintf(stderr, "dim=%d, send elements lo = %d\n", target_dim, blocklength * lwidth * count / type_size);
    //fprintf(stderr, "useHostBuf=%c , packVector=%c\n", useHostBuffer, packVector);
    //    if(useHostBuffer){ fprintf(stderr,"using host buffer\n"); }
    //    if(packVector){ fprintf(stderr, "using pack vector\n"); }
  }else{
    MPI_Type_vector(count, blocklength * lwidth, stride, MPI_BYTE, &reflect->datatype_lo);
  }
  MPI_Type_commit(&reflect->datatype_lo);

  // for upper reflect
  if (reflect->datatype_hi != MPI_DATATYPE_NULL){
    MPI_Type_free(&reflect->datatype_hi);
  }
  if(packVector || count == 1){
    MPI_Type_contiguous(blocklength * uwidth * count, MPI_BYTE, &reflect->datatype_hi);
    //    MPI_Type_contiguous(blocklength * uwidth * count / type_size, MPI_FLOAT, &reflect->datatype_hi);
    fprintf(stderr, "dim=%d, send elements hi = %d\n", target_dim, blocklength * uwidth * count / type_size);    
  }else{
    MPI_Type_vector(count, blocklength * uwidth, stride, MPI_BYTE, &reflect->datatype_hi);
  }
  MPI_Type_commit(&reflect->datatype_hi);


  //
  // Allocate buffers
  //
  if(useHostBuffer){
    CUDA_SAFE_CALL(cudaFreeHost(reflect->lo_send_host_buf));
    CUDA_SAFE_CALL(cudaFreeHost(reflect->lo_recv_host_buf));
  }    

  if ((_XMPF_running && target_dim != ndims - 1) ||
      (_XMPC_running && target_dim != 0)){
    if(packVector){
      CUDA_SAFE_CALL(cudaFree(reflect->lo_send_buf));
      CUDA_SAFE_CALL(cudaFree(reflect->lo_recv_buf));
    }
  }
  if ((_XMPF_running && target_dim == ndims - 1) ||
      (_XMPC_running && target_dim == 0)){
    //
  }
  

  // for lower reflect

  if (lwidth){

    lo_buf_size = lwidth * blocklength * count;
    hi_buf_size = uwidth * blocklength * count;

    if ((_XMPF_running && target_dim == ndims - 1) ||
	(_XMPC_running && target_dim == 0)){
      lo_send_dev_buf = lo_send_array;
      lo_recv_dev_buf = lo_recv_array;
      hi_send_dev_buf = hi_send_array;
      hi_recv_dev_buf = hi_recv_array;
    } else {
      _XMP_TSTART(t0);
      if(packVector){
	CUDA_SAFE_CALL(cudaMalloc((void **)&lo_send_dev_buf, lo_buf_size + hi_buf_size));
	hi_send_dev_buf = (char*)lo_send_dev_buf + lo_buf_size;
	CUDA_SAFE_CALL(cudaMalloc((void **)&lo_recv_dev_buf, lo_buf_size + hi_buf_size));
	hi_recv_dev_buf = (char*)lo_recv_dev_buf + lo_buf_size;	
      }else{
	lo_send_dev_buf = lo_send_array;
	lo_recv_dev_buf = lo_recv_array;
	hi_send_dev_buf = hi_send_array;
	hi_recv_dev_buf = hi_recv_array;
      }
      _XMP_TEND2(xmptiming_.t_mem, xmptiming_.tdim_mem[target_dim], t0);
    }

    if(useHostBuffer){
      CUDA_SAFE_CALL(cudaMallocHost((void**)&lo_send_host_buf, lo_buf_size + hi_buf_size));
      hi_send_host_buf = (char*)lo_send_host_buf + lo_buf_size;
      CUDA_SAFE_CALL(cudaMallocHost((void**)&lo_recv_host_buf, lo_buf_size + hi_buf_size));
      hi_recv_host_buf = (char*)lo_recv_host_buf + lo_buf_size;
      mpi_lo_send_buf = lo_send_host_buf;
      mpi_lo_recv_buf = lo_recv_host_buf;
      mpi_hi_send_buf = hi_send_host_buf;
      mpi_hi_recv_buf = hi_recv_host_buf;
    }else{
      mpi_lo_send_buf = lo_send_dev_buf;
      mpi_lo_recv_buf = lo_recv_dev_buf;
      mpi_hi_send_buf = hi_send_dev_buf;
      mpi_hi_recv_buf = hi_recv_dev_buf;
    }
  }

  // for upper reflect


  //
  // initialize communication
  //

  int src, dst;

  if (!is_periodic && my_pos == lb_pos){ // no periodic
    lo_rank = MPI_PROC_NULL;
  }

  if (!is_periodic && my_pos == ub_pos){ // no periodic
    hi_rank = MPI_PROC_NULL;
  }

  // for lower shadow

  if (lwidth){
    src = lo_rank;
    dst = hi_rank;
  } else {
    src = MPI_PROC_NULL;
    dst = MPI_PROC_NULL;
  }
  //  fprintf(stderr, "dim=%d,  lo_src=%d, lo_dst=%d\n", target_dim, src, dst);

  if (reflect->req[0] != MPI_REQUEST_NULL){
    MPI_Request_free(&reflect->req[0]);
  }
	
  if (reflect->req[1] != MPI_REQUEST_NULL){
    MPI_Request_free(&reflect->req[1]);
  }

  MPI_Recv_init(mpi_lo_recv_buf, 1, reflect->datatype_lo, src,
		_XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[0]);
  MPI_Send_init(mpi_lo_send_buf, 1, reflect->datatype_lo, dst,
		_XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[1]);

  // for upper shadow

  if (uwidth){
    src = hi_rank;
    dst = lo_rank;
  } else {
    src = MPI_PROC_NULL;
    dst = MPI_PROC_NULL;
  }
  //  fprintf(stderr, "dim=%d,  hi_src=%d, hi_dst=%d\n", target_dim, src, dst);

  if (reflect->req[2] != MPI_REQUEST_NULL){
    MPI_Request_free(&reflect->req[2]);
  }
	
  if (reflect->req[3] != MPI_REQUEST_NULL){
    MPI_Request_free(&reflect->req[3]);
  }

  MPI_Recv_init(mpi_hi_recv_buf, 1, reflect->datatype_hi, src,
		_XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[2]);
  MPI_Send_init(mpi_hi_send_buf, 1, reflect->datatype_hi, dst,
		_XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[3]);

  //
  // cache schedule
  //

  reflect->count = count;
  reflect->blocklength = blocklength;
  reflect->stride = stride;

  reflect->lo_send_array = lo_send_array;
  reflect->lo_recv_array = lo_recv_array;
  reflect->hi_send_array = hi_send_array;
  reflect->hi_recv_array = hi_recv_array;

  if(packVector){
    reflect->lo_send_buf = lo_send_dev_buf;
    reflect->lo_recv_buf = lo_recv_dev_buf;
    reflect->hi_send_buf = hi_send_dev_buf;
    reflect->hi_recv_buf = hi_recv_dev_buf;
  }

  if(useHostBuffer){
    reflect->lo_send_host_buf = lo_send_host_buf;
    reflect->lo_recv_host_buf = lo_recv_host_buf;
    reflect->hi_send_host_buf = hi_send_host_buf;
    reflect->hi_recv_host_buf = hi_recv_host_buf;
  }

  reflect->lo_rank = lo_rank;
  reflect->hi_rank = hi_rank;

  // gpu async
  reflect->lo_async_id = _XMP_alloc(sizeof(cudaStream_t));
  CUDA_SAFE_CALL(cudaStreamCreate(reflect->lo_async_id));

  if(target_dim != 0 &&
     (!useHostBuffer || (lo_rank != MPI_PROC_NULL && hi_rank != MPI_PROC_NULL && (lo_buf_size / type_size) <= useSingleStreamLimit)) ){
    reflect->hi_async_id = NULL;
  }else{
    cudaStream_t *hi_stream = (cudaStream_t*)_XMP_alloc(sizeof(cudaStream_t));
    CUDA_SAFE_CALL(cudaStreamCreate(hi_stream));
    reflect->hi_async_id = (void*)hi_stream;
  }

  reflect->event = _XMP_alloc(sizeof(cudaEvent_t));
  CUDA_SAFE_CALL(cudaEventCreateWithFlags(reflect->event, cudaEventDisableTiming));
}
Ejemplo n.º 12
0
static void _XMP_reflect_sched(_XMP_array_t *a, int *lwidth, int *uwidth,
			       int *is_periodic, int is_async, void *dev_addr)
{
  _XMP_TSTART(t0);
  for (int i = 0; i < a->dim; i++){

    _XMP_array_info_t *ai = &(a->info[i]);

    if (ai->shadow_type == _XMP_N_SHADOW_NONE){
      continue;
    }
    else if (ai->shadow_type == _XMP_N_SHADOW_NORMAL){

      _XMP_reflect_sched_t *reflect = ai->reflect_acc_sched;

      if(reflect == NULL){
	reflect = _XMP_alloc(sizeof(_XMP_reflect_sched_t));
	reflect->is_periodic = -1; /* not used yet */
	reflect->datatype_lo = MPI_DATATYPE_NULL;
	reflect->datatype_hi = MPI_DATATYPE_NULL;
	for (int j = 0; j < 4; j++) reflect->req[j] = MPI_REQUEST_NULL;
	reflect->lo_send_buf = NULL;
	reflect->lo_recv_buf = NULL;
	reflect->hi_send_buf = NULL;
	reflect->hi_recv_buf = NULL;
	reflect->lo_send_host_buf = NULL;
	reflect->lo_recv_host_buf = NULL;
	reflect->hi_send_host_buf = NULL;
	reflect->hi_recv_host_buf = NULL;
	ai->reflect_acc_sched = reflect;
      }else{
	//
      }

      if (1/*lwidth[i] || uwidth[i]*/){

	_XMP_ASSERT(reflect);

	if (reflect->is_periodic == -1 /* not set yet */ ||
	    lwidth[i] != reflect->lo_width ||
	    uwidth[i] != reflect->hi_width ||
	    is_periodic[i] != reflect->is_periodic){

	  reflect->lo_width = lwidth[i];
	  reflect->hi_width = uwidth[i];
	  reflect->is_periodic = is_periodic[i];

	  if (/*_xmp_reflect_pack_flag && !is_async*/ 1){
	    _XMP_reflect_pcopy_sched_dim(a, i, lwidth[i], uwidth[i], is_periodic[i], dev_addr, lwidth, uwidth);
	  }
	  else {
	    //_XMP_reflect_normal_sched_dim(a, i, lwidth[i], uwidth[i], is_periodic[i]);
	  }
	}
      }

    }
    else { /* _XMP_N_SHADOW_FULL */
      ;
    }
    
  }
  _XMP_TEND(xmptiming_.t_sched, t0);

}