void _XMP_reflect_async_cardinal(_XMP_array_t *a, int async_id) { _XMP_async_comm_t *async = _XMP_get_current_async(); MPI_Request *reqs = &async->reqs[async->nreqs]; int nreqs = 0; _XMP_TSTART(t0); for (int i = 0; i < a->dim; i++){ _XMP_array_info_t *ai = &(a->info[i]); if (ai->shadow_type == _XMP_N_SHADOW_NONE){ continue; } else if (ai->shadow_type == _XMP_N_SHADOW_NORMAL){ _XMP_reflect_sched_t *reflect = ai->reflect_sched; if (_xmp_lwidth[i] || _xmp_uwidth[i]){ _XMP_ASSERT(reflect); if (reflect->is_periodic == -1 /* not set yet */ || _xmp_lwidth[i] != reflect->lo_width || _xmp_uwidth[i] != reflect->hi_width || _xmp_is_periodic[i] != reflect->is_periodic){ reflect->lo_width = _xmp_lwidth[i]; reflect->hi_width = _xmp_uwidth[i]; reflect->is_periodic = _xmp_is_periodic[i]; _XMP_reflect_normal_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i]); } if (async->nreqs + nreqs + 4 > _XMP_MAX_ASYNC_REQS){ _XMP_fatal("too many arrays in an asynchronous reflect"); } memcpy(&reqs[nreqs], reflect->req, 4 * sizeof(MPI_Request)); nreqs += 4; _XMP_TSTART(t0); if (reflect->req[0] != MPI_REQUEST_NULL) // if req[0] isn't null, any others shouldn't be null. MPI_Startall(4, reflect->req); _XMP_TEND2(xmptiming_.t_comm, xmptiming_.tdim_comm[i], t0); } } else { /* _XMP_N_SHADOW_FULL */ _XMP_reflect_shadow_FULL(a->array_addr_p, a, i); } } _XMP_TEND(xmptiming_.t_sched, t0); async->nreqs += nreqs; }
void xmpf_array_init_shadow__(_XMP_array_t **a_desc, int *i_dim, int *lshadow, int *ushadow) { _XMP_array_t *array = *a_desc; _XMP_array_info_t *ai = &(array->info[*i_dim]); if (*lshadow == 0 && *ushadow == 0){ ai->shadow_type = _XMP_N_SHADOW_NONE; } else if (*lshadow > 0 || *ushadow > 0){ _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK || ai->align_manner == _XMP_N_ALIGN_GBLOCK); ai->shadow_type = _XMP_N_SHADOW_NORMAL; ai->shadow_size_lo = *lshadow; ai->shadow_size_hi = *ushadow; if (array->is_allocated){ ai->local_lower += *lshadow; ai->local_upper += *lshadow; // ai->local_stride is not changed ai->alloc_size += *lshadow + *ushadow; // ai->temp0 shuld not be used in XMP/F. // *(ai->temp0) -= *lshadow; ai->temp0_v -= *lshadow; } if (!ai->reflect_sched){ _XMP_reflect_sched_t *sched = _XMP_alloc(sizeof(_XMP_reflect_sched_t)); sched->is_periodic = -1; /* not used yet */ sched->datatype_lo = MPI_DATATYPE_NULL; sched->datatype_hi = MPI_DATATYPE_NULL; for (int j = 0; j < 4; j++) sched->req[j] = MPI_REQUEST_NULL; sched->lo_send_buf = NULL; sched->lo_recv_buf = NULL; sched->hi_send_buf = NULL; sched->hi_recv_buf = NULL; ai->reflect_sched = sched; } //_XMP_create_shadow_comm(array, *i_dim); } else { // *lshadow < 0 && *ushadow < 0 ai->shadow_type = _XMP_N_SHADOW_FULL; if (array->is_allocated){ ai->shadow_size_lo = ai->par_lower - ai->ser_lower; ai->shadow_size_hi = ai->ser_upper - ai->par_upper; ai->local_lower = ai->par_lower - ai->ser_lower; ai->local_upper = ai->par_upper - ai->ser_lower; ai->local_stride = ai->par_stride; ai->alloc_size = ai->ser_size; } _XMP_create_shadow_comm(array, *i_dim); } }
void xmpf_create_task_nodes__(_XMP_nodes_t **n, _XMP_object_ref_t **r_desc) { _XMP_object_ref_t *rp = *r_desc; _XMP_ASSERT(rp->ndims <= _XMP_N_ALIGN_BLOCK); if (rp->ref_kind == XMP_OBJ_REF_NODES){ int ref_lower[_XMP_N_MAX_DIM]; int ref_upper[_XMP_N_MAX_DIM]; int ref_stride[_XMP_N_MAX_DIM]; int asterisk[_XMP_N_MAX_DIM]; int dim_size[_XMP_N_MAX_DIM]; for(int i=0;i<rp->ndims;i++) { ref_lower[i] = rp->REF_LBOUND[i]; ref_upper[i] = rp->REF_UBOUND[i]; ref_stride[i] = rp->REF_STRIDE[i]; asterisk[i] = (rp->subscript_type[i] == SUBSCRIPT_ASTERISK); if (!asterisk[i]){ dim_size[i] = _XMP_M_COUNT_TRIPLETi(ref_lower[i], ref_upper[i], ref_stride[i]); } else { dim_size[i] = 1; } } *n = _XMP_init_nodes_struct_NODES_NAMED(rp->ndims, rp->n_desc, asterisk, ref_lower, ref_upper, ref_stride, dim_size, true); } else { long long ref_lower[_XMP_N_MAX_DIM]; long long ref_upper[_XMP_N_MAX_DIM]; long long ref_stride[_XMP_N_MAX_DIM]; int asterisk[_XMP_N_MAX_DIM]; for(int i=0;i<rp->ndims;i++){ ref_lower[i] = (long long)rp->REF_LBOUND[i]; ref_upper[i] = (long long)rp->REF_UBOUND[i]; ref_stride[i] = (long long)rp->REF_STRIDE[i]; asterisk[i] = (rp->subscript_type[i] == SUBSCRIPT_ASTERISK); } *n = _XMP_create_nodes_by_template_ref(rp->t_desc, asterisk, ref_lower, ref_upper, ref_stride); } }
void xmpf_loop_sched__(int *lb, int *ub, int *st, int *r_idx, _XMP_object_ref_t **r_desc) { _XMP_object_ref_t *rp = *r_desc; _XMP_ASSERT(rp->ref_kind == XMP_OBJ_REF_TEMPL); //if (rp->index[*r_idx] != -1){ if (rp->REF_INDEX[*r_idx] != -1){ _XMP_ASSERT(*st != 0); _XMP_template_t *t_desc = rp->t_desc; int t_idx = rp->REF_INDEX[*r_idx]; int off = rp->REF_OFFSET[*r_idx]; int global_ub_C = (*st > 0) ? (*ub + 1) : (*ub - 1); switch (t_desc->chunk[t_idx].dist_manner){ case _XMP_N_DIST_DUPLICATION: _XMP_sched_loop_template_DUPLICATION(*lb + off, global_ub_C + off, *st, lb, ub, st, t_desc, t_idx); break; case _XMP_N_DIST_BLOCK: _XMP_sched_loop_template_BLOCK(*lb + off, global_ub_C + off, *st, lb, ub, st, t_desc, t_idx); break; case _XMP_N_DIST_CYCLIC: _XMP_sched_loop_template_CYCLIC(*lb + off, global_ub_C + off, *st, lb, ub, st, t_desc, t_idx); break; case _XMP_N_DIST_BLOCK_CYCLIC: _XMP_sched_loop_template_BLOCK_CYCLIC(*lb + off, global_ub_C + off, *st, lb, ub, st, t_desc, t_idx); break; case _XMP_N_DIST_GBLOCK: _XMP_sched_loop_template_GBLOCK(*lb + off, global_ub_C + off, *st, lb, ub, st, t_desc, t_idx); break; default: _XMP_fatal("xmpf_sched_loop_template: unknown chunk dist_manner"); } //(*lb) -= off; //(*ub) -= off; (*ub)--; // because upper bound in Fortran is inclusive } else { ; /* the nest is not aligned with any dimension of the template. */ } //xmpf_dbg_printf("loop = (%d : %d)\n", *lb, *ub); return; }
void _XMP_reflect_pcopy_sched_dim(_XMP_array_t *adesc, int target_dim, int lwidth, int uwidth, int is_periodic, int shadow_comm_type){ if (lwidth == 0 && uwidth == 0) return; _XMP_array_info_t *ai = &(adesc->info[target_dim]); _XMP_array_info_t *ainfo = adesc->info; _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK); _XMP_ASSERT(ai->is_shadow_comm_member); if (lwidth > ai->shadow_size_lo || uwidth > ai->shadow_size_hi){ _XMP_fatal("reflect width is larger than shadow width."); } _XMP_reflect_sched_t *reflect = ai->reflect_sched; int target_tdim = ai->align_template_index; _XMP_nodes_info_t *ni = adesc->align_template->chunk[target_tdim].onto_nodes_info; if (ni->size == 1 && !is_periodic) return; int ndims = adesc->dim; // 0-origin int my_pos = ni->rank; int lb_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_lower); int ub_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_upper); int lo_pos = (my_pos == lb_pos) ? ub_pos : my_pos - 1; int hi_pos = (my_pos == ub_pos) ? lb_pos : my_pos + 1; MPI_Comm *comm = adesc->align_template->onto_nodes->comm; int my_rank = adesc->align_template->onto_nodes->comm_rank; int lo_rank = my_rank + (lo_pos - my_pos) * ni->multiplier; int hi_rank = my_rank + (hi_pos - my_pos) * ni->multiplier; int count = 0, blocklength = 0; long long stride = 0; int type_size = adesc->type_size; void *array_addr = adesc->array_addr_p; void *lo_send_array = NULL, *lo_recv_array = NULL; void *hi_send_array = NULL, *hi_recv_array = NULL; void *lo_send_buf = NULL; void *lo_recv_buf = NULL; void *hi_send_buf = NULL; void *hi_recv_buf = NULL; int lo_buf_size = 0; int hi_buf_size = 0; if (reflect->prev_pcopy_sched_type && lwidth == reflect->lo_width && uwidth == reflect->hi_width && is_periodic == reflect->is_periodic){ if ((adesc->order == MPI_ORDER_FORTRAN && target_dim != ndims - 1) || (adesc->order == MPI_ORDER_C && target_dim != 0)){ goto init_comm; } else if (reflect->prev_pcopy_sched_type != shadow_comm_type){ count = reflect->count; blocklength = reflect->blocklength; stride = reflect->stride; goto alloc_buf; } } // // setup data_type // if (adesc->order == MPI_ORDER_FORTRAN){ /* for XMP/F */ count = 1; blocklength = type_size; stride = ainfo[0].alloc_size * type_size; for (int i = ndims - 2; i >= target_dim; i--){ count *= ainfo[i+1].alloc_size; } for (int i = 1; i <= target_dim; i++){ blocklength *= ainfo[i-1].alloc_size; stride *= ainfo[i].alloc_size; } } else if (adesc->order == MPI_ORDER_C){ /* for XMP/C */ count = 1; blocklength = type_size; stride = ainfo[ndims-1].alloc_size * type_size; for (int i = 1; i <= target_dim; i++){ count *= ainfo[i-1].alloc_size; } for (int i = ndims - 2; i >= target_dim; i--){ blocklength *= ainfo[i+1].alloc_size; stride *= ainfo[i].alloc_size; } } else { _XMP_fatal("cannot determin the base language."); } // // calculate base address // alloc_buf: // for lower reflect if (lwidth){ lo_send_array = array_addr; lo_recv_array = array_addr; for (int i = 0; i < ndims; i++) { int lb_send, lb_recv; unsigned long long dim_acc; if (i == target_dim) { lb_send = ainfo[i].local_upper - lwidth + 1; lb_recv = ainfo[i].shadow_size_lo - lwidth;; } else { // Note: including shadow area lb_send = 0; lb_recv = 0; } dim_acc = ainfo[i].dim_acc; lo_send_array = (void *)((char *)lo_send_array + lb_send * dim_acc * type_size); lo_recv_array = (void *)((char *)lo_recv_array + lb_recv * dim_acc * type_size); } } // for upper reflect if (uwidth){ hi_send_array = array_addr; hi_recv_array = array_addr; for (int i = 0; i < ndims; i++) { int lb_send, lb_recv; unsigned long long dim_acc; if (i == target_dim) { lb_send = ainfo[i].local_lower; lb_recv = ainfo[i].local_upper + 1; } else { // Note: including shadow area lb_send = 0; lb_recv = 0; } dim_acc = ainfo[i].dim_acc; hi_send_array = (void *)((char *)hi_send_array + lb_send * dim_acc * type_size); hi_recv_array = (void *)((char *)hi_recv_array + lb_recv * dim_acc * type_size); } } // // Allocate buffers // if (reflect->prev_pcopy_sched_type == _XMP_COMM_REFLECT && ((adesc->order == MPI_ORDER_FORTRAN && target_dim == ndims - 1) || (adesc->order == MPI_ORDER_C && target_dim == 0))){ ; } else { _XMP_free(reflect->lo_send_buf); _XMP_free(reflect->lo_recv_buf); _XMP_free(reflect->hi_send_buf); _XMP_free(reflect->hi_recv_buf); } // for lower reflect if (lwidth){ lo_buf_size = lwidth * blocklength * count; if (shadow_comm_type == _XMP_COMM_REFLECT && ((adesc->order == MPI_ORDER_FORTRAN && target_dim == ndims - 1) || (adesc->order == MPI_ORDER_C && target_dim == 0))){ lo_send_buf = lo_send_array; lo_recv_buf = lo_recv_array; } else { _XMP_TSTART(t0); lo_send_buf = _XMP_alloc(lo_buf_size); lo_recv_buf = _XMP_alloc(lo_buf_size); _XMP_TEND2(xmptiming_.t_mem, xmptiming_.tdim_mem[target_dim], t0); } } // for upper reflect if (uwidth){ hi_buf_size = uwidth * blocklength * count; if (shadow_comm_type == _XMP_COMM_REFLECT && ((adesc->order == MPI_ORDER_FORTRAN && target_dim == ndims - 1) || (adesc->order == MPI_ORDER_C && target_dim == 0))){ hi_send_buf = hi_send_array; hi_recv_buf = hi_recv_array; } else { _XMP_TSTART(t0); hi_send_buf = _XMP_alloc(hi_buf_size); hi_recv_buf = _XMP_alloc(hi_buf_size); _XMP_TEND2(xmptiming_.t_mem, xmptiming_.tdim_mem[target_dim], t0); } } // // cache schedule // reflect->count = count; reflect->blocklength = blocklength; reflect->stride = stride; reflect->lo_send_array = lo_send_array; reflect->lo_recv_array = lo_recv_array; reflect->hi_send_array = hi_send_array; reflect->hi_recv_array = hi_recv_array; reflect->lo_send_buf = lo_send_buf; reflect->lo_recv_buf = lo_recv_buf; reflect->hi_send_buf = hi_send_buf; reflect->hi_recv_buf = hi_recv_buf; // // initialize communication // int src, dst; init_comm: if (!is_periodic && my_pos == lb_pos){ // no periodic lo_rank = MPI_PROC_NULL; } if (!is_periodic && my_pos == ub_pos){ // no periodic hi_rank = MPI_PROC_NULL; } lo_buf_size = lwidth * reflect->blocklength * reflect->count; hi_buf_size = uwidth * reflect->blocklength * reflect->count; // for lower shadow if (lwidth){ src = lo_rank; dst = hi_rank; } else { src = MPI_PROC_NULL; dst = MPI_PROC_NULL; } if (shadow_comm_type == _XMP_COMM_REDUCE_SHADOW){ if (reflect->req_reduce[0] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req_reduce[0]); } if (reflect->req_reduce[1] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req_reduce[1]); } MPI_Send_init(reflect->lo_recv_buf, lo_buf_size, MPI_BYTE, src, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req_reduce[0]); MPI_Recv_init(reflect->lo_send_buf, lo_buf_size, MPI_BYTE, dst, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req_reduce[1]); } else { if (reflect->req[0] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[0]); } if (reflect->req[1] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[1]); } MPI_Recv_init(reflect->lo_recv_buf, lo_buf_size, MPI_BYTE, src, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[0]); MPI_Send_init(reflect->lo_send_buf, lo_buf_size, MPI_BYTE, dst, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[1]); } // for upper shadow if (uwidth){ src = hi_rank; dst = lo_rank; } else { src = MPI_PROC_NULL; dst = MPI_PROC_NULL; } if (shadow_comm_type == _XMP_COMM_REDUCE_SHADOW){ if (reflect->req_reduce[2] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req_reduce[2]); } if (reflect->req_reduce[3] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req_reduce[3]); } MPI_Send_init(reflect->hi_recv_buf, hi_buf_size, MPI_BYTE, src, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req_reduce[2]); MPI_Recv_init(reflect->hi_send_buf, hi_buf_size, MPI_BYTE, dst, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req_reduce[3]); } else { if (reflect->req[2] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[2]); } if (reflect->req[3] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[3]); } MPI_Recv_init(reflect->hi_recv_buf, hi_buf_size, MPI_BYTE, src, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[2]); MPI_Send_init(reflect->hi_send_buf, hi_buf_size, MPI_BYTE, dst, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[3]); } reflect->prev_pcopy_sched_type = shadow_comm_type; reflect->lo_rank = lo_rank; reflect->hi_rank = hi_rank; }
static void _XMP_reflect_rdma_sched_dim(_XMP_array_t *adesc, int target_dim, int lwidth, int uwidth, int is_periodic){ if (lwidth == 0 && uwidth == 0) return; _XMP_array_info_t *ai = &(adesc->info[target_dim]); _XMP_array_info_t *ainfo = adesc->info; _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK); _XMP_ASSERT(ai->is_shadow_comm_member); if (lwidth > ai->shadow_size_lo || uwidth > ai->shadow_size_hi){ _XMP_fatal("reflect width is larger than shadow width."); } _XMP_reflect_sched_t *reflect = ai->reflect_sched; int target_tdim = ai->align_template_index; _XMP_nodes_info_t *ni = adesc->align_template->chunk[target_tdim].onto_nodes_info; int ndims = adesc->dim; // 0-origin int my_pos = ni->rank; int lb_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_lower); int ub_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_upper); int my_rank = adesc->align_template->onto_nodes->comm_rank; int lo_pos = (my_pos == lb_pos) ? ub_pos : my_pos - 1; int hi_pos = (my_pos == ub_pos) ? lb_pos : my_pos + 1; int lo_rank = my_rank + (lo_pos - my_pos) * ni->multiplier; int hi_rank = my_rank + (hi_pos - my_pos) * ni->multiplier; int type_size = adesc->type_size; void *lo_send_array, *lo_recv_array; void *hi_send_array, *hi_recv_array; uint64_t rdma_raddr; // // calculate offset // int count = 0, blocklength = 0; long long stride = 0; if (adesc->order == MPI_ORDER_FORTRAN){ /* for XMP/F */ count = 1; blocklength = type_size; stride = ainfo[0].alloc_size * type_size; for (int i = ndims - 2; i >= target_dim; i--){ count *= ainfo[i+1].alloc_size; } for (int i = 1; i <= target_dim; i++){ blocklength *= ainfo[i-1].alloc_size; stride *= ainfo[i].alloc_size; } } else if (adesc->order == MPI_ORDER_C){ /* for XMP/C */ count = 1; blocklength = type_size; stride = ainfo[ndims-1].alloc_size * type_size; for (int i = 1; i <= target_dim; i++){ count *= ainfo[i-1].alloc_size; } for (int i = ndims - 2; i >= target_dim; i--){ blocklength *= ainfo[i+1].alloc_size; stride *= ainfo[i].alloc_size; } } else { _XMP_fatal("cannot determin the base language."); } // // calculate base address // // for lower reflect while ((rdma_raddr = FJMPI_Rdma_get_remote_addr(hi_rank, adesc->rdma_memid)) == FJMPI_RDMA_ERROR); if (lwidth){ lo_send_array = (void *)adesc->rdma_addr; lo_recv_array = (void *)rdma_raddr; for (int i = 0; i < ndims; i++) { int lb_send, lb_recv, dim_acc; if (i == target_dim) { lb_send = ainfo[i].local_upper - lwidth + 1; lb_recv = ainfo[i].shadow_size_lo - lwidth; } else { // Note: including shadow area lb_send = 0; lb_recv = 0; } dim_acc = ainfo[i].dim_acc; lo_send_array = (void *)((uint64_t)lo_send_array + lb_send * dim_acc * type_size); lo_recv_array = (void *)((uint64_t)lo_recv_array + lb_recv * dim_acc * type_size); } } // for upper reflect while ((rdma_raddr = FJMPI_Rdma_get_remote_addr(lo_rank, adesc->rdma_memid)) == FJMPI_RDMA_ERROR); if (uwidth){ hi_send_array = (void *)adesc->rdma_addr; hi_recv_array = (void *)rdma_raddr; for (int i = 0; i < ndims; i++) { int lb_send, lb_recv, dim_acc; if (i == target_dim) { lb_send = ainfo[i].local_lower; lb_recv = ainfo[i].local_upper + 1; } else { // Note: including shadow area lb_send = 0; lb_recv = 0; } dim_acc = ainfo[i].dim_acc; hi_send_array = (void *)((uint64_t)hi_send_array + lb_send * dim_acc * type_size); hi_recv_array = (void *)((uint64_t)hi_recv_array + lb_recv * dim_acc * type_size); } } // // cache schedule // if (!is_periodic && my_pos == lb_pos){ // no periodic lo_rank = -1; } if (!is_periodic && my_pos == ub_pos){ // no periodic hi_rank = -1; } reflect->count = count; reflect->blocklength = blocklength; reflect->stride = stride; reflect->lo_send_array = lo_send_array; reflect->lo_recv_array = lo_recv_array; reflect->hi_send_array = hi_send_array; reflect->hi_recv_array = hi_recv_array; reflect->lo_rank = lo_rank; reflect->hi_rank = hi_rank; }
static void _XMP_reflect_normal_sched_dim(_XMP_array_t *adesc, int target_dim, int lwidth, int uwidth, int is_periodic){ if (lwidth == 0 && uwidth == 0) return; _XMP_array_info_t *ai = &(adesc->info[target_dim]); _XMP_array_info_t *ainfo = adesc->info; _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK); _XMP_ASSERT(ai->is_shadow_comm_member); if (lwidth > ai->shadow_size_lo || uwidth > ai->shadow_size_hi){ _XMP_fatal("reflect width is larger than shadow width."); } _XMP_reflect_sched_t *reflect = ai->reflect_sched; int target_tdim = ai->align_template_index; _XMP_nodes_info_t *ni = adesc->align_template->chunk[target_tdim].onto_nodes_info; if (ni->size == 1 && !is_periodic) return; int ndims = adesc->dim; // 0-origin int my_pos = ni->rank; int lb_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_lower); int ub_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_upper); int lo_pos = (my_pos == lb_pos) ? ub_pos : my_pos - 1; int hi_pos = (my_pos == ub_pos) ? lb_pos : my_pos + 1; MPI_Comm *comm = adesc->align_template->onto_nodes->comm; int my_rank = adesc->align_template->onto_nodes->comm_rank; int lo_rank = my_rank + (lo_pos - my_pos) * ni->multiplier; int hi_rank = my_rank + (hi_pos - my_pos) * ni->multiplier; int type_size = adesc->type_size; void *lo_recv_buf = adesc->array_addr_p; void *lo_send_buf = adesc->array_addr_p; void *hi_recv_buf = adesc->array_addr_p; void *hi_send_buf = adesc->array_addr_p; // // setup MPI_data_type // int count = 0, blocklength = 0; long long stride = 0; if (adesc->order == MPI_ORDER_FORTRAN){ /* for XMP/F */ count = 1; blocklength = type_size; stride = ainfo[0].alloc_size * type_size; for (int i = ndims - 2; i >= target_dim; i--){ count *= ainfo[i+1].alloc_size; } for (int i = 1; i <= target_dim; i++){ blocklength *= ainfo[i-1].alloc_size; stride *= ainfo[i].alloc_size; } } else if (adesc->order == MPI_ORDER_C){ /* for XMP/C */ count = 1; blocklength = type_size; stride = ainfo[ndims-1].alloc_size * type_size; for (int i = 1; i <= target_dim; i++){ count *= ainfo[i-1].alloc_size; } for (int i = ndims - 2; i >= target_dim; i--){ blocklength *= ainfo[i+1].alloc_size; stride *= ainfo[i].alloc_size; } } else { _XMP_fatal("cannot determin the base language."); } // for lower reflect if (reflect->datatype_lo != MPI_DATATYPE_NULL){ MPI_Type_free(&reflect->datatype_lo); } MPI_Type_vector(count, blocklength * lwidth, stride, MPI_BYTE, &reflect->datatype_lo); MPI_Type_commit(&reflect->datatype_lo); // for upper reflect if (reflect->datatype_hi != MPI_DATATYPE_NULL){ MPI_Type_free(&reflect->datatype_hi); } MPI_Type_vector(count, blocklength * uwidth, stride, MPI_BYTE, &reflect->datatype_hi); MPI_Type_commit(&reflect->datatype_hi); // // calculate base address // // for lower reflect if (lwidth){ for (int i = 0; i < ndims; i++) { int lb_send, lb_recv, dim_acc; if (i == target_dim) { lb_send = ainfo[i].local_upper - lwidth + 1; lb_recv = ainfo[i].shadow_size_lo - lwidth; } else { // Note: including shadow area lb_send = 0; lb_recv = 0; } dim_acc = ainfo[i].dim_acc; lo_send_buf = (void *)((char *)lo_send_buf + lb_send * dim_acc * type_size); lo_recv_buf = (void *)((char *)lo_recv_buf + lb_recv * dim_acc * type_size); } } // for upper reflect if (uwidth){ for (int i = 0; i < ndims; i++) { int lb_send, lb_recv, dim_acc; if (i == target_dim) { lb_send = ainfo[i].local_lower; lb_recv = ainfo[i].local_upper + 1; } else { // Note: including shadow area lb_send = 0; lb_recv = 0; } dim_acc = ainfo[i].dim_acc; hi_send_buf = (void *)((char *)hi_send_buf + lb_send * dim_acc * type_size); hi_recv_buf = (void *)((char *)hi_recv_buf + lb_recv * dim_acc * type_size); } } // // initialize communication // int src, dst; if (!is_periodic && my_pos == lb_pos){ // no periodic lo_rank = MPI_PROC_NULL; } if (!is_periodic && my_pos == ub_pos){ // no periodic hi_rank = MPI_PROC_NULL; } // for lower reflect if (lwidth){ src = lo_rank; dst = hi_rank; } else { src = MPI_PROC_NULL; dst = MPI_PROC_NULL; } if (reflect->req[0] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[0]); } if (reflect->req[1] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[1]); } MPI_Recv_init(lo_recv_buf, 1, reflect->datatype_lo, src, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[0]); MPI_Send_init(lo_send_buf, 1, reflect->datatype_lo, dst, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[1]); // for upper reflect if (uwidth){ src = hi_rank; dst = lo_rank; } else { src = MPI_PROC_NULL; dst = MPI_PROC_NULL; } if (reflect->req[2] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[2]); } if (reflect->req[3] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[3]); } MPI_Recv_init(hi_recv_buf, 1, reflect->datatype_hi, src, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[2]); MPI_Send_init(hi_send_buf, 1, reflect->datatype_hi, dst, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[3]); }
void _XMP_reflect__(_XMP_array_t *a) { int is_ordinal = 1; //_XMP_RETURN_IF_SINGLE; if (!a->is_allocated){ _xmp_set_reflect_flag = 0; return; } if (!_xmp_set_reflect_flag){ for (int i = 0; i < a->dim; i++){ _XMP_array_info_t *ai = &(a->info[i]); _xmp_lwidth[i] = ai->shadow_size_lo; _xmp_uwidth[i] = ai->shadow_size_hi; _xmp_is_periodic[i] = 0; } } _XMP_TSTART(t0); for (int i = 0; i < a->dim; i++){ _XMP_array_info_t *ai = &(a->info[i]); if (ai->shadow_type == _XMP_N_SHADOW_NONE){ continue; } else if (ai->shadow_type == _XMP_N_SHADOW_NORMAL){ _XMP_reflect_sched_t *reflect = ai->reflect_sched; if (_xmp_lwidth[i] || _xmp_uwidth[i]){ _XMP_ASSERT(reflect); if (reflect->is_periodic == -1 /* not set yet */ || _xmp_lwidth[i] != reflect->lo_width || _xmp_uwidth[i] != reflect->hi_width || _xmp_is_periodic[i] != reflect->is_periodic){ reflect->lo_width = _xmp_lwidth[i]; reflect->hi_width = _xmp_uwidth[i]; reflect->is_periodic = _xmp_is_periodic[i]; _XMP_reflect_rdma_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i]); } } } else { /* _XMP_N_SHADOW_FULL */ ; } } _XMP_TEND(xmptiming_.t_sched, t0); _XMP_reflect_start(a, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic, 0); _XMP_reflect_wait(a, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic); _xmp_set_reflect_flag = 0; for (int i = 0; i < a->dim; i++){ _xmp_lwidth[i] = 0; _xmp_uwidth[i] = 0; _xmp_is_periodic[i] = 0; } }
void _XMP_reflect__(_XMP_array_t *a) { int is_ordinal = 1; //_XMP_RETURN_IF_SINGLE; if (!a->is_allocated){ _xmp_set_reflect_flag = 0; return; } if (!_xmp_set_reflect_flag){ for (int i = 0; i < a->dim; i++){ _XMP_array_info_t *ai = &(a->info[i]); _xmp_lwidth[i] = ai->shadow_size_lo; _xmp_uwidth[i] = ai->shadow_size_hi; _xmp_is_periodic[i] = 0; } } _XMP_TSTART(t0); for (int i = 0; i < a->dim; i++){ _XMP_array_info_t *ai = &(a->info[i]); if (ai->shadow_type == _XMP_N_SHADOW_NONE){ continue; } else if (ai->shadow_type == _XMP_N_SHADOW_NORMAL){ _XMP_reflect_sched_t *reflect = ai->reflect_sched; if (_xmp_lwidth[i] || _xmp_uwidth[i]){ _XMP_ASSERT(reflect); /* if (!reflect->reflect_is_initialized || */ /* _xmp_lwidth[i] != reflect->lo_width || */ /* _xmp_uwidth[i] != reflect->hi_width || */ /* _xmp_is_periodic[i] != reflect->is_periodic){ */ /* reflect->lo_width = _xmp_lwidth[i]; */ /* reflect->hi_width = _xmp_uwidth[i]; */ /* reflect->is_periodic = _xmp_is_periodic[i]; */ /* if (_xmp_reflect_pack_flag){ */ /* _XMP_reflect_pcopy_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i], 0); */ /* } */ /* else { */ /* _XMP_reflect_normal_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i]); */ /* } */ /* reflect->reflect_is_initialized = 1; */ /* } */ if (!reflect->reflect_is_initialized || _xmp_lwidth[i] != reflect->lo_width || _xmp_uwidth[i] != reflect->hi_width || _xmp_is_periodic[i] != reflect->is_periodic){ if (_xmp_reflect_pack_flag){ _XMP_reflect_pcopy_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i], _XMP_COMM_REFLECT); } else { _XMP_reflect_normal_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i]); } reflect->reflect_is_initialized = 1; reflect->lo_width = _xmp_lwidth[i]; reflect->hi_width = _xmp_uwidth[i]; reflect->is_periodic = _xmp_is_periodic[i]; } if (_xmp_reflect_pack_flag && reflect->req[0] != MPI_REQUEST_NULL){ _XMP_TSTART(t0); _XMP_reflect_pack_dim(a, i, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic, _XMP_COMM_REFLECT); _XMP_TEND(xmptiming_.t_copy, t0); } _XMP_TSTART(t0); if (reflect->req[0] != MPI_REQUEST_NULL) // if req[0] isn't null, any others shouldn't be null. MPI_Startall(4, reflect->req); _XMP_TEND2(xmptiming_.t_comm, xmptiming_.tdim_comm[i], t0); if (is_ordinal){ _XMP_TSTART(t0); MPI_Waitall(4, reflect->req, MPI_STATUSES_IGNORE); _XMP_TEND2(xmptiming_.t_wait, xmptiming_.tdim_wait[i], t0); if (_xmp_reflect_pack_flag && reflect->req[0] != MPI_REQUEST_NULL){ _XMP_TSTART(t0); _XMP_reflect_unpack_dim(a, i, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic); _XMP_TEND(xmptiming_.t_copy, t0); } } } } else { /* _XMP_N_SHADOW_FULL */ _XMP_reflect_shadow_FULL(a->array_addr_p, a, i); } } _XMP_TEND(xmptiming_.t_sched, t0); // t0 = MPI_Wtime(); if (!is_ordinal) _XMP_reflect_wait(a, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic); // t_wait = t_wait + (MPI_Wtime() - t0); _xmp_set_reflect_flag = 0; for (int i = 0; i < a->dim; i++){ _xmp_lwidth[i] = 0; _xmp_uwidth[i] = 0; _xmp_is_periodic[i] = 0; } }
static void _XMP_reflect_sched_dir(_XMP_array_t *adesc, int ishadow[], int lwidth[], int uwidth[], int is_periodic_dim[]){ int ndims = adesc->dim; _XMP_array_info_t *ainfo = adesc->info; MPI_Comm *comm = adesc->align_template->onto_nodes->comm; int my_rank = adesc->align_template->onto_nodes->comm_rank; int src = my_rank; int dst = my_rank; _XMP_async_reflect_t *async_reflect = adesc->async_reflect; MPI_Datatype *send_dtype = &async_reflect->datatype[async_reflect->nreqs]; MPI_Datatype *recv_dtype = send_dtype + 1; MPI_Request *send_req = &async_reflect->reqs[async_reflect->nreqs]; MPI_Request *recv_req = send_req + 1; int width[_XMP_N_MAX_DIM] = { 0 }; int is_periodic = 1; int at_tail = 0, at_head = 0; void *recv_buf = adesc->array_addr_p; void *send_buf = adesc->array_addr_p; // // setup neighbor nodes // for (int i = 0; i < ndims; i++){ if (ishadow[i] == 0) continue; width[i] = ishadow[i] > 0 ? uwidth[i] : lwidth[i]; is_periodic = is_periodic * is_periodic_dim[i]; _XMP_array_info_t *ai = &(adesc->info[i]); _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK); _XMP_ASSERT(ai->is_shadow_comm_member); if (lwidth[i] > ai->shadow_size_lo || uwidth[i] > ai->shadow_size_hi){ _XMP_fatal("reflect width is larger than shadow width."); } int tdim = ai->align_template_index; _XMP_nodes_info_t *ni = adesc->align_template->chunk[tdim].onto_nodes_info; // don't skip if no comm. is needed. //if (ni->size == 1 && !is_periodic_dim[i]) return; // 0-origin int my_pos = ni->rank; int lb_pos = _XMP_get_owner_pos(adesc, i, ai->ser_lower); int ub_pos = _XMP_get_owner_pos(adesc, i, ai->ser_upper); int src_pos; int dst_pos; if (ishadow[i] > 0){ src_pos = my_pos + 1; dst_pos = my_pos - 1; if (my_pos == lb_pos){ at_head = 1; dst_pos = ub_pos; } if (my_pos == ub_pos){ at_tail = 1; src_pos = lb_pos; } } else { //ishadow[i] < 0 src_pos = my_pos - 1; dst_pos = my_pos + 1; if (my_pos == lb_pos){ at_tail = 1; src_pos = ub_pos; } if (my_pos == ub_pos){ at_head = 1; dst_pos = lb_pos; } } src = src + (src_pos - my_pos) * ni->multiplier; dst = dst + (dst_pos - my_pos) * ni->multiplier; } src = (is_periodic || !at_tail) ? src : MPI_PROC_NULL; dst = (is_periodic || !at_head) ? dst : MPI_PROC_NULL; // // setup MPI_data_type // int sizes[_XMP_N_MAX_DIM]; int subsizes[_XMP_N_MAX_DIM]; int send_starts[_XMP_N_MAX_DIM]; int recv_starts[_XMP_N_MAX_DIM]; for (int i = 0; i < ndims; i++){ sizes[i] = ainfo[i].alloc_size; subsizes[i] = (ishadow[i] == 0) ? ainfo[i].par_size : width[i]; if (ishadow[i] == 0){ // excludes shadow area send_starts[i] = ainfo[i].shadow_size_lo; recv_starts[i] = ainfo[i].shadow_size_lo; } else if (ishadow[i] > 0){ send_starts[i] = ainfo[i].shadow_size_lo; recv_starts[i] = ainfo[i].local_upper + 1; } else { send_starts[i] = ainfo[i].local_upper - width[i] + 1; recv_starts[i] = ainfo[i].shadow_size_lo - width[i]; } } MPI_Type_create_subarray(ndims, sizes, subsizes, send_starts, adesc->order, adesc->mpi_type, send_dtype); MPI_Type_create_subarray(ndims, sizes, subsizes, recv_starts, adesc->order, adesc->mpi_type, recv_dtype); MPI_Type_commit(send_dtype); MPI_Type_commit(recv_dtype); // // initialize communication // MPI_Send_init(send_buf, 1, *send_dtype, dst, _XMP_N_MPI_TAG_REFLECT_LO, *comm, send_req); MPI_Recv_init(recv_buf, 1, *recv_dtype, src, _XMP_N_MPI_TAG_REFLECT_LO, *comm, recv_req); async_reflect->nreqs += 2; }
static void _XMP_reflect_pcopy_sched_dim(_XMP_array_t *adesc, int target_dim, int lwidth, int uwidth, int is_periodic, void *dev_array_addr, int *lwidths, int *uwidths){ //printf("desc=%p, tardim=%d, lw=%d, uw=%d, devp=%p\n", adesc, target_dim, lwidth, uwidth, dev_array_addr); if (lwidth == 0 && uwidth == 0) return; _XMP_array_info_t *ai = &(adesc->info[target_dim]); _XMP_array_info_t *ainfo = adesc->info; _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK); _XMP_ASSERT(ai->is_shadow_comm_member); if (lwidth > ai->shadow_size_lo || uwidth > ai->shadow_size_hi){ _XMP_fatal("reflect width is larger than shadow width."); } _XMP_reflect_sched_t *reflect = ai->reflect_acc_sched; int target_tdim = ai->align_template_index; _XMP_nodes_info_t *ni = adesc->align_template->chunk[target_tdim].onto_nodes_info; int ndims = adesc->dim; // 0-origin int my_pos = ni->rank; int lb_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_lower); int ub_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_upper); int lo_pos = (my_pos == lb_pos) ? ub_pos : my_pos - 1; int hi_pos = (my_pos == ub_pos) ? lb_pos : my_pos + 1; MPI_Comm *comm = adesc->align_template->onto_nodes->comm; int my_rank = adesc->align_template->onto_nodes->comm_rank; int lo_rank = my_rank + (lo_pos - my_pos) * ni->multiplier; int hi_rank = my_rank + (hi_pos - my_pos) * ni->multiplier; int type_size = adesc->type_size; //void *array_addr = adesc->array_addr_p; void *lo_send_array = NULL; void *lo_recv_array = NULL; void *hi_send_array = NULL; void *hi_recv_array = NULL; void *lo_send_dev_buf = NULL; void *lo_recv_dev_buf = NULL; void *hi_send_dev_buf = NULL; void *hi_recv_dev_buf = NULL; void *lo_send_host_buf = NULL; void *lo_recv_host_buf = NULL; void *hi_send_host_buf = NULL; void *hi_recv_host_buf = NULL; void *mpi_lo_send_buf = NULL; void *mpi_lo_recv_buf = NULL; void *mpi_hi_send_buf = NULL; void *mpi_hi_recv_buf = NULL; int lo_buf_size = 0; int hi_buf_size = 0; // // setup data_type // int count = 0, blocklength = 0; long long stride = 0; // int count_offset = 0; if (_XMPF_running && !_XMPC_running){ /* for XMP/F */ count = 1; blocklength = type_size; stride = ainfo[0].alloc_size * type_size; for (int i = ndims - 2; i >= target_dim; i--){ count *= ainfo[i+1].alloc_size; } for (int i = 1; i <= target_dim; i++){ blocklength *= ainfo[i-1].alloc_size; stride *= ainfo[i].alloc_size; } } else if (!_XMPF_running && _XMPC_running){ /* for XMP/C */ count = 1; blocklength = type_size; stride = ainfo[ndims-1].alloc_size * type_size; /* if(target_dim > 0){ */ /* count *= ainfo[0].par_size; */ /* count_offset = ainfo[0].shadow_size_lo; */ /* } */ /* for (int i = 1; i < target_dim; i++){ */ /* count *= ainfo[i].alloc_size; */ /* } */ /* for (int i = ndims - 2; i >= target_dim; i--){ */ /* blocklength *= ainfo[i+1].alloc_size; */ /* stride *= ainfo[i].alloc_size; */ /* } */ if(target_dim == 0){ count *= 1; if(ndims >= 2){ blocklength *= (ainfo[1].par_size + lwidths[1] + uwidths[1]); } }else{ count *= (ainfo[0].par_size + lwidths[0] + uwidths[0]); for(int i = 1; i < target_dim; i++){ count *= ainfo[i].alloc_size; } blocklength *= ainfo[target_dim+1].alloc_size; stride *= ainfo[target_dim].alloc_size; } for(int i = target_dim+2; i < ndims; i++){ blocklength *= ainfo[i].alloc_size; } for(int i = target_dim+1 ; i < ndims - 1; i++){ stride *= ainfo[i].alloc_size; } /* mod_4 */ count = 1; blocklength = 1; stride = 1; for(int i = 0; i < ndims; i++){ int fact = (i == target_dim)? 1 : (ainfo[i].par_size + lwidths[i] + uwidths[i]); int alloc_size = ainfo[i].alloc_size; if(blocklength == 1 || fact == alloc_size){ blocklength *= fact; stride *= alloc_size; }else if(count == 1 && target_dim != 0){ //to be contiguous if target_dim==0 count = blocklength; blocklength = fact; stride = alloc_size; }else{ blocklength *= alloc_size; stride *= alloc_size; } //printf("tar=%d, i=%d, fact=%d, allocsize=%d, (%d,%d,%lld)\n", target_dim, i, fact, alloc_size, count , blocklength, stride); } blocklength *= type_size; stride *= type_size; /* mod_4 end */ /* it used at 150717 for (int i = 1; i <= target_dim; i++){ count *= ainfo[i-1].alloc_size; } for (int i = ndims - 2; i >= target_dim; i--){ blocklength *= ainfo[i+1].alloc_size; stride *= ainfo[i].alloc_size; } */ /* for (int i = target_dim + 1; i < ndims; i++){ */ /* blocklength *= ainfo[i].alloc_size; */ /* } */ /* for (int i = target_dim; i < ndims - 1; i++){ */ /* stride *= ainfo[i].alloc_size; */ /* } */ // printf("count =%d, blength=%d, stride=%lld\n", count ,blocklength, stride); // printf("ainfo[0].par_size=%d\n", ainfo[0].par_size); // printf("count_ofset=%d,\n", count_offset); } else { _XMP_fatal("cannot determin the base language."); } // // calculate base address // // for lower reflect if (lwidth){ lo_send_array = lo_recv_array = (void *)((char*)dev_array_addr + /*count_offset*/0 * stride); for (int i = 0; i < ndims; i++) { int lb_send, lb_recv; unsigned long long dim_acc; if (i == target_dim) { //printf("ainfo[%d].local_upper=%d\n",i,ainfo[i].local_upper); lb_send = ainfo[i].local_upper - lwidth + 1; lb_recv = ainfo[i].shadow_size_lo - lwidth; ////ainfo[i].local_lower - lwidth; } else { // Note: including shadow area lb_send = 0; //// ainfo[i].local_lower - ainfo[i].shadow_size_lo; lb_recv = 0; //// ainfo[i].local_lower - ainfo[i].shadow_size_lo; } dim_acc = ainfo[i].dim_acc; lo_send_array = (void *)((char *)lo_send_array + lb_send * dim_acc * type_size); lo_recv_array = (void *)((char *)lo_recv_array + lb_recv * dim_acc * type_size); } } // for upper reflect if (uwidth){ hi_send_array = hi_recv_array = (void *)((char*)dev_array_addr + /*count_offset*/0 * stride); for (int i = 0; i < ndims; i++) { int lb_send, lb_recv; unsigned long long dim_acc; if (i == target_dim) { lb_send = ainfo[i].local_lower; lb_recv = ainfo[i].local_upper + 1; } else { // Note: including shadow area lb_send = 0; //ainfo[i].local_lower - ainfo[i].shadow_size_lo; lb_recv = 0; //ainfo[i].local_lower - ainfo[i].shadow_size_lo; } dim_acc = ainfo[i].dim_acc; hi_send_array = (void *)((char *)hi_send_array + lb_send * dim_acc * type_size); hi_recv_array = (void *)((char *)hi_recv_array + lb_recv * dim_acc * type_size); } } // for lower reflect if (reflect->datatype_lo != MPI_DATATYPE_NULL){ MPI_Type_free(&reflect->datatype_lo); } if(packVector || count == 1){ MPI_Type_contiguous(blocklength * lwidth * count, MPI_BYTE, &reflect->datatype_lo); // MPI_Type_contiguous(blocklength * lwidth * count / type_size, MPI_FLOAT, &reflect->datatype_lo); fprintf(stderr, "dim=%d, send elements lo = %d\n", target_dim, blocklength * lwidth * count / type_size); //fprintf(stderr, "useHostBuf=%c , packVector=%c\n", useHostBuffer, packVector); // if(useHostBuffer){ fprintf(stderr,"using host buffer\n"); } // if(packVector){ fprintf(stderr, "using pack vector\n"); } }else{ MPI_Type_vector(count, blocklength * lwidth, stride, MPI_BYTE, &reflect->datatype_lo); } MPI_Type_commit(&reflect->datatype_lo); // for upper reflect if (reflect->datatype_hi != MPI_DATATYPE_NULL){ MPI_Type_free(&reflect->datatype_hi); } if(packVector || count == 1){ MPI_Type_contiguous(blocklength * uwidth * count, MPI_BYTE, &reflect->datatype_hi); // MPI_Type_contiguous(blocklength * uwidth * count / type_size, MPI_FLOAT, &reflect->datatype_hi); fprintf(stderr, "dim=%d, send elements hi = %d\n", target_dim, blocklength * uwidth * count / type_size); }else{ MPI_Type_vector(count, blocklength * uwidth, stride, MPI_BYTE, &reflect->datatype_hi); } MPI_Type_commit(&reflect->datatype_hi); // // Allocate buffers // if(useHostBuffer){ CUDA_SAFE_CALL(cudaFreeHost(reflect->lo_send_host_buf)); CUDA_SAFE_CALL(cudaFreeHost(reflect->lo_recv_host_buf)); } if ((_XMPF_running && target_dim != ndims - 1) || (_XMPC_running && target_dim != 0)){ if(packVector){ CUDA_SAFE_CALL(cudaFree(reflect->lo_send_buf)); CUDA_SAFE_CALL(cudaFree(reflect->lo_recv_buf)); } } if ((_XMPF_running && target_dim == ndims - 1) || (_XMPC_running && target_dim == 0)){ // } // for lower reflect if (lwidth){ lo_buf_size = lwidth * blocklength * count; hi_buf_size = uwidth * blocklength * count; if ((_XMPF_running && target_dim == ndims - 1) || (_XMPC_running && target_dim == 0)){ lo_send_dev_buf = lo_send_array; lo_recv_dev_buf = lo_recv_array; hi_send_dev_buf = hi_send_array; hi_recv_dev_buf = hi_recv_array; } else { _XMP_TSTART(t0); if(packVector){ CUDA_SAFE_CALL(cudaMalloc((void **)&lo_send_dev_buf, lo_buf_size + hi_buf_size)); hi_send_dev_buf = (char*)lo_send_dev_buf + lo_buf_size; CUDA_SAFE_CALL(cudaMalloc((void **)&lo_recv_dev_buf, lo_buf_size + hi_buf_size)); hi_recv_dev_buf = (char*)lo_recv_dev_buf + lo_buf_size; }else{ lo_send_dev_buf = lo_send_array; lo_recv_dev_buf = lo_recv_array; hi_send_dev_buf = hi_send_array; hi_recv_dev_buf = hi_recv_array; } _XMP_TEND2(xmptiming_.t_mem, xmptiming_.tdim_mem[target_dim], t0); } if(useHostBuffer){ CUDA_SAFE_CALL(cudaMallocHost((void**)&lo_send_host_buf, lo_buf_size + hi_buf_size)); hi_send_host_buf = (char*)lo_send_host_buf + lo_buf_size; CUDA_SAFE_CALL(cudaMallocHost((void**)&lo_recv_host_buf, lo_buf_size + hi_buf_size)); hi_recv_host_buf = (char*)lo_recv_host_buf + lo_buf_size; mpi_lo_send_buf = lo_send_host_buf; mpi_lo_recv_buf = lo_recv_host_buf; mpi_hi_send_buf = hi_send_host_buf; mpi_hi_recv_buf = hi_recv_host_buf; }else{ mpi_lo_send_buf = lo_send_dev_buf; mpi_lo_recv_buf = lo_recv_dev_buf; mpi_hi_send_buf = hi_send_dev_buf; mpi_hi_recv_buf = hi_recv_dev_buf; } } // for upper reflect // // initialize communication // int src, dst; if (!is_periodic && my_pos == lb_pos){ // no periodic lo_rank = MPI_PROC_NULL; } if (!is_periodic && my_pos == ub_pos){ // no periodic hi_rank = MPI_PROC_NULL; } // for lower shadow if (lwidth){ src = lo_rank; dst = hi_rank; } else { src = MPI_PROC_NULL; dst = MPI_PROC_NULL; } // fprintf(stderr, "dim=%d, lo_src=%d, lo_dst=%d\n", target_dim, src, dst); if (reflect->req[0] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[0]); } if (reflect->req[1] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[1]); } MPI_Recv_init(mpi_lo_recv_buf, 1, reflect->datatype_lo, src, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[0]); MPI_Send_init(mpi_lo_send_buf, 1, reflect->datatype_lo, dst, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[1]); // for upper shadow if (uwidth){ src = hi_rank; dst = lo_rank; } else { src = MPI_PROC_NULL; dst = MPI_PROC_NULL; } // fprintf(stderr, "dim=%d, hi_src=%d, hi_dst=%d\n", target_dim, src, dst); if (reflect->req[2] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[2]); } if (reflect->req[3] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[3]); } MPI_Recv_init(mpi_hi_recv_buf, 1, reflect->datatype_hi, src, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[2]); MPI_Send_init(mpi_hi_send_buf, 1, reflect->datatype_hi, dst, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[3]); // // cache schedule // reflect->count = count; reflect->blocklength = blocklength; reflect->stride = stride; reflect->lo_send_array = lo_send_array; reflect->lo_recv_array = lo_recv_array; reflect->hi_send_array = hi_send_array; reflect->hi_recv_array = hi_recv_array; if(packVector){ reflect->lo_send_buf = lo_send_dev_buf; reflect->lo_recv_buf = lo_recv_dev_buf; reflect->hi_send_buf = hi_send_dev_buf; reflect->hi_recv_buf = hi_recv_dev_buf; } if(useHostBuffer){ reflect->lo_send_host_buf = lo_send_host_buf; reflect->lo_recv_host_buf = lo_recv_host_buf; reflect->hi_send_host_buf = hi_send_host_buf; reflect->hi_recv_host_buf = hi_recv_host_buf; } reflect->lo_rank = lo_rank; reflect->hi_rank = hi_rank; // gpu async reflect->lo_async_id = _XMP_alloc(sizeof(cudaStream_t)); CUDA_SAFE_CALL(cudaStreamCreate(reflect->lo_async_id)); if(target_dim != 0 && (!useHostBuffer || (lo_rank != MPI_PROC_NULL && hi_rank != MPI_PROC_NULL && (lo_buf_size / type_size) <= useSingleStreamLimit)) ){ reflect->hi_async_id = NULL; }else{ cudaStream_t *hi_stream = (cudaStream_t*)_XMP_alloc(sizeof(cudaStream_t)); CUDA_SAFE_CALL(cudaStreamCreate(hi_stream)); reflect->hi_async_id = (void*)hi_stream; } reflect->event = _XMP_alloc(sizeof(cudaEvent_t)); CUDA_SAFE_CALL(cudaEventCreateWithFlags(reflect->event, cudaEventDisableTiming)); }
static void _XMP_reflect_sched(_XMP_array_t *a, int *lwidth, int *uwidth, int *is_periodic, int is_async, void *dev_addr) { _XMP_TSTART(t0); for (int i = 0; i < a->dim; i++){ _XMP_array_info_t *ai = &(a->info[i]); if (ai->shadow_type == _XMP_N_SHADOW_NONE){ continue; } else if (ai->shadow_type == _XMP_N_SHADOW_NORMAL){ _XMP_reflect_sched_t *reflect = ai->reflect_acc_sched; if(reflect == NULL){ reflect = _XMP_alloc(sizeof(_XMP_reflect_sched_t)); reflect->is_periodic = -1; /* not used yet */ reflect->datatype_lo = MPI_DATATYPE_NULL; reflect->datatype_hi = MPI_DATATYPE_NULL; for (int j = 0; j < 4; j++) reflect->req[j] = MPI_REQUEST_NULL; reflect->lo_send_buf = NULL; reflect->lo_recv_buf = NULL; reflect->hi_send_buf = NULL; reflect->hi_recv_buf = NULL; reflect->lo_send_host_buf = NULL; reflect->lo_recv_host_buf = NULL; reflect->hi_send_host_buf = NULL; reflect->hi_recv_host_buf = NULL; ai->reflect_acc_sched = reflect; }else{ // } if (1/*lwidth[i] || uwidth[i]*/){ _XMP_ASSERT(reflect); if (reflect->is_periodic == -1 /* not set yet */ || lwidth[i] != reflect->lo_width || uwidth[i] != reflect->hi_width || is_periodic[i] != reflect->is_periodic){ reflect->lo_width = lwidth[i]; reflect->hi_width = uwidth[i]; reflect->is_periodic = is_periodic[i]; if (/*_xmp_reflect_pack_flag && !is_async*/ 1){ _XMP_reflect_pcopy_sched_dim(a, i, lwidth[i], uwidth[i], is_periodic[i], dev_addr, lwidth, uwidth); } else { //_XMP_reflect_normal_sched_dim(a, i, lwidth[i], uwidth[i], is_periodic[i]); } } } } else { /* _XMP_N_SHADOW_FULL */ ; } } _XMP_TEND(xmptiming_.t_sched, t0); }