void _XMP_reflect_async_cardinal(_XMP_array_t *a, int async_id) { _XMP_async_comm_t *async = _XMP_get_current_async(); MPI_Request *reqs = &async->reqs[async->nreqs]; int nreqs = 0; _XMP_TSTART(t0); for (int i = 0; i < a->dim; i++){ _XMP_array_info_t *ai = &(a->info[i]); if (ai->shadow_type == _XMP_N_SHADOW_NONE){ continue; } else if (ai->shadow_type == _XMP_N_SHADOW_NORMAL){ _XMP_reflect_sched_t *reflect = ai->reflect_sched; if (_xmp_lwidth[i] || _xmp_uwidth[i]){ _XMP_ASSERT(reflect); if (reflect->is_periodic == -1 /* not set yet */ || _xmp_lwidth[i] != reflect->lo_width || _xmp_uwidth[i] != reflect->hi_width || _xmp_is_periodic[i] != reflect->is_periodic){ reflect->lo_width = _xmp_lwidth[i]; reflect->hi_width = _xmp_uwidth[i]; reflect->is_periodic = _xmp_is_periodic[i]; _XMP_reflect_normal_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i]); } if (async->nreqs + nreqs + 4 > _XMP_MAX_ASYNC_REQS){ _XMP_fatal("too many arrays in an asynchronous reflect"); } memcpy(&reqs[nreqs], reflect->req, 4 * sizeof(MPI_Request)); nreqs += 4; _XMP_TSTART(t0); if (reflect->req[0] != MPI_REQUEST_NULL) // if req[0] isn't null, any others shouldn't be null. MPI_Startall(4, reflect->req); _XMP_TEND2(xmptiming_.t_comm, xmptiming_.tdim_comm[i], t0); } } else { /* _XMP_N_SHADOW_FULL */ _XMP_reflect_shadow_FULL(a->array_addr_p, a, i); } } _XMP_TEND(xmptiming_.t_sched, t0); async->nreqs += nreqs; }
static void _XMP_reflect_wait(_XMP_array_t *a, int *lwidth, int *uwidth, int *is_periodic) { int nrdmas0 = 0, nrdmas1 = 0; _XMP_TSTART(t0); for (int i = 0; i < a->dim; i++){ _XMP_reflect_sched_t *reflect = a->info[i].reflect_sched; if (lwidth[i] && reflect->hi_rank != -1) nrdmas0 += reflect->count; if (uwidth[i] && reflect->lo_rank != -1) nrdmas1 += reflect->count; } while (nrdmas0 || nrdmas1){ while (FJMPI_Rdma_poll_cq(FJMPI_RDMA_NIC0, NULL) == FJMPI_RDMA_NOTICE){ nrdmas0--; } while (FJMPI_Rdma_poll_cq(FJMPI_RDMA_NIC1, NULL) == FJMPI_RDMA_NOTICE){ nrdmas1--; } } xmp_barrier(); _XMP_TEND(xmptiming_.t_wait, t0); }
static void _XMP_reflect_wait(_XMP_array_t *a, int *lwidth, int *uwidth, int *is_periodic) { for (int i = 0; i < a->dim; i++){ if (!lwidth[i] && !uwidth[i]) continue; _XMP_array_info_t *ai = &(a->info[i]); if (ai->shadow_type == _XMP_N_SHADOW_NORMAL){ _XMP_reflect_sched_t *reflect = ai->reflect_sched; _XMP_TSTART(t0); MPI_Waitall(4, reflect->req, MPI_STATUSES_IGNORE); _XMP_TEND2(xmptiming_.t_wait, xmptiming_.tdim_wait[i], t0); } else if (ai->shadow_type == _XMP_N_SHADOW_FULL){ //_XMP_reflect_shadow_FULL(a->array_addr_p, a, i); } } if (_xmp_reflect_pack_flag){ _XMP_TSTART(t0); _XMP_reflect_unpack(a, lwidth, uwidth, is_periodic); _XMP_TEND(xmptiming_.t_copy, t0); } }
static void _XMP_reflect_start(_XMP_array_t *a, int *lwidth, int *uwidth, int *is_periodic, int tag) { _XMP_TSTART(t1); xmp_barrier(); for (int i = 0; i < a->dim; i++){ _XMP_reflect_sched_t *reflect = a->info[i].reflect_sched; _XMP_TSTART(t0); // for lower reflect if (lwidth[i] && reflect->hi_rank != -1){ for (int j = 0; j < reflect->count; j++){ FJMPI_Rdma_put(reflect->hi_rank, tag, (uint64_t)reflect->lo_recv_array + j * reflect->stride, (uint64_t)reflect->lo_send_array + j * reflect->stride, lwidth[i] * reflect->blocklength, FJMPI_RDMA_LOCAL_NIC0 | FJMPI_RDMA_REMOTE_NIC2); } } // for upper reflect if (uwidth[i] && reflect->lo_rank != -1){ for (int j = 0; j < reflect->count; j++){ FJMPI_Rdma_put(reflect->lo_rank, tag, (uint64_t)reflect->hi_recv_array + j * reflect->stride, (uint64_t)reflect->hi_send_array + j * reflect->stride, uwidth[i] * reflect->blocklength, FJMPI_RDMA_LOCAL_NIC1 | FJMPI_RDMA_REMOTE_NIC3); } } _XMP_TEND(xmptiming_.tdim_comm[i], t0); } _XMP_TEND(xmptiming_.t_comm, t1); }
void _XMP_reflect__(_XMP_array_t *a) { int is_ordinal = 1; //_XMP_RETURN_IF_SINGLE; if (!a->is_allocated){ _xmp_set_reflect_flag = 0; return; } if (!_xmp_set_reflect_flag){ for (int i = 0; i < a->dim; i++){ _XMP_array_info_t *ai = &(a->info[i]); _xmp_lwidth[i] = ai->shadow_size_lo; _xmp_uwidth[i] = ai->shadow_size_hi; _xmp_is_periodic[i] = 0; } } _XMP_TSTART(t0); for (int i = 0; i < a->dim; i++){ _XMP_array_info_t *ai = &(a->info[i]); if (ai->shadow_type == _XMP_N_SHADOW_NONE){ continue; } else if (ai->shadow_type == _XMP_N_SHADOW_NORMAL){ _XMP_reflect_sched_t *reflect = ai->reflect_sched; if (_xmp_lwidth[i] || _xmp_uwidth[i]){ _XMP_ASSERT(reflect); if (reflect->is_periodic == -1 /* not set yet */ || _xmp_lwidth[i] != reflect->lo_width || _xmp_uwidth[i] != reflect->hi_width || _xmp_is_periodic[i] != reflect->is_periodic){ reflect->lo_width = _xmp_lwidth[i]; reflect->hi_width = _xmp_uwidth[i]; reflect->is_periodic = _xmp_is_periodic[i]; _XMP_reflect_rdma_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i]); } } } else { /* _XMP_N_SHADOW_FULL */ ; } } _XMP_TEND(xmptiming_.t_sched, t0); _XMP_reflect_start(a, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic, 0); _XMP_reflect_wait(a, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic); _xmp_set_reflect_flag = 0; for (int i = 0; i < a->dim; i++){ _xmp_lwidth[i] = 0; _xmp_uwidth[i] = 0; _xmp_is_periodic[i] = 0; } }
void _XMP_reflect__(_XMP_array_t *a) { int is_ordinal = 1; //_XMP_RETURN_IF_SINGLE; if (!a->is_allocated){ _xmp_set_reflect_flag = 0; return; } if (!_xmp_set_reflect_flag){ for (int i = 0; i < a->dim; i++){ _XMP_array_info_t *ai = &(a->info[i]); _xmp_lwidth[i] = ai->shadow_size_lo; _xmp_uwidth[i] = ai->shadow_size_hi; _xmp_is_periodic[i] = 0; } } _XMP_TSTART(t0); for (int i = 0; i < a->dim; i++){ _XMP_array_info_t *ai = &(a->info[i]); if (ai->shadow_type == _XMP_N_SHADOW_NONE){ continue; } else if (ai->shadow_type == _XMP_N_SHADOW_NORMAL){ _XMP_reflect_sched_t *reflect = ai->reflect_sched; if (_xmp_lwidth[i] || _xmp_uwidth[i]){ _XMP_ASSERT(reflect); /* if (!reflect->reflect_is_initialized || */ /* _xmp_lwidth[i] != reflect->lo_width || */ /* _xmp_uwidth[i] != reflect->hi_width || */ /* _xmp_is_periodic[i] != reflect->is_periodic){ */ /* reflect->lo_width = _xmp_lwidth[i]; */ /* reflect->hi_width = _xmp_uwidth[i]; */ /* reflect->is_periodic = _xmp_is_periodic[i]; */ /* if (_xmp_reflect_pack_flag){ */ /* _XMP_reflect_pcopy_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i], 0); */ /* } */ /* else { */ /* _XMP_reflect_normal_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i]); */ /* } */ /* reflect->reflect_is_initialized = 1; */ /* } */ if (!reflect->reflect_is_initialized || _xmp_lwidth[i] != reflect->lo_width || _xmp_uwidth[i] != reflect->hi_width || _xmp_is_periodic[i] != reflect->is_periodic){ if (_xmp_reflect_pack_flag){ _XMP_reflect_pcopy_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i], _XMP_COMM_REFLECT); } else { _XMP_reflect_normal_sched_dim(a, i, _xmp_lwidth[i], _xmp_uwidth[i], _xmp_is_periodic[i]); } reflect->reflect_is_initialized = 1; reflect->lo_width = _xmp_lwidth[i]; reflect->hi_width = _xmp_uwidth[i]; reflect->is_periodic = _xmp_is_periodic[i]; } if (_xmp_reflect_pack_flag && reflect->req[0] != MPI_REQUEST_NULL){ _XMP_TSTART(t0); _XMP_reflect_pack_dim(a, i, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic, _XMP_COMM_REFLECT); _XMP_TEND(xmptiming_.t_copy, t0); } _XMP_TSTART(t0); if (reflect->req[0] != MPI_REQUEST_NULL) // if req[0] isn't null, any others shouldn't be null. MPI_Startall(4, reflect->req); _XMP_TEND2(xmptiming_.t_comm, xmptiming_.tdim_comm[i], t0); if (is_ordinal){ _XMP_TSTART(t0); MPI_Waitall(4, reflect->req, MPI_STATUSES_IGNORE); _XMP_TEND2(xmptiming_.t_wait, xmptiming_.tdim_wait[i], t0); if (_xmp_reflect_pack_flag && reflect->req[0] != MPI_REQUEST_NULL){ _XMP_TSTART(t0); _XMP_reflect_unpack_dim(a, i, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic); _XMP_TEND(xmptiming_.t_copy, t0); } } } } else { /* _XMP_N_SHADOW_FULL */ _XMP_reflect_shadow_FULL(a->array_addr_p, a, i); } } _XMP_TEND(xmptiming_.t_sched, t0); // t0 = MPI_Wtime(); if (!is_ordinal) _XMP_reflect_wait(a, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic); // t_wait = t_wait + (MPI_Wtime() - t0); _xmp_set_reflect_flag = 0; for (int i = 0; i < a->dim; i++){ _xmp_lwidth[i] = 0; _xmp_uwidth[i] = 0; _xmp_is_periodic[i] = 0; } }
void _XMP_reflect_async_ordinal(_XMP_array_t *a, int async_id){ int n = a->dim; _XMP_async_reflect_t *async_reflect; _Bool reusable_sched = false; if (!a->async_reflect){ int max_nreqs = (pow(3, n) - 1) * 2; async_reflect = (_XMP_async_reflect_t *)_XMP_alloc(sizeof(_XMP_async_reflect_t)); async_reflect->datatype = (MPI_Datatype *)_XMP_alloc(sizeof(MPI_Datatype) * max_nreqs); async_reflect->reqs = (MPI_Request *)_XMP_alloc(sizeof(MPI_Request) * max_nreqs); for (int i = 0; i < max_nreqs; i++){ async_reflect->datatype[i] = MPI_DATATYPE_NULL; async_reflect->reqs[i] = MPI_REQUEST_NULL; } async_reflect->nreqs = 0; a->async_reflect = async_reflect; } else { reusable_sched = true; async_reflect = a->async_reflect; for (int i = 0; i < n; i++){ if (async_reflect->lwidth[i] != _xmp_lwidth[i] || async_reflect->uwidth[i] != _xmp_uwidth[i] || async_reflect->is_periodic[i] != _xmp_is_periodic[i]){ reusable_sched = false; break; } } } if (!reusable_sched){ int lb[_XMP_N_MAX_DIM] = { 0 }; int ub[_XMP_N_MAX_DIM] = { 0 }; for (int i = 0; i < n; i++){ async_reflect->lwidth[i] = _xmp_lwidth[i]; async_reflect->uwidth[i] = _xmp_uwidth[i]; async_reflect->is_periodic[i] = _xmp_is_periodic[i]; if (_xmp_lwidth[i] > 0) lb[i] = -1; if (_xmp_uwidth[i] > 0) ub[i] = 1; } for (int i = 0; i < async_reflect->nreqs; i++){ if (async_reflect->datatype[i] != MPI_DATATYPE_NULL) MPI_Type_free(&async_reflect->datatype[i]); if (async_reflect->reqs[i] != MPI_REQUEST_NULL) MPI_Request_free(&async_reflect->reqs[i]); } async_reflect->nreqs = 0; int ishadow[_XMP_N_MAX_DIM]; for (ishadow[0] = lb[0]; ishadow[0] <= ub[0]; ishadow[0]++){ for (ishadow[1] = lb[1]; ishadow[1] <= ub[1]; ishadow[1]++){ for (ishadow[2] = lb[2]; ishadow[2] <= ub[2]; ishadow[2]++){ for (ishadow[3] = lb[3]; ishadow[3] <= ub[3]; ishadow[3]++){ for (ishadow[4] = lb[4]; ishadow[4] <= ub[4]; ishadow[4]++){ for (ishadow[5] = lb[5]; ishadow[5] <= ub[5]; ishadow[5]++){ for (ishadow[6] = lb[6]; ishadow[6] <= ub[6]; ishadow[6]++){ // When ishadow > 0, upper shadow is to be updated, and vice versa. int nnzero = 0; for (int i = 0; i < n; i++){ if (ishadow[i] != 0) nnzero++; } if (nnzero == 0) continue; _XMP_reflect_sched_dir(a, ishadow, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic); }}}}}}} } _XMP_async_comm_t *async = _XMP_get_current_async(); MPI_Request *reqs = &async->reqs[async->nreqs]; // copy to async if (async->nreqs + async_reflect->nreqs > _XMP_MAX_ASYNC_REQS){ _XMP_fatal("too many arrays in an asynchronous reflect"); } memcpy(reqs, async_reflect->reqs, async_reflect->nreqs * sizeof(MPI_Request)); async->nreqs += async_reflect->nreqs; _XMP_TSTART(t0); MPI_Startall(async_reflect->nreqs, reqs); _XMP_TEND(xmptiming_.t_start, t0); }
static void _XMP_reflect_sched(_XMP_array_t *a, int *lwidth, int *uwidth, int *is_periodic, int is_async, void *dev_addr) { _XMP_TSTART(t0); for (int i = 0; i < a->dim; i++){ _XMP_array_info_t *ai = &(a->info[i]); if (ai->shadow_type == _XMP_N_SHADOW_NONE){ continue; } else if (ai->shadow_type == _XMP_N_SHADOW_NORMAL){ _XMP_reflect_sched_t *reflect = ai->reflect_acc_sched; if(reflect == NULL){ reflect = _XMP_alloc(sizeof(_XMP_reflect_sched_t)); reflect->is_periodic = -1; /* not used yet */ reflect->datatype_lo = MPI_DATATYPE_NULL; reflect->datatype_hi = MPI_DATATYPE_NULL; for (int j = 0; j < 4; j++) reflect->req[j] = MPI_REQUEST_NULL; reflect->lo_send_buf = NULL; reflect->lo_recv_buf = NULL; reflect->hi_send_buf = NULL; reflect->hi_recv_buf = NULL; reflect->lo_send_host_buf = NULL; reflect->lo_recv_host_buf = NULL; reflect->hi_send_host_buf = NULL; reflect->hi_recv_host_buf = NULL; ai->reflect_acc_sched = reflect; }else{ // } if (1/*lwidth[i] || uwidth[i]*/){ _XMP_ASSERT(reflect); if (reflect->is_periodic == -1 /* not set yet */ || lwidth[i] != reflect->lo_width || uwidth[i] != reflect->hi_width || is_periodic[i] != reflect->is_periodic){ reflect->lo_width = lwidth[i]; reflect->hi_width = uwidth[i]; reflect->is_periodic = is_periodic[i]; if (/*_xmp_reflect_pack_flag && !is_async*/ 1){ _XMP_reflect_pcopy_sched_dim(a, i, lwidth[i], uwidth[i], is_periodic[i], dev_addr, lwidth, uwidth); } else { //_XMP_reflect_normal_sched_dim(a, i, lwidth[i], uwidth[i], is_periodic[i]); } } } } else { /* _XMP_N_SHADOW_FULL */ ; } } _XMP_TEND(xmptiming_.t_sched, t0); }