void _XMP_reduce_gpu_NODES_ENTIRE(_XMP_nodes_t *nodes, void *dev_addr, int count, int datatype, int op) { if (count == 0) { return; // FIXME not good implementation } if (!nodes->is_member) { return; } // setup information MPI_Datatype mpi_datatype = MPI_DATATYPE_NULL; size_t datatype_size = 0; MPI_Op mpi_op; _XMP_setup_reduce_type(&mpi_datatype, &datatype_size, datatype); _XMP_setup_reduce_op(&mpi_op, op); size_t size = datatype_size * count; void *host_buf = _XMP_alloc(size); cudaError_t e; // copy dev to host e = cudaMemcpy(host_buf, dev_addr, size, cudaMemcpyDeviceToHost); cudaErrorCheck(e); MPI_Allreduce(MPI_IN_PLACE, host_buf, count, mpi_datatype, mpi_op, *((MPI_Comm *)nodes->comm)); // copy host to dev e = cudaMemcpy(dev_addr, host_buf, size, cudaMemcpyHostToDevice); cudaErrorCheck(e); _XMP_free(host_buf); }
void xmp_gather_(_XMP_array_t **x_d, _XMP_array_t **a_d, ... ) { int i; va_list valst; _XMP_array_t *idx_p; _XMP_array_t **idx_pp; _XMP_array_t **idx_array; // _XMP_array_t *x_p = *(_XMP_array_t **)x_d; _XMP_array_t *a_p = *(_XMP_array_t **)a_d; idx_array = (_XMP_array_t **)_XMP_alloc(sizeof(_XMP_array_t *)*a_p->dim); va_start( valst, a_d ); for(i=0;i<a_p->dim;i++){ idx_pp = va_arg( valst , _XMP_array_t** ); idx_p = *(_XMP_array_t **)idx_pp; idx_array[i] = idx_p; } va_end(valst); xmpf_gather(*x_d, *a_d, idx_array); _XMP_free(idx_array); }
void _XMP_reduce_gpu_CLAUSE(void *dev_addr, int count, int datatype, int op) { // setup information MPI_Datatype mpi_datatype = MPI_DATATYPE_NULL; size_t datatype_size = 0; MPI_Op mpi_op; _XMP_setup_reduce_type(&mpi_datatype, &datatype_size, datatype); _XMP_setup_reduce_op(&mpi_op, op); size_t size = datatype_size * count; void *host_buf = _XMP_alloc(size); cudaError_t e; // copy dev to host e = cudaMemcpy(host_buf, dev_addr, size, cudaMemcpyDeviceToHost); cudaErrorCheck(e); // reduce MPI_Allreduce(MPI_IN_PLACE, host_buf, count, mpi_datatype, mpi_op, *((MPI_Comm *)(_XMP_get_execution_nodes())->comm)); // copy host to dev e = cudaMemcpy(dev_addr, host_buf, size, cudaMemcpyHostToDevice); cudaErrorCheck(e); _XMP_free(host_buf); }
static void _mpi_scalar_mput(const int target_rank, const _XMP_coarray_t *dst_desc, const void *src, const size_t dst_offset, const size_t src_offset, const int dst_dims, const _XMP_array_section_t *dst_info, const bool is_dst_on_acc) { int allelmt_dim = _XMP_get_dim_of_allelmts(dst_dims, dst_info); size_t element_size = dst_desc->elmt_size; size_t allelmt_size = (allelmt_dim == dst_dims)? element_size : dst_info[allelmt_dim].distance * dst_info[allelmt_dim].elmts; char *laddr = (allelmt_dim == dst_dims)? ((char*)src + src_offset) : _XMP_alloc(allelmt_size); char *raddr = get_remote_addr(dst_desc, target_rank, is_dst_on_acc) + dst_offset; MPI_Win win = get_window(dst_desc, is_dst_on_acc); XACC_DEBUG("scalar_mput(src_p=%p, size=%zd, target=%d, dst_p=%p, is_acc=%d)", laddr, element_size, target_rank, raddr, is_dst_on_acc); XACC_DEBUG("allelmt_dim=%d, dst_dims=%d", allelmt_dim, dst_dims); if(allelmt_dim != dst_dims){ //mcopy _XMP_array_section_t info; info.start = 0; info.length = allelmt_size/element_size; info.stride = 1; info.elmts = info.length; info.distance = element_size; _XMP_stride_memcpy_1dim(laddr, (char*)src+src_offset, &info, element_size, _XMP_SCALAR_MCOPY); XACC_DEBUG("mcopy(%lld, %lld, %lld), %lld",info.start, info.length, info.stride, info.elmts); } long long idxs[allelmt_dim+1]; for(int i = 0; i < allelmt_dim+1; i++) idxs[i]=0; while(1){ size_t offset = 0; for(int i = 0; i < allelmt_dim; i++){ offset += dst_info[i].distance * idxs[i+1] * dst_info[i].stride; } MPI_Put((void*)laddr, allelmt_size, MPI_BYTE, target_rank, (MPI_Aint)(raddr+offset), allelmt_size, MPI_BYTE, win); ++idxs[allelmt_dim]; for(int i = allelmt_dim-1; i >= 0; i--){ long long length = dst_info[i].length; if(idxs[i+1] >= length){ idxs[i+1] -= length; ++idxs[i]; }else{ break; } } if(idxs[0] > 0){ break; } } _wait_puts(target_rank, win); if(allelmt_dim != dst_dims){ _XMP_free(laddr); } }
void _XMP_create_TCA_handle(void *acc_addr, _XMP_array_t *adesc) { if(adesc->set_handle) return; // 64KB align ? long tmp = ((long)acc_addr/65536)*65536; if(tmp != (long)acc_addr){ _XMP_fatal("An array is not aligned at 64KB."); return; } size_t size = (size_t)(adesc->type_size * adesc->total_elmts); #if 0 printf("[%d] tcaCreateHandle size = %d addr=%p\n", _XMP_world_rank, size, acc_addr); #endif tcaHandle tmp_handle; TCA_CHECK(tcaCreateHandle(&tmp_handle, acc_addr, size, tcaMemoryGPU)); adesc->tca_handle = _XMP_alloc(sizeof(tcaHandle) * _XMP_world_size); MPI_Allgather(&tmp_handle, sizeof(tcaHandle), MPI_BYTE, adesc->tca_handle, sizeof(tcaHandle), MPI_BYTE, MPI_COMM_WORLD); adesc->set_handle = _XMP_N_INT_TRUE; }
void xmpf_array_init_shadow__(_XMP_array_t **a_desc, int *i_dim, int *lshadow, int *ushadow) { _XMP_array_t *array = *a_desc; _XMP_array_info_t *ai = &(array->info[*i_dim]); if (*lshadow == 0 && *ushadow == 0){ ai->shadow_type = _XMP_N_SHADOW_NONE; } else if (*lshadow > 0 || *ushadow > 0){ _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK || ai->align_manner == _XMP_N_ALIGN_GBLOCK); ai->shadow_type = _XMP_N_SHADOW_NORMAL; ai->shadow_size_lo = *lshadow; ai->shadow_size_hi = *ushadow; if (array->is_allocated){ ai->local_lower += *lshadow; ai->local_upper += *lshadow; // ai->local_stride is not changed ai->alloc_size += *lshadow + *ushadow; // ai->temp0 shuld not be used in XMP/F. // *(ai->temp0) -= *lshadow; ai->temp0_v -= *lshadow; } if (!ai->reflect_sched){ _XMP_reflect_sched_t *sched = _XMP_alloc(sizeof(_XMP_reflect_sched_t)); sched->is_periodic = -1; /* not used yet */ sched->datatype_lo = MPI_DATATYPE_NULL; sched->datatype_hi = MPI_DATATYPE_NULL; for (int j = 0; j < 4; j++) sched->req[j] = MPI_REQUEST_NULL; sched->lo_send_buf = NULL; sched->lo_recv_buf = NULL; sched->hi_send_buf = NULL; sched->hi_recv_buf = NULL; ai->reflect_sched = sched; } //_XMP_create_shadow_comm(array, *i_dim); } else { // *lshadow < 0 && *ushadow < 0 ai->shadow_type = _XMP_N_SHADOW_FULL; if (array->is_allocated){ ai->shadow_size_lo = ai->par_lower - ai->ser_lower; ai->shadow_size_hi = ai->ser_upper - ai->par_upper; ai->local_lower = ai->par_lower - ai->ser_lower; ai->local_upper = ai->par_upper - ai->ser_lower; ai->local_stride = ai->par_stride; ai->alloc_size = ai->ser_size; } _XMP_create_shadow_comm(array, *i_dim); } }
void _XMP_tca_comm_init() { _ring_bufs = _XMP_alloc(sizeof(tca_ring_buf_t) * _XMP_world_size); for(int i = 0; i < _XMP_world_size; i++){ if(i == _XMP_world_rank) continue; _XMP_tca_ring_buf_init(&_ring_bufs[i], i); } }
void _XMP_alloc_tca(_XMP_array_t *adesc) { adesc->set_handle = _XMP_N_INT_FALSE; int array_dim = adesc->dim; for(int i=0;i<array_dim;i++){ _XMP_array_info_t *ai = &(adesc->info[i]); if(ai->shadow_type == _XMP_N_SHADOW_NONE) continue; ai->reflect_acc_sched = _XMP_alloc(sizeof(_XMP_reflect_sched_t)); } adesc->wait_slot = 0; // No change ? adesc->wait_tag = 0x100; // No change ? }
void _XMP_mpi_coarray_attach(_XMP_coarray_t *coarray_desc, void *addr, const size_t coarray_size, const bool is_acc) { MPI_Win win = MPI_WIN_NULL; char **each_addr = NULL; // head address of a local array on each node _XMP_nodes_t *nodes = _XMP_get_execution_nodes(); int comm_size = nodes->comm_size; MPI_Comm comm = *(MPI_Comm *)nodes->comm; XACC_DEBUG("attach addr=%p, size=%zd, is_acc=%d", addr, coarray_size, is_acc); if(_XMP_flag_multi_win){ _XMP_mpi_onesided_create_win(&win, addr, coarray_size, comm); MPI_Win_lock_all(MPI_MODE_NOCHECK, win); }else{ win = _xmp_mpi_distarray_win; #ifdef _XMP_XACC if(is_acc){ win = _xmp_mpi_distarray_win_acc; } #endif MPI_Win_attach(win, addr, coarray_size); each_addr = (char**)_XMP_alloc(sizeof(char *) * comm_size); MPI_Allgather(&addr, sizeof(char *), MPI_BYTE, each_addr, sizeof(char *), MPI_BYTE, comm); // exchange displacement } if(is_acc){ #ifdef _XMP_XACC coarray_desc->addr_dev = each_addr; coarray_desc->real_addr_dev = addr; coarray_desc->win_acc = win; coarray_desc->nodes = nodes; #endif }else{ coarray_desc->addr = each_addr; coarray_desc->real_addr = addr; coarray_desc->win = win; coarray_desc->win_acc = MPI_WIN_NULL; coarray_desc->nodes = nodes; } }
/** * Build table and Initialize for sync images */ void _XMP_mpi_build_sync_images_table() { size_t table_size = sizeof(unsigned int) * _XMP_world_size; #ifdef _SYNCIMAGE_SENDRECV _sync_images_table = _XMP_alloc(table_size); #else struct _shift_queue_t *shift_queue = &_shift_queue; _sync_images_table = (unsigned int*)(_xmp_mpi_onesided_buf + shift_queue->total_shift); _sync_images_table_disp = (unsigned int*)(shift_queue->total_shift); size_t shift; if(table_size % _XMP_MPI_ALIGNMENT == 0) shift = table_size; else{ shift = ((table_size / _XMP_MPI_ALIGNMENT) + 1) * _XMP_MPI_ALIGNMENT; } _push_shift_queue(shift_queue, shift); #endif for(int i=0;i<_XMP_world_size;i++) _sync_images_table[i] = 0; MPI_Barrier(MPI_COMM_WORLD); }
void xmp_transpose_(_XMP_array_t **dst_d, _XMP_array_t **src_d, int *opt){ #if 1 xmpf_transpose(*dst_d, *src_d, *opt); return; #else _XMP_array_t *dst_array = *(_XMP_array_t **)dst_d; _XMP_array_t *src_array = *(_XMP_array_t **)src_d; int nnodes; int dst_block_dim, src_block_dim; void *sendbuf=NULL, *recvbuf=NULL; unsigned long long count, bufsize; int dst_chunk_size, dst_ser_size, type_size; int src_chunk_size, src_ser_size; nnodes = dst_array->align_template->onto_nodes->comm_size; // 2-dimensional Matrix if (dst_array->dim != 2) { _XMP_fatal("bad dimension for xmp_transpose"); } // No Shadow if (dst_array->info[0].shadow_size_lo != 0 || dst_array->info[0].shadow_size_hi != 0 || src_array->info[0].shadow_size_lo != 0 || src_array->info[0].shadow_size_hi != 0) { _XMP_fatal("A global array must not have shadows"); fflush(stdout); } // Dividable by the number of nodes if (dst_array->info[0].ser_size % nnodes != 0) { _XMP_fatal("Not dividable by the number of nodes"); fflush(stdout); } dst_block_dim = (dst_array->info[0].align_manner == _XMP_N_ALIGN_BLOCK) ? 0 : 1; src_block_dim = (src_array->info[0].align_manner == _XMP_N_ALIGN_BLOCK) ? 0 : 1; dst_chunk_size = dst_array->info[dst_block_dim].par_size; dst_ser_size = dst_array->info[dst_block_dim].ser_size; src_chunk_size = src_array->info[src_block_dim].par_size; src_ser_size = src_array->info[src_block_dim].ser_size; type_size = dst_array->type_size; count = dst_chunk_size * src_chunk_size; bufsize = count * nnodes * type_size; _XMP_check_reflect_type(); if (src_block_dim == 1){ if (*opt ==0){ sendbuf = _XMP_alloc(bufsize); }else if (*opt==1){ sendbuf = dst_array->array_addr_p; } // src_array -> sendbuf _XMP_pack_vector2((char *)sendbuf, (char *)src_array->array_addr_p , src_chunk_size, dst_chunk_size, nnodes, type_size, src_block_dim); } else { sendbuf = src_array->array_addr_p; } if (*opt == 0){ recvbuf = _XMP_alloc(bufsize); }else if (*opt ==1){ recvbuf = src_array->array_addr_p; } MPI_Alltoall(sendbuf, count * type_size, MPI_BYTE, recvbuf, count * type_size, MPI_BYTE, *((MPI_Comm *)src_array->align_template->onto_nodes->comm)); if (dst_block_dim == 1){ _XMPF_unpack_transpose_vector((char *)dst_array->array_addr_p , (char *)recvbuf , src_ser_size, dst_chunk_size, type_size, dst_block_dim); if (*opt==0){ _XMP_free(recvbuf); } } if (src_block_dim == 1){ if (*opt == 0){ _XMP_free(sendbuf); } } return; #endif }
static void _XMP_reflect_pcopy_sched_dim(_XMP_array_t *adesc, int target_dim, int lwidth, int uwidth, int is_periodic, void *dev_array_addr, int *lwidths, int *uwidths){ //printf("desc=%p, tardim=%d, lw=%d, uw=%d, devp=%p\n", adesc, target_dim, lwidth, uwidth, dev_array_addr); if (lwidth == 0 && uwidth == 0) return; _XMP_array_info_t *ai = &(adesc->info[target_dim]); _XMP_array_info_t *ainfo = adesc->info; _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK); _XMP_ASSERT(ai->is_shadow_comm_member); if (lwidth > ai->shadow_size_lo || uwidth > ai->shadow_size_hi){ _XMP_fatal("reflect width is larger than shadow width."); } _XMP_reflect_sched_t *reflect = ai->reflect_acc_sched; int target_tdim = ai->align_template_index; _XMP_nodes_info_t *ni = adesc->align_template->chunk[target_tdim].onto_nodes_info; int ndims = adesc->dim; // 0-origin int my_pos = ni->rank; int lb_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_lower); int ub_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_upper); int lo_pos = (my_pos == lb_pos) ? ub_pos : my_pos - 1; int hi_pos = (my_pos == ub_pos) ? lb_pos : my_pos + 1; MPI_Comm *comm = adesc->align_template->onto_nodes->comm; int my_rank = adesc->align_template->onto_nodes->comm_rank; int lo_rank = my_rank + (lo_pos - my_pos) * ni->multiplier; int hi_rank = my_rank + (hi_pos - my_pos) * ni->multiplier; int type_size = adesc->type_size; //void *array_addr = adesc->array_addr_p; void *lo_send_array = NULL; void *lo_recv_array = NULL; void *hi_send_array = NULL; void *hi_recv_array = NULL; void *lo_send_dev_buf = NULL; void *lo_recv_dev_buf = NULL; void *hi_send_dev_buf = NULL; void *hi_recv_dev_buf = NULL; void *lo_send_host_buf = NULL; void *lo_recv_host_buf = NULL; void *hi_send_host_buf = NULL; void *hi_recv_host_buf = NULL; void *mpi_lo_send_buf = NULL; void *mpi_lo_recv_buf = NULL; void *mpi_hi_send_buf = NULL; void *mpi_hi_recv_buf = NULL; int lo_buf_size = 0; int hi_buf_size = 0; // // setup data_type // int count = 0, blocklength = 0; long long stride = 0; // int count_offset = 0; if (_XMPF_running && !_XMPC_running){ /* for XMP/F */ count = 1; blocklength = type_size; stride = ainfo[0].alloc_size * type_size; for (int i = ndims - 2; i >= target_dim; i--){ count *= ainfo[i+1].alloc_size; } for (int i = 1; i <= target_dim; i++){ blocklength *= ainfo[i-1].alloc_size; stride *= ainfo[i].alloc_size; } } else if (!_XMPF_running && _XMPC_running){ /* for XMP/C */ count = 1; blocklength = type_size; stride = ainfo[ndims-1].alloc_size * type_size; /* if(target_dim > 0){ */ /* count *= ainfo[0].par_size; */ /* count_offset = ainfo[0].shadow_size_lo; */ /* } */ /* for (int i = 1; i < target_dim; i++){ */ /* count *= ainfo[i].alloc_size; */ /* } */ /* for (int i = ndims - 2; i >= target_dim; i--){ */ /* blocklength *= ainfo[i+1].alloc_size; */ /* stride *= ainfo[i].alloc_size; */ /* } */ if(target_dim == 0){ count *= 1; if(ndims >= 2){ blocklength *= (ainfo[1].par_size + lwidths[1] + uwidths[1]); } }else{ count *= (ainfo[0].par_size + lwidths[0] + uwidths[0]); for(int i = 1; i < target_dim; i++){ count *= ainfo[i].alloc_size; } blocklength *= ainfo[target_dim+1].alloc_size; stride *= ainfo[target_dim].alloc_size; } for(int i = target_dim+2; i < ndims; i++){ blocklength *= ainfo[i].alloc_size; } for(int i = target_dim+1 ; i < ndims - 1; i++){ stride *= ainfo[i].alloc_size; } /* mod_4 */ count = 1; blocklength = 1; stride = 1; for(int i = 0; i < ndims; i++){ int fact = (i == target_dim)? 1 : (ainfo[i].par_size + lwidths[i] + uwidths[i]); int alloc_size = ainfo[i].alloc_size; if(blocklength == 1 || fact == alloc_size){ blocklength *= fact; stride *= alloc_size; }else if(count == 1 && target_dim != 0){ //to be contiguous if target_dim==0 count = blocklength; blocklength = fact; stride = alloc_size; }else{ blocklength *= alloc_size; stride *= alloc_size; } //printf("tar=%d, i=%d, fact=%d, allocsize=%d, (%d,%d,%lld)\n", target_dim, i, fact, alloc_size, count , blocklength, stride); } blocklength *= type_size; stride *= type_size; /* mod_4 end */ /* it used at 150717 for (int i = 1; i <= target_dim; i++){ count *= ainfo[i-1].alloc_size; } for (int i = ndims - 2; i >= target_dim; i--){ blocklength *= ainfo[i+1].alloc_size; stride *= ainfo[i].alloc_size; } */ /* for (int i = target_dim + 1; i < ndims; i++){ */ /* blocklength *= ainfo[i].alloc_size; */ /* } */ /* for (int i = target_dim; i < ndims - 1; i++){ */ /* stride *= ainfo[i].alloc_size; */ /* } */ // printf("count =%d, blength=%d, stride=%lld\n", count ,blocklength, stride); // printf("ainfo[0].par_size=%d\n", ainfo[0].par_size); // printf("count_ofset=%d,\n", count_offset); } else { _XMP_fatal("cannot determin the base language."); } // // calculate base address // // for lower reflect if (lwidth){ lo_send_array = lo_recv_array = (void *)((char*)dev_array_addr + /*count_offset*/0 * stride); for (int i = 0; i < ndims; i++) { int lb_send, lb_recv; unsigned long long dim_acc; if (i == target_dim) { //printf("ainfo[%d].local_upper=%d\n",i,ainfo[i].local_upper); lb_send = ainfo[i].local_upper - lwidth + 1; lb_recv = ainfo[i].shadow_size_lo - lwidth; ////ainfo[i].local_lower - lwidth; } else { // Note: including shadow area lb_send = 0; //// ainfo[i].local_lower - ainfo[i].shadow_size_lo; lb_recv = 0; //// ainfo[i].local_lower - ainfo[i].shadow_size_lo; } dim_acc = ainfo[i].dim_acc; lo_send_array = (void *)((char *)lo_send_array + lb_send * dim_acc * type_size); lo_recv_array = (void *)((char *)lo_recv_array + lb_recv * dim_acc * type_size); } } // for upper reflect if (uwidth){ hi_send_array = hi_recv_array = (void *)((char*)dev_array_addr + /*count_offset*/0 * stride); for (int i = 0; i < ndims; i++) { int lb_send, lb_recv; unsigned long long dim_acc; if (i == target_dim) { lb_send = ainfo[i].local_lower; lb_recv = ainfo[i].local_upper + 1; } else { // Note: including shadow area lb_send = 0; //ainfo[i].local_lower - ainfo[i].shadow_size_lo; lb_recv = 0; //ainfo[i].local_lower - ainfo[i].shadow_size_lo; } dim_acc = ainfo[i].dim_acc; hi_send_array = (void *)((char *)hi_send_array + lb_send * dim_acc * type_size); hi_recv_array = (void *)((char *)hi_recv_array + lb_recv * dim_acc * type_size); } } // for lower reflect if (reflect->datatype_lo != MPI_DATATYPE_NULL){ MPI_Type_free(&reflect->datatype_lo); } if(packVector || count == 1){ MPI_Type_contiguous(blocklength * lwidth * count, MPI_BYTE, &reflect->datatype_lo); // MPI_Type_contiguous(blocklength * lwidth * count / type_size, MPI_FLOAT, &reflect->datatype_lo); fprintf(stderr, "dim=%d, send elements lo = %d\n", target_dim, blocklength * lwidth * count / type_size); //fprintf(stderr, "useHostBuf=%c , packVector=%c\n", useHostBuffer, packVector); // if(useHostBuffer){ fprintf(stderr,"using host buffer\n"); } // if(packVector){ fprintf(stderr, "using pack vector\n"); } }else{ MPI_Type_vector(count, blocklength * lwidth, stride, MPI_BYTE, &reflect->datatype_lo); } MPI_Type_commit(&reflect->datatype_lo); // for upper reflect if (reflect->datatype_hi != MPI_DATATYPE_NULL){ MPI_Type_free(&reflect->datatype_hi); } if(packVector || count == 1){ MPI_Type_contiguous(blocklength * uwidth * count, MPI_BYTE, &reflect->datatype_hi); // MPI_Type_contiguous(blocklength * uwidth * count / type_size, MPI_FLOAT, &reflect->datatype_hi); fprintf(stderr, "dim=%d, send elements hi = %d\n", target_dim, blocklength * uwidth * count / type_size); }else{ MPI_Type_vector(count, blocklength * uwidth, stride, MPI_BYTE, &reflect->datatype_hi); } MPI_Type_commit(&reflect->datatype_hi); // // Allocate buffers // if(useHostBuffer){ CUDA_SAFE_CALL(cudaFreeHost(reflect->lo_send_host_buf)); CUDA_SAFE_CALL(cudaFreeHost(reflect->lo_recv_host_buf)); } if ((_XMPF_running && target_dim != ndims - 1) || (_XMPC_running && target_dim != 0)){ if(packVector){ CUDA_SAFE_CALL(cudaFree(reflect->lo_send_buf)); CUDA_SAFE_CALL(cudaFree(reflect->lo_recv_buf)); } } if ((_XMPF_running && target_dim == ndims - 1) || (_XMPC_running && target_dim == 0)){ // } // for lower reflect if (lwidth){ lo_buf_size = lwidth * blocklength * count; hi_buf_size = uwidth * blocklength * count; if ((_XMPF_running && target_dim == ndims - 1) || (_XMPC_running && target_dim == 0)){ lo_send_dev_buf = lo_send_array; lo_recv_dev_buf = lo_recv_array; hi_send_dev_buf = hi_send_array; hi_recv_dev_buf = hi_recv_array; } else { _XMP_TSTART(t0); if(packVector){ CUDA_SAFE_CALL(cudaMalloc((void **)&lo_send_dev_buf, lo_buf_size + hi_buf_size)); hi_send_dev_buf = (char*)lo_send_dev_buf + lo_buf_size; CUDA_SAFE_CALL(cudaMalloc((void **)&lo_recv_dev_buf, lo_buf_size + hi_buf_size)); hi_recv_dev_buf = (char*)lo_recv_dev_buf + lo_buf_size; }else{ lo_send_dev_buf = lo_send_array; lo_recv_dev_buf = lo_recv_array; hi_send_dev_buf = hi_send_array; hi_recv_dev_buf = hi_recv_array; } _XMP_TEND2(xmptiming_.t_mem, xmptiming_.tdim_mem[target_dim], t0); } if(useHostBuffer){ CUDA_SAFE_CALL(cudaMallocHost((void**)&lo_send_host_buf, lo_buf_size + hi_buf_size)); hi_send_host_buf = (char*)lo_send_host_buf + lo_buf_size; CUDA_SAFE_CALL(cudaMallocHost((void**)&lo_recv_host_buf, lo_buf_size + hi_buf_size)); hi_recv_host_buf = (char*)lo_recv_host_buf + lo_buf_size; mpi_lo_send_buf = lo_send_host_buf; mpi_lo_recv_buf = lo_recv_host_buf; mpi_hi_send_buf = hi_send_host_buf; mpi_hi_recv_buf = hi_recv_host_buf; }else{ mpi_lo_send_buf = lo_send_dev_buf; mpi_lo_recv_buf = lo_recv_dev_buf; mpi_hi_send_buf = hi_send_dev_buf; mpi_hi_recv_buf = hi_recv_dev_buf; } } // for upper reflect // // initialize communication // int src, dst; if (!is_periodic && my_pos == lb_pos){ // no periodic lo_rank = MPI_PROC_NULL; } if (!is_periodic && my_pos == ub_pos){ // no periodic hi_rank = MPI_PROC_NULL; } // for lower shadow if (lwidth){ src = lo_rank; dst = hi_rank; } else { src = MPI_PROC_NULL; dst = MPI_PROC_NULL; } // fprintf(stderr, "dim=%d, lo_src=%d, lo_dst=%d\n", target_dim, src, dst); if (reflect->req[0] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[0]); } if (reflect->req[1] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[1]); } MPI_Recv_init(mpi_lo_recv_buf, 1, reflect->datatype_lo, src, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[0]); MPI_Send_init(mpi_lo_send_buf, 1, reflect->datatype_lo, dst, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[1]); // for upper shadow if (uwidth){ src = hi_rank; dst = lo_rank; } else { src = MPI_PROC_NULL; dst = MPI_PROC_NULL; } // fprintf(stderr, "dim=%d, hi_src=%d, hi_dst=%d\n", target_dim, src, dst); if (reflect->req[2] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[2]); } if (reflect->req[3] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[3]); } MPI_Recv_init(mpi_hi_recv_buf, 1, reflect->datatype_hi, src, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[2]); MPI_Send_init(mpi_hi_send_buf, 1, reflect->datatype_hi, dst, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[3]); // // cache schedule // reflect->count = count; reflect->blocklength = blocklength; reflect->stride = stride; reflect->lo_send_array = lo_send_array; reflect->lo_recv_array = lo_recv_array; reflect->hi_send_array = hi_send_array; reflect->hi_recv_array = hi_recv_array; if(packVector){ reflect->lo_send_buf = lo_send_dev_buf; reflect->lo_recv_buf = lo_recv_dev_buf; reflect->hi_send_buf = hi_send_dev_buf; reflect->hi_recv_buf = hi_recv_dev_buf; } if(useHostBuffer){ reflect->lo_send_host_buf = lo_send_host_buf; reflect->lo_recv_host_buf = lo_recv_host_buf; reflect->hi_send_host_buf = hi_send_host_buf; reflect->hi_recv_host_buf = hi_recv_host_buf; } reflect->lo_rank = lo_rank; reflect->hi_rank = hi_rank; // gpu async reflect->lo_async_id = _XMP_alloc(sizeof(cudaStream_t)); CUDA_SAFE_CALL(cudaStreamCreate(reflect->lo_async_id)); if(target_dim != 0 && (!useHostBuffer || (lo_rank != MPI_PROC_NULL && hi_rank != MPI_PROC_NULL && (lo_buf_size / type_size) <= useSingleStreamLimit)) ){ reflect->hi_async_id = NULL; }else{ cudaStream_t *hi_stream = (cudaStream_t*)_XMP_alloc(sizeof(cudaStream_t)); CUDA_SAFE_CALL(cudaStreamCreate(hi_stream)); reflect->hi_async_id = (void*)hi_stream; } reflect->event = _XMP_alloc(sizeof(cudaEvent_t)); CUDA_SAFE_CALL(cudaEventCreateWithFlags(reflect->event, cudaEventDisableTiming)); }
void _XMP_mpi_coarray_malloc(_XMP_coarray_t *coarray_desc, void **addr, const size_t coarray_size, bool is_acc) { char **each_addr = NULL; // gap_size on each node struct _shift_queue_t *shift_queue = is_acc? &_shift_queue_acc : &_shift_queue; size_t shift; char *real_addr = NULL; MPI_Win win = MPI_WIN_NULL; _XMP_nodes_t *nodes = _XMP_get_execution_nodes(); MPI_Comm comm = *(MPI_Comm *)nodes->comm; if(coarray_size == 0){ _XMP_fatal("_XMP_mpi_coarray_malloc: zero size is not allowed"); } if(_XMP_flag_multi_win){ _XMP_mpi_onesided_alloc_win(&win, (void**)&real_addr, coarray_size, comm, is_acc); MPI_Win_lock_all(MPI_MODE_NOCHECK, win); XACC_DEBUG("addr=%p, size=%zd, is_acc=%d", real_addr, coarray_size, is_acc); }else{ each_addr = (char**)_XMP_alloc(sizeof(char *) * _XMP_world_size); for(int i=0;i<_XMP_world_size;i++){ each_addr[i] = (char *)(shift_queue->total_shift); } real_addr = _xmp_mpi_onesided_buf; #ifdef _XMP_XACC if(is_acc){ real_addr = _xmp_mpi_onesided_buf_acc; } #endif real_addr += shift_queue->total_shift; XACC_DEBUG("malloc_do: addr=%p, shift=%zd, is_acc=%d", real_addr, shift_queue->total_shift, is_acc); if(coarray_size % _XMP_MPI_ALIGNMENT == 0) shift = coarray_size; else{ shift = ((coarray_size / _XMP_MPI_ALIGNMENT) + 1) * _XMP_MPI_ALIGNMENT; } _push_shift_queue(shift_queue, shift); size_t total_shift = shift_queue->total_shift; if(total_shift > _xmp_mpi_onesided_heap_size){ fprintf(stderr, "_xmp_mpi_onesided_heap_size=%zd\n", _xmp_mpi_onesided_heap_size); if(_XMP_world_rank == 0){ fprintf(stderr, "[ERROR] Cannot allocate coarray. Heap memory size of coarray is too small.\n"); fprintf(stderr, " Please set the environmental variable \"XMP_ONESIDED_HEAP_SIZE\".\n"); fprintf(stderr, " e.g.) export XMP_ONESIDED_HEAP_SIZE=%zuM (or more).\n", (total_shift/1024/1024)+1); } _XMP_fatal_nomsg(); } } if(is_acc){ #ifdef _XMP_XACC coarray_desc->addr_dev = each_addr; coarray_desc->real_addr_dev = real_addr; coarray_desc->win_acc = win; coarray_desc->nodes = nodes; #endif }else{ coarray_desc->addr = each_addr; coarray_desc->real_addr = real_addr; coarray_desc->win = win; coarray_desc->win_acc = MPI_WIN_NULL; coarray_desc->nodes = nodes; } *addr = real_addr; }
void _XMP_reflect_pcopy_sched_dim(_XMP_array_t *adesc, int target_dim, int lwidth, int uwidth, int is_periodic, int shadow_comm_type){ if (lwidth == 0 && uwidth == 0) return; _XMP_array_info_t *ai = &(adesc->info[target_dim]); _XMP_array_info_t *ainfo = adesc->info; _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK); _XMP_ASSERT(ai->is_shadow_comm_member); if (lwidth > ai->shadow_size_lo || uwidth > ai->shadow_size_hi){ _XMP_fatal("reflect width is larger than shadow width."); } _XMP_reflect_sched_t *reflect = ai->reflect_sched; int target_tdim = ai->align_template_index; _XMP_nodes_info_t *ni = adesc->align_template->chunk[target_tdim].onto_nodes_info; if (ni->size == 1 && !is_periodic) return; int ndims = adesc->dim; // 0-origin int my_pos = ni->rank; int lb_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_lower); int ub_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_upper); int lo_pos = (my_pos == lb_pos) ? ub_pos : my_pos - 1; int hi_pos = (my_pos == ub_pos) ? lb_pos : my_pos + 1; MPI_Comm *comm = adesc->align_template->onto_nodes->comm; int my_rank = adesc->align_template->onto_nodes->comm_rank; int lo_rank = my_rank + (lo_pos - my_pos) * ni->multiplier; int hi_rank = my_rank + (hi_pos - my_pos) * ni->multiplier; int count = 0, blocklength = 0; long long stride = 0; int type_size = adesc->type_size; void *array_addr = adesc->array_addr_p; void *lo_send_array = NULL, *lo_recv_array = NULL; void *hi_send_array = NULL, *hi_recv_array = NULL; void *lo_send_buf = NULL; void *lo_recv_buf = NULL; void *hi_send_buf = NULL; void *hi_recv_buf = NULL; int lo_buf_size = 0; int hi_buf_size = 0; if (reflect->prev_pcopy_sched_type && lwidth == reflect->lo_width && uwidth == reflect->hi_width && is_periodic == reflect->is_periodic){ if ((adesc->order == MPI_ORDER_FORTRAN && target_dim != ndims - 1) || (adesc->order == MPI_ORDER_C && target_dim != 0)){ goto init_comm; } else if (reflect->prev_pcopy_sched_type != shadow_comm_type){ count = reflect->count; blocklength = reflect->blocklength; stride = reflect->stride; goto alloc_buf; } } // // setup data_type // if (adesc->order == MPI_ORDER_FORTRAN){ /* for XMP/F */ count = 1; blocklength = type_size; stride = ainfo[0].alloc_size * type_size; for (int i = ndims - 2; i >= target_dim; i--){ count *= ainfo[i+1].alloc_size; } for (int i = 1; i <= target_dim; i++){ blocklength *= ainfo[i-1].alloc_size; stride *= ainfo[i].alloc_size; } } else if (adesc->order == MPI_ORDER_C){ /* for XMP/C */ count = 1; blocklength = type_size; stride = ainfo[ndims-1].alloc_size * type_size; for (int i = 1; i <= target_dim; i++){ count *= ainfo[i-1].alloc_size; } for (int i = ndims - 2; i >= target_dim; i--){ blocklength *= ainfo[i+1].alloc_size; stride *= ainfo[i].alloc_size; } } else { _XMP_fatal("cannot determin the base language."); } // // calculate base address // alloc_buf: // for lower reflect if (lwidth){ lo_send_array = array_addr; lo_recv_array = array_addr; for (int i = 0; i < ndims; i++) { int lb_send, lb_recv; unsigned long long dim_acc; if (i == target_dim) { lb_send = ainfo[i].local_upper - lwidth + 1; lb_recv = ainfo[i].shadow_size_lo - lwidth;; } else { // Note: including shadow area lb_send = 0; lb_recv = 0; } dim_acc = ainfo[i].dim_acc; lo_send_array = (void *)((char *)lo_send_array + lb_send * dim_acc * type_size); lo_recv_array = (void *)((char *)lo_recv_array + lb_recv * dim_acc * type_size); } } // for upper reflect if (uwidth){ hi_send_array = array_addr; hi_recv_array = array_addr; for (int i = 0; i < ndims; i++) { int lb_send, lb_recv; unsigned long long dim_acc; if (i == target_dim) { lb_send = ainfo[i].local_lower; lb_recv = ainfo[i].local_upper + 1; } else { // Note: including shadow area lb_send = 0; lb_recv = 0; } dim_acc = ainfo[i].dim_acc; hi_send_array = (void *)((char *)hi_send_array + lb_send * dim_acc * type_size); hi_recv_array = (void *)((char *)hi_recv_array + lb_recv * dim_acc * type_size); } } // // Allocate buffers // if (reflect->prev_pcopy_sched_type == _XMP_COMM_REFLECT && ((adesc->order == MPI_ORDER_FORTRAN && target_dim == ndims - 1) || (adesc->order == MPI_ORDER_C && target_dim == 0))){ ; } else { _XMP_free(reflect->lo_send_buf); _XMP_free(reflect->lo_recv_buf); _XMP_free(reflect->hi_send_buf); _XMP_free(reflect->hi_recv_buf); } // for lower reflect if (lwidth){ lo_buf_size = lwidth * blocklength * count; if (shadow_comm_type == _XMP_COMM_REFLECT && ((adesc->order == MPI_ORDER_FORTRAN && target_dim == ndims - 1) || (adesc->order == MPI_ORDER_C && target_dim == 0))){ lo_send_buf = lo_send_array; lo_recv_buf = lo_recv_array; } else { _XMP_TSTART(t0); lo_send_buf = _XMP_alloc(lo_buf_size); lo_recv_buf = _XMP_alloc(lo_buf_size); _XMP_TEND2(xmptiming_.t_mem, xmptiming_.tdim_mem[target_dim], t0); } } // for upper reflect if (uwidth){ hi_buf_size = uwidth * blocklength * count; if (shadow_comm_type == _XMP_COMM_REFLECT && ((adesc->order == MPI_ORDER_FORTRAN && target_dim == ndims - 1) || (adesc->order == MPI_ORDER_C && target_dim == 0))){ hi_send_buf = hi_send_array; hi_recv_buf = hi_recv_array; } else { _XMP_TSTART(t0); hi_send_buf = _XMP_alloc(hi_buf_size); hi_recv_buf = _XMP_alloc(hi_buf_size); _XMP_TEND2(xmptiming_.t_mem, xmptiming_.tdim_mem[target_dim], t0); } } // // cache schedule // reflect->count = count; reflect->blocklength = blocklength; reflect->stride = stride; reflect->lo_send_array = lo_send_array; reflect->lo_recv_array = lo_recv_array; reflect->hi_send_array = hi_send_array; reflect->hi_recv_array = hi_recv_array; reflect->lo_send_buf = lo_send_buf; reflect->lo_recv_buf = lo_recv_buf; reflect->hi_send_buf = hi_send_buf; reflect->hi_recv_buf = hi_recv_buf; // // initialize communication // int src, dst; init_comm: if (!is_periodic && my_pos == lb_pos){ // no periodic lo_rank = MPI_PROC_NULL; } if (!is_periodic && my_pos == ub_pos){ // no periodic hi_rank = MPI_PROC_NULL; } lo_buf_size = lwidth * reflect->blocklength * reflect->count; hi_buf_size = uwidth * reflect->blocklength * reflect->count; // for lower shadow if (lwidth){ src = lo_rank; dst = hi_rank; } else { src = MPI_PROC_NULL; dst = MPI_PROC_NULL; } if (shadow_comm_type == _XMP_COMM_REDUCE_SHADOW){ if (reflect->req_reduce[0] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req_reduce[0]); } if (reflect->req_reduce[1] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req_reduce[1]); } MPI_Send_init(reflect->lo_recv_buf, lo_buf_size, MPI_BYTE, src, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req_reduce[0]); MPI_Recv_init(reflect->lo_send_buf, lo_buf_size, MPI_BYTE, dst, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req_reduce[1]); } else { if (reflect->req[0] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[0]); } if (reflect->req[1] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[1]); } MPI_Recv_init(reflect->lo_recv_buf, lo_buf_size, MPI_BYTE, src, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[0]); MPI_Send_init(reflect->lo_send_buf, lo_buf_size, MPI_BYTE, dst, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[1]); } // for upper shadow if (uwidth){ src = hi_rank; dst = lo_rank; } else { src = MPI_PROC_NULL; dst = MPI_PROC_NULL; } if (shadow_comm_type == _XMP_COMM_REDUCE_SHADOW){ if (reflect->req_reduce[2] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req_reduce[2]); } if (reflect->req_reduce[3] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req_reduce[3]); } MPI_Send_init(reflect->hi_recv_buf, hi_buf_size, MPI_BYTE, src, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req_reduce[2]); MPI_Recv_init(reflect->hi_send_buf, hi_buf_size, MPI_BYTE, dst, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req_reduce[3]); } else { if (reflect->req[2] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[2]); } if (reflect->req[3] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[3]); } MPI_Recv_init(reflect->hi_recv_buf, hi_buf_size, MPI_BYTE, src, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[2]); MPI_Send_init(reflect->hi_send_buf, hi_buf_size, MPI_BYTE, dst, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[3]); } reflect->prev_pcopy_sched_type = shadow_comm_type; reflect->lo_rank = lo_rank; reflect->hi_rank = hi_rank; }
void _XMP_reflect_async_ordinal(_XMP_array_t *a, int async_id){ int n = a->dim; _XMP_async_reflect_t *async_reflect; _Bool reusable_sched = false; if (!a->async_reflect){ int max_nreqs = (pow(3, n) - 1) * 2; async_reflect = (_XMP_async_reflect_t *)_XMP_alloc(sizeof(_XMP_async_reflect_t)); async_reflect->datatype = (MPI_Datatype *)_XMP_alloc(sizeof(MPI_Datatype) * max_nreqs); async_reflect->reqs = (MPI_Request *)_XMP_alloc(sizeof(MPI_Request) * max_nreqs); for (int i = 0; i < max_nreqs; i++){ async_reflect->datatype[i] = MPI_DATATYPE_NULL; async_reflect->reqs[i] = MPI_REQUEST_NULL; } async_reflect->nreqs = 0; a->async_reflect = async_reflect; } else { reusable_sched = true; async_reflect = a->async_reflect; for (int i = 0; i < n; i++){ if (async_reflect->lwidth[i] != _xmp_lwidth[i] || async_reflect->uwidth[i] != _xmp_uwidth[i] || async_reflect->is_periodic[i] != _xmp_is_periodic[i]){ reusable_sched = false; break; } } } if (!reusable_sched){ int lb[_XMP_N_MAX_DIM] = { 0 }; int ub[_XMP_N_MAX_DIM] = { 0 }; for (int i = 0; i < n; i++){ async_reflect->lwidth[i] = _xmp_lwidth[i]; async_reflect->uwidth[i] = _xmp_uwidth[i]; async_reflect->is_periodic[i] = _xmp_is_periodic[i]; if (_xmp_lwidth[i] > 0) lb[i] = -1; if (_xmp_uwidth[i] > 0) ub[i] = 1; } for (int i = 0; i < async_reflect->nreqs; i++){ if (async_reflect->datatype[i] != MPI_DATATYPE_NULL) MPI_Type_free(&async_reflect->datatype[i]); if (async_reflect->reqs[i] != MPI_REQUEST_NULL) MPI_Request_free(&async_reflect->reqs[i]); } async_reflect->nreqs = 0; int ishadow[_XMP_N_MAX_DIM]; for (ishadow[0] = lb[0]; ishadow[0] <= ub[0]; ishadow[0]++){ for (ishadow[1] = lb[1]; ishadow[1] <= ub[1]; ishadow[1]++){ for (ishadow[2] = lb[2]; ishadow[2] <= ub[2]; ishadow[2]++){ for (ishadow[3] = lb[3]; ishadow[3] <= ub[3]; ishadow[3]++){ for (ishadow[4] = lb[4]; ishadow[4] <= ub[4]; ishadow[4]++){ for (ishadow[5] = lb[5]; ishadow[5] <= ub[5]; ishadow[5]++){ for (ishadow[6] = lb[6]; ishadow[6] <= ub[6]; ishadow[6]++){ // When ishadow > 0, upper shadow is to be updated, and vice versa. int nnzero = 0; for (int i = 0; i < n; i++){ if (ishadow[i] != 0) nnzero++; } if (nnzero == 0) continue; _XMP_reflect_sched_dir(a, ishadow, _xmp_lwidth, _xmp_uwidth, _xmp_is_periodic); }}}}}}} } _XMP_async_comm_t *async = _XMP_get_current_async(); MPI_Request *reqs = &async->reqs[async->nreqs]; // copy to async if (async->nreqs + async_reflect->nreqs > _XMP_MAX_ASYNC_REQS){ _XMP_fatal("too many arrays in an asynchronous reflect"); } memcpy(reqs, async_reflect->reqs, async_reflect->nreqs * sizeof(MPI_Request)); async->nreqs += async_reflect->nreqs; _XMP_TSTART(t0); MPI_Startall(async_reflect->nreqs, reqs); _XMP_TEND(xmptiming_.t_start, t0); }
void xmpf_array_alloc__(_XMP_array_t **a_desc, int *n_dim, int *type, _XMP_template_t **t_desc) { _XMP_array_t *a = _XMP_alloc(sizeof(_XMP_array_t) + sizeof(_XMP_array_info_t) * (*n_dim - 1)); // moved to xmpf_align_info //a->is_allocated = (*t_desc)->is_owner; a->is_align_comm_member = false; a->dim = *n_dim; a->type = *type; a->type_size = _XMP_get_datatype_size(a->type); size_t dummy; _XMP_setup_reduce_type(&a->mpi_type, &dummy, *type); a->order = MPI_ORDER_FORTRAN; a->total_elmts = 0; a->async_reflect = NULL; a->align_comm = NULL; a->align_comm_size = 1; a->align_comm_rank = _XMP_N_INVALID_RANK; a->array_nodes = NULL; #ifdef _XMP_MPI3_ONESIDED a->coarray = NULL; #endif //a->num_reqs = -1; //a->mpi_req_shadow = _XMP_alloc(sizeof(MPI_Request) * 4 * (*n_dim)); a->align_template = *t_desc; *a_desc = a; for (int i = 0; i < *n_dim; i++) { _XMP_array_info_t *ai = &(a->info[i]); ai->is_shadow_comm_member = false; ai->ser_lower = 0; ai->ser_upper = 0; ai->ser_size = 0; ai->par_lower = 0; ai->par_upper = 0; ai->par_stride = 0; ai->par_size = 0; ai->local_lower = 0; ai->local_upper = 0; ai->local_stride = 0; ai->alloc_size = 0; ai->shadow_type = _XMP_N_SHADOW_NONE; ai->shadow_size_lo = 0; ai->shadow_size_hi = 0; ai->reflect_sched = NULL; ai->shadow_comm = NULL; ai->shadow_comm_size = 1; ai->shadow_comm_rank = _XMP_N_INVALID_RANK; } //xmpf_dbg_printf("xmpf_array_alloc ends\n"); }
static void _XMP_reflect_sched(_XMP_array_t *a, int *lwidth, int *uwidth, int *is_periodic, int is_async, void *dev_addr) { _XMP_TSTART(t0); for (int i = 0; i < a->dim; i++){ _XMP_array_info_t *ai = &(a->info[i]); if (ai->shadow_type == _XMP_N_SHADOW_NONE){ continue; } else if (ai->shadow_type == _XMP_N_SHADOW_NORMAL){ _XMP_reflect_sched_t *reflect = ai->reflect_acc_sched; if(reflect == NULL){ reflect = _XMP_alloc(sizeof(_XMP_reflect_sched_t)); reflect->is_periodic = -1; /* not used yet */ reflect->datatype_lo = MPI_DATATYPE_NULL; reflect->datatype_hi = MPI_DATATYPE_NULL; for (int j = 0; j < 4; j++) reflect->req[j] = MPI_REQUEST_NULL; reflect->lo_send_buf = NULL; reflect->lo_recv_buf = NULL; reflect->hi_send_buf = NULL; reflect->hi_recv_buf = NULL; reflect->lo_send_host_buf = NULL; reflect->lo_recv_host_buf = NULL; reflect->hi_send_host_buf = NULL; reflect->hi_recv_host_buf = NULL; ai->reflect_acc_sched = reflect; }else{ // } if (1/*lwidth[i] || uwidth[i]*/){ _XMP_ASSERT(reflect); if (reflect->is_periodic == -1 /* not set yet */ || lwidth[i] != reflect->lo_width || uwidth[i] != reflect->hi_width || is_periodic[i] != reflect->is_periodic){ reflect->lo_width = lwidth[i]; reflect->hi_width = uwidth[i]; reflect->is_periodic = is_periodic[i]; if (/*_xmp_reflect_pack_flag && !is_async*/ 1){ _XMP_reflect_pcopy_sched_dim(a, i, lwidth[i], uwidth[i], is_periodic[i], dev_addr, lwidth, uwidth); } else { //_XMP_reflect_normal_sched_dim(a, i, lwidth[i], uwidth[i], is_periodic[i]); } } } } else { /* _XMP_N_SHADOW_FULL */ ; } } _XMP_TEND(xmptiming_.t_sched, t0); }