void AllreduceEngine::Allgather(char* input, int send_size, char* output) { //assign blocks block_start_[0] = 0; block_len_[0] = send_size; for (int i = 1; i < num_machines_; ++i) { block_start_[i] = block_start_[i - 1] + block_len_[i - 1]; block_len_[i] = send_size; } Allgather(input, send_size * num_machines_, block_start_, block_len_, output); }
// REVIEW(feiga): the third argument type_size never used void AllreduceEngine::AllreduceByAllGather(char* input, int input_size, int, char* output, ReduceFunction reducer) { //assign blocks int all_size = input_size * num_machines_; block_start_[0] = 0; block_len_[0] = input_size; for (int i = 1; i < num_machines_; ++i) { block_start_[i] = block_start_[i - 1] + block_len_[i - 1]; block_len_[i] = input_size; } if (input_size*num_machines_ > buffer_size_) { delete[] buffer_; buffer_size_ = input_size*num_machines_; buffer_ = new char[buffer_size_]; } Allgather(input, all_size, block_start_, block_len_, buffer_); for (int i = 1; i < num_machines_; ++i) { reducer(buffer_ + block_start_[i], buffer_ + block_start_[0], input_size); } std::memcpy(output, buffer_, input_size); }
void AllreduceEngine::Allreduce(char* input, int input_size, int type_size, char* output, ReduceFunction reducer) { int count = input_size / type_size; //if small package or small count , do it by all gather.(reduce the communication times.) if (count < num_machines_ || input_size < 4096) { AllreduceByAllGather(input, input_size, type_size, output, reducer); return; } //assign the blocks to every rank_s. int step = (count + num_machines_ - 1) / num_machines_; if (step < 1) { step = 1; } block_start_[0] = 0; for (int i = 0; i < num_machines_ - 1; ++i) { block_len_[i] = step * type_size < input_size - block_start_[i] ? step * type_size : input_size - block_start_[i]; block_start_[i + 1] = block_start_[i] + block_len_[i]; } block_len_[num_machines_ - 1] = input_size - block_start_[num_machines_ - 1]; //do reduce scatter ReduceScatter(input, input_size, type_size, block_start_, block_len_, output, reducer); //do all gather Allgather(output, input_size, block_start_, block_len_, output); }
/*==========================================================================*/ void build_cp_comm_pkg_dvr(CP *cp,MPI_Comm world) /*==========================================================================*/ {/* begin routine */ /*==========================================================================*/ /* Local variable declarations */ #include "../typ_defs/typ_mask.h" int irem,idiv,iii; int nstate_ncoef_proc_max,nstate_ncoef_proc_min; int num_coef,*num_coef_v,ncoef_proc,ncoef_proc_yz; /*==========================================================================*/ /* I) Up states */ /*------------------------------------*/ /* i) states per processor stuff */ idiv = cp->cpcoeffs_info.nstate_up/cp->communicate.np_states; irem = (cp->cpcoeffs_info.nstate_up % cp->communicate.np_states); cp->cpcoeffs_info.nstate_up_proc = idiv; if(cp->communicate.myid_state < irem) { cp->cpcoeffs_info.nstate_up_proc = idiv+1; }/*endif*/ if(cp->communicate.myid_state <= irem) { cp->cpcoeffs_info.istate_up_st = cp->communicate.myid_state*(idiv+1)+1; } else { cp->cpcoeffs_info.istate_up_st = irem*(idiv+1) + (cp->communicate.myid_state-irem)*idiv+1; }/*endif*/ cp->cpcoeffs_info.istate_up_end = cp->cpcoeffs_info.istate_up_st + cp->cpcoeffs_info.nstate_up_proc-1; /*------------------------------------*/ /* ii) coefs per processor stuff */ cp->cp_comm_state_pkg_dvr_up.num_proc = cp->communicate.np_states; cp->cp_comm_state_pkg_dvr_up.myid = cp->communicate.myid_state; cp->cp_comm_state_pkg_dvr_up.nstate = cp->cpcoeffs_info.nstate_up; cp->cp_comm_state_pkg_dvr_up.ncoef = cp->cpcoeffs_info.ncoef; cp->cp_comm_state_pkg_dvr_up.nstate_proc= cp->cpcoeffs_info.nstate_up_proc; cp->cp_comm_state_pkg_dvr_up.world = world; if(cp->communicate.np_states > 1){ Comm_dup(cp->communicate.comm_states,&(cp->cp_comm_state_pkg_dvr_up.comm)); } else { cp->cp_comm_state_pkg_dvr_up.comm = cp->communicate.comm_states; }/* endif */ irem = (cp->cp_comm_state_pkg_dvr_up.nstate % cp->cp_comm_state_pkg_dvr_up.num_proc); cp->cp_comm_state_pkg_dvr_up.nstate_proc_max = (irem > 0 ? idiv+1 : idiv); cp->cp_comm_state_pkg_dvr_up.nstate_max = (irem > 0 ? ((idiv+1)*cp->communicate.np_states) : (idiv*cp->communicate.np_states)) ; /* different from PW code*/ cp->cp_comm_state_pkg_dvr_up.nstate_proc_min = idiv; idiv = (cp->cpcoeffs_info.grid_ny)*(cp->cpcoeffs_info.grid_nz)/ (cp->cp_comm_state_pkg_dvr_up.num_proc); irem = (cp->cpcoeffs_info.grid_ny * cp->cpcoeffs_info.grid_nz) % cp->cp_comm_state_pkg_dvr_up.num_proc; ncoef_proc_yz = (cp->communicate.myid_state < irem ? idiv+1 : idiv); ncoef_proc = ncoef_proc_yz * (cp->cpcoeffs_info.grid_nx); cp->cpcoeffs_info.nstate_ncoef_proc_up = ncoef_proc; cp->cp_comm_state_pkg_dvr_up.nstate_ncoef_proc = cp->cpcoeffs_info.nstate_ncoef_proc_up; if(cp->communicate.np_states > 1){ Allreduce(&(cp->cpcoeffs_info.nstate_ncoef_proc_up), &nstate_ncoef_proc_max, 1,MPI_INT,MPI_MAX,0,world); /* Not defined anymore Allreduce(&(cp->cpcoeffs_info.nstate_ncoef_proc_up), &nstate_ncoef_proc_min, 1,MPI_INT,MPI_MIN,0,world); */ }else{ nstate_ncoef_proc_max = cp->cpcoeffs_info.nstate_ncoef_proc_up; /* nstate_ncoef_proc_min = cp->cpcoeffs_info.nstate_ncoef_proc_up; */ } cp->cpcoeffs_info.nstate_ncoef_proc_max_up = nstate_ncoef_proc_max; cp->cp_comm_state_pkg_dvr_up.nstate_ncoef_proc_max = nstate_ncoef_proc_max; /*cp->cp_comm_state_pkg_dvr_up.nstate_ncoef_proc_min = nstate_ncoef_proc_min; */ if(cp->communicate.np_states > 1){ num_coef = cp->cp_comm_state_pkg_dvr_up.nstate_ncoef_proc; num_coef_v = (int *) malloc((cp->communicate.np_states)*sizeof(int))-1; Allgather(&num_coef,1,MPI_INT,&num_coef_v[1],1,MPI_INT,0,world); cp->cpcoeffs_info.icoef_start_up = 1; for(iii=1; iii <= cp->communicate.myid; iii++){ cp->cpcoeffs_info.icoef_start_up += num_coef_v[iii]; } cp->cp_comm_state_pkg_dvr_up.icoef_start = cp->cpcoeffs_info.icoef_start_up; }else{ cp->cpcoeffs_info.icoef_start_up = 1; cp->cp_comm_state_pkg_dvr_up.icoef_start = 1; } /*==========================================================================*/ /* II) Down states */ /*------------------------------------*/ /* i) states per processor stuff */ idiv = cp->cpcoeffs_info.nstate_dn/cp->communicate.np_states; irem = (cp->cpcoeffs_info.nstate_dn % cp->communicate.np_states); cp->cpcoeffs_info.nstate_dn_proc = idiv; if(cp->communicate.myid_state < irem) { cp->cpcoeffs_info.nstate_dn_proc = idiv+1; }/*endif*/ if(cp->communicate.myid_state <= irem) { cp->cpcoeffs_info.istate_dn_st = cp->communicate.myid_state*(idiv+1)+1; } else { cp->cpcoeffs_info.istate_dn_st = irem*(idiv+1) + (cp->communicate.myid_state-irem)*idiv+1; }/*endif*/ cp->cpcoeffs_info.istate_dn_end = cp->cpcoeffs_info.istate_dn_st + cp->cpcoeffs_info.nstate_dn_proc-1; /*------------------------------------*/ /* ii) coefs per processor stuff */ cp->cp_comm_state_pkg_dvr_dn.num_proc = cp->communicate.np_states; cp->cp_comm_state_pkg_dvr_dn.myid = cp->communicate.myid_state; cp->cp_comm_state_pkg_dvr_dn.nstate = cp->cpcoeffs_info.nstate_dn; cp->cp_comm_state_pkg_dvr_dn.ncoef = cp->cpcoeffs_info.ncoef; cp->cp_comm_state_pkg_dvr_dn.nstate_proc= cp->cpcoeffs_info.nstate_dn_proc; cp->cp_comm_state_pkg_dvr_dn.world = world; if(cp->communicate.np_states > 1){ Comm_dup(cp->communicate.comm_states,&(cp->cp_comm_state_pkg_dvr_dn.comm)); } else { cp->cp_comm_state_pkg_dvr_dn.comm = cp->communicate.comm_states; }/* endif */ irem = (cp->cp_comm_state_pkg_dvr_dn.nstate % cp->cp_comm_state_pkg_dvr_dn.num_proc); cp->cp_comm_state_pkg_dvr_dn.nstate_proc_max = (irem > 0 ? idiv+1 : idiv); cp->cp_comm_state_pkg_dvr_dn.nstate_max = (irem > 0 ? ((idiv+1)*cp->communicate.np_states) : (idiv*cp->communicate.np_states)) ; cp->cp_comm_state_pkg_dvr_dn.nstate_proc_min = idiv; cp->cpcoeffs_info.nstate_ncoef_proc_dn = ncoef_proc; cp->cp_comm_state_pkg_dvr_dn.nstate_ncoef_proc = cp->cpcoeffs_info.nstate_ncoef_proc_dn; if(cp->communicate.np_states > 1){ Allreduce(&(cp->cpcoeffs_info.nstate_ncoef_proc_dn), &nstate_ncoef_proc_max, 1,MPI_INT,MPI_MAX,0,world); /* Allreduce(&(cp->cpcoeffs_info.nstate_ncoef_proc_dn), &nstate_ncoef_proc_min, 1,MPI_INT,MPI_MIN,0,world); */ }else{ nstate_ncoef_proc_max = cp->cpcoeffs_info.nstate_ncoef_proc_dn; /* nstate_ncoef_proc_min = cp->cpcoeffs_info.nstate_ncoef_proc_dn; */ } cp->cpcoeffs_info.nstate_ncoef_proc_max_dn = nstate_ncoef_proc_max; cp->cp_comm_state_pkg_dvr_dn.nstate_ncoef_proc_max = cp->cpcoeffs_info.nstate_ncoef_proc_max_dn; /* cp->cp_comm_state_pkg_dvr_dn.nstate_ncoef_proc_min = nstate_ncoef_proc_min; */ if(cp->communicate.np_states > 1){ for(iii=1; iii <= cp->communicate.np_states; iii++){ num_coef_v[iii] = 0; } num_coef = cp->cp_comm_state_pkg_dvr_dn.nstate_ncoef_proc; Allgather(&num_coef,1,MPI_INT,&num_coef_v[1],1,MPI_INT,0,world); cp->cpcoeffs_info.icoef_start_dn = 1; for(iii=1; iii <= cp->communicate.myid; iii++){ cp->cpcoeffs_info.icoef_start_dn += num_coef_v[iii]; } cp->cp_comm_state_pkg_dvr_dn.icoef_start = cp->cpcoeffs_info.icoef_start_dn; }else{ cp->cpcoeffs_info.icoef_start_dn = 1; cp->cp_comm_state_pkg_dvr_dn.icoef_start = 1; } if(cp->communicate.np_states > 1){ free(&num_coef_v[1]); } /*==========================================================================*/ }/* end routine */