void sg_route(struct Domain *theDomain,struct poisson *thePoisson){ // struct Tridiagsys* thetridiag=alloc_tridiagsys(thePoisson); tridiag_setup(thePoisson); int N_r=thePoisson->N_r; int N_z=thePoisson->N_z; int N_p=thePoisson->N_p; int N_k=thePoisson->N_k; int i,j,k; int size=thePoisson->size; int rank=theDomain->rank; cylinder_interp(theDomain,thePoisson); set_bndry(theDomain,thePoisson); density_fft(thePoisson); mpi_arrange(thePoisson->density,thePoisson->buffer,thePoisson); double *buffersend=thePoisson->buffer; double *bufferstore=thePoisson->density; int *sendcnts=thePoisson->sendcnts; int *sdispls=thePoisson->sdispls; int *recvcnts=thePoisson->recvcnts; int *rdispls=thePoisson->rdispls; MPI_Alltoallv(buffersend,sendcnts,sdispls,MPI_DOUBLE,bufferstore,recvcnts,rdispls,MPI_DOUBLE,MPI_COMM_WORLD); sinefft(thePoisson); solveVp(rank,thePoisson); sinefft(thePoisson); /* if(rank==0){ int i,j,k; i=0;j=0;k=0; FILE *f; f=fopen("poten.dat","w"); for(i=0;i<thePoisson->N_r_glob;i++) for(k=0;k<thePoisson->N_z_glob;k++) fprintf(f,"%f %d %f\n",i+0.5,k,thePoisson->density[in_lookup(thePoisson,k,i,0)]); fclose(f); } */ buffersend=thePoisson->density; bufferstore=thePoisson->buffer; //this is the inverse MPI comm.SO the sendcnts and recvcnts are exchanged MPI_Alltoallv(buffersend,recvcnts,rdispls,MPI_DOUBLE,bufferstore,sendcnts,sdispls,MPI_DOUBLE,MPI_COMM_WORLD); inverse_mpi_arrange(thePoisson->buffer,thePoisson->density,thePoisson); inverse_fft(thePoisson); int direction=0; for(direction=0;direction<3;direction++){ cal_force(thePoisson,direction); disco_force_interp(theDomain,thePoisson,direction); } // disco_interp(theSim,theCells,thePoisson); // destroy_tridiagsys(thetridiag); }
static void apply(const plan *ego_, R *I, R *O) { const P *ego = (const P *) ego_; plan_rdft *cld1, *cld2, *cld2rest, *cld3; /* transpose locally to get contiguous chunks */ cld1 = (plan_rdft *) ego->cld1; if (cld1) { cld1->apply(ego->cld1, I, O); /* transpose chunks globally */ if (ego->equal_blocks) MPI_Alltoall(O, ego->send_block_sizes[0], FFTW_MPI_TYPE, I, ego->recv_block_sizes[0], FFTW_MPI_TYPE, ego->comm); else MPI_Alltoallv(O, ego->send_block_sizes, ego->send_block_offsets, FFTW_MPI_TYPE, I, ego->recv_block_sizes, ego->recv_block_offsets, FFTW_MPI_TYPE, ego->comm); } else { /* TRANSPOSED_IN, no need to destroy input */ /* transpose chunks globally */ if (ego->equal_blocks) MPI_Alltoall(I, ego->send_block_sizes[0], FFTW_MPI_TYPE, O, ego->recv_block_sizes[0], FFTW_MPI_TYPE, ego->comm); else MPI_Alltoallv(I, ego->send_block_sizes, ego->send_block_offsets, FFTW_MPI_TYPE, O, ego->recv_block_sizes, ego->recv_block_offsets, FFTW_MPI_TYPE, ego->comm); I = O; /* final transpose (if any) is in-place */ } /* transpose locally, again, to get ordinary row-major */ cld2 = (plan_rdft *) ego->cld2; if (cld2) { cld2->apply(ego->cld2, I, O); cld2rest = (plan_rdft *) ego->cld2rest; if (cld2rest) { /* leftover from unequal block sizes */ cld2rest->apply(ego->cld2rest, I + ego->rest_Ioff, O + ego->rest_Ooff); cld3 = (plan_rdft *) ego->cld3; if (cld3) cld3->apply(ego->cld3, O, O); /* else TRANSPOSED_OUT is true and user wants O transposed */ } } }
double time_alltoallv(struct collParams* p) { int i, j, size2; int disp = 0; for ( i = 0; i < p->nranks; i++) { int size2 = (i+p->myrank) % (p->size+1); sendcounts[i] = size2; recvcounts[i] = size2; sdispls[i] = disp; rdispls[i] = disp; disp += size2; } MPI_Barrier(MPI_COMM_WORLD); size2 = p->myrank % (p->size+1); __TIME_START__; for (i = 0; i < p->iter; i++) { MPI_Alltoallv(sbuffer, sendcounts, sdispls, p->type, rbuffer, recvcounts, rdispls, p->type, p->comm); __BAR__(p->comm); } __TIME_END__; if (check_buffers) { check_sbuffer(p->myrank); for (i = 0; i < p->nranks; i++) { disp = 0; for (j = 0; j < p->myrank; j++) { disp += (j+i) % (p->size+1); } check_rbuffer(rbuffer, rdispls[i], i, disp, recvcounts[i]); } } return __TIME_USECS__ / (double)p->iter; }
void remap(){ MPI_Allgather(num,mask,MPI_INT,cnt,mask,MPI_INT,MPI_COMM_WORLD); int arrStart = 0; int arrEnd = 0; int allStart = 0; for (int i=0;i<mask;++i){ spf[0] = allStart; for (int j=0;j<size;++j){ spf[j+1] = spf[j]+cnt[j*mask+i]; } for (int j=0;j<size;++j){ if (spf[rank]>j*len+len-1 || spf[rank+1]-1<j*len){ sdispls[j] = arrStart; sendcounts[j] = 0; } else { sdispls[j] = arrStart; sendcounts[j] = std::min(spf[rank+1],j*len+len)-std::max(j*len,spf[rank]); arrStart += sendcounts[j]; } if (spf[j]>rank*len+len-1 || spf[j+1]-1<rank*len){ rdispls[j] = arrEnd; recvcounts[j] = 0; } else { rdispls[j] = arrEnd; recvcounts[j] = std::min(spf[j+1],rank*len+len)-std::max(rank*len,spf[j]); arrEnd += recvcounts[j]; } } MPI_Alltoallv(tmpData,sendcounts,sdispls,MPI_DT_,data,recvcounts,rdispls,MPI_DT_,MPI_COMM_WORLD); allStart = spf[size]; } }
int ZMPI_Alltoall_int_proclists_alltoallv(int *sendbuf, int nsprocs, int *sprocs, int *recvbuf, int nrprocs, int *rprocs, MPI_Comm comm) /* zmpi_func ZMPI_Alltoall_int_proclists_alltoallv */ { int i, size; int *scounts2, *sdispls2, *rcounts2, *rdispls2; MPI_Comm_size(comm, &size); scounts2 = z_alloc(4 * size, sizeof(int)); sdispls2 = scounts2 + 1 * size; rcounts2 = scounts2 + 2 * size; rdispls2 = scounts2 + 3 * size; for (i = 0; i < size; ++i) { scounts2[i] = rcounts2[i] = DEFAULT_INT; sdispls2[i] = rdispls2[i] = i; recvbuf[i] = 0; } for (i = 0; i < nsprocs; ++i) scounts2[sprocs[i]] = 1; for (i = 0; i < nrprocs; ++i) rcounts2[rprocs[i]] = 1; MPI_Alltoallv(sendbuf, scounts2, sdispls2, MPI_INT, recvbuf, rcounts2, rdispls2, MPI_INT, comm); z_free(scounts2); return MPI_SUCCESS; }
/* Out-of-place version of transpose_mpi (or rather, in place using a scratch array): */ static void transpose_mpi_out_of_place(transpose_mpi_plan p, int el_size, TRANSPOSE_EL_TYPE *local_data, TRANSPOSE_EL_TYPE *work) { local_transpose_copy(local_data, work, el_size, p->local_nx, p->ny); if (p->all_blocks_equal) MPI_Alltoall(work, p->send_block_size * el_size, p->el_type, local_data, p->recv_block_size * el_size, p->el_type, p->comm); else { int i, n_pes = p->n_pes; for (i = 0; i < n_pes; ++i) { p->send_block_sizes[i] *= el_size; p->recv_block_sizes[i] *= el_size; p->send_block_offsets[i] *= el_size; p->recv_block_offsets[i] *= el_size; } MPI_Alltoallv(work, p->send_block_sizes, p->send_block_offsets, p->el_type, local_data, p->recv_block_sizes, p->recv_block_offsets, p->el_type, p->comm); for (i = 0; i < n_pes; ++i) { p->send_block_sizes[i] /= el_size; p->recv_block_sizes[i] /= el_size; p->send_block_offsets[i] /= el_size; p->recv_block_offsets[i] /= el_size; } } do_permutation(local_data, p->perm_block_dest, p->num_perm_blocks, p->perm_block_size * el_size); }
//----------------------------------------------------------------------------- // //----------------------------------------------------------------------------- void Image_Exchanger::exchange_fragment_images(unsigned int* databuf, int nviewer, ImageFragment_Tile* ift) { // fprintf(stderr, "**** %s:%s() ****\n", __FILE__, __func__); #ifdef _DEBUG7 fprintf(stderr, "**** %s:%s() ****\n", __FILE__, __func__); #endif unsigned int* sendbuf = databuf + m_sbuf_offset; unsigned int* recvbuf = databuf + m_rbuf_offset; if(nviewer == 1) { MPI_Gatherv((int*)sendbuf, m_scounts[0], MPI_INT, (int*)recvbuf, m_rcounts, m_rdispls, MPI_INT, 0, MPI_COMM_WORLD); } else { MPI_Alltoallv( (int*)sendbuf, m_scounts, m_sdispls, MPI_INT, (int*)recvbuf, m_rcounts, m_rdispls, MPI_INT, MPI_COMM_WORLD); } ift->address_fragments(m_rbuf_offset, m_rdispls); }
void mpi_alltoallv(void *sendbuf, int *sendcount, int *sdispls, int *sendtype, void *recvbuf, int *recvcount, int *rdispls, int *recvtype, int *comm, int *ierr) { *ierr = MPI_Alltoallv(sendbuf, sendcount, sdispls, *sendtype, recvbuf, recvcount, rdispls, *recvtype, *comm); return; }
FORT_DLL_SPEC void FORT_CALL mpi_alltoallv_ ( void*v1, MPI_Fint *v2, MPI_Fint *v3, MPI_Fint *v4, void*v5, MPI_Fint *v6, MPI_Fint *v7, MPI_Fint *v8, MPI_Fint *v9, MPI_Fint *ierr ){ #ifndef HAVE_MPI_F_INIT_WORKS_WITH_C if (MPIR_F_NeedInit){ mpirinitf_(); MPIR_F_NeedInit = 0; } #endif if (v1 == MPIR_F_MPI_IN_PLACE) v1 = MPI_IN_PLACE; *ierr = MPI_Alltoallv( v1, v2, v3, (MPI_Datatype)(*v4), v5, v6, v7, (MPI_Datatype)(*v8), (MPI_Comm)(*v9) ); }
void mpiColumnMatrixTranspose(ColumnMatrix recvData,ColumnMatrix recvBuffer, ColumnMatrix sendData, ColumnMatrix sendBuffer) { packTansposeData(sendData, sendBuffer); //All processors send and receive the same amount of doubles with the same processor. MPI_Alltoallv(sendBuffer->data,sendBuffer->blockSize,sendBuffer->displacement,MPI_DOUBLE,recvBuffer->data,sendBuffer->blockSize,sendBuffer->displacement,MPI_DOUBLE,*sendData->comm); unpackTansposeData(recvData, recvBuffer); }
int binGraph::exchange_edges(uint64_t m_read, uint64_t* read_edges, int32_t* ranks,etype t) { int32_t* scounts = (int32_t*)malloc(PCU_Comm_Peers()*sizeof(int32_t)); int32_t* rcounts = (int32_t*)malloc(PCU_Comm_Peers()*sizeof(int32_t)); int32_t* sdispls = (int32_t*)malloc(PCU_Comm_Peers()*sizeof(int32_t)); int32_t* sdispls_cpy = (int32_t*)malloc(PCU_Comm_Peers()*sizeof(int32_t)); int32_t* rdispls = (int32_t*)malloc(PCU_Comm_Peers()*sizeof(int32_t)); for (int i = 0; i < PCU_Comm_Peers(); ++i) { scounts[i] = 0; rcounts[i] = 0; sdispls[i] = 0; sdispls_cpy[i] = 0; rdispls[i] = 0; } uint64_t n_per_rank = num_global_verts / PCU_Comm_Peers() + 1; for (uint64_t i = 0; i < m_read*2; i+=2) { uint64_t vert = read_edges[i]; int vert_task = ranks[vert]; scounts[vert_task] += 2; } MPI_Alltoall(scounts, 1, MPI_INT32_T, rcounts, 1, MPI_INT32_T, PCU_Get_Comm()); for (uint64_t i = 1; i < PCU_Comm_Peers(); ++i) { sdispls[i] = sdispls[i-1] + scounts[i-1]; sdispls_cpy[i] = sdispls[i]; rdispls[i] = rdispls[i-1] + rcounts[i-1]; } int32_t total_send = sdispls[PCU_Comm_Peers()-1] + scounts[PCU_Comm_Peers()-1]; int32_t total_recv = rdispls[PCU_Comm_Peers()-1] + rcounts[PCU_Comm_Peers()-1]; uint64_t* sendbuf = (uint64_t*)malloc(total_send*sizeof(uint64_t)); edge_list[t] = (uint64_t*)malloc(total_recv*sizeof(uint64_t)); num_local_edges[t] = total_recv / 2; for (uint64_t i = 0; i < m_read*2; i+=2) { uint64_t vert1 = read_edges[i]; uint64_t vert2 = read_edges[i+1]; int vert_task = ranks[vert1]; sendbuf[sdispls_cpy[vert_task]++] = vert1; sendbuf[sdispls_cpy[vert_task]++] = vert2; } MPI_Alltoallv(sendbuf, scounts, sdispls, MPI_UINT64_T, edge_list[t], rcounts, rdispls, MPI_UINT64_T, PCU_Get_Comm()); free(sendbuf); return 0; }
void mpi_alltoallv (void *sendbuf, MPI_Fint *sendcnts, MPI_Fint *sdispls, MPI_Fint *sendtype, void *recvbuf, MPI_Fint *recvcnts, MPI_Fint *rdispls, MPI_Fint *recvtype, MPI_Fint *comm, MPI_Fint *__ierr) { *__ierr = MPI_Alltoallv (sendbuf, sendcnts, sdispls, MPI_Type_f2c (*sendtype), recvbuf, recvcnts, rdispls, MPI_Type_f2c (*recvtype), MPI_Comm_f2c (*comm)); }
void dummy_operations::run_collective_dummy_operations() { int rank, size; MPI_Comm_rank( MPI_COMM_WORLD, &rank); MPI_Comm_size( MPI_COMM_WORLD, &size); // Run Broadcast { int x; MPI_Comm_rank( MPI_COMM_WORLD, &x); MPI_Bcast(&x, 1, MPI_INT, 0, MPI_COMM_WORLD); } // Run Allgather. { int x, size; MPI_Comm_rank( MPI_COMM_WORLD, &x); MPI_Comm_size( MPI_COMM_WORLD, &size); std::vector<int> rcv(size); MPI_Allgather(&x, 1, MPI_INT, &rcv[0], 1, MPI_INT, MPI_COMM_WORLD); } // Run Allreduce. { int x; MPI_Comm_rank( MPI_COMM_WORLD, &x); int y = 0; MPI_Allreduce(&x, &y, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); } // Dummy Prefix Sum { int x = 1; int y = 0; MPI_Scan(&x, &y, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); } // Run Alltoallv. { std::vector<int> snd(size); std::vector<int> rcv(size); std::vector<int> scounts(size, 1); std::vector<int> rcounts(size, 1); std::vector<int> sdispls(size); std::vector<int> rdispls(size); for (int i = 0, iend = sdispls.size(); i < iend; ++i) { sdispls[i] = rdispls[i] = i; } MPI_Alltoallv(&snd[0], &scounts[0], &sdispls[0], MPI_INT, &rcv[0], &rcounts[0], &rdispls[0], MPI_INT, MPI_COMM_WORLD); } }
FC_FUNC( mpi_alltoallv , MPI_ALLTOALLV ) ( void *sendbuf, int *sendcounts, int *sdispls, int *sendtype, void *recvbuf, int *recvcounts, int *rdispls, int *recvtype, int *comm, int *ierror ) { *ierror=MPI_Alltoallv(sendbuf, sendcounts, sdispls, *sendtype, recvbuf, recvcounts, rdispls, *recvtype, *comm); }
void mpiMatrix_transpose(struct mpiMatrix *matrix, struct mpi_com *uplink) { mpiMatrix_serialiseForSending(matrix, uplink); int *SRcounts = mpiMatrix_genCounts(matrix, uplink); int *SRdispl = mpiMatrix_genDispl(uplink, SRcounts); MPI_Alltoallv(matrix->data, SRcounts , SRdispl, MPI_DOUBLE, matrix->aux, SRcounts, SRdispl, MPI_DOUBLE, uplink->comm); mpiMatrix_swapDataAux(matrix); mpiMatrix_deserialiseAfterReception(matrix); free(SRcounts); free(SRdispl); }
char * avtSamplePointCommunicator::CommunicateMessages(char **sendmessages, int *sendcount, char **recvmessages, int *recvcount) { #ifdef PARALLEL // // Figure out how much each processor needs to send/receive. // MPI_Alltoall(sendcount, 1, MPI_INT, recvcount, 1, MPI_INT, VISIT_MPI_COMM); // // Create a buffer we can receive into. // char *recvConcatList = CreateMessageStrings(recvmessages, recvcount, numProcs); // // Calculate the displacement lists. // int *senddisp = new int[numProcs]; int *recvdisp = new int[numProcs]; senddisp[0] = 0; recvdisp[0] = 0; for (int i = 1 ; i < numProcs ; i++) { senddisp[i] = senddisp[i-1] + sendcount[i-1]; recvdisp[i] = recvdisp[i-1] + recvcount[i-1]; } // // Do the actual transfer of sample points. The messages arrays are // actually indexes into one big array. Since MPI expects that big // array, give that (which is at location 0). // MPI_Alltoallv(sendmessages[0], sendcount, senddisp, MPI_CHAR, recvmessages[0], recvcount, recvdisp, MPI_CHAR, VISIT_MPI_COMM); delete [] senddisp; delete [] recvdisp; // // We need to return this buffer so the calling function can delete it. // return recvConcatList; #else return 0; #endif }
/* communicate integers and doubles accodring * to the pattern computed by COMALL_Pattern */ int COMALL_Repeat (void *pattern) { COMALLPATTERN *pp = pattern; COMDATA *cd; int i; for (i = 0; i < pp->ncpu; i ++) pp->send_position [i] = pp->recv_position [i] = 0; /* pack ints */ for (i = 0, cd = pp->send; i < pp->nsend; i ++, cd ++) { if (cd->ints) { MPI_Pack (cd->i, cd->ints, MPI_INT, &pp->send_data [pp->send_disps [cd->rank]], pp->send_counts [cd->rank], &pp->send_position [cd->rank], pp->comm); } } /* pack doubles */ for (i = 0, cd = pp->send; i < pp->nsend; i ++, cd ++) { if (cd->doubles) { MPI_Pack (cd->d, cd->doubles, MPI_DOUBLE, &pp->send_data [pp->send_disps [cd->rank]], pp->send_counts [cd->rank], &pp->send_position [cd->rank], pp->comm); } } #if DEBUG for (i = 0; i < pp->ncpu; i ++) { ASSERT_DEBUG (pp->send_position [i] <= pp->send_counts [i], "Incorrect packing"); } #endif /* all to all send and receive */ MPI_Alltoallv (pp->send_data, pp->send_counts, pp->send_disps, MPI_PACKED, pp->recv_data, pp->recv_counts, pp->recv_disps, MPI_PACKED, pp->comm); if (pp->recv_size) { /* unpack data */ for (i = 0; i < pp->ncpu; i ++) { MPI_Unpack (&pp->recv_data [pp->recv_disps [i]], pp->recv_counts [i], &pp->recv_position [i], pp->recv [i].i, pp->recv [i].ints, MPI_INT, pp->comm); MPI_Unpack (&pp->recv_data [pp->recv_disps [i]], pp->recv_counts [i], &pp->recv_position [i], pp->recv [i].d, pp->recv [i].doubles, MPI_DOUBLE, pp->comm); } } return pp->send_size; }
int main( int argc, char* argv[] ) { int i, j; int myrank, nprocs; char *sbuf, *rbuf; int *scnt, *rcnt; int *sdpl, *rdpl; int dsize; int ssize, rsize; MPI_Init( &argc, &argv ); MPI_Comm_rank( MPI_COMM_WORLD, &myrank ); MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); MPI_Type_size(DATATYPE, &dsize); scnt = malloc( sizeof(int)*nprocs ); sdpl = malloc( sizeof(int)*nprocs ); rcnt = malloc( sizeof(int)*nprocs ); rdpl = malloc( sizeof(int)*nprocs ); for( i=0; i<nprocs; i++ ) { scnt[i]=SIZE*(i+1)*(myrank+1); rcnt[i]=SIZE*(i+1)*(myrank+1); sdpl[i]=SIZE*((i*(i+1))/2)*(myrank+1); rdpl[i]=SIZE*((i*(i+1))/2)*(myrank+1); } ssize=0; for(i=0; i<nprocs; i++) ssize+=scnt[i]; rsize=0; for(i=0; i<nprocs; i++) rsize+=rcnt[i]; sbuf = (char*) malloc( SIZE*dsize*ssize ); rbuf = (char*) malloc( SIZE*dsize*rsize ); for( i=0; i<REPEAT; i++ ) { MPI_Alltoallv( sbuf, scnt, sdpl, DATATYPE, rbuf, rcnt, rdpl, DATATYPE, MPI_COMM_WORLD ); } fprintf(stdout, "DONE (rank %d)!\n", myrank); MPI_Finalize(); }
/* * Class: mpi_Intracomm * Method: Alltoallv * Signature: (Ljava/lang/Object;II[ILmpi/Datatype;Ljava/lang/Object;I[I[ILmpi/Datatype;)V */ JNIEXPORT void JNICALL Java_mpi_Intracomm_alltoallv(JNIEnv *env, jobject jthis, jobject sendbuf, jint sendoffset, jintArray sendcount, jintArray sdispls, jobject sendtype, jobject recvbuf, jint recvoffset, jintArray recvcount, jintArray rdispls, jobject recvtype) { jint *rcount, *scount, *sdps, *rdps ; jboolean isCopy ; MPI_Comm mpi_comm = (MPI_Comm)((*env)->GetLongField(env,jthis,ompi_java.CommhandleID)) ; MPI_Datatype mpi_stype = (MPI_Datatype) ((*env)->GetLongField(env,sendtype,ompi_java.DatatypehandleID)) ; MPI_Datatype mpi_rtype = (MPI_Datatype) ((*env)->GetLongField(env, recvtype, ompi_java.DatatypehandleID)) ; int sbaseType = (*env)->GetIntField(env, sendtype, ompi_java.DatatypebaseTypeID) ; int rbaseType = (*env)->GetIntField(env, recvtype, ompi_java.DatatypebaseTypeID) ; void *sendptr, *recvptr ; void *sbufbase, *rbufbase ; ompi_java_clearFreeList(env) ; scount=(*env)->GetIntArrayElements(env,sendcount,&isCopy); rcount=(*env)->GetIntArrayElements(env,recvcount,&isCopy); sdps=(*env)->GetIntArrayElements(env,sdispls,&isCopy); rdps=(*env)->GetIntArrayElements(env,rdispls,&isCopy); recvptr = ompi_java_getBufPtr(&rbufbase, env, recvbuf, rbaseType, recvoffset) ; sendptr = ompi_java_getBufPtr(&sbufbase, env, sendbuf, sbaseType, sendoffset) ; MPI_Alltoallv(sendptr, (int*) scount, (int*) sdps, mpi_stype, recvptr, (int*) rcount, (int*) rdps, mpi_rtype, mpi_comm) ; ompi_java_releaseBufPtr(env, sendbuf, sbufbase, sbaseType) ; ompi_java_releaseBufPtr(env, recvbuf, rbufbase, rbaseType) ; (*env)->ReleaseIntArrayElements(env,recvcount,rcount,JNI_ABORT); (*env)->ReleaseIntArrayElements(env,sendcount,scount,JNI_ABORT); (*env)->ReleaseIntArrayElements(env,sdispls,sdps,JNI_ABORT); (*env)->ReleaseIntArrayElements(env,rdispls,rdps,JNI_ABORT); }
int ZMPI_Reduce_scatter_block_intsum_proclists_alltoallv(const int *sendbuf, int nsendprocs, int *sendprocs, int *recvbuf, int recvcount, int nrecvprocs, int *recvprocs, MPI_Comm comm) /* zmpi_func ZMPI_Reduce_scatter_block_intsum_proclists_alltoallv */ { int i, j, size, rank; int *recvbuf_full; int *scounts, *sdispls, *rcounts, *rdispls; MPI_Comm_size(comm, &size); MPI_Comm_rank(comm, &rank); recvbuf_full = z_alloc(nrecvprocs * recvcount, sizeof(int)); scounts = z_alloc(4 * size, sizeof(int)); sdispls = scounts + 1 * size; rcounts = scounts + 2 * size; rdispls = scounts + 3 * size; memset(scounts, 0, 4 * size * sizeof(int)); for (j = 0; j < nrecvprocs; ++j) { rcounts[recvprocs[j]] = recvcount; rdispls[recvprocs[j]] = j * recvcount; } for (j = 0; j < nsendprocs; ++j) { scounts[sendprocs[j]] = recvcount; sdispls[sendprocs[j]] = sendprocs[j] * recvcount; } MPI_Alltoallv((void *) sendbuf, scounts, sdispls, MPI_INT, recvbuf_full, rcounts, rdispls, MPI_INT, comm); for (i = 0; i < recvcount; ++i) recvbuf[i] = DEFAULT_INT; for (j = 0; j < nrecvprocs; ++j) for (i = 0; i < recvcount; ++i) recvbuf[i] += recvbuf_full[j * recvcount + i]; z_free(scounts); z_free(recvbuf_full); return MPI_SUCCESS; }
void transpose (Real **b, int size, int *len, int *disp, int rank, int m){ int i, *sendcounts, *rdispls; Real *sendbuf, *recvbuf; sendbuf = createRealArray (m * len[rank]); recvbuf = createRealArray (m * len[rank]); sendcounts = calloc(size,sizeof(int)); rdispls = calloc(size,sizeof(int)); matrixToVector(b,sendbuf,len,disp, size, rank); int index = 0; for (int i = 0; i < size; ++i) { sendcounts[i]= len[rank]*len[i]; rdispls[i]=index; index=index+sendcounts[i]; } MPI_Alltoallv(sendbuf, sendcounts, rdispls, MPI_DOUBLE, recvbuf, sendcounts, rdispls, MPI_DOUBLE, MPI_COMM_WORLD); vectorToMatrix(b,recvbuf,len,disp, size, rank); }
void ompi_alltoallv_f(char *sendbuf, MPI_Fint *sendcounts, MPI_Fint *sdispls, MPI_Fint *sendtype, char *recvbuf, MPI_Fint *recvcounts, MPI_Fint *rdispls, MPI_Fint *recvtype, MPI_Fint *comm, MPI_Fint *ierr) { MPI_Comm c_comm; MPI_Datatype c_sendtype, c_recvtype; int size, c_ierr; OMPI_ARRAY_NAME_DECL(sendcounts); OMPI_ARRAY_NAME_DECL(sdispls); OMPI_ARRAY_NAME_DECL(recvcounts); OMPI_ARRAY_NAME_DECL(rdispls); c_comm = MPI_Comm_f2c(*comm); c_sendtype = MPI_Type_f2c(*sendtype); c_recvtype = MPI_Type_f2c(*recvtype); MPI_Comm_size(c_comm, &size); OMPI_ARRAY_FINT_2_INT(sendcounts, size); OMPI_ARRAY_FINT_2_INT(sdispls, size); OMPI_ARRAY_FINT_2_INT(recvcounts, size); OMPI_ARRAY_FINT_2_INT(rdispls, size); sendbuf = (char *) OMPI_F2C_IN_PLACE(sendbuf); sendbuf = (char *) OMPI_F2C_BOTTOM(sendbuf); recvbuf = (char *) OMPI_F2C_BOTTOM(recvbuf); c_ierr = MPI_Alltoallv(sendbuf, OMPI_ARRAY_NAME_CONVERT(sendcounts), OMPI_ARRAY_NAME_CONVERT(sdispls), c_sendtype, recvbuf, OMPI_ARRAY_NAME_CONVERT(recvcounts), OMPI_ARRAY_NAME_CONVERT(rdispls), c_recvtype, c_comm); if (NULL != ierr) *ierr = OMPI_INT_2_FINT(c_ierr); OMPI_ARRAY_FINT_2_INT_CLEANUP(sendcounts); OMPI_ARRAY_FINT_2_INT_CLEANUP(sdispls); OMPI_ARRAY_FINT_2_INT_CLEANUP(recvcounts); OMPI_ARRAY_FINT_2_INT_CLEANUP(rdispls); }
void transpose(real **B, size_t block_col, size_t m, size_t nprocs, size_t block_uk, size_t rem_uk, size_t rank) { // Create send and recv Vectors: real *sendV = mk_1D_array(block_col*m, false); real *recvV = mk_1D_array(block_col*m, false); // Create send and recv Count and Displacement for MPI: int scount[nprocs], sdisp[nprocs], rcount[nprocs], rdisp[nprocs]; for (size_t i = 0; i < nprocs; i++){ scount[i] = block_uk; sdisp[i] = block_uk*i; rcount[i] = block_uk; rdisp[i] = block_uk*i; } scount[nprocs-1] = rem_uk; rcount[nprocs-1] = rem_uk; // Wrap Data into the 1D Array sendV: #pragma omp parallel for schedule(static) for (size_t i = 0; i < m; i++){ for (size_t j = 0; j < block_col; j++){ sendV[j + i*block_col] = B[j][i]; } } // Communicate with all processes: MPI_Alltoallv(sendV, scount, sdisp, MPI_DOUBLE, recvV, rcount, rdisp, MPI_DOUBLE, MPI_COMM_WORLD); // Unwrap the Data into the 2D array B, from recvV: #pragma omp parallel for schedule(static) for (size_t i = 0; i < nprocs; i++){ int offset = rdisp[i], count = (rcount[i])/block_col; // #pragma omp parallel for schedule(static) for (size_t k = 0; k < block_col; k++){ for (size_t j = 0; j < count; j++){ B[k][j+(m/nprocs)*i] = recvV[offset + k*count + j]; } } } }
static void sharp_communicate_alm2map (const sharp_mpi_info *minfo, dcmplx **ph) { dcmplx *phas_tmp = RALLOC(dcmplx,minfo->mapdisp[minfo->ntasks]/2); MPI_Alltoallv (*ph,minfo->almcount,minfo->almdisp,MPI_DOUBLE,phas_tmp, minfo->mapcount,minfo->mapdisp,MPI_DOUBLE,minfo->comm); DEALLOC(*ph); ALLOC(*ph,dcmplx,minfo->nph*minfo->npair[minfo->mytask]*minfo->nmtotal); for (int task=0; task<minfo->ntasks; ++task) for (int th=0; th<minfo->npair[minfo->mytask]; ++th) for (int mi=0; mi<minfo->nm[task]; ++mi) { int m = minfo->mval[mi+minfo->ofs_m[task]]; int o1 = minfo->nph*(th*(minfo->mmax+1) + m); int o2 = minfo->mapdisp[task]/2+minfo->nph*(mi+th*minfo->nm[task]); for (int i=0; i<minfo->nph; ++i) (*ph)[o1+i] = phas_tmp[o2+i]; } DEALLOC(phas_tmp); }
void MPItranspose(double **b, double **bt, int nrColon, int m, double *sendbuf, double *recbuf, int *sendcnt, int *sdispls, int size, int rank, int *displs ){ int tt = 0; //teller for (int o=0; o < size; o++) { //går igjennom hver prosessor for (int i=0; i < nrColon; i++) { for (int j=displs[o]; j < displs[o+1]; j++) { //går igjennom det som skal sendes tl prosessoren med rank= o sendbuf[tt]=b[i][j]; //fyller sendbuf tt++; } } } MPI_Alltoallv(sendbuf, sendcnt, sdispls, MPI_DOUBLE, recbuf, sendcnt, sdispls, MPI_DOUBLE, MPI_COMM_WORLD); //Sender til alle prosessorer tt = 0; for (int o = 0; o < size; o++){ //går igjennom hver prosessor for (int j=displs[o]; j < displs[o+1]; j++) { //Tar displacementen først for å også da transponere selve innholdet for (int i=0; i < nrColon; i++) { bt[i][j]=recbuf[tt]; //Skriver til bt tt++; } } } }
static int kmr_alltoallv_mpi(KMR *mr, void *sbuf, long *scnts, long *sdsps, void *rbuf, long *rcnts, long *rdsps) { MPI_Comm comm = mr->comm; int nprocs = mr->nprocs; int *ssz = kmr_malloc(sizeof(int) * (size_t)nprocs); int *sdp = kmr_malloc(sizeof(int) * (size_t)nprocs); int *rsz = kmr_malloc(sizeof(int) * (size_t)nprocs); int *rdp = kmr_malloc(sizeof(int) * (size_t)nprocs); for (int r = 0; r < nprocs; r++) { assert(INT_MIN * 8L <= scnts[r] && scnts[r] <= INT_MAX * 8L); assert(INT_MIN * 8L <= rcnts[r] && rcnts[r] <= INT_MAX * 8L); assert(INT_MIN * 8L <= sdsps[r] && sdsps[r] <= INT_MAX * 8L); assert(INT_MIN * 8L <= rdsps[r] && rdsps[r] <= INT_MAX * 8L); assert(((scnts[r] & 7) == 0) && ((rcnts[r] & 7) == 0) && ((sdsps[r] & 7) == 0) && ((rdsps[r] & 7) == 0)); ssz[r] = (int)(scnts[r] / 8L); rsz[r] = (int)(rcnts[r] / 8L); sdp[r] = (int)(sdsps[r] / 8L); rdp[r] = (int)(rdsps[r] / 8L); } int cc; cc = MPI_Alltoallv(sbuf, ssz, sdp, MPI_LONG, rbuf, rsz, rdp, MPI_LONG, comm); assert(cc == MPI_SUCCESS); kmr_free(ssz, (sizeof(int) * (size_t)nprocs)); kmr_free(rsz, (sizeof(int) * (size_t)nprocs)); kmr_free(sdp, (sizeof(int) * (size_t)nprocs)); kmr_free(rdp, (sizeof(int) * (size_t)nprocs)); return MPI_SUCCESS; }
void transpose (Real **A, int m, int n, int size, int bb, int bre) { int se[size], sd[size], re[size], rd[size]; Real *V = createRealArray (n*m); Real *Vt = createRealArray (n*m); for (int i = 0; i < size; ++i) { se[i] = bb; sd[i] = bb*i; re[i] = bb; rd[i] = bb*i; } se[size-1] = bre; re[size-1] = bre; for(int i = 0; i < n; i++) { for(int j = 0; j < m; j++) { V[j + i*m] = A[j][i]; } } MPI_Alltoallv(V, se, sd, MPI_DOUBLE, Vt, re, rd, MPI_DOUBLE, MPI_COMM_WORLD); fillA(A, Vt, re, rd, m, n, size); }
/*! \brief Permute the distributed dense matrix: B <= perm(X). perm[i] = j means the i-th row of X is in the j-th row of B. */ int pzPermute_Dense_Matrix ( int_t fst_row, int_t m_loc, int_t row_to_proc[], int_t perm[], doublecomplex X[], int ldx, doublecomplex B[], int ldb, int nrhs, gridinfo_t *grid ) { int_t i, j, k, l; int p, procs; int *sendcnts, *sendcnts_nrhs, *recvcnts, *recvcnts_nrhs; int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; int *ptr_to_ibuf, *ptr_to_dbuf; int_t *send_ibuf, *recv_ibuf; doublecomplex *send_dbuf, *recv_dbuf; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pzPermute_Dense_Matrix()"); #endif procs = grid->nprow * grid->npcol; if ( !(sendcnts = SUPERLU_MALLOC(10*procs * sizeof(int))) ) ABORT("Malloc fails for sendcnts[]."); sendcnts_nrhs = sendcnts + procs; recvcnts = sendcnts_nrhs + procs; recvcnts_nrhs = recvcnts + procs; sdispls = recvcnts_nrhs + procs; sdispls_nrhs = sdispls + procs; rdispls = sdispls_nrhs + procs; rdispls_nrhs = rdispls + procs; ptr_to_ibuf = rdispls_nrhs + procs; ptr_to_dbuf = ptr_to_ibuf + procs; for (i = 0; i < procs; ++i) sendcnts[i] = 0; /* Count the number of X entries to be sent to each process.*/ for (i = fst_row; i < fst_row + m_loc; ++i) { p = row_to_proc[perm[i]]; ++sendcnts[p]; } MPI_Alltoall(sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm); sdispls[0] = rdispls[0] = 0; sdispls_nrhs[0] = rdispls_nrhs[0] = 0; sendcnts_nrhs[0] = sendcnts[0] * nrhs; recvcnts_nrhs[0] = recvcnts[0] * nrhs; for (i = 1; i < procs; ++i) { sdispls[i] = sdispls[i-1] + sendcnts[i-1]; sdispls_nrhs[i] = sdispls[i] * nrhs; rdispls[i] = rdispls[i-1] + recvcnts[i-1]; rdispls_nrhs[i] = rdispls[i] * nrhs; sendcnts_nrhs[i] = sendcnts[i] * nrhs; recvcnts_nrhs[i] = recvcnts[i] * nrhs; } k = sdispls[procs-1] + sendcnts[procs-1];/* Total number of sends */ l = rdispls[procs-1] + recvcnts[procs-1];/* Total number of recvs */ /*assert(k == m_loc);*/ /*assert(l == m_loc);*/ if ( !(send_ibuf = intMalloc_dist(k + l)) ) ABORT("Malloc fails for send_ibuf[]."); recv_ibuf = send_ibuf + k; if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)*nrhs)) ) ABORT("Malloc fails for send_dbuf[]."); recv_dbuf = send_dbuf + k * nrhs; for (i = 0; i < procs; ++i) { ptr_to_ibuf[i] = sdispls[i]; ptr_to_dbuf[i] = sdispls_nrhs[i]; } /* Fill in the send buffers: send_ibuf[] and send_dbuf[]. */ for (i = fst_row; i < fst_row + m_loc; ++i) { j = perm[i]; p = row_to_proc[j]; send_ibuf[ptr_to_ibuf[p]] = j; j = ptr_to_dbuf[p]; RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ send_dbuf[j++] = X[i-fst_row + k*ldx]; } ++ptr_to_ibuf[p]; ptr_to_dbuf[p] += nrhs; } /* Transfer the (permuted) row indices and numerical values. */ MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t, recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm); MPI_Alltoallv(send_dbuf, sendcnts_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, recv_dbuf, recvcnts_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); /* Copy the buffer into b. */ for (i = 0, l = 0; i < m_loc; ++i) { j = recv_ibuf[i] - fst_row; /* Relative row number */ RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ B[j + k*ldb] = recv_dbuf[l++]; } } SUPERLU_FREE(sendcnts); SUPERLU_FREE(send_ibuf); SUPERLU_FREE(send_dbuf); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Exit pzPermute_Dense_Matrix()"); #endif return 0; } /* pzPermute_Dense_Matrix */
/*! \brief Gather A from the distributed compressed row format to global A in compressed column format. */ int pzCompRow_loc_to_CompCol_global ( int_t need_value, /* Input. Whether need to gather numerical values */ SuperMatrix *A, /* Input. Distributed matrix in NRformat_loc format. */ gridinfo_t *grid, /* Input */ SuperMatrix *GA /* Output */ ) { NRformat_loc *Astore; NCformat *GAstore; doublecomplex *a, *a_loc; int_t *colind, *rowptr; int_t *colptr_loc, *rowind_loc; int_t m_loc, n, i, j, k, l; int_t colnnz, fst_row, nnz_loc, nnz; doublecomplex *a_recv; /* Buffer to receive the blocks of values. */ doublecomplex *a_buf; /* Buffer to merge blocks into block columns. */ int_t *itemp; int_t *colptr_send; /* Buffer to redistribute the column pointers of the local block rows. Use n_loc+1 pointers for each block. */ int_t *colptr_blk; /* The column pointers for each block, after redistribution to the local block columns. Use n_loc+1 pointers for each block. */ int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */ int_t *rowind_buf; /* Buffer to merge blocks into block columns. */ int_t *fst_rows, *n_locs; int *sendcnts, *sdispls, *recvcnts, *rdispls, *itemp_32; int it, n_loc, procs; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pzCompRow_loc_to_CompCol_global"); #endif /* Initialization. */ n = A->ncol; Astore = (NRformat_loc *) A->Store; nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc; fst_row = Astore->fst_row; a = Astore->nzval; rowptr = Astore->rowptr; colind = Astore->colind; n_loc = m_loc; /* NOTE: CURRENTLY ONLY WORK FOR SQUARE MATRIX */ /* ------------------------------------------------------------ FIRST PHASE: TRANSFORM A INTO DISTRIBUTED COMPRESSED COLUMN. ------------------------------------------------------------*/ zCompRow_to_CompCol_dist(m_loc, n, nnz_loc, a, colind, rowptr, &a_loc, &rowind_loc, &colptr_loc); /* Change local row index numbers to global numbers. */ for (i = 0; i < nnz_loc; ++i) rowind_loc[i] += fst_row; #if ( DEBUGlevel>=2 ) printf("Proc %d\n", grid->iam); PrintInt10("rowind_loc", nnz_loc, rowind_loc); PrintInt10("colptr_loc", n+1, colptr_loc); #endif procs = grid->nprow * grid->npcol; if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) ) ABORT("Malloc fails for fst_rows[]"); n_locs = fst_rows + procs; MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t, grid->comm); for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i]; n_locs[procs-1] = n - fst_rows[procs-1]; if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) ) ABORT("Malloc fails for recvcnts[]"); sendcnts = recvcnts + procs; rdispls = sendcnts + procs; sdispls = rdispls + procs; itemp_32 = sdispls + procs; /* All-to-all transfer column pointers of each block. Now the matrix view is P-by-P block-partition. */ /* n column starts for each column, and procs column ends for each block */ if ( !(colptr_send = intMalloc_dist(n + procs)) ) ABORT("Malloc fails for colptr_send[]"); if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) ) ABORT("Malloc fails for colptr_blk[]"); for (i = 0, j = 0; i < procs; ++i) { for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k]; colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */ sendcnts[i] = n_locs[i] + 1; #if ( DEBUGlevel>=1 ) assert(j == fst_rows[i]); #endif sdispls[i] = j + i; recvcnts[i] = n_loc + 1; rdispls[i] = i * (n_loc + 1); j += n_locs[i]; /* First column of next block in colptr_loc[] */ } MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t, colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm); /* Adjust colptr_blk[] so that they contain the local indices of the column pointers in the receive buffer. */ nnz = 0; /* The running sum of the nonzeros counted by far */ k = 0; for (i = 0; i < procs; ++i) { for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) { colnnz = colptr_blk[j+1] - colptr_blk[j]; /*assert(k<=j);*/ colptr_blk[k] = nnz; nnz += colnnz; /* Start of the next column */ ++k; } colptr_blk[k++] = nnz; /* Add an END marker for each block */ } /*assert(k == (n_loc+1)*procs);*/ /* Now prepare to transfer row indices and values. */ sdispls[0] = 0; for (i = 0; i < procs-1; ++i) { sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]]; sdispls[i+1] = sdispls[i] + sendcnts[i]; } sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]]; for (i = 0; i < procs; ++i) { j = rdispls[i]; /* Point to this block in colptr_blk[]. */ recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j]; } rdispls[0] = 0; /* Recompute rdispls[] for row indices. */ for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i]; k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */ if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) ) ABORT("Malloc fails for rowind_recv[]"); rowind_buf = rowind_recv + k; MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t, rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm); if ( need_value ) { if ( !(a_recv = (doublecomplex *) doublecomplexMalloc_dist(2*k)) ) ABORT("Malloc fails for rowind_recv[]"); a_buf = a_recv + k; MPI_Alltoallv(a_loc, sendcnts, sdispls, SuperLU_MPI_DOUBLE_COMPLEX, a_recv, recvcnts, rdispls, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); } /* Reset colptr_loc[] to point to the n_loc global columns. */ colptr_loc[0] = 0; itemp = colptr_send; for (j = 0; j < n_loc; ++j) { colnnz = 0; for (i = 0; i < procs; ++i) { k = i * (n_loc + 1) + j; /* j-th column in i-th block */ colnnz += colptr_blk[k+1] - colptr_blk[k]; } colptr_loc[j+1] = colptr_loc[j] + colnnz; itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */ } itemp[n_loc] = colptr_loc[n_loc]; /* Merge blocks of row indices into columns of row indices. */ for (i = 0; i < procs; ++i) { k = i * (n_loc + 1); for (j = 0; j < n_loc; ++j) { /* i-th block */ for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { rowind_buf[itemp[j]] = rowind_recv[l]; ++itemp[j]; } } } if ( need_value ) { for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j]; for (i = 0; i < procs; ++i) { k = i * (n_loc + 1); for (j = 0; j < n_loc; ++j) { /* i-th block */ for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { a_buf[itemp[j]] = a_recv[l]; ++itemp[j]; } } } } /* ------------------------------------------------------------ SECOND PHASE: GATHER TO GLOBAL A IN COMPRESSED COLUMN FORMAT. ------------------------------------------------------------*/ GA->nrow = A->nrow; GA->ncol = A->ncol; GA->Stype = SLU_NC; GA->Dtype = A->Dtype; GA->Mtype = A->Mtype; GAstore = GA->Store = (NCformat *) SUPERLU_MALLOC ( sizeof(NCformat) ); if ( !GAstore ) ABORT ("SUPERLU_MALLOC fails for GAstore"); /* First gather the size of each piece. */ nnz_loc = colptr_loc[n_loc]; MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm); for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i]; GAstore->nnz = nnz; if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) ) ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]"); if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) ) ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]"); /* Allgatherv for row indices. */ rdispls[0] = 0; for (i = 0; i < procs-1; ++i) { rdispls[i+1] = rdispls[i] + itemp[i]; itemp_32[i] = itemp[i]; } itemp_32[procs-1] = itemp[procs-1]; it = nnz_loc; MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, itemp_32, rdispls, mpi_int_t, grid->comm); if ( need_value ) { if ( !(GAstore->nzval = (doublecomplex *) doublecomplexMalloc_dist (nnz)) ) ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]"); MPI_Allgatherv(a_buf, it, SuperLU_MPI_DOUBLE_COMPLEX, GAstore->nzval, itemp_32, rdispls, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); } else GAstore->nzval = NULL; /* Now gather the column pointers. */ rdispls[0] = 0; for (i = 0; i < procs-1; ++i) { rdispls[i+1] = rdispls[i] + n_locs[i]; itemp_32[i] = n_locs[i]; } itemp_32[procs-1] = n_locs[procs-1]; MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, itemp_32, rdispls, mpi_int_t, grid->comm); /* Recompute column pointers. */ for (i = 1; i < procs; ++i) { k = rdispls[i]; for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1]; itemp[i] += itemp[i-1]; /* prefix sum */ } GAstore->colptr[n] = nnz; #if ( DEBUGlevel>=2 ) if ( !grid->iam ) { printf("After pdCompRow_loc_to_CompCol_global()\n"); zPrint_CompCol_Matrix_dist(GA); } #endif SUPERLU_FREE(a_loc); SUPERLU_FREE(rowind_loc); SUPERLU_FREE(colptr_loc); SUPERLU_FREE(fst_rows); SUPERLU_FREE(recvcnts); SUPERLU_FREE(colptr_send); SUPERLU_FREE(colptr_blk); SUPERLU_FREE(rowind_recv); if ( need_value) SUPERLU_FREE(a_recv); #if ( DEBUGlevel>=1 ) if ( !grid->iam ) printf("sizeof(NCformat) %lu\n", sizeof(NCformat)); CHECK_MALLOC(grid->iam, "Exit pzCompRow_loc_to_CompCol_global"); #endif return 0; } /* pzCompRow_loc_to_CompCol_global */
/* For MPI distributed memory. */ void scramble_edges_mpi(MPI_Comm comm, const uint64_t userseed1, const uint64_t userseed2, const int64_t local_nedges_in, const int64_t* const local_edges_in, int64_t* const local_nedges_out_ptr, int64_t** const local_edges_out_ptr /* Allocated using xmalloc() by scramble_edges_mpi */) { int rank, size; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &size); mrg_state st; uint_fast32_t seed[5]; make_mrg_seed(userseed1, userseed2, seed); mrg_seed(&st, seed); mrg_skip(&st, 5, 0, 0); /* To make offset different from other PRNG uses */ int64_t total_nedges; MPI_Allreduce((void*)&local_nedges_in, &total_nedges, 1, INT64_T_MPI_TYPE, MPI_SUM, comm); int64_t local_nedges_out; /* = local permutation size */ int64_t* local_perm; rand_sort_mpi(comm, &st, total_nedges, &local_nedges_out, &local_perm); *local_nedges_out_ptr = local_nedges_out; /* Gather permutation information and fast owner lookup cache (code in * apply_permutation_mpi.c). */ int64_t* edge_displs = (int64_t*)xmalloc((size + 1) * sizeof(int64_t)); int* edge_owner_table; int64_t* edge_owner_cutoff; int lg_minedgecount; int64_t maxedgecount; gather_block_distribution_info(comm, local_nedges_in, total_nedges, edge_displs, &edge_owner_table, &edge_owner_cutoff, &lg_minedgecount, &maxedgecount); /* Originally from apply_permutation_mpi.c */ #define LOOKUP_EDGE_OWNER(v) \ (edge_owner_table[(v) >> lg_minedgecount] + \ ((v) >= edge_owner_cutoff[(v) >> lg_minedgecount])) /* Apply permutation. Output distribution is same as distribution of * generated edge permutation. */ /* Count number of requests to send to each destination. */ int* send_counts = (int*)xcalloc(size, sizeof(int)); /* Uses zero-init */ int64_t i; for (i = 0; i < local_nedges_out; ++i) { ++send_counts[LOOKUP_EDGE_OWNER(local_perm[i])]; } /* Prefix sum to get displacements. */ int* send_displs = (int*)xmalloc((size + 1) * sizeof(int)); send_displs[0] = 0; for (i = 0; i < size; ++i) { send_displs[i + 1] = send_displs[i] + send_counts[i]; } assert (send_displs[size] == local_nedges_out); /* Put edges into buffer by destination; also keep around index values for * where to write the result. */ int64_t* sendbuf = (int64_t*)xmalloc(local_nedges_out * sizeof(int64_t)); int64_t* reply_loc_buf = (int64_t*)xmalloc(local_nedges_out * sizeof(int64_t)); int* send_offsets = (int*)xmalloc((size + 1) * sizeof(int)); memcpy(send_offsets, send_displs, (size + 1) * sizeof(int)); for (i = 0; i < local_nedges_out; ++i) { int write_index = send_offsets[LOOKUP_EDGE_OWNER(local_perm[i])]; sendbuf[write_index] = local_perm[i]; reply_loc_buf[write_index] = i; ++send_offsets[LOOKUP_EDGE_OWNER(local_perm[i])]; } for (i = 0; i < size; ++i) assert (send_offsets[i] == send_displs[i + 1]); free(send_offsets); send_offsets = NULL; free(local_perm); local_perm = NULL; #undef LOOKUP_EDGE_OWNER free(edge_owner_table); edge_owner_table = NULL; free(edge_owner_cutoff); edge_owner_cutoff = NULL; /* Find out how many requests I will be receiving. */ int* recv_counts = (int*)xmalloc(size * sizeof(int)); MPI_Alltoall(send_counts, 1, MPI_INT, recv_counts, 1, MPI_INT, comm); /* Compute their displacements. */ int* recv_displs = (int*)xmalloc((size + 1) * sizeof(int)); recv_displs[0] = 0; for (i = 0; i < size; ++i) { recv_displs[i + 1] = recv_displs[i] + recv_counts[i]; } /* Make receive and reply buffers. */ int64_t* recvbuf = (int64_t*)xmalloc(recv_displs[size] * sizeof(int64_t)); int64_t* replybuf = (int64_t*)xmalloc(recv_displs[size] * 2 * sizeof(int64_t)); /* Move requests for edges into receive buffer. */ MPI_Alltoallv(sendbuf, send_counts, send_displs, INT64_T_MPI_TYPE, recvbuf, recv_counts, recv_displs, INT64_T_MPI_TYPE, comm); free(sendbuf); sendbuf = NULL; /* Put requested edges into response buffer. */ int64_t my_edge_offset = edge_displs[rank]; for (i = 0; i < recv_displs[size]; ++i) { replybuf[i * 2 + 0] = local_edges_in[(recvbuf[i] - my_edge_offset) * 2 + 0]; replybuf[i * 2 + 1] = local_edges_in[(recvbuf[i] - my_edge_offset) * 2 + 1]; } free(recvbuf); recvbuf = NULL; free(edge_displs); edge_displs = NULL; /* Send replies back. */ int64_t* reply_edges = (int64_t*)xmalloc(local_nedges_out * 2 * sizeof(int64_t)); for (i = 0; i < size; ++i) { /* Sending back two values for each request */ recv_counts[i] *= 2; recv_displs[i] *= 2; send_counts[i] *= 2; send_displs[i] *= 2; } MPI_Alltoallv(replybuf, recv_counts, recv_displs, INT64_T_MPI_TYPE, reply_edges, send_counts, send_displs, INT64_T_MPI_TYPE, comm); free(replybuf); replybuf = NULL; free(recv_counts); recv_counts = NULL; free(recv_displs); recv_displs = NULL; free(send_counts); send_counts = NULL; free(send_displs); send_displs = NULL; /* Make output array of edges. */ int64_t* local_edges_out = (int64_t*)xmalloc(local_nedges_out * 2 * sizeof(int64_t)); *local_edges_out_ptr = local_edges_out; /* Put edges into output array. */ for (i = 0; i < local_nedges_out; ++i) { local_edges_out[reply_loc_buf[i] * 2 + 0] = reply_edges[2 * i + 0]; local_edges_out[reply_loc_buf[i] * 2 + 1] = reply_edges[2 * i + 1]; } free(reply_loc_buf); reply_loc_buf = NULL; free(reply_edges); reply_edges = NULL; }