C++ (Cpp) MPI_Alltoallv示例

示例#1

0

显示文件

文件： sg_routine.c 项目： julywater/self_gravity

void sg_route(struct Domain *theDomain,struct poisson *thePoisson){
//	struct Tridiagsys* thetridiag=alloc_tridiagsys(thePoisson);
	tridiag_setup(thePoisson);
	int N_r=thePoisson->N_r;
	int N_z=thePoisson->N_z;
	int N_p=thePoisson->N_p;
	int N_k=thePoisson->N_k;
	int i,j,k;
	int size=thePoisson->size;
	int rank=theDomain->rank;
	cylinder_interp(theDomain,thePoisson);
	set_bndry(theDomain,thePoisson);
	density_fft(thePoisson);
	mpi_arrange(thePoisson->density,thePoisson->buffer,thePoisson);
	double *buffersend=thePoisson->buffer;
	double *bufferstore=thePoisson->density;

	int *sendcnts=thePoisson->sendcnts;
        int *sdispls=thePoisson->sdispls;
        int *recvcnts=thePoisson->recvcnts;
        int *rdispls=thePoisson->rdispls;
	
        MPI_Alltoallv(buffersend,sendcnts,sdispls,MPI_DOUBLE,bufferstore,recvcnts,rdispls,MPI_DOUBLE,MPI_COMM_WORLD);
        sinefft(thePoisson);
        solveVp(rank,thePoisson);
        sinefft(thePoisson);    
/*
	if(rank==0){
	int i,j,k;
	i=0;j=0;k=0;
	FILE *f;
	f=fopen("poten.dat","w");
	for(i=0;i<thePoisson->N_r_glob;i++)
		for(k=0;k<thePoisson->N_z_glob;k++)
			fprintf(f,"%f   %d   %f\n",i+0.5,k,thePoisson->density[in_lookup(thePoisson,k,i,0)]);
	fclose(f);
	}
*/

	buffersend=thePoisson->density;
	bufferstore=thePoisson->buffer;
	//this is the inverse MPI comm.SO the sendcnts and recvcnts are exchanged 
	MPI_Alltoallv(buffersend,recvcnts,rdispls,MPI_DOUBLE,bufferstore,sendcnts,sdispls,MPI_DOUBLE,MPI_COMM_WORLD);
	inverse_mpi_arrange(thePoisson->buffer,thePoisson->density,thePoisson);
	inverse_fft(thePoisson);
	int direction=0;
	for(direction=0;direction<3;direction++){
		cal_force(thePoisson,direction);	
		disco_force_interp(theDomain,thePoisson,direction);
	}
//	disco_interp(theSim,theCells,thePoisson);
//	destroy_tridiagsys(thetridiag);

}

示例#2

0

显示文件

文件： transpose-alltoall.c 项目： 376473984/fftw3

static void apply(const plan *ego_, R *I, R *O)
{
     const P *ego = (const P *) ego_;
     plan_rdft *cld1, *cld2, *cld2rest, *cld3;

     /* transpose locally to get contiguous chunks */
     cld1 = (plan_rdft *) ego->cld1;
     if (cld1) {
	  cld1->apply(ego->cld1, I, O);
	  
	  /* transpose chunks globally */
	  if (ego->equal_blocks)
	       MPI_Alltoall(O, ego->send_block_sizes[0], FFTW_MPI_TYPE,
			    I, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
			    ego->comm);
	  else
	       MPI_Alltoallv(O, ego->send_block_sizes, ego->send_block_offsets,
			     FFTW_MPI_TYPE,
			     I, ego->recv_block_sizes, ego->recv_block_offsets,
			     FFTW_MPI_TYPE,
			     ego->comm);
     }
     else { /* TRANSPOSED_IN, no need to destroy input */
	  /* transpose chunks globally */
	  if (ego->equal_blocks)
	       MPI_Alltoall(I, ego->send_block_sizes[0], FFTW_MPI_TYPE,
			    O, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
			    ego->comm);
	  else
	       MPI_Alltoallv(I, ego->send_block_sizes, ego->send_block_offsets,
			     FFTW_MPI_TYPE,
			     O, ego->recv_block_sizes, ego->recv_block_offsets,
			     FFTW_MPI_TYPE,
			     ego->comm);
	  I = O; /* final transpose (if any) is in-place */
     }
     
     /* transpose locally, again, to get ordinary row-major */
     cld2 = (plan_rdft *) ego->cld2;
     if (cld2) {
	  cld2->apply(ego->cld2, I, O);
	  cld2rest = (plan_rdft *) ego->cld2rest;
	  if (cld2rest) { /* leftover from unequal block sizes */
	       cld2rest->apply(ego->cld2rest,
			       I + ego->rest_Ioff, O + ego->rest_Ooff);
	       cld3 = (plan_rdft *) ego->cld3;
	       if (cld3)
		    cld3->apply(ego->cld3, O, O);
	       /* else TRANSPOSED_OUT is true and user wants O transposed */
	  }
     }
}

示例#3

0

显示文件

文件： bCast.c 项目： 8l/insieme

double time_alltoallv(struct collParams* p)
{
    int i, j, size2;
    int disp = 0;
    for ( i = 0; i < p->nranks; i++) {
        int size2 = (i+p->myrank) % (p->size+1);
        sendcounts[i] = size2;
        recvcounts[i] = size2;
        sdispls[i] = disp;
        rdispls[i] = disp;
        disp += size2;
    }
    MPI_Barrier(MPI_COMM_WORLD);

    size2 = p->myrank % (p->size+1);
    __TIME_START__;
    for (i = 0; i < p->iter; i++) {
        MPI_Alltoallv(sbuffer, sendcounts, sdispls, p->type, rbuffer, recvcounts, rdispls, p->type, p->comm);
        __BAR__(p->comm);
    }
    __TIME_END__;

    if (check_buffers) {
        check_sbuffer(p->myrank);
        for (i = 0; i < p->nranks; i++) {
            disp = 0;
            for (j = 0; j < p->myrank; j++) { disp += (j+i) % (p->size+1); }
            check_rbuffer(rbuffer, rdispls[i], i, disp, recvcounts[i]);
        }
    }

    return __TIME_USECS__ / (double)p->iter;
}

示例#4

0

显示文件

文件： 2.cpp 项目： Cueroqu/ParallelComputingHomework

void remap(){
    MPI_Allgather(num,mask,MPI_INT,cnt,mask,MPI_INT,MPI_COMM_WORLD);
    int arrStart = 0;
    int arrEnd = 0;
    int allStart = 0;
    for (int i=0;i<mask;++i){
        spf[0] = allStart;
        for (int j=0;j<size;++j){
            spf[j+1] = spf[j]+cnt[j*mask+i];
        }
        for (int j=0;j<size;++j){
            if (spf[rank]>j*len+len-1 || spf[rank+1]-1<j*len){
                sdispls[j] = arrStart;
                sendcounts[j] = 0;
            } else {
                sdispls[j] = arrStart;
                sendcounts[j] = std::min(spf[rank+1],j*len+len)-std::max(j*len,spf[rank]);
                arrStart += sendcounts[j];
            }
            if (spf[j]>rank*len+len-1 || spf[j+1]-1<rank*len){
                rdispls[j] = arrEnd;
                recvcounts[j] = 0;
            } else {
                rdispls[j] = arrEnd;
                recvcounts[j] = std::min(spf[j+1],rank*len+len)-std::max(rank*len,spf[j]);
                arrEnd += recvcounts[j];
            }
        }
        MPI_Alltoallv(tmpData,sendcounts,sdispls,MPI_DT_,data,recvcounts,rdispls,MPI_DT_,MPI_COMM_WORLD);
        allStart = spf[size];
    }
}

示例#5

0

显示文件

文件： zmpi_tools.c 项目： fweik/scafacos

int ZMPI_Alltoall_int_proclists_alltoallv(int *sendbuf, int nsprocs, int *sprocs, int *recvbuf, int nrprocs, int *rprocs, MPI_Comm comm) /* zmpi_func ZMPI_Alltoall_int_proclists_alltoallv */
{
  int i, size;

  int *scounts2, *sdispls2, *rcounts2, *rdispls2;


  MPI_Comm_size(comm, &size);

  scounts2 = z_alloc(4 * size, sizeof(int));
  sdispls2 = scounts2 + 1 * size;
  rcounts2 = scounts2 + 2 * size;
  rdispls2 = scounts2 + 3 * size;

  for (i = 0; i < size; ++i)
  {
    scounts2[i] = rcounts2[i] = DEFAULT_INT;
    sdispls2[i] = rdispls2[i] = i;
    recvbuf[i] = 0;
  }

  for (i = 0; i < nsprocs; ++i) scounts2[sprocs[i]] = 1;
  for (i = 0; i < nrprocs; ++i) rcounts2[rprocs[i]] = 1;

  MPI_Alltoallv(sendbuf, scounts2, sdispls2, MPI_INT, recvbuf, rcounts2, rdispls2, MPI_INT, comm);

  z_free(scounts2);

  return MPI_SUCCESS;
}

示例#6

0

显示文件

文件： transpose_mpi.c 项目： JonBoley/peaqb-fast

/* Out-of-place version of transpose_mpi (or rather, in place using
   a scratch array): */
static void transpose_mpi_out_of_place(transpose_mpi_plan p, int el_size,
				       TRANSPOSE_EL_TYPE *local_data,
				       TRANSPOSE_EL_TYPE *work)
{
     local_transpose_copy(local_data, work, el_size, p->local_nx, p->ny);

     if (p->all_blocks_equal)
	  MPI_Alltoall(work, p->send_block_size * el_size, p->el_type,
		       local_data, p->recv_block_size * el_size, p->el_type,
		       p->comm);
     else {
	  int i, n_pes = p->n_pes;

	  for (i = 0; i < n_pes; ++i) {
	       p->send_block_sizes[i] *= el_size;
	       p->recv_block_sizes[i] *= el_size;
	       p->send_block_offsets[i] *= el_size;
	       p->recv_block_offsets[i] *= el_size;
	  }
	  MPI_Alltoallv(work, p->send_block_sizes, p->send_block_offsets,
			p->el_type,
			local_data, p->recv_block_sizes, p->recv_block_offsets,
			p->el_type,
			p->comm);
	  for (i = 0; i < n_pes; ++i) {
	       p->send_block_sizes[i] /= el_size;
	       p->recv_block_sizes[i] /= el_size;
	       p->send_block_offsets[i] /= el_size;
	       p->recv_block_offsets[i] /= el_size;
	  }
     }

     do_permutation(local_data, p->perm_block_dest, p->num_perm_blocks,
		    p->perm_block_size * el_size);
}

示例#7

0

显示文件

文件： image_exchanger.cpp 项目： jinghuage/pcaster

//-----------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------
void Image_Exchanger::exchange_fragment_images(unsigned int* databuf,
                                               int nviewer,
                                               ImageFragment_Tile* ift)
{
//    fprintf(stderr, "**** %s:%s() ****\n", __FILE__, __func__);
 
#ifdef _DEBUG7
    fprintf(stderr, "**** %s:%s() ****\n", __FILE__, __func__);
#endif


    unsigned int* sendbuf = databuf + m_sbuf_offset;
    unsigned int* recvbuf = databuf + m_rbuf_offset;


    if(nviewer == 1)
    {
        MPI_Gatherv((int*)sendbuf, m_scounts[0], MPI_INT,
                    (int*)recvbuf, m_rcounts, m_rdispls, MPI_INT,
                    0, MPI_COMM_WORLD);
    }
    else
    {
        MPI_Alltoallv( (int*)sendbuf, m_scounts, m_sdispls, MPI_INT,
                       (int*)recvbuf, m_rcounts, m_rdispls, MPI_INT, 
                       MPI_COMM_WORLD);
    }

    ift->address_fragments(m_rbuf_offset, m_rdispls);
}

示例#8

0

显示文件

文件： volpex_fAPI.c 项目： hadimontakhabi/VolpexMPI-HPX

void mpi_alltoallv(void *sendbuf, int *sendcount, int *sdispls,  int *sendtype,
                   void *recvbuf, int *recvcount, int *rdispls,  int *recvtype,
                   int *comm, int *ierr)
{
    *ierr = MPI_Alltoallv(sendbuf, sendcount, sdispls, *sendtype, recvbuf, recvcount,
                          rdispls, *recvtype, *comm);
    return;
}

示例#9

0

显示文件

文件： alltoallvf.c 项目： mpifl/mpich3newforfile

FORT_DLL_SPEC void FORT_CALL mpi_alltoallv_ ( void*v1, MPI_Fint *v2, MPI_Fint *v3, MPI_Fint *v4, void*v5, MPI_Fint *v6, MPI_Fint *v7, MPI_Fint *v8, MPI_Fint *v9, MPI_Fint *ierr ){

#ifndef HAVE_MPI_F_INIT_WORKS_WITH_C
    if (MPIR_F_NeedInit){ mpirinitf_(); MPIR_F_NeedInit = 0; }
#endif
    if (v1 == MPIR_F_MPI_IN_PLACE) v1 = MPI_IN_PLACE;
    *ierr = MPI_Alltoallv( v1, v2, v3, (MPI_Datatype)(*v4), v5, v6, v7, (MPI_Datatype)(*v8), (MPI_Comm)(*v9) );
}

示例#10

0

显示文件

文件： parallelPoisson.c 项目： raymondt8/ex6_2016

void mpiColumnMatrixTranspose(ColumnMatrix recvData,ColumnMatrix recvBuffer, ColumnMatrix sendData, ColumnMatrix sendBuffer)
{
	packTansposeData(sendData, sendBuffer);	

	//All processors send and receive the same amount of doubles with the same processor.
	MPI_Alltoallv(sendBuffer->data,sendBuffer->blockSize,sendBuffer->displacement,MPI_DOUBLE,recvBuffer->data,sendBuffer->blockSize,sendBuffer->displacement,MPI_DOUBLE,*sendData->comm); 

	unpackTansposeData(recvData, recvBuffer);
}

示例#11

0

显示文件

文件： binGraph.cpp 项目： SCOREC/EnGPar

int binGraph::exchange_edges(uint64_t m_read, uint64_t* read_edges,
                             int32_t* ranks,etype t)
{
  int32_t* scounts = (int32_t*)malloc(PCU_Comm_Peers()*sizeof(int32_t));
  int32_t* rcounts = (int32_t*)malloc(PCU_Comm_Peers()*sizeof(int32_t));
  int32_t* sdispls = (int32_t*)malloc(PCU_Comm_Peers()*sizeof(int32_t));
  int32_t* sdispls_cpy = (int32_t*)malloc(PCU_Comm_Peers()*sizeof(int32_t));
  int32_t* rdispls = (int32_t*)malloc(PCU_Comm_Peers()*sizeof(int32_t));
  for (int i = 0; i < PCU_Comm_Peers(); ++i)
  {
    scounts[i] = 0;
    rcounts[i] = 0;
    sdispls[i] = 0;
    sdispls_cpy[i] = 0;
    rdispls[i] = 0;
  }

  uint64_t n_per_rank = num_global_verts / PCU_Comm_Peers() + 1;
  for (uint64_t i = 0; i < m_read*2; i+=2)
  {
    uint64_t vert = read_edges[i];
    int vert_task = ranks[vert];
    scounts[vert_task] += 2;
  }

  MPI_Alltoall(scounts, 1, MPI_INT32_T,
               rcounts, 1, MPI_INT32_T, PCU_Get_Comm());

  for (uint64_t i = 1; i < PCU_Comm_Peers(); ++i) {
    sdispls[i] = sdispls[i-1] + scounts[i-1];
    sdispls_cpy[i] = sdispls[i];
    rdispls[i] = rdispls[i-1] + rcounts[i-1];
  }
 
  int32_t total_send = sdispls[PCU_Comm_Peers()-1] + scounts[PCU_Comm_Peers()-1];
  int32_t total_recv = rdispls[PCU_Comm_Peers()-1] + rcounts[PCU_Comm_Peers()-1];
  uint64_t* sendbuf = (uint64_t*)malloc(total_send*sizeof(uint64_t));
  edge_list[t] = (uint64_t*)malloc(total_recv*sizeof(uint64_t));
  num_local_edges[t] = total_recv / 2;

  for (uint64_t i = 0; i < m_read*2; i+=2)
  {
    uint64_t vert1 = read_edges[i];
    uint64_t vert2 = read_edges[i+1];
    int vert_task = ranks[vert1];

    sendbuf[sdispls_cpy[vert_task]++] = vert1;
    sendbuf[sdispls_cpy[vert_task]++] = vert2;
  }

  MPI_Alltoallv(sendbuf, scounts, sdispls, MPI_UINT64_T,
                edge_list[t], rcounts, rdispls, MPI_UINT64_T, PCU_Get_Comm());
  free(sendbuf);

  return 0;
}

示例#12

0

显示文件

文件： f_wrappers_pmpi.c 项目： JeremyFyke/cime

void mpi_alltoallv (void *sendbuf, MPI_Fint *sendcnts, MPI_Fint *sdispls, 
		    MPI_Fint *sendtype, void *recvbuf, MPI_Fint *recvcnts,
		    MPI_Fint *rdispls, MPI_Fint *recvtype, MPI_Fint *comm, 
		    MPI_Fint *__ierr)
{
  *__ierr = MPI_Alltoallv (sendbuf, sendcnts, sdispls, 
			   MPI_Type_f2c (*sendtype), recvbuf,
			   recvcnts, rdispls, MPI_Type_f2c (*recvtype),
			   MPI_Comm_f2c (*comm));
}

示例#13

0

显示文件

文件： dummy_operations.cpp 项目： SebastianSchlag/KaHIP

void dummy_operations::run_collective_dummy_operations() {
        int rank, size;
        MPI_Comm_rank( MPI_COMM_WORLD, &rank);
        MPI_Comm_size( MPI_COMM_WORLD, &size);
        
        // Run Broadcast
        {
                int x;
                MPI_Comm_rank( MPI_COMM_WORLD, &x);
                MPI_Bcast(&x, 1, MPI_INT, 0, MPI_COMM_WORLD);
        }
        // Run Allgather.
        {
                int x, size;
                MPI_Comm_rank( MPI_COMM_WORLD, &x);
                MPI_Comm_size( MPI_COMM_WORLD, &size);
                
                std::vector<int> rcv(size);
                MPI_Allgather(&x, 1, MPI_INT, &rcv[0], 1, MPI_INT, MPI_COMM_WORLD);
        }

        // Run Allreduce.
        {
                int x;
                MPI_Comm_rank( MPI_COMM_WORLD, &x);
                
                int y = 0;
                MPI_Allreduce(&x, &y, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
        }

        // Dummy Prefix Sum
        {
                int x  = 1;
                int y  = 0;

                MPI_Scan(&x, &y, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); 
        }

        // Run Alltoallv.
        {
                std::vector<int> snd(size);
                std::vector<int> rcv(size);
                std::vector<int> scounts(size, 1);
                std::vector<int> rcounts(size, 1);
                std::vector<int> sdispls(size);
                std::vector<int> rdispls(size);
                for (int i = 0, iend = sdispls.size(); i < iend; ++i) {
                        sdispls[i] = rdispls[i] = i;
                }
                MPI_Alltoallv(&snd[0], &scounts[0], &sdispls[0], MPI_INT,
                              &rcv[0], &rcounts[0], &rdispls[0], MPI_INT, MPI_COMM_WORLD);
        }
        

}

示例#14

0

显示文件

文件： collective.c 项目： ACME-Climate/cime

FC_FUNC( mpi_alltoallv , MPI_ALLTOALLV )
           ( void *sendbuf, int *sendcounts, int *sdispls, int *sendtype,
	     void *recvbuf, int *recvcounts, int *rdispls, int *recvtype,
             int *comm, int *ierror )
{

  *ierror=MPI_Alltoallv(sendbuf, sendcounts, sdispls, *sendtype,
			recvbuf, recvcounts, rdispls, *recvtype,
			*comm);

}

示例#15

0

显示文件

文件： transpose.c 项目： somaen/superdupercomputers

void mpiMatrix_transpose(struct mpiMatrix *matrix, struct mpi_com *uplink) {
  mpiMatrix_serialiseForSending(matrix, uplink);
  int *SRcounts = mpiMatrix_genCounts(matrix, uplink);
  int *SRdispl = mpiMatrix_genDispl(uplink, SRcounts);

  MPI_Alltoallv(matrix->data, SRcounts , SRdispl, MPI_DOUBLE,
                matrix->aux, SRcounts, SRdispl, MPI_DOUBLE,  uplink->comm);
  mpiMatrix_swapDataAux(matrix);
  mpiMatrix_deserialiseAfterReception(matrix);

  free(SRcounts);
  free(SRdispl);
}

示例#16

0

显示文件

文件： avtSamplePointCommunicator.C 项目： HarinarayanKrishnan/VisIt26RC_Trunk

char *
avtSamplePointCommunicator::CommunicateMessages(char **sendmessages,
                                                int   *sendcount,
                                                char **recvmessages,
                                                int   *recvcount)
{
#ifdef PARALLEL
    //
    // Figure out how much each processor needs to send/receive.
    //
    MPI_Alltoall(sendcount, 1, MPI_INT, recvcount, 1, MPI_INT, VISIT_MPI_COMM);

    //
    // Create a buffer we can receive into.
    //
    char *recvConcatList = CreateMessageStrings(recvmessages, recvcount,
                                                numProcs);
    
    //
    // Calculate the displacement lists.
    //
    int *senddisp = new int[numProcs];
    int *recvdisp = new int[numProcs];
    senddisp[0] = 0;
    recvdisp[0] = 0;
    for (int i = 1 ; i < numProcs ; i++)
    {
        senddisp[i] = senddisp[i-1] + sendcount[i-1];
        recvdisp[i] = recvdisp[i-1] + recvcount[i-1];
    }

    //
    // Do the actual transfer of sample points.   The messages arrays are
    // actually indexes into one big array.  Since MPI expects that big
    // array, give that (which is at location 0).
    //
    MPI_Alltoallv(sendmessages[0], sendcount, senddisp, MPI_CHAR,
                  recvmessages[0], recvcount, recvdisp, MPI_CHAR,
                  VISIT_MPI_COMM);

    delete [] senddisp;
    delete [] recvdisp;

    //
    // We need to return this buffer so the calling function can delete it.
    //
    return recvConcatList;
#else
    return 0;
#endif
}

示例#17

0

显示文件

文件： com.c 项目： KonstantinosKr/solfec

/* communicate integers and doubles accodring
 * to the pattern computed by COMALL_Pattern */
int COMALL_Repeat (void *pattern)
{
  COMALLPATTERN *pp = pattern;
  COMDATA *cd;
  int i;

  for (i = 0; i < pp->ncpu; i ++) pp->send_position [i] = pp->recv_position [i] = 0;

  /* pack ints */
  for (i = 0, cd = pp->send; i < pp->nsend; i ++, cd ++)
  {
    if (cd->ints)
    {
      MPI_Pack (cd->i, cd->ints, MPI_INT, &pp->send_data [pp->send_disps [cd->rank]], pp->send_counts [cd->rank], &pp->send_position [cd->rank], pp->comm);
    }
  }

  /* pack doubles */
  for (i = 0, cd = pp->send; i < pp->nsend; i ++, cd ++)
  {
    if (cd->doubles)
    {
      MPI_Pack (cd->d, cd->doubles, MPI_DOUBLE, &pp->send_data [pp->send_disps [cd->rank]], pp->send_counts [cd->rank], &pp->send_position [cd->rank], pp->comm); 
    }
  }

#if DEBUG
  for (i = 0; i < pp->ncpu; i ++)
  {
    ASSERT_DEBUG (pp->send_position [i] <= pp->send_counts [i], "Incorrect packing");
  }
#endif

  /* all to all send and receive */
  MPI_Alltoallv (pp->send_data, pp->send_counts, pp->send_disps, MPI_PACKED, pp->recv_data, pp->recv_counts, pp->recv_disps, MPI_PACKED, pp->comm);

  if (pp->recv_size)
  {
    /* unpack data */
    for (i = 0; i < pp->ncpu; i ++)
    {
      MPI_Unpack (&pp->recv_data [pp->recv_disps [i]], pp->recv_counts [i], &pp->recv_position [i], pp->recv [i].i, pp->recv [i].ints, MPI_INT, pp->comm);
      MPI_Unpack (&pp->recv_data [pp->recv_disps [i]], pp->recv_counts [i], &pp->recv_position [i], pp->recv [i].d, pp->recv [i].doubles, MPI_DOUBLE, pp->comm);
    }
  }

  return pp->send_size;
}

示例#18

0

显示文件

文件： main.c 项目： RSE-Cambridge/IPM

int main( int argc, char* argv[] )
{
    int i, j;
    int myrank, nprocs;
    char *sbuf,  *rbuf;
    int *scnt, *rcnt;
    int *sdpl, *rdpl;
    int dsize;
    int ssize, rsize;

    MPI_Init( &argc, &argv );

    MPI_Comm_rank( MPI_COMM_WORLD, &myrank );
    MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
    MPI_Type_size(DATATYPE, &dsize);

    scnt = malloc( sizeof(int)*nprocs );
    sdpl = malloc( sizeof(int)*nprocs );
    rcnt = malloc( sizeof(int)*nprocs );
    rdpl = malloc( sizeof(int)*nprocs );

    for( i=0; i<nprocs; i++ )
    {
        scnt[i]=SIZE*(i+1)*(myrank+1);
        rcnt[i]=SIZE*(i+1)*(myrank+1);
        sdpl[i]=SIZE*((i*(i+1))/2)*(myrank+1);
        rdpl[i]=SIZE*((i*(i+1))/2)*(myrank+1);
    }

    ssize=0;
    for(i=0; i<nprocs; i++) ssize+=scnt[i];
    rsize=0;
    for(i=0; i<nprocs; i++) rsize+=rcnt[i];

    sbuf = (char*) malloc( SIZE*dsize*ssize );
    rbuf = (char*) malloc( SIZE*dsize*rsize );

    for( i=0; i<REPEAT; i++ )
    {
        MPI_Alltoallv( sbuf, scnt, sdpl, DATATYPE,
                       rbuf, rcnt, rdpl, DATATYPE,
                       MPI_COMM_WORLD );
    }

    fprintf(stdout, "DONE (rank %d)!\n", myrank);

    MPI_Finalize();
}

示例#19

0

显示文件

/*
 * Class:     mpi_Intracomm
 * Method:    Alltoallv
 * Signature:
 (Ljava/lang/Object;II[ILmpi/Datatype;Ljava/lang/Object;I[I[ILmpi/Datatype;)V
*/
JNIEXPORT void JNICALL Java_mpi_Intracomm_alltoallv(JNIEnv *env, jobject jthis,
                                                    jobject sendbuf, jint sendoffset, jintArray sendcount, 
                                                    jintArray sdispls, jobject sendtype,
                                                    jobject recvbuf, jint recvoffset, jintArray recvcount,
                                                    jintArray rdispls, jobject recvtype)
{
    jint *rcount, *scount, *sdps, *rdps ;
    jboolean isCopy ;

    MPI_Comm mpi_comm =
        (MPI_Comm)((*env)->GetLongField(env,jthis,ompi_java.CommhandleID)) ;

    MPI_Datatype mpi_stype = (MPI_Datatype)
        ((*env)->GetLongField(env,sendtype,ompi_java.DatatypehandleID)) ;
    MPI_Datatype mpi_rtype = (MPI_Datatype)
        ((*env)->GetLongField(env, recvtype, ompi_java.DatatypehandleID)) ;

    int sbaseType = (*env)->GetIntField(env, sendtype, ompi_java.DatatypebaseTypeID) ;
    int rbaseType = (*env)->GetIntField(env, recvtype, ompi_java.DatatypebaseTypeID) ;

    void *sendptr, *recvptr ;
    void *sbufbase, *rbufbase ;

    ompi_java_clearFreeList(env) ;

    scount=(*env)->GetIntArrayElements(env,sendcount,&isCopy);
    rcount=(*env)->GetIntArrayElements(env,recvcount,&isCopy);
    sdps=(*env)->GetIntArrayElements(env,sdispls,&isCopy);
    rdps=(*env)->GetIntArrayElements(env,rdispls,&isCopy);

    recvptr = ompi_java_getBufPtr(&rbufbase, env, recvbuf, rbaseType, recvoffset) ;
    sendptr = ompi_java_getBufPtr(&sbufbase, env, sendbuf, sbaseType, sendoffset) ;

    MPI_Alltoallv(sendptr, (int*) scount, (int*) sdps, mpi_stype,
                  recvptr, (int*) rcount, (int*) rdps, mpi_rtype,
                  mpi_comm) ;

    ompi_java_releaseBufPtr(env, sendbuf, sbufbase, sbaseType) ;
    ompi_java_releaseBufPtr(env, recvbuf, rbufbase, rbaseType) ;

    (*env)->ReleaseIntArrayElements(env,recvcount,rcount,JNI_ABORT);
    (*env)->ReleaseIntArrayElements(env,sendcount,scount,JNI_ABORT);
    (*env)->ReleaseIntArrayElements(env,sdispls,sdps,JNI_ABORT);
    (*env)->ReleaseIntArrayElements(env,rdispls,rdps,JNI_ABORT);
}

示例#20

0

显示文件

文件： zmpi_tools.c 项目： fweik/scafacos

int ZMPI_Reduce_scatter_block_intsum_proclists_alltoallv(const int *sendbuf, int nsendprocs, int *sendprocs, int *recvbuf, int recvcount, int nrecvprocs, int *recvprocs, MPI_Comm comm) /* zmpi_func ZMPI_Reduce_scatter_block_intsum_proclists_alltoallv */
{
  int i, j, size, rank;

  int *recvbuf_full;
  int *scounts, *sdispls, *rcounts, *rdispls;


  MPI_Comm_size(comm, &size);
  MPI_Comm_rank(comm, &rank);

  recvbuf_full = z_alloc(nrecvprocs * recvcount, sizeof(int));

  scounts = z_alloc(4 * size, sizeof(int));
  sdispls = scounts + 1 * size;
  rcounts = scounts + 2 * size;
  rdispls = scounts + 3 * size;

  memset(scounts, 0, 4 * size * sizeof(int));

  for (j = 0; j < nrecvprocs; ++j)
  {
    rcounts[recvprocs[j]] = recvcount;
    rdispls[recvprocs[j]] = j * recvcount;
  }

  for (j = 0; j < nsendprocs; ++j)
  {
    scounts[sendprocs[j]] = recvcount;
    sdispls[sendprocs[j]] = sendprocs[j] * recvcount;
  }

  MPI_Alltoallv((void *) sendbuf, scounts, sdispls, MPI_INT, recvbuf_full, rcounts, rdispls, MPI_INT, comm);

  for (i = 0; i < recvcount; ++i) recvbuf[i] = DEFAULT_INT;

  for (j = 0; j < nrecvprocs; ++j)
    for (i = 0; i < recvcount; ++i) recvbuf[i] += recvbuf_full[j * recvcount + i];

  z_free(scounts);

  z_free(recvbuf_full);

  return MPI_SUCCESS;
}

示例#21

0

显示文件

文件： common.c 项目： hgranlund/superComp

void transpose (Real **b, int size, int *len, int *disp, int rank, int m){
  int i, *sendcounts, *rdispls;
  Real  *sendbuf, *recvbuf;
  sendbuf = createRealArray (m * len[rank]);
  recvbuf = createRealArray (m * len[rank]);
  sendcounts = calloc(size,sizeof(int));
  rdispls = calloc(size,sizeof(int));
  matrixToVector(b,sendbuf,len,disp, size, rank);

  int index = 0;
  for (int i = 0; i < size; ++i)
  {
    sendcounts[i]= len[rank]*len[i];
    rdispls[i]=index;
    index=index+sendcounts[i];
  }
  MPI_Alltoallv(sendbuf, sendcounts, rdispls, MPI_DOUBLE, recvbuf, sendcounts, rdispls, MPI_DOUBLE, MPI_COMM_WORLD);
  vectorToMatrix(b,recvbuf,len,disp, size, rank);
}

示例#22

0

显示文件

文件： alltoallv_f.c 项目： Dissolubilis/ompi-svn-mirror

void ompi_alltoallv_f(char *sendbuf, MPI_Fint *sendcounts, MPI_Fint *sdispls,
		     MPI_Fint *sendtype, char *recvbuf, MPI_Fint *recvcounts,
		     MPI_Fint *rdispls, MPI_Fint *recvtype, 
		     MPI_Fint *comm, MPI_Fint *ierr)
{
    MPI_Comm c_comm;
    MPI_Datatype c_sendtype, c_recvtype;
    int size, c_ierr;
    OMPI_ARRAY_NAME_DECL(sendcounts);
    OMPI_ARRAY_NAME_DECL(sdispls);
    OMPI_ARRAY_NAME_DECL(recvcounts);
    OMPI_ARRAY_NAME_DECL(rdispls);

    c_comm = MPI_Comm_f2c(*comm);
    c_sendtype = MPI_Type_f2c(*sendtype);
    c_recvtype = MPI_Type_f2c(*recvtype);

    MPI_Comm_size(c_comm, &size);
    OMPI_ARRAY_FINT_2_INT(sendcounts, size);
    OMPI_ARRAY_FINT_2_INT(sdispls, size);
    OMPI_ARRAY_FINT_2_INT(recvcounts, size);
    OMPI_ARRAY_FINT_2_INT(rdispls, size);

    sendbuf = (char *) OMPI_F2C_IN_PLACE(sendbuf);
    sendbuf = (char *) OMPI_F2C_BOTTOM(sendbuf);
    recvbuf = (char *) OMPI_F2C_BOTTOM(recvbuf);

    c_ierr = MPI_Alltoallv(sendbuf, 
                           OMPI_ARRAY_NAME_CONVERT(sendcounts),
                           OMPI_ARRAY_NAME_CONVERT(sdispls), 
                           c_sendtype, 
                           recvbuf, 
                           OMPI_ARRAY_NAME_CONVERT(recvcounts),
                           OMPI_ARRAY_NAME_CONVERT(rdispls),
                           c_recvtype, c_comm);
    if (NULL != ierr) *ierr = OMPI_INT_2_FINT(c_ierr);

    OMPI_ARRAY_FINT_2_INT_CLEANUP(sendcounts);
    OMPI_ARRAY_FINT_2_INT_CLEANUP(sdispls);
    OMPI_ARRAY_FINT_2_INT_CLEANUP(recvcounts);
    OMPI_ARRAY_FINT_2_INT_CLEANUP(rdispls);
}

示例#23

0

显示文件

文件： poisson.c 项目： pranavm19/TMA4280

void transpose(real **B, size_t block_col, size_t m, size_t nprocs, size_t block_uk, size_t rem_uk, size_t rank)
{
    // Create send and recv Vectors:
    real *sendV = mk_1D_array(block_col*m, false);
    real *recvV = mk_1D_array(block_col*m, false);

    // Create send and recv Count and Displacement for MPI:
    int scount[nprocs], sdisp[nprocs], rcount[nprocs], rdisp[nprocs];
    for (size_t i = 0; i < nprocs; i++){
        scount[i] = block_uk;
        sdisp[i]  = block_uk*i;
        rcount[i] = block_uk;
        rdisp[i]  = block_uk*i;
    }
    scount[nprocs-1] = rem_uk;
    rcount[nprocs-1] = rem_uk;

    // Wrap Data into the 1D Array sendV:
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < m; i++){
        for (size_t j = 0; j < block_col; j++){
            sendV[j + i*block_col] = B[j][i];
        }
    }

    // Communicate with all processes:
    MPI_Alltoallv(sendV, scount, sdisp, MPI_DOUBLE, recvV, rcount, rdisp, MPI_DOUBLE, MPI_COMM_WORLD);

    // Unwrap the Data into the 2D array B, from recvV:
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nprocs; i++){
        int offset = rdisp[i], count = (rcount[i])/block_col;
        // #pragma omp parallel for schedule(static)
        for (size_t k = 0; k < block_col; k++){
            for (size_t j = 0; j < count; j++){
                B[k][j+(m/nprocs)*i] = recvV[offset + k*count + j];
            }
        }
    }
}

示例#24

0

显示文件

文件： sharp_mpi.c 项目： EiffL/libsharp

static void sharp_communicate_alm2map (const sharp_mpi_info *minfo, dcmplx **ph)
  {
  dcmplx *phas_tmp = RALLOC(dcmplx,minfo->mapdisp[minfo->ntasks]/2);

  MPI_Alltoallv (*ph,minfo->almcount,minfo->almdisp,MPI_DOUBLE,phas_tmp,
    minfo->mapcount,minfo->mapdisp,MPI_DOUBLE,minfo->comm);

  DEALLOC(*ph);
  ALLOC(*ph,dcmplx,minfo->nph*minfo->npair[minfo->mytask]*minfo->nmtotal);

  for (int task=0; task<minfo->ntasks; ++task)
    for (int th=0; th<minfo->npair[minfo->mytask]; ++th)
      for (int mi=0; mi<minfo->nm[task]; ++mi)
        {
        int m = minfo->mval[mi+minfo->ofs_m[task]];
        int o1 = minfo->nph*(th*(minfo->mmax+1) + m);
        int o2 = minfo->mapdisp[task]/2+minfo->nph*(mi+th*minfo->nm[task]);
        for (int i=0; i<minfo->nph; ++i)
          (*ph)[o1+i] = phas_tmp[o2+i];
        }
  DEALLOC(phas_tmp);
  }

示例#25

0

显示文件

文件： poisson.c 项目： lapstue/tma4280

void MPItranspose(double **b, double **bt, int nrColon, int m, double *sendbuf, double *recbuf, int *sendcnt, int *sdispls, int size, int rank, int *displs ){
    int tt = 0; //teller
    
    for (int o=0; o < size; o++) { //går igjennom hver prosessor
        for (int i=0; i < nrColon; i++) { 
            for (int j=displs[o]; j < displs[o+1]; j++) {  //går igjennom det som skal sendes tl prosessoren med rank=  o
                sendbuf[tt]=b[i][j]; //fyller sendbuf 
                tt++;
            }
        }
    }    
    MPI_Alltoallv(sendbuf, sendcnt, sdispls, MPI_DOUBLE, recbuf, sendcnt, sdispls, MPI_DOUBLE, MPI_COMM_WORLD);
    //Sender til alle prosessorer
    tt = 0;
    for (int o = 0; o < size; o++){ //går igjennom hver prosessor
        for (int j=displs[o]; j <  displs[o+1]; j++) { //Tar displacementen først for å også da transponere selve innholdet
            for (int i=0; i < nrColon; i++) {
                bt[i][j]=recbuf[tt]; //Skriver til bt
                tt++;
            }
        }
    }
}

示例#26

0

显示文件

文件： kmratoa.c 项目： tnishinaga/kmr

static int
kmr_alltoallv_mpi(KMR *mr,
		  void *sbuf, long *scnts, long *sdsps,
		  void *rbuf, long *rcnts, long *rdsps)
{
    MPI_Comm comm = mr->comm;
    int nprocs = mr->nprocs;
    int *ssz = kmr_malloc(sizeof(int) * (size_t)nprocs);
    int *sdp = kmr_malloc(sizeof(int) * (size_t)nprocs);
    int *rsz = kmr_malloc(sizeof(int) * (size_t)nprocs);
    int *rdp = kmr_malloc(sizeof(int) * (size_t)nprocs);

    for (int r = 0; r < nprocs; r++) {
	assert(INT_MIN * 8L <= scnts[r] && scnts[r] <= INT_MAX * 8L);
	assert(INT_MIN * 8L <= rcnts[r] && rcnts[r] <= INT_MAX * 8L);
	assert(INT_MIN * 8L <= sdsps[r] && sdsps[r] <= INT_MAX * 8L);
	assert(INT_MIN * 8L <= rdsps[r] && rdsps[r] <= INT_MAX * 8L);
	assert(((scnts[r] & 7) == 0)
	       && ((rcnts[r] & 7) == 0)
	       && ((sdsps[r] & 7) == 0)
	       && ((rdsps[r] & 7) == 0));
	ssz[r] = (int)(scnts[r] / 8L);
	rsz[r] = (int)(rcnts[r] / 8L);
	sdp[r] = (int)(sdsps[r] / 8L);
	rdp[r] = (int)(rdsps[r] / 8L);
    }
    int cc;
    cc = MPI_Alltoallv(sbuf, ssz, sdp, MPI_LONG,
		       rbuf, rsz, rdp, MPI_LONG, comm);
    assert(cc == MPI_SUCCESS);

    kmr_free(ssz, (sizeof(int) * (size_t)nprocs));
    kmr_free(rsz, (sizeof(int) * (size_t)nprocs));
    kmr_free(sdp, (sizeof(int) * (size_t)nprocs));
    kmr_free(rdp, (sizeof(int) * (size_t)nprocs));
    return MPI_SUCCESS;
}

示例#27

0

显示文件

文件： poisson-mpi.c 项目： teodoran/PoissonParalell

void transpose (Real **A, int m, int n, int size, int bb, int bre)
{
  int se[size], sd[size], re[size], rd[size];
  Real *V = createRealArray (n*m);
  Real *Vt = createRealArray (n*m);

  for (int i = 0; i < size; ++i) {
    se[i] = bb;
    sd[i] = bb*i;
    re[i] = bb;
    rd[i] = bb*i;
  }
  se[size-1] = bre;
  re[size-1] = bre;

  for(int i = 0; i < n; i++) {
    for(int j = 0; j < m; j++) {
      V[j + i*m] = A[j][i];
    }
  }

  MPI_Alltoallv(V, se, sd, MPI_DOUBLE, Vt, re, rd, MPI_DOUBLE, MPI_COMM_WORLD);
  fillA(A, Vt, re, rd, m, n, size);
}

示例#28

0

显示文件

文件： pzutil.c 项目： DBorello/OpenSeesDev

/*! \brief Permute the distributed dense matrix: B <= perm(X). perm[i] = j means the i-th row of X is in the j-th row of B.
 */
int pzPermute_Dense_Matrix
(
 int_t fst_row,
 int_t m_loc,
 int_t row_to_proc[],
 int_t perm[],
 doublecomplex X[], int ldx,
 doublecomplex B[], int ldb,
 int nrhs,
 gridinfo_t *grid
)
{
    int_t i, j, k, l;
    int p, procs;
    int *sendcnts, *sendcnts_nrhs, *recvcnts, *recvcnts_nrhs;
    int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
    int *ptr_to_ibuf, *ptr_to_dbuf;
    int_t *send_ibuf, *recv_ibuf;
    doublecomplex *send_dbuf, *recv_dbuf;

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(grid->iam, "Enter pzPermute_Dense_Matrix()");
#endif

    procs = grid->nprow * grid->npcol;
    if ( !(sendcnts = SUPERLU_MALLOC(10*procs * sizeof(int))) )
        ABORT("Malloc fails for sendcnts[].");
    sendcnts_nrhs = sendcnts + procs;
    recvcnts = sendcnts_nrhs + procs;
    recvcnts_nrhs = recvcnts + procs;
    sdispls = recvcnts_nrhs + procs;
    sdispls_nrhs = sdispls + procs;
    rdispls = sdispls_nrhs + procs;
    rdispls_nrhs = rdispls + procs;
    ptr_to_ibuf = rdispls_nrhs + procs;
    ptr_to_dbuf = ptr_to_ibuf + procs;

    for (i = 0; i < procs; ++i) sendcnts[i] = 0;

    /* Count the number of X entries to be sent to each process.*/
    for (i = fst_row; i < fst_row + m_loc; ++i) {
        p = row_to_proc[perm[i]];
	++sendcnts[p];
    }
    MPI_Alltoall(sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm);
    sdispls[0] = rdispls[0] = 0;
    sdispls_nrhs[0] = rdispls_nrhs[0] = 0;
    sendcnts_nrhs[0] = sendcnts[0] * nrhs;
    recvcnts_nrhs[0] = recvcnts[0] * nrhs;
    for (i = 1; i < procs; ++i) {
        sdispls[i] = sdispls[i-1] + sendcnts[i-1];
	sdispls_nrhs[i] = sdispls[i] * nrhs;
	rdispls[i] = rdispls[i-1] + recvcnts[i-1];
	rdispls_nrhs[i] = rdispls[i] * nrhs;
	sendcnts_nrhs[i] = sendcnts[i] * nrhs;
	recvcnts_nrhs[i] = recvcnts[i] * nrhs;
    }
    k = sdispls[procs-1] + sendcnts[procs-1];/* Total number of sends */
    l = rdispls[procs-1] + recvcnts[procs-1];/* Total number of recvs */
    /*assert(k == m_loc);*/
    /*assert(l == m_loc);*/
    if ( !(send_ibuf = intMalloc_dist(k + l)) )
        ABORT("Malloc fails for send_ibuf[].");
    recv_ibuf = send_ibuf + k;
    if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)*nrhs)) )
        ABORT("Malloc fails for send_dbuf[].");
    recv_dbuf = send_dbuf + k * nrhs;

    for (i = 0; i < procs; ++i) {
        ptr_to_ibuf[i] = sdispls[i];
	ptr_to_dbuf[i] = sdispls_nrhs[i];
    }

    /* Fill in the send buffers: send_ibuf[] and send_dbuf[]. */
    for (i = fst_row; i < fst_row + m_loc; ++i) {
        j = perm[i];
	p = row_to_proc[j];
	send_ibuf[ptr_to_ibuf[p]] = j;
	j = ptr_to_dbuf[p];
	RHS_ITERATE(k) { /* RHS stored in row major in the buffer */
	    send_dbuf[j++] = X[i-fst_row + k*ldx];
	}
	++ptr_to_ibuf[p];
	ptr_to_dbuf[p] += nrhs;
    }
	  
    /* Transfer the (permuted) row indices and numerical values. */
    MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t,
		  recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm);
    MPI_Alltoallv(send_dbuf, sendcnts_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
		  recv_dbuf, recvcnts_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
		  grid->comm);

    /* Copy the buffer into b. */
    for (i = 0, l = 0; i < m_loc; ++i) {
        j = recv_ibuf[i] - fst_row; /* Relative row number */
	RHS_ITERATE(k) { /* RHS stored in row major in the buffer */
	    B[j + k*ldb] = recv_dbuf[l++];
	}
    }

    SUPERLU_FREE(sendcnts);
    SUPERLU_FREE(send_ibuf);
    SUPERLU_FREE(send_dbuf);
#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(grid->iam, "Exit pzPermute_Dense_Matrix()");
#endif
    return 0;
} /* pzPermute_Dense_Matrix */

示例#29

0

显示文件

文件： pzutil.c 项目： DBorello/OpenSeesDev

/*! \brief Gather A from the distributed compressed row format to global A in compressed column format.
 */
int pzCompRow_loc_to_CompCol_global
(
 int_t need_value, /* Input. Whether need to gather numerical values */
 SuperMatrix *A,   /* Input. Distributed matrix in NRformat_loc format. */
 gridinfo_t *grid, /* Input */
 SuperMatrix *GA   /* Output */
)
{
    NRformat_loc *Astore;
    NCformat *GAstore;
    doublecomplex *a, *a_loc;
    int_t *colind, *rowptr;
    int_t *colptr_loc, *rowind_loc;
    int_t m_loc, n, i, j, k, l;
    int_t colnnz, fst_row, nnz_loc, nnz;
    doublecomplex *a_recv;  /* Buffer to receive the blocks of values. */
    doublecomplex *a_buf;   /* Buffer to merge blocks into block columns. */
    int_t *itemp;
    int_t *colptr_send; /* Buffer to redistribute the column pointers of the 
			   local block rows.
			   Use n_loc+1 pointers for each block. */
    int_t *colptr_blk;  /* The column pointers for each block, after
			   redistribution to the local block columns. 
			   Use n_loc+1 pointers for each block. */
    int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */
    int_t *rowind_buf;  /* Buffer to merge blocks into block columns. */
    int_t *fst_rows, *n_locs;
    int   *sendcnts, *sdispls, *recvcnts, *rdispls, *itemp_32;
    int   it, n_loc, procs;

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(grid->iam, "Enter pzCompRow_loc_to_CompCol_global");
#endif

    /* Initialization. */
    n = A->ncol;
    Astore = (NRformat_loc *) A->Store;
    nnz_loc = Astore->nnz_loc;
    m_loc = Astore->m_loc;
    fst_row = Astore->fst_row;
    a = Astore->nzval;
    rowptr = Astore->rowptr;
    colind = Astore->colind;
    n_loc = m_loc; /* NOTE: CURRENTLY ONLY WORK FOR SQUARE MATRIX */

    /* ------------------------------------------------------------
       FIRST PHASE: TRANSFORM A INTO DISTRIBUTED COMPRESSED COLUMN.
       ------------------------------------------------------------*/
    zCompRow_to_CompCol_dist(m_loc, n, nnz_loc, a, colind, rowptr, &a_loc,
                             &rowind_loc, &colptr_loc);
    /* Change local row index numbers to global numbers. */
    for (i = 0; i < nnz_loc; ++i) rowind_loc[i] += fst_row;

#if ( DEBUGlevel>=2 )
    printf("Proc %d\n", grid->iam);
    PrintInt10("rowind_loc", nnz_loc, rowind_loc);
    PrintInt10("colptr_loc", n+1, colptr_loc);
#endif

    procs = grid->nprow * grid->npcol;
    if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) )
	  ABORT("Malloc fails for fst_rows[]");
    n_locs = fst_rows + procs;
    MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t,
		  grid->comm);
    for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i];
    n_locs[procs-1] = n - fst_rows[procs-1];
    if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) )
	  ABORT("Malloc fails for recvcnts[]");
    sendcnts = recvcnts + procs;
    rdispls = sendcnts + procs;
    sdispls = rdispls + procs;
    itemp_32 = sdispls + procs;

    /* All-to-all transfer column pointers of each block.
       Now the matrix view is P-by-P block-partition. */
    /* n column starts for each column, and procs column ends for each block */
    if ( !(colptr_send = intMalloc_dist(n + procs)) )
	   ABORT("Malloc fails for colptr_send[]");
    if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) )
	   ABORT("Malloc fails for colptr_blk[]");
    for (i = 0, j = 0; i < procs; ++i) {
        for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k];
	colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */
	sendcnts[i] = n_locs[i] + 1;
#if ( DEBUGlevel>=1 )
	assert(j == fst_rows[i]);
#endif
	sdispls[i] = j + i;
	recvcnts[i] = n_loc + 1;
	rdispls[i] = i * (n_loc + 1);
	j += n_locs[i]; /* First column of next block in colptr_loc[] */
    }
    MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t,
		  colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm);

    /* Adjust colptr_blk[] so that they contain the local indices of the
       column pointers in the receive buffer. */
    nnz = 0; /* The running sum of the nonzeros counted by far */
    k = 0;
    for (i = 0; i < procs; ++i) {
	for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) {
	    colnnz = colptr_blk[j+1] - colptr_blk[j];
	    /*assert(k<=j);*/
	    colptr_blk[k] = nnz;
	    nnz += colnnz; /* Start of the next column */
	    ++k;
	}
	colptr_blk[k++] = nnz; /* Add an END marker for each block */
    }
    /*assert(k == (n_loc+1)*procs);*/

    /* Now prepare to transfer row indices and values. */
    sdispls[0] = 0;
    for (i = 0; i < procs-1; ++i) {
        sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]];
	sdispls[i+1] = sdispls[i] + sendcnts[i];
    }
    sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]];
    for (i = 0; i < procs; ++i) {
        j = rdispls[i]; /* Point to this block in colptr_blk[]. */
	recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j];
    }
    rdispls[0] = 0; /* Recompute rdispls[] for row indices. */
    for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i];

    k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */
    if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) )
        ABORT("Malloc fails for rowind_recv[]");
    rowind_buf = rowind_recv + k;
    MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t,
		  rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm);
    if ( need_value ) {
        if ( !(a_recv = (doublecomplex *) doublecomplexMalloc_dist(2*k)) )
	    ABORT("Malloc fails for rowind_recv[]");
	a_buf = a_recv + k;
	MPI_Alltoallv(a_loc, sendcnts, sdispls, SuperLU_MPI_DOUBLE_COMPLEX,
                      a_recv, recvcnts, rdispls, SuperLU_MPI_DOUBLE_COMPLEX,
                      grid->comm);
    }
      
    /* Reset colptr_loc[] to point to the n_loc global columns. */
    colptr_loc[0] = 0;
    itemp = colptr_send;
    for (j = 0; j < n_loc; ++j) {
        colnnz = 0;
	for (i = 0; i < procs; ++i) {
	    k = i * (n_loc + 1) + j; /* j-th column in i-th block */
	    colnnz += colptr_blk[k+1] - colptr_blk[k];
	}
	colptr_loc[j+1] = colptr_loc[j] + colnnz;
	itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */
    }
    itemp[n_loc] = colptr_loc[n_loc];
      
    /* Merge blocks of row indices into columns of row indices. */
    for (i = 0; i < procs; ++i) {
        k = i * (n_loc + 1);
	for (j = 0; j < n_loc; ++j) { /* i-th block */
	    for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
	        rowind_buf[itemp[j]] = rowind_recv[l];
		++itemp[j];
	    }
	}
    }

    if ( need_value ) {
        for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j];
        for (i = 0; i < procs; ++i) {
	    k = i * (n_loc + 1);
	    for (j = 0; j < n_loc; ++j) { /* i-th block */
	        for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
		    a_buf[itemp[j]] = a_recv[l];
		    ++itemp[j];
		}
	    }
	}
    }

    /* ------------------------------------------------------------
       SECOND PHASE: GATHER TO GLOBAL A IN COMPRESSED COLUMN FORMAT.
       ------------------------------------------------------------*/
    GA->nrow  = A->nrow;
    GA->ncol  = A->ncol;
    GA->Stype = SLU_NC;
    GA->Dtype = A->Dtype;
    GA->Mtype = A->Mtype;
    GAstore = GA->Store = (NCformat *) SUPERLU_MALLOC ( sizeof(NCformat) );
    if ( !GAstore ) ABORT ("SUPERLU_MALLOC fails for GAstore");

    /* First gather the size of each piece. */
    nnz_loc = colptr_loc[n_loc];
    MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm);
    for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i];
    GAstore->nnz = nnz;
    
    if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) )
        ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]");
    if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) )
        ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]");
      
    /* Allgatherv for row indices. */
    rdispls[0] = 0;
    for (i = 0; i < procs-1; ++i) {
        rdispls[i+1] = rdispls[i] + itemp[i];
        itemp_32[i] = itemp[i];
    }
    itemp_32[procs-1] = itemp[procs-1];
    it = nnz_loc;
    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, 
		   itemp_32, rdispls, mpi_int_t, grid->comm);
    if ( need_value ) {
      if ( !(GAstore->nzval = (doublecomplex *) doublecomplexMalloc_dist (nnz)) )
          ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]");
      MPI_Allgatherv(a_buf, it, SuperLU_MPI_DOUBLE_COMPLEX, GAstore->nzval, 
		     itemp_32, rdispls, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm);
    } else GAstore->nzval = NULL;

    /* Now gather the column pointers. */
    rdispls[0] = 0;
    for (i = 0; i < procs-1; ++i) {
        rdispls[i+1] = rdispls[i] + n_locs[i];
        itemp_32[i] = n_locs[i];
    }
    itemp_32[procs-1] = n_locs[procs-1];
    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, 
		   itemp_32, rdispls, mpi_int_t, grid->comm);

    /* Recompute column pointers. */
    for (i = 1; i < procs; ++i) {
        k = rdispls[i];
	for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1];
	itemp[i] += itemp[i-1]; /* prefix sum */
    }
    GAstore->colptr[n] = nnz;

#if ( DEBUGlevel>=2 )
    if ( !grid->iam ) {
        printf("After pdCompRow_loc_to_CompCol_global()\n");
	zPrint_CompCol_Matrix_dist(GA);
    }
#endif

    SUPERLU_FREE(a_loc);
    SUPERLU_FREE(rowind_loc);
    SUPERLU_FREE(colptr_loc);
    SUPERLU_FREE(fst_rows);
    SUPERLU_FREE(recvcnts);
    SUPERLU_FREE(colptr_send);
    SUPERLU_FREE(colptr_blk);
    SUPERLU_FREE(rowind_recv);
    if ( need_value) SUPERLU_FREE(a_recv);
#if ( DEBUGlevel>=1 )
    if ( !grid->iam ) printf("sizeof(NCformat) %lu\n", sizeof(NCformat));
    CHECK_MALLOC(grid->iam, "Exit pzCompRow_loc_to_CompCol_global");
#endif
    return 0;
} /* pzCompRow_loc_to_CompCol_global */

示例#30

0

显示文件

文件： scramble_edges.c 项目： Aguomath/CombBLAS_15

/* For MPI distributed memory. */
void scramble_edges_mpi(MPI_Comm comm,
                        const uint64_t userseed1, const uint64_t userseed2,
                        const int64_t local_nedges_in,
                        const int64_t* const local_edges_in,
                        int64_t* const local_nedges_out_ptr,
                        int64_t** const local_edges_out_ptr /* Allocated using xmalloc() by scramble_edges_mpi */) {
  int rank, size;

  MPI_Comm_rank(comm, &rank);
  MPI_Comm_size(comm, &size);

  mrg_state st;
  uint_fast32_t seed[5];
  make_mrg_seed(userseed1, userseed2, seed);
  mrg_seed(&st, seed);
  mrg_skip(&st, 5, 0, 0); /* To make offset different from other PRNG uses */
  int64_t total_nedges;
  MPI_Allreduce((void*)&local_nedges_in, &total_nedges, 1, INT64_T_MPI_TYPE, MPI_SUM, comm);
  int64_t local_nedges_out; /* = local permutation size */
  int64_t* local_perm;
  rand_sort_mpi(comm, &st, total_nedges, &local_nedges_out, &local_perm);
  *local_nedges_out_ptr = local_nedges_out;

  /* Gather permutation information and fast owner lookup cache (code in
   * apply_permutation_mpi.c). */
  int64_t* edge_displs = (int64_t*)xmalloc((size + 1) * sizeof(int64_t));
  int* edge_owner_table;
  int64_t* edge_owner_cutoff;
  int lg_minedgecount;
  int64_t maxedgecount;
  gather_block_distribution_info(comm, local_nedges_in, total_nedges, edge_displs, &edge_owner_table, &edge_owner_cutoff, &lg_minedgecount, &maxedgecount);

  /* Originally from apply_permutation_mpi.c */
#define LOOKUP_EDGE_OWNER(v) \
  (edge_owner_table[(v) >> lg_minedgecount] + \
   ((v) >= edge_owner_cutoff[(v) >> lg_minedgecount]))

  /* Apply permutation.  Output distribution is same as distribution of
   * generated edge permutation. */

  /* Count number of requests to send to each destination. */
  int* send_counts = (int*)xcalloc(size, sizeof(int)); /* Uses zero-init */
  int64_t i;
  for (i = 0; i < local_nedges_out; ++i) {
    ++send_counts[LOOKUP_EDGE_OWNER(local_perm[i])];
  }

  /* Prefix sum to get displacements. */
  int* send_displs = (int*)xmalloc((size + 1) * sizeof(int));
  send_displs[0] = 0;
  for (i = 0; i < size; ++i) {
    send_displs[i + 1] = send_displs[i] + send_counts[i];
  }
  assert (send_displs[size] == local_nedges_out);

  /* Put edges into buffer by destination; also keep around index values for
   * where to write the result. */
  int64_t* sendbuf = (int64_t*)xmalloc(local_nedges_out * sizeof(int64_t));
  int64_t* reply_loc_buf = (int64_t*)xmalloc(local_nedges_out * sizeof(int64_t));
  int* send_offsets = (int*)xmalloc((size + 1) * sizeof(int));
  memcpy(send_offsets, send_displs, (size + 1) * sizeof(int));
  for (i = 0; i < local_nedges_out; ++i) {
    int write_index = send_offsets[LOOKUP_EDGE_OWNER(local_perm[i])];
    sendbuf[write_index] = local_perm[i];
    reply_loc_buf[write_index] = i;
    ++send_offsets[LOOKUP_EDGE_OWNER(local_perm[i])];
  }
  for (i = 0; i < size; ++i) assert (send_offsets[i] == send_displs[i + 1]);
  free(send_offsets); send_offsets = NULL;
  free(local_perm); local_perm = NULL;
#undef LOOKUP_EDGE_OWNER
  free(edge_owner_table); edge_owner_table = NULL;
  free(edge_owner_cutoff); edge_owner_cutoff = NULL;

  /* Find out how many requests I will be receiving. */
  int* recv_counts = (int*)xmalloc(size * sizeof(int));
  MPI_Alltoall(send_counts, 1, MPI_INT,
               recv_counts, 1, MPI_INT,
               comm);

  /* Compute their displacements. */
  int* recv_displs = (int*)xmalloc((size + 1) * sizeof(int));
  recv_displs[0] = 0;
  for (i = 0; i < size; ++i) {
    recv_displs[i + 1] = recv_displs[i] + recv_counts[i];
  }

  /* Make receive and reply buffers. */
  int64_t* recvbuf = (int64_t*)xmalloc(recv_displs[size] * sizeof(int64_t));
  int64_t* replybuf = (int64_t*)xmalloc(recv_displs[size] * 2 * sizeof(int64_t));

  /* Move requests for edges into receive buffer. */
  MPI_Alltoallv(sendbuf, send_counts, send_displs, INT64_T_MPI_TYPE,
                recvbuf, recv_counts, recv_displs, INT64_T_MPI_TYPE,
                comm);
  free(sendbuf); sendbuf = NULL;

  /* Put requested edges into response buffer. */
  int64_t my_edge_offset = edge_displs[rank];
  for (i = 0; i < recv_displs[size]; ++i) {
    replybuf[i * 2 + 0] = local_edges_in[(recvbuf[i] - my_edge_offset) * 2 + 0];
    replybuf[i * 2 + 1] = local_edges_in[(recvbuf[i] - my_edge_offset) * 2 + 1];
  }
  free(recvbuf); recvbuf = NULL;
  free(edge_displs); edge_displs = NULL;

  /* Send replies back. */
  int64_t* reply_edges = (int64_t*)xmalloc(local_nedges_out * 2 * sizeof(int64_t));
  for (i = 0; i < size; ++i) { /* Sending back two values for each request */
    recv_counts[i] *= 2;
    recv_displs[i] *= 2;
    send_counts[i] *= 2;
    send_displs[i] *= 2;
  }
  MPI_Alltoallv(replybuf, recv_counts, recv_displs, INT64_T_MPI_TYPE,
                reply_edges, send_counts, send_displs, INT64_T_MPI_TYPE,
                comm);
  free(replybuf); replybuf = NULL;
  free(recv_counts); recv_counts = NULL;
  free(recv_displs); recv_displs = NULL;
  free(send_counts); send_counts = NULL;
  free(send_displs); send_displs = NULL;

  /* Make output array of edges. */
  int64_t* local_edges_out = (int64_t*)xmalloc(local_nedges_out * 2 * sizeof(int64_t));
  *local_edges_out_ptr = local_edges_out;

  /* Put edges into output array. */
  for (i = 0; i < local_nedges_out; ++i) {
    local_edges_out[reply_loc_buf[i] * 2 + 0] = reply_edges[2 * i + 0];
    local_edges_out[reply_loc_buf[i] * 2 + 1] = reply_edges[2 * i + 1];
  }
  free(reply_loc_buf); reply_loc_buf = NULL;
  free(reply_edges); reply_edges = NULL;
}