int exch_addr(void) { MPI_Status status; int i, rc; rc = MPI_Alltoall((void *)conn.qp_num, sizeof(uint32_t), MPI_BYTE, (void *)rbuf.qp_num, sizeof(uint32_t), MPI_BYTE, l_state.world_comm); assert(!rc); rc = MPI_Alltoall((void *)conn.lid, sizeof(uint16_t), MPI_BYTE, (void *)rbuf.lid, sizeof(uint16_t), MPI_BYTE, l_state.world_comm); assert(!rc); #ifdef DEBUG for (i = 0; i < nprocs; i++) { if (me == i) continue; fprintf(stdout,"[%d] Remote QP %d, Remote LID %u, Rkey %u, Lkey %u\n" " LBuf %p, RBuf %p\n", me, rbuf.qp_num[i], rbuf.lid[i], rbuf.rkey[i], lbuf.mr->lkey, lbuf.buf, rbuf.buf[i]); fflush(stdout); } #endif return 0; }
static void apply(const plan *ego_, R *I, R *O) { const P *ego = (const P *) ego_; plan_rdft *cld1, *cld2, *cld2rest, *cld3; /* transpose locally to get contiguous chunks */ cld1 = (plan_rdft *) ego->cld1; if (cld1) { cld1->apply(ego->cld1, I, O); /* transpose chunks globally */ if (ego->equal_blocks) MPI_Alltoall(O, ego->send_block_sizes[0], FFTW_MPI_TYPE, I, ego->recv_block_sizes[0], FFTW_MPI_TYPE, ego->comm); else MPI_Alltoallv(O, ego->send_block_sizes, ego->send_block_offsets, FFTW_MPI_TYPE, I, ego->recv_block_sizes, ego->recv_block_offsets, FFTW_MPI_TYPE, ego->comm); } else { /* TRANSPOSED_IN, no need to destroy input */ /* transpose chunks globally */ if (ego->equal_blocks) MPI_Alltoall(I, ego->send_block_sizes[0], FFTW_MPI_TYPE, O, ego->recv_block_sizes[0], FFTW_MPI_TYPE, ego->comm); else MPI_Alltoallv(I, ego->send_block_sizes, ego->send_block_offsets, FFTW_MPI_TYPE, O, ego->recv_block_sizes, ego->recv_block_offsets, FFTW_MPI_TYPE, ego->comm); I = O; /* final transpose (if any) is in-place */ } /* transpose locally, again, to get ordinary row-major */ cld2 = (plan_rdft *) ego->cld2; if (cld2) { cld2->apply(ego->cld2, I, O); cld2rest = (plan_rdft *) ego->cld2rest; if (cld2rest) { /* leftover from unequal block sizes */ cld2rest->apply(ego->cld2rest, I + ego->rest_Ioff, O + ego->rest_Ooff); cld3 = (plan_rdft *) ego->cld3; if (cld3) cld3->apply(ego->cld3, O, O); /* else TRANSPOSED_OUT is true and user wants O transposed */ } } }
/* * Class: mpi_Intracomm * Method: Alltoall * Signature: (Ljava/lang/Object;IILmpi/Datatype;Ljava/lang/Object;IILmpi/Datatype;)V */ JNIEXPORT void JNICALL Java_mpi_Intracomm_alltoall(JNIEnv *env, jobject jthis, jobject sendbuf, jint sendoffset, jint sendcount, jobject sendtype, jobject recvbuf, jint recvoffset, jint recvcount, jobject recvtype) { MPI_Comm mpi_comm = (MPI_Comm)((*env)->GetLongField(env,jthis,ompi_java.CommhandleID)) ; MPI_Datatype mpi_stype = (MPI_Datatype) ((*env)->GetLongField(env,sendtype,ompi_java.DatatypehandleID)) ; MPI_Datatype mpi_rtype = (MPI_Datatype) ((*env)->GetLongField(env, recvtype, ompi_java.DatatypehandleID)) ; int sbaseType = (*env)->GetIntField(env, sendtype, ompi_java.DatatypebaseTypeID) ; int rbaseType = (*env)->GetIntField(env, recvtype, ompi_java.DatatypebaseTypeID) ; void *sendptr, *recvptr ; void *sbufbase, *rbufbase ; ompi_java_clearFreeList(env) ; recvptr = ompi_java_getBufPtr(&rbufbase, env, recvbuf, rbaseType, recvoffset) ; sendptr = ompi_java_getBufPtr(&sbufbase, env, sendbuf, sbaseType, sendoffset) ; MPI_Alltoall(sendptr, sendcount, mpi_stype, recvptr, recvcount, mpi_rtype, mpi_comm) ; ompi_java_releaseBufPtr(env, sendbuf, sbufbase, sbaseType) ; ompi_java_releaseBufPtr(env, recvbuf, rbufbase, rbaseType) ; }
int main( int argc, char* argv[] ) { int i, j; int myrank, nprocs; char *sbuf, *rbuf; int dsize; MPI_Init( &argc, &argv ); MPI_Comm_rank( MPI_COMM_WORLD, &myrank ); MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); MPI_Type_size(DATATYPE, &dsize); sbuf=(char*)malloc(SIZE*dsize*nprocs); rbuf=(char*)malloc(SIZE*dsize*nprocs); for( i=0; i<REPEAT; i++ ) { MPI_Alltoall( sbuf, SIZE, DATATYPE, rbuf, SIZE, DATATYPE, MPI_COMM_WORLD ); } MPI_Finalize(); return 0; }
static inline void execute_predefined_op(int opnum, void* args, void* scratch) { if (opnum == -1) { MPI_Barrier(G_GOAL_WorldComm); } else if (opnum == -2) { struct bcast_args* bc = (struct bcast_args*) args; MPI_Bcast(bc->buffer, bc->count, MPI_BYTE, bc->root, G_GOAL_WorldComm); } else if (opnum == -3) { struct scatter_args* sc = (struct scatter_args*) args; MPI_Scatter(sc->sendbuffer, sc->count, MPI_BYTE, sc->recvbuffer, sc->count, MPI_BYTE, sc->root, G_GOAL_WorldComm); } else if (opnum == -4) { struct scatter_args* ga = (struct scatter_args*) args; MPI_Gather(ga->sendbuffer, ga->count, MPI_BYTE, ga->recvbuffer, ga->count, MPI_BYTE, ga->root, G_GOAL_WorldComm); } else if (opnum == -5) { struct alltoall_args* aa = (struct alltoall_args*) args; MPI_Alltoall(aa->sendbuffer, aa->count, MPI_BYTE, aa->recvbuffer, aa->count, MPI_BYTE, G_GOAL_WorldComm); } else if (opnum == -99) { /* dummy op - do nothing */ } else { printf("Predefined op number %i is not implemented yet\n", opnum); } }
/* Out-of-place version of transpose_mpi (or rather, in place using a scratch array): */ static void transpose_mpi_out_of_place(transpose_mpi_plan p, int el_size, TRANSPOSE_EL_TYPE *local_data, TRANSPOSE_EL_TYPE *work) { local_transpose_copy(local_data, work, el_size, p->local_nx, p->ny); if (p->all_blocks_equal) MPI_Alltoall(work, p->send_block_size * el_size, p->el_type, local_data, p->recv_block_size * el_size, p->el_type, p->comm); else { int i, n_pes = p->n_pes; for (i = 0; i < n_pes; ++i) { p->send_block_sizes[i] *= el_size; p->recv_block_sizes[i] *= el_size; p->send_block_offsets[i] *= el_size; p->recv_block_offsets[i] *= el_size; } MPI_Alltoallv(work, p->send_block_sizes, p->send_block_offsets, p->el_type, local_data, p->recv_block_sizes, p->recv_block_offsets, p->el_type, p->comm); for (i = 0; i < n_pes; ++i) { p->send_block_sizes[i] /= el_size; p->recv_block_sizes[i] /= el_size; p->send_block_offsets[i] /= el_size; p->recv_block_offsets[i] /= el_size; } } do_permutation(local_data, p->perm_block_dest, p->num_perm_blocks, p->perm_block_size * el_size); }
int main(int argc, char** argv) { // Initialize MPI MPI_Init(&argc, &argv); int size, rank; // Figure out the number of processes and our rank in the world group MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (size % 2) { printf("Need an even number of processes\n"); MPI_Finalize(); return 1; } // setup new communicators MPI_Comm twocomm; MPI_Comm_split(MPI_COMM_WORLD, rank/2, rank%2, &twocomm); int senddata[2], recvdata[2]; senddata[(rank+1)%2] = rank; senddata[rank%2] = 0; MPI_Alltoall(senddata, 1, MPI_INT, recvdata, 1, MPI_INT, twocomm); // print to tty printf("process %i: received %i\n", rank, recvdata[(rank+1)%2]); // close down MPI MPI_Finalize(); // ay-oh-kay return 0; }
void MADRE_exchange(MC* mc, int *myRecvCount, int *mySendCount){ int i; Particle *p; p = mc->particles; //cache blockLength int blockLength = MADRE_BLOCK_LENGTH; /* MADRE_pack should have constructed an integer number of blocks */ assert(mc->nparticles % (int)MADRE_BLOCK_LENGTH == 0); int liveBlocks = mc->nparticles/blockLength; for (i=0; i<liveBlocks; ++i) destRanks[i] = p[i*blockLength].proc; /* By default, this was set to zero */ myRecvCount[mc->mype] = mySendCount[mc->mype]; /* Organize destIndices by proc-rank order */ displ[0] = 0; for (i=1;i<(mc->nprocs);++i) displ[i] = displ[i-1] + myRecvCount[i-1]/blockLength; /* Alltoall where each proc can start receiving particles to get destIndices */ MPI_Alltoall(displ, 1, MPI_INT, sdispl, 1, MPI_INT, MPI_COMM_WORLD); for (i=0; i<liveBlocks; ++i){ destIndices[i]= sdispl[p[i*blockLength].proc]; sdispl[p[i*blockLength].proc]++; } MADRE_redistribute(MADRE_particle, liveBlocks, destRanks, destIndices); mc->nparticles = isum(myRecvCount, mc->nprocs); /* Each proc should have an integer number of blocks after exchanges */ assert(mc->nparticles % (int)MADRE_BLOCK_LENGTH == 0); }
int main( int argc, char **argv ) { int send[4], recv[4]; int rank, size, k; MPI_Init( &argc, &argv ); MPI_Comm_rank( MPI_COMM_WORLD, &rank ); MPI_Comm_size( MPI_COMM_WORLD, &size ); if (size != 4) { printf("Error!:# of processors must be equal to 4\n"); printf("Programm aborting....\n"); MPI_Abort(MPI_COMM_WORLD, 1); } for (k=0;k<size;k++) send[k] = (k+1) + rank*size; printf("%d : send = %d %d %d %d\n", rank, send[0], send[1], send[2], send[3]); MPI_Alltoall(send, 1, MPI_INT, recv, 1, MPI_INT, MPI_COMM_WORLD); printf("%d : recv = %d %d %d %d\n", rank, recv[0], recv[1], recv[2], recv[3]); MPI_Finalize(); return 0; }
/* run an exchange test with msgsz bytes per proc with bytes transferred * actually nproc*msgsz per exchange (all-to-all). */ double exchangetest(int iters, int msgsz) { int64_t starttime, endtime; int i; char *sendbuf, *recvbuf; sendbuf = malloc(msgsz*nproc); recvbuf = malloc(msgsz*nproc); if (sendbuf == NULL || recvbuf == NULL) { fprintf(stderr, "malloc"); exit(-1); } barrier(); starttime = getMicrosecondTimeStamp(); for (i=0; i<iters; i++) { MPI_Alltoall(sendbuf, msgsz, MPI_CHAR, recvbuf, msgsz, MPI_CHAR, MPI_COMM_WORLD); } endtime = getMicrosecondTimeStamp(); free(sendbuf); free(recvbuf); return (endtime-starttime); }
int main(int argc, char *argv[]) { int rank, size; int chunk = 128; int i; int *sb; int *rb; int status, gstatus; MTest_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); for (i = 1; i < argc; ++i) { if (argv[i][0] != '-') continue; switch (argv[i][1]) { case 'm': chunk = atoi(argv[++i]); break; default: fprintf(stderr, "Unrecognized argument %s\n", argv[i]); MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); } } sb = (int *) malloc(size * chunk * sizeof(int)); if (!sb) { perror("can't allocate send buffer"); MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); } rb = (int *) malloc(size * chunk * sizeof(int)); if (!rb) { perror("can't allocate recv buffer"); free(sb); MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); } for (i = 0; i < size * chunk; ++i) { sb[i] = rank + 1; rb[i] = 0; } /* fputs("Before MPI_Alltoall\n",stdout); */ /* This should really send MPI_CHAR, but since sb and rb were allocated * as chunk*size*sizeof(int), the buffers are large enough */ status = MPI_Alltoall(sb, chunk, MPI_INT, rb, chunk, MPI_INT, MPI_COMM_WORLD); /* fputs("Before MPI_Allreduce\n",stdout); */ MTest_Finalize(status); free(sb); free(rb); MPI_Finalize(); return MTestReturnValue(status); }
void mpi_alltoall (void *sendbuf, MPI_Fint *sendcount, MPI_Fint *sendtype, void *recvbuf, MPI_Fint *recvcount, MPI_Fint *recvtype, MPI_Fint *comm, MPI_Fint *__ierr) { *__ierr = MPI_Alltoall (sendbuf, *sendcount, MPI_Type_f2c(*sendtype), recvbuf, *recvcount, MPI_Type_f2c(*recvtype), MPI_Comm_f2c (*comm)); }
int ReAllocateRasterBlock( void * SendBuf, int SendCount, MPI_Datatype SendType, void * RecvBuf, int RecvCount, MPI_Datatype RecvType, MPI_Comm Comm ) { return MPI_Alltoall(SendBuf, SendCount, SendType, RecvBuf, RecvCount, RecvType, Comm); }
static void all_to_all(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, int n = 1) { // NB: this will fail if T is a vector MPI_Alltoall(Datatype::address(const_cast<T&>(in[0])), n, Datatype::datatype(), Datatype::address(out[0]), n, Datatype::datatype(), comm); }
int kmr_exchange_sizes(KMR *mr, long *sbuf, long *rbuf) { MPI_Comm comm = mr->comm; int cc; cc = MPI_Alltoall(sbuf, 1, MPI_LONG, rbuf, 1, MPI_LONG, comm); assert(cc == MPI_SUCCESS); return MPI_SUCCESS; }
FC_FUNC( mpi_alltoall , MPI_ALLTOALL ) ( void *sendbuf, int *sendcount, int *sendtype, void *recvbuf, int *recvcount, int *recvtype, int *comm, int *ierror ) { *ierror=MPI_Alltoall(sendbuf, *sendcount, *sendtype, recvbuf, *recvcount, *recvtype, *comm); }
int binGraph::exchange_edges(uint64_t m_read, uint64_t* read_edges, int32_t* ranks,etype t) { int32_t* scounts = (int32_t*)malloc(PCU_Comm_Peers()*sizeof(int32_t)); int32_t* rcounts = (int32_t*)malloc(PCU_Comm_Peers()*sizeof(int32_t)); int32_t* sdispls = (int32_t*)malloc(PCU_Comm_Peers()*sizeof(int32_t)); int32_t* sdispls_cpy = (int32_t*)malloc(PCU_Comm_Peers()*sizeof(int32_t)); int32_t* rdispls = (int32_t*)malloc(PCU_Comm_Peers()*sizeof(int32_t)); for (int i = 0; i < PCU_Comm_Peers(); ++i) { scounts[i] = 0; rcounts[i] = 0; sdispls[i] = 0; sdispls_cpy[i] = 0; rdispls[i] = 0; } uint64_t n_per_rank = num_global_verts / PCU_Comm_Peers() + 1; for (uint64_t i = 0; i < m_read*2; i+=2) { uint64_t vert = read_edges[i]; int vert_task = ranks[vert]; scounts[vert_task] += 2; } MPI_Alltoall(scounts, 1, MPI_INT32_T, rcounts, 1, MPI_INT32_T, PCU_Get_Comm()); for (uint64_t i = 1; i < PCU_Comm_Peers(); ++i) { sdispls[i] = sdispls[i-1] + scounts[i-1]; sdispls_cpy[i] = sdispls[i]; rdispls[i] = rdispls[i-1] + rcounts[i-1]; } int32_t total_send = sdispls[PCU_Comm_Peers()-1] + scounts[PCU_Comm_Peers()-1]; int32_t total_recv = rdispls[PCU_Comm_Peers()-1] + rcounts[PCU_Comm_Peers()-1]; uint64_t* sendbuf = (uint64_t*)malloc(total_send*sizeof(uint64_t)); edge_list[t] = (uint64_t*)malloc(total_recv*sizeof(uint64_t)); num_local_edges[t] = total_recv / 2; for (uint64_t i = 0; i < m_read*2; i+=2) { uint64_t vert1 = read_edges[i]; uint64_t vert2 = read_edges[i+1]; int vert_task = ranks[vert1]; sendbuf[sdispls_cpy[vert_task]++] = vert1; sendbuf[sdispls_cpy[vert_task]++] = vert2; } MPI_Alltoallv(sendbuf, scounts, sdispls, MPI_UINT64_T, edge_list[t], rcounts, rdispls, MPI_UINT64_T, PCU_Get_Comm()); free(sendbuf); return 0; }
void transpose(Real** recv, Real** send) { int i; for (i = 0; i < mpi_work; i++) { MPI_Alltoall(send[i], mpi_work, MPI_DOUBLE, recv[i], mpi_work, MPI_DOUBLE, MPI_COMM_WORLD); } for (i = 0; i < mpi_size; i++) { local_transpose(recv, i*mpi_work); } }
double measure_delayed_Alltoall(int send_count, MPI_Datatype send_dt, int recv_count, MPI_Datatype recv_dt, double delay, int node) { double start_time, end_time; start_time = start_synchronization(); if( get_measurement_rank() == node ) while( wtime() < start_time + delay ) ; MPI_Alltoall(get_send_buffer(), send_count, send_dt, get_recv_buffer(), recv_count, recv_dt, get_measurement_comm()); end_time = stop_synchronization(); return end_time - start_time; }
// Given how many numbers each process is sending to the other processes, find // out how many numbers you are receiving from each process. This function // returns an array of counts indexed on the rank of the process from which it // will receive the numbers. int *get_recv_amounts_per_proc(int *send_amounts_per_proc, int world_size) { int *recv_amounts_per_proc = (int *)malloc(sizeof(int) * world_size); // Perform an Alltoall for the send counts. This will send the send counts // from each process and place them in the recv_amounts_per_proc array of // the receiving processes to let them know how many numbers they will // receive when binning occurs. MPI_Alltoall(send_amounts_per_proc, 1, MPI_INT, recv_amounts_per_proc, 1, MPI_INT, MPI_COMM_WORLD); return recv_amounts_per_proc; }
void timing_basic_alltoall_nelements( int DIM1, int procs, int loop, char* testname, MPI_Comm local_communicator) { float* send_array; float* recv_array; int myrank; int base, typesize, bytes, i; char method[50]; send_array = malloc( DIM1 * procs * sizeof(float)); recv_array = malloc( DIM1 * procs * sizeof(float)); MPI_Comm_rank( local_communicator, &myrank ); base = myrank * DIM1 + 1; utilities_fill_unique_array_1D_float( &send_array[0], DIM1, base ); if ( myrank == 0 ) { snprintf(method, 50, "reference"); MPI_Type_size( MPI_FLOAT, &typesize ); bytes = typesize * DIM1 * procs; timing_init( testname, &method[0], bytes ); } for( i=0 ; i<loop ; i++ ) { MPI_Alltoall(&send_array[0], DIM1, MPI_FLOAT, &recv_array[0], DIM1, MPI_FLOAT, local_communicator ); MPI_Alltoall(&recv_array[0], DIM1, MPI_FLOAT, &send_array[0], DIM1, MPI_FLOAT, local_communicator ); if ( myrank == 0 ) { timing_record(3); } } if ( myrank == 0 ) { timing_print( 1 ); } free(send_array); free(recv_array); }
HYPRE_Int hypre_MPI_Alltoall( void *sendbuf, HYPRE_Int sendcount, hypre_MPI_Datatype sendtype, void *recvbuf, HYPRE_Int recvcount, hypre_MPI_Datatype recvtype, hypre_MPI_Comm comm ) { return (HYPRE_Int) MPI_Alltoall(sendbuf, (hypre_int)sendcount, sendtype, recvbuf, (hypre_int)recvcount, recvtype, comm); }
char * avtSamplePointCommunicator::CommunicateMessages(char **sendmessages, int *sendcount, char **recvmessages, int *recvcount) { #ifdef PARALLEL // // Figure out how much each processor needs to send/receive. // MPI_Alltoall(sendcount, 1, MPI_INT, recvcount, 1, MPI_INT, VISIT_MPI_COMM); // // Create a buffer we can receive into. // char *recvConcatList = CreateMessageStrings(recvmessages, recvcount, numProcs); // // Calculate the displacement lists. // int *senddisp = new int[numProcs]; int *recvdisp = new int[numProcs]; senddisp[0] = 0; recvdisp[0] = 0; for (int i = 1 ; i < numProcs ; i++) { senddisp[i] = senddisp[i-1] + sendcount[i-1]; recvdisp[i] = recvdisp[i-1] + recvcount[i-1]; } // // Do the actual transfer of sample points. The messages arrays are // actually indexes into one big array. Since MPI expects that big // array, give that (which is at location 0). // MPI_Alltoallv(sendmessages[0], sendcount, senddisp, MPI_CHAR, recvmessages[0], recvcount, recvdisp, MPI_CHAR, VISIT_MPI_COMM); delete [] senddisp; delete [] recvdisp; // // We need to return this buffer so the calling function can delete it. // return recvConcatList; #else return 0; #endif }
int main(int argc, char **argv) { int *out, *in,j,k; int me,tasks,i, errcount=0; double start,end,diff,avg_diff_usec; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD,&tasks); if(tasks < 2) { printf("MUST RUN WITH AT LEAST 2 TASKS\n"); errcount++; MPI_Finalize(); exit(0); } MPI_Comm_rank(MPI_COMM_WORLD,&me); out=(int *)calloc(tasks, sizeof(int)); in=(int *)calloc(tasks,sizeof(int)); for(i=0;i<tasks;i++) out[i] = me; MPI_Barrier(MPI_COMM_WORLD); if (!me) { start = MPI_Wtime(); } for(i=0;i<ALLTOALL_COUNT;i++) MPI_Alltoall(out,1,MPI_INT,in,1,MPI_INT,MPI_COMM_WORLD); if (!me) { end = MPI_Wtime(); diff = end - start; avg_diff_usec = diff * (1000000/ALLTOALL_COUNT); printf("AFTER ALLTOALLS, START TIME = %f, END TIME = %f, DIFF (sec) = %f,\n",start,end,diff); printf("\t\tITERS = %d, AVG (usec) = %f, EXPECTED = %d\n",ALLTOALL_COUNT,avg_diff_usec, EXPECTED_AVG_uSEC); if (avg_diff_usec < EXPECTED_AVG_uSEC) { printf ("Passed\n"); } else if (avg_diff_usec < (2* EXPECTED_AVG_uSEC)) { printf ("Acceptable\n"); } else { printf ("SLOW\n"); } fflush (stdout); } MPI_Finalize(); return 0; }
//does global transposition from sendbuffer to recvbuffer of count data (in BYTE). be careful with 2GB limits QMP_status_t QMP_comm_alltoall_mpi(QMP_comm_t comm, char* recvbuffer, char* sendbuffer, int count) { QMP_status_t status=QMP_SUCCESS; ENTER; int err=MPI_Alltoall( (void*)sendbuffer, count, MPI_BYTE, (void*)recvbuffer, count, MPI_BYTE, comm->mpicomm); if(err != MPI_SUCCESS) status = err; LEAVE; return status; }
// =========================================================================== // =========================================================================== void test_mpi_alltoall(int rank, int num_cores) { int i; unsigned int time[10]; int *buf_in; int *buf_out; buf_in = kt_malloc(512 * 4); buf_out = kt_malloc(512 * 4); for (i = 0; i < 512 * 1; i++) { buf_in[i] = 0; buf_out[i] = 0; } for (i = 0; i < 10; i++) { if (!rank) { kt_printf("entering %d\r\n", i); } MPI_Barrier(MPI_COMM_WORLD); ar_timer_reset(); MPI_Alltoall(buf_out, 1, MPI_INT, buf_in, 1, MPI_INT, MPI_COMM_WORLD); time[i] = ar_timer_get_cycles(); MPI_Barrier(MPI_COMM_WORLD); if (!rank) { kt_printf("done %d\r\n", i); } MPI_Barrier(MPI_COMM_WORLD); } kt_free(buf_in); kt_free(buf_out); if (!rank) { kt_printf("Alltoall time = %12d\r\n" " %12d\r\n" " %12d\r\n" " %12d\r\n" " %12d\r\n" " %12d\r\n" " %12d\r\n" " %12d\r\n" " %12d\r\n" " %12d cycles\r\n", time[0], time[1], time[2], time[3], time[4], time[5], time[6], time[7], time[8], time[9]); } }
static void p_fill_ineed_ptrs( sptensor_t const * const tt, idx_t const mode, rank_info * const rinfo, MPI_Comm const comm) { idx_t const m = mode; int size; int rank; MPI_Comm_size(comm, &size); MPI_Comm_rank(comm, &rank); rinfo->nlocal2nbr[m] = 0; rinfo->local2nbr_ptr[m] = (int *) calloc((size+1), sizeof(int)); rinfo->nbr2globs_ptr[m] = (int *) splatt_malloc((size+1) * sizeof(int)); int * const local2nbr_ptr = rinfo->local2nbr_ptr[m]; int * const nbr2globs_ptr = rinfo->nbr2globs_ptr[m]; idx_t const * const mat_ptrs = rinfo->mat_ptrs[m]; int pdest = 0; /* count recvs for each process */ for(idx_t i=0; i < tt->dims[m]; ++i) { /* grab global index */ idx_t const gi = (tt->indmap[m] == NULL) ? i : tt->indmap[m][i]; /* move to the next processor if necessary */ while(gi >= mat_ptrs[pdest+1]) { ++pdest; } assert(pdest < size); assert(gi >= mat_ptrs[pdest]); assert(gi < mat_ptrs[pdest+1]); /* if it is non-local */ if(pdest != rank) { local2nbr_ptr[pdest] += 1; rinfo->nlocal2nbr[m] += 1; } } /* communicate local2nbr and receive nbr2globs */ MPI_Alltoall(local2nbr_ptr, 1, MPI_INT, nbr2globs_ptr, 1, MPI_INT, comm); rinfo->nnbr2globs[m] = 0; for(int p=0; p < size; ++p) { rinfo->nnbr2globs[m] += nbr2globs_ptr[p]; } nbr2globs_ptr[size] = rinfo->nnbr2globs[m]; }
/** * Opens channel and initializes all fields of the channel_t structure. */ void channel_open(channel_t * ch, int direction, int *swap_me) { direction = (direction != 0); // Clamps direction flag to {0, 1}. if(channel_registerTag(ch->tag)) error("channel_open(%s): tag '%d' is used already.", ch->name, ch->tag); int *swap_they = (int *) calloc(cpu_total, sizeof(int)); // Allocates second exchange list. MPI_Alltoall(swap_me, 1, MPI_INT, swap_they, 1, MPI_INT, MPI_COMM_WORLD); // Exchanges by invitations. channel_allocateSide(ch, direction, swap_me); // Allocates two array of sockets/requests. channel_allocateSide(ch, 1 - direction, swap_they); free(swap_they); // Returns memory. ch->open = 1; // Marks success. }
int main(int argc, char **argv) { MPI_Init(&argc, &argv); int i, myrank, numranks, groupsize; int dims[3] = {0, 0, 0}; int temp[3] = {0, 0, 0}; int coord[3] = {0, 0, 0}; int periods[3] = {1, 1, 1}; double startTime, stopTime; MPI_Comm cartcomm, subcomm; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &numranks); dims[MP_X] = atoi(argv[1]); dims[MP_Y] = atoi(argv[2]); dims[MP_Z] = atoi(argv[3]); MPI_Dims_create(numranks, 3, dims); MPI_Cart_create(MPI_COMM_WORLD, 3, dims, periods, 1, &cartcomm); MPI_Cart_get(cartcomm, 3, dims, periods, coord); temp[MP_X] = 0; temp[MP_Y] = 1; temp[MP_Z] = 0; MPI_Cart_sub(cartcomm, temp, &subcomm); MPI_Comm_size(subcomm,&groupsize); int perrank = atoi(argv[4]); char *sendbuf = (char*)malloc(perrank*groupsize); char *recvbuf = (char*)malloc(perrank*groupsize); MPI_Barrier(cartcomm); MPI_Pcontrol(1); startTime = MPI_Wtime(); for (i=0; i<MAX_ITER; i++) { MPI_Alltoall(sendbuf, perrank, MPI_CHAR, recvbuf, perrank, MPI_CHAR, subcomm); } MPI_Barrier(cartcomm); stopTime = MPI_Wtime(); MPI_Pcontrol(0); if(myrank == 0) { printf("Completed %d iterations for subcom size %d, perrank %d\n", i, groupsize, perrank); printf("Time elapsed: %f\n", stopTime - startTime); } MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { int myid, size; int *each_vector, data_id, send_count; int *recv_vector, recv_count; //char output_msg[16]; int i, c; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &myid); MPI_Comm_size(MPI_COMM_WORLD, &size); // print size by root //if (myid == 0) { // printf("Process Count: %d\n", size); //} for (c = 0; c < 10; c++) { send_count = size * (c+1); recv_count = size * (c+1); // init vector on each process each_vector = (int*)malloc(send_count * sizeof(int)); data_id = myid * 1000; for (i = 0; i < send_count; i++) { each_vector[i] = data_id + i; } // print vector //snprintf(output_msg, 16, "Rank[%d]: ", myid); //print_array(each_vector, send_count, output_msg); // init recv vector recv_vector = (int*)malloc(recv_count * sizeof(int)); for (i = 0; i < recv_count; i++) { recv_vector[i] = -1; } // do alltoall MPI_Alltoall(each_vector, c+1, MPI_INT, recv_vector, c+1, MPI_INT, MPI_COMM_WORLD); //print_array(recv_vector, recv_count, output_msg); free(each_vector); free(recv_vector); } MPI_Finalize(); return 0; }