static void action_Isend(const char *const *action) { int to = atoi(action[2]); double size=parse_double(action[3]); double clock = smpi_process_simulated_elapsed(); MPI_Request request; if(action[4]) MPI_CURRENT_TYPE=decode_datatype(action[4]); else MPI_CURRENT_TYPE= MPI_DEFAULT_TYPE; #ifdef HAVE_TRACING int rank = smpi_comm_rank(MPI_COMM_WORLD); TRACE_smpi_computing_out(rank); int dst_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), to); TRACE_smpi_ptp_in(rank, rank, dst_traced, __FUNCTION__); TRACE_smpi_send(rank, rank, dst_traced); #endif request = smpi_mpi_isend(NULL, size, MPI_CURRENT_TYPE, to, 0,MPI_COMM_WORLD); #ifdef HAVE_TRACING TRACE_smpi_ptp_out(rank, rank, dst_traced, __FUNCTION__); request->send = 1; TRACE_smpi_computing_in(rank); #endif xbt_dynar_push(reqq[smpi_comm_rank(MPI_COMM_WORLD)],&request); log_timed_action (action, clock); }
static void action_Irecv(const char *const *action) { int from = atoi(action[2]); double size=parse_double(action[3]); double clock = smpi_process_simulated_elapsed(); MPI_Request request; smpi_replay_globals_t globals = (smpi_replay_globals_t) smpi_process_get_user_data(); if(action[4]) MPI_CURRENT_TYPE=decode_datatype(action[4]); else MPI_CURRENT_TYPE= MPI_DEFAULT_TYPE; #ifdef HAVE_TRACING int rank = smpi_comm_rank(MPI_COMM_WORLD); int src_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), from); TRACE_smpi_ptp_in(rank, src_traced, rank, __FUNCTION__); #endif request = smpi_mpi_irecv(NULL, size, MPI_CURRENT_TYPE, from, 0, MPI_COMM_WORLD); #ifdef HAVE_TRACING TRACE_smpi_ptp_out(rank, src_traced, rank, __FUNCTION__); request->recv = 1; #endif xbt_dynar_push(globals->irecvs,&request); xbt_dynar_push(reqq[smpi_comm_rank(MPI_COMM_WORLD)],&request); log_timed_action (action, clock); }
// Allgather-Non-Topoloty-Scecific-Logical-Ring algorithm int smpi_coll_tuned_allgather_NTSLR_NB(void *sbuf, int scount, MPI_Datatype stype, void *rbuf, int rcount, MPI_Datatype rtype, MPI_Comm comm) { MPI_Aint rextent, sextent; MPI_Status status, status2; int i, to, from, rank, size; int send_offset, recv_offset; int tag = COLL_TAG_ALLGATHER; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); rextent = smpi_datatype_get_extent(rtype); sextent = smpi_datatype_get_extent(stype); MPI_Request *rrequest_array; MPI_Request *srequest_array; rrequest_array = (MPI_Request *) xbt_malloc(size * sizeof(MPI_Request)); srequest_array = (MPI_Request *) xbt_malloc(size * sizeof(MPI_Request)); // irregular case use default MPI fucntions if (scount * sextent != rcount * rextent) { XBT_WARN("MPI_allgather_NTSLR_NB use default MPI_allgather."); smpi_mpi_allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm); return MPI_SUCCESS; } // topo non-specific to = (rank + 1) % size; from = (rank + size - 1) % size; //copy a single segment from sbuf to rbuf send_offset = rank * scount * sextent; smpi_mpi_sendrecv(sbuf, scount, stype, rank, tag, (char *)rbuf + send_offset, rcount, rtype, rank, tag, comm, &status); //start sending logical ring message int increment = scount * sextent; //post all irecv first for (i = 0; i < size - 1; i++) { recv_offset = ((rank - i - 1 + size) % size) * increment; rrequest_array[i] = smpi_mpi_irecv((char *)rbuf + recv_offset, rcount, rtype, from, tag + i, comm); } for (i = 0; i < size - 1; i++) { send_offset = ((rank - i + size) % size) * increment; srequest_array[i] = smpi_mpi_isend((char *)rbuf + send_offset, scount, stype, to, tag + i, comm); smpi_mpi_wait(&rrequest_array[i], &status); smpi_mpi_wait(&srequest_array[i], &status2); } free(rrequest_array); free(srequest_array); return MPI_SUCCESS; }
int smpi_coll_tuned_reduce_ompi_binary( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { uint32_t segsize; int segcount = count; size_t typelng; /** * Determine number of segments and number of elements * sent per operation */ typelng=smpi_datatype_size( datatype ); // Binary_32K segsize = 32*1024; XBT_DEBUG("coll:tuned:reduce_intra_binary rank %d ss %5d", smpi_comm_rank(comm), segsize); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, ompi_coll_tuned_topo_build_tree(2, comm, root), segcount, 0); }
int smpi_coll_tuned_bcast_flattree(void *buff, int count, MPI_Datatype data_type, int root, MPI_Comm comm) { MPI_Request *req_ptr; MPI_Request *reqs; int i, rank, num_procs; int tag = 1; rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); if (rank != root) { smpi_mpi_recv(buff, count, data_type, root, tag, comm, MPI_STATUS_IGNORE); } else { reqs = (MPI_Request *) xbt_malloc((num_procs - 1) * sizeof(MPI_Request)); req_ptr = reqs; // Root sends data to all others for (i = 0; i < num_procs; i++) { if (i == rank) continue; *(req_ptr++) = smpi_mpi_isend(buff, count, data_type, i, tag, comm); } // wait on all requests smpi_mpi_waitall(num_procs - 1, reqs, MPI_STATUSES_IGNORE); free(reqs); } return MPI_SUCCESS; }
/***************************************************************************** * Function: allgather_ring * return: int * inputs: * send_buff: send input buffer * send_count: number of elements to send * send_type: data type of elements being sent * recv_buff: receive output buffer * recv_count: number of elements to received * recv_type: data type of elements being received * comm: communication * Descrp: Function works in P - 1 steps. In step i, node j - i -> j -> j+ i. * Auther: Ahmad Faraj ****************************************************************************/ int smpi_coll_tuned_allgather_ring(void *send_buff, int send_count, MPI_Datatype send_type, void *recv_buff, int recv_count, MPI_Datatype recv_type, MPI_Comm comm) { MPI_Aint extent; int i, src, dst, rank, num_procs; int tag = 1; MPI_Status status; char *sendptr = (char *) send_buff; char *recvptr = (char *) recv_buff; rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); extent = smpi_datatype_get_extent(send_type); // local send/recv smpi_mpi_sendrecv(sendptr, send_count, send_type, rank, tag, recvptr + rank * recv_count * extent, recv_count, recv_type, rank, tag, comm, &status); for (i = 1; i < num_procs; i++) { src = (rank - i + num_procs) % num_procs; dst = (rank + i) % num_procs; smpi_mpi_sendrecv(sendptr, send_count, send_type, dst, tag, recvptr + src * recv_count * extent, recv_count, recv_type, src, tag, comm, &status); } return MPI_SUCCESS; }
/***************************************************************************** * Function: alltoall_pair_mpi_barrier * Return: int * Inputs: send_buff: send input buffer send_count: number of elements to send send_type: data type of elements being sent recv_buff: receive output buffer recv_count: number of elements to received recv_type: data type of elements being received comm: communicator * Descrp: Function works when P is power of two. In each phase of P - 1 phases, nodes in pair communicate their data. MPI barriers are inserted between each two phases. * Auther: Ahmad Faraj ****************************************************************************/ int smpi_coll_tuned_alltoallv_pair_mpi_barrier(void *send_buff, int *send_counts, int *send_disps, MPI_Datatype send_type, void *recv_buff, int *recv_counts, int *recv_disps, MPI_Datatype recv_type, MPI_Comm comm) { MPI_Status s; MPI_Aint send_chunk, recv_chunk; int i, src, dst, rank, num_procs; int tag = 101; char *send_ptr = (char *) send_buff; char *recv_ptr = (char *) recv_buff; rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); send_chunk = smpi_datatype_get_extent(send_type); recv_chunk = smpi_datatype_get_extent(recv_type); for (i = 0; i < num_procs; i++) { src = dst = rank ^ i; smpi_mpi_barrier(comm); smpi_mpi_sendrecv(send_ptr + send_disps[dst] * send_chunk, send_counts[dst], send_type, dst, tag, recv_ptr + recv_disps[src] * recv_chunk, recv_counts[src], recv_type, src, tag, comm, &s); } return MPI_SUCCESS; }
int smpi_coll_tuned_barrier_ompi_bruck(MPI_Comm comm ) { int rank, size; int distance, to, from; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); XBT_DEBUG( "ompi_coll_tuned_barrier_ompi_bruck rank %d", rank); /* exchange data with rank-2^k and rank+2^k */ for (distance = 1; distance < size; distance <<= 1) { from = (rank + size - distance) % size; to = (rank + distance) % size; /* send message to lower ranked node */ smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, to, COLL_TAG_BARRIER, NULL, 0, MPI_BYTE, from, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } return MPI_SUCCESS; }
static void action_send(const char *const *action) { int to = atoi(action[2]); double size=parse_double(action[3]); double clock = smpi_process_simulated_elapsed(); #ifdef HAVE_TRACING int rank = smpi_comm_rank(MPI_COMM_WORLD); TRACE_smpi_computing_out(rank); int dst_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), to); TRACE_smpi_ptp_in(rank, rank, dst_traced, __FUNCTION__); TRACE_smpi_send(rank, rank, dst_traced); #endif smpi_mpi_send(NULL, size, MPI_BYTE, to , 0, MPI_COMM_WORLD); if (XBT_LOG_ISENABLED(smpi_replay, xbt_log_priority_verbose)){ char *name = xbt_str_join_array(action, " "); XBT_VERB("%s %f", name, smpi_process_simulated_elapsed()-clock); free(name); } #ifdef HAVE_TRACING TRACE_smpi_ptp_out(rank, rank, dst_traced, __FUNCTION__); TRACE_smpi_computing_in(rank); #endif }
static void action_Irecv(const char *const *action) { int from = atoi(action[2]); double size=parse_double(action[3]); double clock = smpi_process_simulated_elapsed(); MPI_Request request; smpi_replay_globals_t globals = (smpi_replay_globals_t) smpi_process_get_user_data(); #ifdef HAVE_TRACING int rank = smpi_comm_rank(MPI_COMM_WORLD); int src_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), from); TRACE_smpi_ptp_in(rank, src_traced, rank, __FUNCTION__); #endif request = smpi_mpi_irecv(NULL, size, MPI_BYTE, from, 0, MPI_COMM_WORLD); #ifdef HAVE_TRACING TRACE_smpi_ptp_out(rank, src_traced, rank, __FUNCTION__); request->recv = 1; #endif xbt_dynar_push(globals->irecvs,&request); //TODO do the asynchronous cleanup if (XBT_LOG_ISENABLED(smpi_replay, xbt_log_priority_verbose)){ char *name = xbt_str_join_array(action, " "); XBT_VERB("%s %f", name, smpi_process_simulated_elapsed()-clock); free(name); } }
static void action_recv(const char *const *action) { int from = atoi(action[2]); double size=parse_double(action[3]); double clock = smpi_process_simulated_elapsed(); MPI_Status status; if(action[4]) MPI_CURRENT_TYPE=decode_datatype(action[4]); else MPI_CURRENT_TYPE= MPI_DEFAULT_TYPE; #ifdef HAVE_TRACING int rank = smpi_comm_rank(MPI_COMM_WORLD); int src_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), from); TRACE_smpi_computing_out(rank); TRACE_smpi_ptp_in(rank, src_traced, rank, __FUNCTION__); #endif smpi_mpi_recv(NULL, size, MPI_CURRENT_TYPE, from, 0, MPI_COMM_WORLD, &status); #ifdef HAVE_TRACING TRACE_smpi_ptp_out(rank, src_traced, rank, __FUNCTION__); TRACE_smpi_recv(rank, src_traced, rank); TRACE_smpi_computing_in(rank); #endif log_timed_action (action, clock); }
/***************************************************************************** * Function: alltoall_ring * Return: int * Inputs: send_buff: send input buffer send_count: number of elements to send send_type: data type of elements being sent recv_buff: receive output buffer recv_count: number of elements to received recv_type: data type of elements being received comm: communicator * Descrp: Function works in P - 1 steps. In step i, node j - i -> j -> j + i. * Auther: Ahmad Faraj ****************************************************************************/ int smpi_coll_tuned_alltoall_ring(void *send_buff, int send_count, MPI_Datatype send_type, void *recv_buff, int recv_count, MPI_Datatype recv_type, MPI_Comm comm) { MPI_Status s; MPI_Aint send_chunk, recv_chunk; int i, src, dst, rank, num_procs; int tag = COLL_TAG_ALLTOALL; char *send_ptr = (char *) send_buff; char *recv_ptr = (char *) recv_buff; rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); send_chunk = smpi_datatype_get_extent(send_type); recv_chunk = smpi_datatype_get_extent(recv_type); send_chunk *= send_count; recv_chunk *= recv_count; for (i = 0; i < num_procs; i++) { src = (rank - i + num_procs) % num_procs; dst = (rank + i) % num_procs; smpi_mpi_sendrecv(send_ptr + dst * send_chunk, send_count, send_type, dst, tag, recv_ptr + src * recv_chunk, recv_count, recv_type, src, tag, comm, &s); } return MPI_SUCCESS; }
static void action_wait(const char *const *action){ double clock = smpi_process_simulated_elapsed(); MPI_Request request; MPI_Status status; smpi_replay_globals_t globals = (smpi_replay_globals_t) smpi_process_get_user_data(); xbt_assert(xbt_dynar_length(globals->irecvs), "action wait not preceded by any irecv: %s", xbt_str_join_array(action," ")); request = xbt_dynar_pop_as(globals->irecvs,MPI_Request); #ifdef HAVE_TRACING int rank = request && request->comm != MPI_COMM_NULL ? smpi_comm_rank(request->comm) : -1; TRACE_smpi_computing_out(rank); MPI_Group group = smpi_comm_group(request->comm); int src_traced = smpi_group_rank(group, request->src); int dst_traced = smpi_group_rank(group, request->dst); int is_wait_for_receive = request->recv; TRACE_smpi_ptp_in(rank, src_traced, dst_traced, __FUNCTION__); #endif smpi_mpi_wait(&request, &status); #ifdef HAVE_TRACING TRACE_smpi_ptp_out(rank, src_traced, dst_traced, __FUNCTION__); if (is_wait_for_receive) { TRACE_smpi_recv(rank, src_traced, dst_traced); } TRACE_smpi_computing_in(rank); #endif log_timed_action (action, clock); }
/* * scatter_intra * * Function: - basic scatter operation * Accepts: - same arguments as MPI_Scatter() * Returns: - MPI_SUCCESS or error code */ int smpi_coll_tuned_scatter_ompi_basic_linear(void *sbuf, int scount, MPI_Datatype sdtype, void *rbuf, int rcount, MPI_Datatype rdtype, int root, MPI_Comm comm ) { int i, rank, size, err; char *ptmp; ptrdiff_t lb, incr; /* Initialize */ rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); /* If not root, receive data. */ if (rank != root) { smpi_mpi_recv(rbuf, rcount, rdtype, root, COLL_TAG_SCATTER, comm, MPI_STATUS_IGNORE); return MPI_SUCCESS; } /* I am the root, loop sending data. */ err = smpi_datatype_extent(sdtype, &lb, &incr); if (MPI_SUCCESS != err) { return MPI_ERR_OTHER; } incr *= scount; for (i = 0, ptmp = (char *) sbuf; i < size; ++i, ptmp += incr) { /* simple optimization */ if (i == rank) { if (MPI_IN_PLACE != rbuf) { err = smpi_datatype_copy(ptmp, scount, sdtype, rbuf, rcount, rdtype); } } else { smpi_mpi_send(ptmp, scount, sdtype, i, COLL_TAG_SCATTER, comm); } if (MPI_SUCCESS != err) { return err; } } /* All done */ return MPI_SUCCESS; }
int smpi_coll_tuned_reduce_flat_tree(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, int root, MPI_Comm comm) { int i, tag = 4321; int size; int rank; MPI_Aint extent; char *origin = 0; char *inbuf; MPI_Status status; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); /* If not root, send data to the root. */ extent = smpi_datatype_get_extent(dtype); if (rank != root) { smpi_mpi_send(sbuf, count, dtype, root, tag, comm); return 0; } /* Root receives and reduces messages. Allocate buffer to receive messages. */ if (size > 1) origin = (char *) xbt_malloc(count * extent); /* Initialize the receive buffer. */ if (rank == (size - 1)) smpi_mpi_sendrecv(sbuf, count, dtype, rank, tag, rbuf, count, dtype, rank, tag, comm, &status); else smpi_mpi_recv(rbuf, count, dtype, size - 1, tag, comm, &status); /* Loop receiving and calling reduction function (C or Fortran). */ for (i = size - 2; i >= 0; --i) { if (rank == i) inbuf = sbuf; else { smpi_mpi_recv(origin, count, dtype, i, tag, comm, &status); inbuf = origin; } /* Call reduction function. */ smpi_op_apply(op, inbuf, rbuf, &count, &dtype); } if (origin) free(origin); /* All done */ return 0; }
int smpi_coll_tuned_gather_ompi(void *sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount, MPI_Datatype rdtype, int root, MPI_Comm comm ) { //const int large_segment_size = 32768; //const int small_segment_size = 1024; //const size_t large_block_size = 92160; const size_t intermediate_block_size = 6000; const size_t small_block_size = 1024; const int large_communicator_size = 60; const int small_communicator_size = 10; int communicator_size, rank; size_t dsize, block_size; XBT_DEBUG("smpi_coll_tuned_gather_ompi"); communicator_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); // Determine block size if (rank == root) { dsize = smpi_datatype_size(rdtype); block_size = dsize * rcount; } else { dsize = smpi_datatype_size(sdtype); block_size = dsize * scount; } /* if (block_size > large_block_size) {*/ /* return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, */ /* rbuf, rcount, rdtype, */ /* root, comm);*/ /* } else*/ if (block_size > intermediate_block_size) { return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm); } else if ((communicator_size > large_communicator_size) || ((communicator_size > small_communicator_size) && (block_size < small_block_size))) { return smpi_coll_tuned_gather_ompi_binomial (sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm); } // Otherwise, use basic linear return smpi_coll_tuned_gather_ompi_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm); }
static void action_waitall(const char *const *action){ double clock = smpi_process_simulated_elapsed(); int count_requests=0; unsigned int i=0; count_requests=xbt_dynar_length(reqq[smpi_comm_rank(MPI_COMM_WORLD)]); if (count_requests>0) { MPI_Request requests[count_requests]; MPI_Status status[count_requests]; /* The reqq is an array of dynars. Its index corresponds to the rank. Thus each rank saves its own requests to the array request. */ xbt_dynar_foreach(reqq[smpi_comm_rank(MPI_COMM_WORLD)],i,requests[i]); #ifdef HAVE_TRACING //save information from requests xbt_dynar_t srcs = xbt_dynar_new(sizeof(int), NULL); xbt_dynar_t dsts = xbt_dynar_new(sizeof(int), NULL); xbt_dynar_t recvs = xbt_dynar_new(sizeof(int), NULL); for (i = 0; i < count_requests; i++) { if(requests[i]){ int *asrc = xbt_new(int, 1); int *adst = xbt_new(int, 1); int *arecv = xbt_new(int, 1); *asrc = requests[i]->src; *adst = requests[i]->dst; *arecv = requests[i]->recv; xbt_dynar_insert_at(srcs, i, asrc); xbt_dynar_insert_at(dsts, i, adst); xbt_dynar_insert_at(recvs, i, arecv); xbt_free(asrc); xbt_free(adst); xbt_free(arecv); }else { int *t = xbt_new(int, 1); xbt_dynar_insert_at(srcs, i, t); xbt_dynar_insert_at(dsts, i, t); xbt_dynar_insert_at(recvs, i, t); xbt_free(t); } }
/** * Alltoall basic_linear (STARMPI:alltoall-simple) **/ int smpi_coll_tuned_alltoall_basic_linear(void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm) { int system_tag = 888; int i, rank, size, err, count; MPI_Aint lb = 0, sendext = 0, recvext = 0; MPI_Request *requests; /* Initialize. */ rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); XBT_DEBUG("<%d> algorithm alltoall_basic_linear() called.", rank); smpi_datatype_extent(sendtype, &lb, &sendext); smpi_datatype_extent(recvtype, &lb, &recvext); /* simple optimization */ err = smpi_datatype_copy((char *)sendbuf + rank * sendcount * sendext, sendcount, sendtype, (char *)recvbuf + rank * recvcount * recvext, recvcount, recvtype); if (err == MPI_SUCCESS && size > 1) { /* Initiate all send/recv to/from others. */ requests = xbt_new(MPI_Request, 2 * (size - 1)); /* Post all receives first -- a simple optimization */ count = 0; for (i = (rank + 1) % size; i != rank; i = (i + 1) % size) { requests[count] = smpi_irecv_init((char *)recvbuf + i * recvcount * recvext, recvcount, recvtype, i, system_tag, comm); count++; } /* Now post all sends in reverse order * - We would like to minimize the search time through message queue * when messages actually arrive in the order in which they were posted. * TODO: check the previous assertion */ for (i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size) { requests[count] = smpi_isend_init((char *)sendbuf + i * sendcount * sendext, sendcount, sendtype, i, system_tag, comm); count++; } /* Wait for them all. */ smpi_mpi_startall(count, requests); XBT_DEBUG("<%d> wait for %d requests", rank, count); smpi_mpi_waitall(count, requests, MPI_STATUS_IGNORE); for(i = 0; i < count; i++) { if(requests[i]!=MPI_REQUEST_NULL) smpi_mpi_request_free(&requests[i]); } xbt_free(requests); } return err; }
/* * Simple double ring version of barrier * * synchronous gurantee made by last ring of sends are synchronous * */ int smpi_coll_tuned_barrier_ompi_doublering(MPI_Comm comm ) { int rank, size; int left, right; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); XBT_DEBUG("ompi_coll_tuned_barrier_ompi_doublering rank %d", rank); left = ((rank-1+size)%size); right = ((rank+1)%size); if (rank > 0) { /* receive message from the left */ smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } /* Send message to the right */ smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right, COLL_TAG_BARRIER, comm); /* root needs to receive from the last node */ if (rank == 0) { smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } /* Allow nodes to exit */ if (rank > 0) { /* post Receive from left */ smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } /* send message to the right one */ smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right, COLL_TAG_BARRIER, comm); /* rank 0 post receive from the last node */ if (rank == 0) { smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } return MPI_SUCCESS; }
int smpi_coll_tuned_barrier_mvapich2_pair(MPI_Comm comm) { int size, rank; int d, dst, src; int mpi_errno = MPI_SUCCESS; size = smpi_comm_size(comm); /* Trivial barriers return immediately */ if (size == 1) return MPI_SUCCESS; rank = smpi_comm_rank(comm); int N2_prev = 1; /* N2_prev = greatest power of two < size of Comm */ for( N2_prev = 1; N2_prev <= size; N2_prev <<= 1 ); N2_prev >>= 1; int surfeit = size - N2_prev; /* Perform a combine-like operation */ if (rank < N2_prev) { if (rank < surfeit) { /* get the fanin letter from the upper "half" process: */ dst = N2_prev + rank; smpi_mpi_recv(NULL, 0, MPI_BYTE, dst, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } /* combine on embedded N2_prev power-of-two processes */ for (d = 1; d < N2_prev; d <<= 1) { dst = (rank ^ d); smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, dst, COLL_TAG_BARRIER, NULL, 0, MPI_BYTE, dst, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } /* fanout data to nodes above N2_prev... */ if (rank < surfeit) { dst = N2_prev + rank; smpi_mpi_send(NULL, 0, MPI_BYTE, dst, COLL_TAG_BARRIER, comm); } } else { /* fanin data to power of 2 subset */ src = rank - N2_prev; smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, src, COLL_TAG_BARRIER, NULL, 0, MPI_BYTE, src, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } return mpi_errno; }
/***************************************************************************** * Function: allgather_spreading_simple * return: int * inputs: * send_buff: send input buffer * send_count: number of elements to send * send_type: data type of elements being sent * recv_buff: receive output buffer * recv_count: number of elements to received * recv_type: data type of elements being received * comm: communication * Descrp: Let i -> j denote the communication from node i to node j. The * order of communications for node i is i -> i + 1, i -> i + 2, ..., * i -> (i + p -1) % P. * * Auther: Ahmad Faraj ****************************************************************************/ int smpi_coll_tuned_allgather_spreading_simple(void *send_buff, int send_count, MPI_Datatype send_type, void *recv_buff, int recv_count, MPI_Datatype recv_type, MPI_Comm comm) { MPI_Request *reqs, *req_ptr; MPI_Aint extent; int i, src, dst, rank, num_procs, num_reqs; int tag = COLL_TAG_ALLGATHER; MPI_Status status; char *recv_ptr = (char *) recv_buff; rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); extent = smpi_datatype_get_extent(send_type); num_reqs = (2 * num_procs) - 2; reqs = (MPI_Request *) xbt_malloc(num_reqs * sizeof(MPI_Request)); if (!reqs) { printf("allgather-spreading-simple.c:40: cannot allocate memory\n"); MPI_Finalize(); exit(0); } req_ptr = reqs; smpi_mpi_sendrecv(send_buff, send_count, send_type, rank, tag, (char *) recv_buff + rank * recv_count * extent, recv_count, recv_type, rank, tag, comm, &status); for (i = 0; i < num_procs; i++) { src = (rank + i) % num_procs; if (src == rank) continue; *(req_ptr++) = smpi_mpi_irecv(recv_ptr + src * recv_count * extent, recv_count, recv_type, src, tag, comm); } for (i = 0; i < num_procs; i++) { dst = (rank + i) % num_procs; if (dst == rank) continue; *(req_ptr++) = smpi_mpi_isend(send_buff, send_count, send_type, dst, tag, comm); } smpi_mpi_waitall(num_reqs, reqs, MPI_STATUSES_IGNORE); free(reqs); return MPI_SUCCESS; }
/***************************************************************************** * Function: alltoall_pair_light_barrier * Return: int * Inputs: send_buff: send input buffer send_count: number of elements to send send_type: data type of elements being sent recv_buff: receive output buffer recv_count: number of elements to received recv_type: data type of elements being received comm: communicator * Descrp: Function works in P - 1 steps. In step i, node j exchanges data with node i ^ j. Light barriers are inserted between communications in different phases. * Auther: Ahmad Faraj ****************************************************************************/ int smpi_coll_tuned_alltoall_pair_light_barrier(void *send_buff, int send_count, MPI_Datatype send_type, void *recv_buff, int recv_count, MPI_Datatype recv_type, MPI_Comm comm) { MPI_Aint send_chunk, recv_chunk; MPI_Status s; int i, src, dst, rank, num_procs, next_partner; int tag = COLL_TAG_ALLTOALL; /*, failure = 0; */ char send_sync = 'a', recv_sync = 'b'; char *send_ptr = (char *) send_buff; char *recv_ptr = (char *) recv_buff; rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); if((num_procs&(num_procs-1))) THROWF(arg_error,0, "alltoall pair algorithm can't be used with non power of two number of processes ! "); send_chunk = smpi_datatype_get_extent(send_type); recv_chunk = smpi_datatype_get_extent(recv_type); send_chunk *= send_count; recv_chunk *= recv_count; smpi_mpi_sendrecv(send_ptr + rank * send_chunk, send_count, send_type, rank, tag, recv_ptr + rank * recv_chunk, recv_count, recv_type, rank, tag, comm, &s); for (i = 1; i < num_procs; i++) { src = dst = rank ^ i; smpi_mpi_sendrecv(send_ptr + dst * send_chunk, send_count, send_type, dst, tag, recv_ptr + src * recv_chunk, recv_count, recv_type, src, tag, comm, &s); if ((i + 1) < num_procs) { next_partner = rank ^ (i + 1); smpi_mpi_sendrecv(&send_sync, 1, MPI_CHAR, next_partner, tag, &recv_sync, 1, MPI_CHAR, next_partner, tag, comm, &s); } } return MPI_SUCCESS; }
// Allgather-Non-Topoloty-Scecific-Logical-Ring algorithm int smpi_coll_tuned_allgather_NTSLR(void *sbuf, int scount, MPI_Datatype stype, void *rbuf, int rcount, MPI_Datatype rtype, MPI_Comm comm) { MPI_Aint rextent, sextent; MPI_Status status; int i, to, from, rank, size; int send_offset, recv_offset; int tag = COLL_TAG_ALLGATHER; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); rextent = smpi_datatype_get_extent(rtype); sextent = smpi_datatype_get_extent(stype); // irregular case use default MPI fucntions if (scount * sextent != rcount * rextent) { XBT_WARN("MPI_allgather_NTSLR use default MPI_allgather."); smpi_mpi_allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm); return MPI_SUCCESS; } // topo non-specific to = (rank + 1) % size; from = (rank + size - 1) % size; //copy a single segment from sbuf to rbuf send_offset = rank * scount * sextent; smpi_mpi_sendrecv(sbuf, scount, stype, rank, tag, (char *)rbuf + send_offset, rcount, rtype, rank, tag, comm, &status); //start sending logical ring message int increment = scount * sextent; for (i = 0; i < size - 1; i++) { send_offset = ((rank - i + size) % size) * increment; recv_offset = ((rank - i - 1 + size) % size) * increment; smpi_mpi_sendrecv((char *) rbuf + send_offset, scount, stype, to, tag + i, (char *) rbuf + recv_offset, rcount, rtype, from, tag + i, comm, &status); } return MPI_SUCCESS; }
/* * Another recursive doubling type algorithm, but in this case * we go up the tree and back down the tree. */ int smpi_coll_tuned_barrier_ompi_tree(MPI_Comm comm) { int rank, size, depth; int jump, partner; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); XBT_DEBUG( "ompi_coll_tuned_barrier_ompi_tree %d", rank); /* Find the nearest power of 2 of the communicator size. */ for(depth = 1; depth < size; depth <<= 1 ); for (jump=1; jump<depth; jump<<=1) { partner = rank ^ jump; if (!(partner & (jump-1)) && partner < size) { if (partner > rank) { smpi_mpi_recv (NULL, 0, MPI_BYTE, partner, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } else if (partner < rank) { smpi_mpi_send (NULL, 0, MPI_BYTE, partner, COLL_TAG_BARRIER, comm); } } } depth>>=1; for (jump = depth; jump>0; jump>>=1) { partner = rank ^ jump; if (!(partner & (jump-1)) && partner < size) { if (partner > rank) { smpi_mpi_send (NULL, 0, MPI_BYTE, partner, COLL_TAG_BARRIER, comm); } else if (partner < rank) { smpi_mpi_recv (NULL, 0, MPI_BYTE, partner, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } } } return MPI_SUCCESS; }
/* special case for two processes */ int smpi_coll_tuned_barrier_ompi_two_procs(MPI_Comm comm ) { int remote; remote = smpi_comm_rank(comm); XBT_DEBUG( "ompi_coll_tuned_barrier_ompi_two_procs rank %d", remote); remote = (remote + 1) & 0x1; smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote, COLL_TAG_BARRIER, NULL, 0, MPI_BYTE, remote, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); return (MPI_SUCCESS); }
int smpi_coll_tuned_barrier_ompi_basic_linear(MPI_Comm comm) { int i; int size = smpi_comm_size(comm); int rank = smpi_comm_rank(comm); /* All non-root send & receive zero-length message. */ if (rank > 0) { smpi_mpi_send (NULL, 0, MPI_BYTE, 0, COLL_TAG_BARRIER, comm); smpi_mpi_recv (NULL, 0, MPI_BYTE, 0, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } /* The root collects and broadcasts the messages. */ else { MPI_Request* requests; requests = (MPI_Request*)malloc( size * sizeof(MPI_Request) ); for (i = 1; i < size; ++i) { requests[i] = smpi_mpi_irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE, COLL_TAG_BARRIER, comm ); } smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE ); for (i = 1; i < size; ++i) { requests[i] = smpi_mpi_isend(NULL, 0, MPI_BYTE, i, COLL_TAG_BARRIER, comm ); } smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE ); free( requests ); } /* All done */ return MPI_SUCCESS; }
int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount, MPI_Datatype rdtype, int root, MPI_Comm comm ) { const size_t small_block_size = 300; const int small_comm_size = 10; int communicator_size, rank; size_t dsize, block_size; XBT_DEBUG("smpi_coll_tuned_scatter_ompi"); communicator_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); // Determine block size if (root == rank) { dsize=smpi_datatype_size(sdtype); block_size = dsize * scount; } else { dsize=smpi_datatype_size(rdtype); block_size = dsize * rcount; } if ((communicator_size > small_comm_size) && (block_size < small_block_size)) { if(rank!=root){ sbuf=xbt_malloc(rcount*smpi_datatype_get_extent(rdtype)); scount=rcount; sdtype=rdtype; } int ret=smpi_coll_tuned_scatter_ompi_binomial (sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm); if(rank!=root){ xbt_free(sbuf); } return ret; } return smpi_coll_tuned_scatter_ompi_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm); }
int smpi_coll_tuned_reduce_ompi_pipeline( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm ) { uint32_t segsize; int segcount = count; size_t typelng; // COLL_TUNED_UPDATE_PIPELINE( comm, tuned_module, root ); /** * Determine number of segments and number of elements * sent per operation */ const double a2 = 0.0410 / 1024.0; /* [1/B] */ const double b2 = 9.7128; const double a4 = 0.0033 / 1024.0; /* [1/B] */ const double b4 = 1.6761; typelng= smpi_datatype_size( datatype); int communicator_size = smpi_comm_size(comm); size_t message_size = typelng * count; if (communicator_size > (a2 * message_size + b2)) { // Pipeline_1K segsize = 1024; }else if (communicator_size > (a4 * message_size + b4)) { // Pipeline_32K segsize = 32*1024; } else { // Pipeline_64K segsize = 64*1024; } XBT_DEBUG("coll:tuned:reduce_intra_pipeline rank %d ss %5d", smpi_comm_rank(comm), segsize); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, ompi_coll_tuned_topo_build_chain( 1, comm, root), segcount, 0); }
static void action_barrier(const char *const *action){ double clock = smpi_process_simulated_elapsed(); #ifdef HAVE_TRACING int rank = smpi_comm_rank(MPI_COMM_WORLD); TRACE_smpi_computing_out(rank); TRACE_smpi_collective_in(rank, -1, __FUNCTION__); #endif smpi_mpi_barrier(MPI_COMM_WORLD); #ifdef HAVE_TRACING TRACE_smpi_collective_out(rank, -1, __FUNCTION__); TRACE_smpi_computing_in(rank); #endif if (XBT_LOG_ISENABLED(smpi_replay, xbt_log_priority_verbose)){ char *name = xbt_str_join_array(action, " "); XBT_VERB("%s %f", name, smpi_process_simulated_elapsed()-clock); free(name); } }
/***************************************************************************** * Function: alltoall_pair_light_barrier * Return: int * Inputs: send_buff: send input buffer send_count: number of elements to send send_type: data type of elements being sent recv_buff: receive output buffer recv_count: number of elements to received recv_type: data type of elements being received comm: communicator * Descrp: Function works in P - 1 steps. In step i, node j exchanges data with node i ^ j. Light barriers are inserted between communications in different phases. * Auther: Ahmad Faraj ****************************************************************************/ int smpi_coll_tuned_alltoallv_pair_light_barrier(void *send_buff, int *send_counts, int *send_disps, MPI_Datatype send_type, void *recv_buff, int *recv_counts, int *recv_disps, MPI_Datatype recv_type, MPI_Comm comm) { MPI_Aint send_chunk, recv_chunk; MPI_Status s; int i, src, dst, rank, num_procs, next_partner; int tag = COLL_TAG_ALLTOALLV; /*, failure = 0; */ char send_sync = 'a', recv_sync = 'b'; char *send_ptr = (char *) send_buff; char *recv_ptr = (char *) recv_buff; rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); send_chunk = smpi_datatype_get_extent(send_type); recv_chunk = smpi_datatype_get_extent(recv_type); smpi_mpi_sendrecv(send_ptr + send_disps[rank] * send_chunk, send_counts[rank], send_type, rank, tag, recv_ptr + recv_disps[rank] * recv_chunk, recv_counts[rank], recv_type, rank, tag, comm, &s); for (i = 1; i < num_procs; i++) { src = dst = rank ^ i; smpi_mpi_sendrecv(send_ptr + send_disps[dst] * send_chunk, send_counts[dst], send_type, dst, tag, recv_ptr + recv_disps[src] *recv_chunk, recv_counts[dst], recv_type, src, tag, comm, &s); if ((i + 1) < num_procs) { next_partner = rank ^ (i + 1); smpi_mpi_sendrecv(&send_sync, 1, MPI_CHAR, next_partner, tag, &recv_sync, 1, MPI_CHAR, next_partner, tag, comm, &s); } } return MPI_SUCCESS; }