static void action_wait(const char *const *action){ double clock = smpi_process_simulated_elapsed(); MPI_Request request; MPI_Status status; smpi_replay_globals_t globals = (smpi_replay_globals_t) smpi_process_get_user_data(); xbt_assert(xbt_dynar_length(globals->irecvs), "action wait not preceded by any irecv: %s", xbt_str_join_array(action," ")); request = xbt_dynar_pop_as(globals->irecvs,MPI_Request); #ifdef HAVE_TRACING int rank = request && request->comm != MPI_COMM_NULL ? smpi_comm_rank(request->comm) : -1; TRACE_smpi_computing_out(rank); MPI_Group group = smpi_comm_group(request->comm); int src_traced = smpi_group_rank(group, request->src); int dst_traced = smpi_group_rank(group, request->dst); int is_wait_for_receive = request->recv; TRACE_smpi_ptp_in(rank, src_traced, dst_traced, __FUNCTION__); #endif smpi_mpi_wait(&request, &status); #ifdef HAVE_TRACING TRACE_smpi_ptp_out(rank, src_traced, dst_traced, __FUNCTION__); if (is_wait_for_receive) { TRACE_smpi_recv(rank, src_traced, dst_traced); } TRACE_smpi_computing_in(rank); #endif log_timed_action (action, clock); }
static void action_recv(const char *const *action) { int from = atoi(action[2]); double size=parse_double(action[3]); double clock = smpi_process_simulated_elapsed(); MPI_Status status; if(action[4]) MPI_CURRENT_TYPE=decode_datatype(action[4]); else MPI_CURRENT_TYPE= MPI_DEFAULT_TYPE; #ifdef HAVE_TRACING int rank = smpi_comm_rank(MPI_COMM_WORLD); int src_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), from); TRACE_smpi_computing_out(rank); TRACE_smpi_ptp_in(rank, src_traced, rank, __FUNCTION__); #endif smpi_mpi_recv(NULL, size, MPI_CURRENT_TYPE, from, 0, MPI_COMM_WORLD, &status); #ifdef HAVE_TRACING TRACE_smpi_ptp_out(rank, src_traced, rank, __FUNCTION__); TRACE_smpi_recv(rank, src_traced, rank); TRACE_smpi_computing_in(rank); #endif log_timed_action (action, clock); }
static void action_Irecv(const char *const *action) { int from = atoi(action[2]); double size=parse_double(action[3]); double clock = smpi_process_simulated_elapsed(); MPI_Request request; smpi_replay_globals_t globals = (smpi_replay_globals_t) smpi_process_get_user_data(); if(action[4]) MPI_CURRENT_TYPE=decode_datatype(action[4]); else MPI_CURRENT_TYPE= MPI_DEFAULT_TYPE; #ifdef HAVE_TRACING int rank = smpi_comm_rank(MPI_COMM_WORLD); int src_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), from); TRACE_smpi_ptp_in(rank, src_traced, rank, __FUNCTION__); #endif request = smpi_mpi_irecv(NULL, size, MPI_CURRENT_TYPE, from, 0, MPI_COMM_WORLD); #ifdef HAVE_TRACING TRACE_smpi_ptp_out(rank, src_traced, rank, __FUNCTION__); request->recv = 1; #endif xbt_dynar_push(globals->irecvs,&request); xbt_dynar_push(reqq[smpi_comm_rank(MPI_COMM_WORLD)],&request); log_timed_action (action, clock); }
static void action_Isend(const char *const *action) { CHECK_ACTION_PARAMS(action, 2, 1); int to = atoi(action[2]); double size=parse_double(action[3]); double clock = smpi_process_simulated_elapsed(); MPI_Request request; if(action[4]) MPI_CURRENT_TYPE=decode_datatype(action[4]); else MPI_CURRENT_TYPE= MPI_DEFAULT_TYPE; int rank = smpi_process_index(); int dst_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), to); instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1); extra->type = TRACING_ISEND; extra->send_size = size; extra->src = rank; extra->dst = dst_traced; extra->datatype1 = encode_datatype(MPI_CURRENT_TYPE, NULL); TRACE_smpi_ptp_in(rank, rank, dst_traced, __FUNCTION__, extra); if (!TRACE_smpi_view_internals()) { TRACE_smpi_send(rank, rank, dst_traced, size*smpi_datatype_size(MPI_CURRENT_TYPE)); } request = smpi_mpi_isend(NULL, size, MPI_CURRENT_TYPE, to, 0,MPI_COMM_WORLD); TRACE_smpi_ptp_out(rank, rank, dst_traced, __FUNCTION__); request->send = 1; xbt_dynar_push(get_reqq_self(),&request); log_timed_action (action, clock); }
static void action_Isend(const char *const *action) { int to = atoi(action[2]); double size=parse_double(action[3]); double clock = smpi_process_simulated_elapsed(); MPI_Request request; if(action[4]) MPI_CURRENT_TYPE=decode_datatype(action[4]); else MPI_CURRENT_TYPE= MPI_DEFAULT_TYPE; #ifdef HAVE_TRACING int rank = smpi_comm_rank(MPI_COMM_WORLD); TRACE_smpi_computing_out(rank); int dst_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), to); TRACE_smpi_ptp_in(rank, rank, dst_traced, __FUNCTION__); TRACE_smpi_send(rank, rank, dst_traced); #endif request = smpi_mpi_isend(NULL, size, MPI_CURRENT_TYPE, to, 0,MPI_COMM_WORLD); #ifdef HAVE_TRACING TRACE_smpi_ptp_out(rank, rank, dst_traced, __FUNCTION__); request->send = 1; TRACE_smpi_computing_in(rank); #endif xbt_dynar_push(reqq[smpi_comm_rank(MPI_COMM_WORLD)],&request); log_timed_action (action, clock); }
static void action_Irecv(const char *const *action) { CHECK_ACTION_PARAMS(action, 2, 1); int from = atoi(action[2]); double size=parse_double(action[3]); double clock = smpi_process_simulated_elapsed(); MPI_Request request; if(action[4]) MPI_CURRENT_TYPE=decode_datatype(action[4]); else MPI_CURRENT_TYPE= MPI_DEFAULT_TYPE; int rank = smpi_process_index(); int src_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), from); instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1); extra->type = TRACING_IRECV; extra->send_size = size; extra->src = src_traced; extra->dst = rank; extra->datatype1 = encode_datatype(MPI_CURRENT_TYPE, NULL); TRACE_smpi_ptp_in(rank, src_traced, rank, __FUNCTION__, extra); MPI_Status status; //unknow size from the receiver pov if(size==-1){ smpi_mpi_probe(from, 0, MPI_COMM_WORLD, &status); size=status.count; } request = smpi_mpi_irecv(NULL, size, MPI_CURRENT_TYPE, from, 0, MPI_COMM_WORLD); TRACE_smpi_ptp_out(rank, src_traced, rank, __FUNCTION__); request->recv = 1; xbt_dynar_push(get_reqq_self(),&request); log_timed_action (action, clock); }
static void action_send(const char *const *action) { int to = atoi(action[2]); double size=parse_double(action[3]); double clock = smpi_process_simulated_elapsed(); #ifdef HAVE_TRACING int rank = smpi_comm_rank(MPI_COMM_WORLD); TRACE_smpi_computing_out(rank); int dst_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), to); TRACE_smpi_ptp_in(rank, rank, dst_traced, __FUNCTION__); TRACE_smpi_send(rank, rank, dst_traced); #endif smpi_mpi_send(NULL, size, MPI_BYTE, to , 0, MPI_COMM_WORLD); if (XBT_LOG_ISENABLED(smpi_replay, xbt_log_priority_verbose)){ char *name = xbt_str_join_array(action, " "); XBT_VERB("%s %f", name, smpi_process_simulated_elapsed()-clock); free(name); } #ifdef HAVE_TRACING TRACE_smpi_ptp_out(rank, rank, dst_traced, __FUNCTION__); TRACE_smpi_computing_in(rank); #endif }
static void action_Irecv(const char *const *action) { int from = atoi(action[2]); double size=parse_double(action[3]); double clock = smpi_process_simulated_elapsed(); MPI_Request request; smpi_replay_globals_t globals = (smpi_replay_globals_t) smpi_process_get_user_data(); #ifdef HAVE_TRACING int rank = smpi_comm_rank(MPI_COMM_WORLD); int src_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), from); TRACE_smpi_ptp_in(rank, src_traced, rank, __FUNCTION__); #endif request = smpi_mpi_irecv(NULL, size, MPI_BYTE, from, 0, MPI_COMM_WORLD); #ifdef HAVE_TRACING TRACE_smpi_ptp_out(rank, src_traced, rank, __FUNCTION__); request->recv = 1; #endif xbt_dynar_push(globals->irecvs,&request); //TODO do the asynchronous cleanup if (XBT_LOG_ISENABLED(smpi_replay, xbt_log_priority_verbose)){ char *name = xbt_str_join_array(action, " "); XBT_VERB("%s %f", name, smpi_process_simulated_elapsed()-clock); free(name); } }
void smpi_deployment_cleanup_instances(){ xbt_dict_cursor_t cursor = nullptr; s_smpi_mpi_instance_t* instance = nullptr; char *name = nullptr; xbt_dict_foreach(smpi_instances, cursor, name, instance) { if(instance->comm_world!=MPI_COMM_NULL) while (smpi_group_unuse(smpi_comm_group(instance->comm_world)) > 0); xbt_free(instance->comm_world); xbt_barrier_destroy(instance->finalization_barrier); } xbt_dict_free(&smpi_instances); }
void smpi_global_destroy(void) { int count = smpi_process_count(); int i; smpi_bench_destroy(); if (MPI_COMM_WORLD != MPI_COMM_UNINITIALIZED){ while (smpi_group_unuse(smpi_comm_group(MPI_COMM_WORLD)) > 0); xbt_free(MPI_COMM_WORLD); xbt_barrier_destroy(process_data[0]->finalization_barrier); }else{ smpi_deployment_cleanup_instances(); } MPI_COMM_WORLD = MPI_COMM_NULL; for (i = 0; i < count; i++) { if(process_data[i]->comm_self!=MPI_COMM_NULL){ smpi_group_unuse(smpi_comm_group(process_data[i]->comm_self)); smpi_comm_destroy(process_data[i]->comm_self); } if(process_data[i]->comm_intra!=MPI_COMM_NULL){ smpi_group_unuse(smpi_comm_group(process_data[i]->comm_intra)); smpi_comm_destroy(process_data[i]->comm_intra); } xbt_os_timer_free(process_data[i]->timer); simcall_rdv_destroy(process_data[i]->mailbox); simcall_rdv_destroy(process_data[i]->mailbox_small); xbt_mutex_destroy(process_data[i]->mailboxes_mutex); xbt_free(process_data[i]); } xbt_free(process_data); process_data = NULL; xbt_free(index_to_process_data); if(smpi_privatize_global_variables) smpi_destroy_global_memory_segments(); smpi_free_static(); }
static void action_wait(const char *const *action){ CHECK_ACTION_PARAMS(action, 0, 0); double clock = smpi_process_simulated_elapsed(); MPI_Request request; MPI_Status status; xbt_assert(xbt_dynar_length(get_reqq_self()), "action wait not preceded by any irecv or isend: %s", xbt_str_join_array(action," ")); request = xbt_dynar_pop_as(get_reqq_self(),MPI_Request); if (!request){ /* Assuming that the trace is well formed, this mean the comm might have * been caught by a MPI_test. Then just return. */ return; } int rank = request->comm != MPI_COMM_NULL ? smpi_comm_rank(request->comm) : -1; MPI_Group group = smpi_comm_group(request->comm); int src_traced = smpi_group_rank(group, request->src); int dst_traced = smpi_group_rank(group, request->dst); int is_wait_for_receive = request->recv; instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1); extra->type = TRACING_WAIT; TRACE_smpi_ptp_in(rank, src_traced, dst_traced, __FUNCTION__, extra); smpi_mpi_wait(&request, &status); TRACE_smpi_ptp_out(rank, src_traced, dst_traced, __FUNCTION__); if (is_wait_for_receive) TRACE_smpi_recv(rank, src_traced, dst_traced); log_timed_action (action, clock); }
//get the index of the process in the process_data array void smpi_deployment_register_process(const char* instance_id, int rank, int index,MPI_Comm** comm, xbt_bar_t* bar){ if(smpi_instances==nullptr){//no instance registered, we probably used smpirun. index_to_process_data[index]=index; *bar = nullptr; *comm = nullptr; return; } s_smpi_mpi_instance_t* instance = static_cast<s_smpi_mpi_instance_t*>(xbt_dict_get_or_null(smpi_instances, instance_id)); xbt_assert(instance, "Error, unknown instance %s", instance_id); if(instance->comm_world == MPI_COMM_NULL){ MPI_Group group = smpi_group_new(instance->size); instance->comm_world = smpi_comm_new(group, nullptr); } instance->present_processes++; index_to_process_data[index]=instance->index+rank; smpi_group_set_mapping(smpi_comm_group(instance->comm_world), index, rank); *bar = instance->finalization_barrier; *comm = &instance->comm_world; return; }
int smpi_coll_tuned_gather_mvapich2_two_level(void *sendbuf, int sendcnt, MPI_Datatype sendtype, void *recvbuf, int recvcnt, MPI_Datatype recvtype, int root, MPI_Comm comm) { void *leader_gather_buf = NULL; int comm_size, rank; int local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = 0; int mpi_errno = MPI_SUCCESS; int recvtype_size = 0, sendtype_size = 0, nbytes=0; int leader_root, leader_of_root; MPI_Status status; MPI_Aint sendtype_extent = 0, recvtype_extent = 0; /* Datatype extent */ MPI_Aint true_lb, sendtype_true_extent, recvtype_true_extent; MPI_Comm shmem_comm, leader_comm; void* tmp_buf = NULL; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Gather_intra_node_function==NULL) MV2_Gather_intra_node_function=smpi_coll_tuned_gather_mpich; if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); if (((rank == root) && (recvcnt == 0)) || ((rank != root) && (sendcnt == 0))) { return MPI_SUCCESS; } if (sendtype != MPI_DATATYPE_NULL) { sendtype_extent=smpi_datatype_get_extent(sendtype); sendtype_size=smpi_datatype_size(sendtype); smpi_datatype_extent(sendtype, &true_lb, &sendtype_true_extent); } if (recvtype != MPI_DATATYPE_NULL) { recvtype_extent=smpi_datatype_get_extent(recvtype); recvtype_size=smpi_datatype_size(recvtype); smpi_datatype_extent(recvtype, &true_lb, &recvtype_true_extent); } /* extract the rank,size information for the intra-node * communicator */ shmem_comm = smpi_comm_get_intra_comm(comm); local_rank = smpi_comm_rank(shmem_comm); local_size = smpi_comm_size(shmem_comm); if (local_rank == 0) { /* Node leader. Extract the rank, size information for the leader * communicator */ leader_comm = smpi_comm_get_leaders_comm(comm); if(leader_comm==MPI_COMM_NULL){ leader_comm = MPI_COMM_WORLD; } leader_comm_size = smpi_comm_size(leader_comm); leader_comm_rank = smpi_comm_rank(leader_comm); } if (rank == root) { nbytes = recvcnt * recvtype_size; } else { nbytes = sendcnt * sendtype_size; } #if defined(_SMP_LIMIC_) if((g_use_limic2_coll) && (shmem_commptr->ch.use_intra_sock_comm == 1) && (use_limic_gather) &&((num_scheme == USE_GATHER_PT_PT_BINOMIAL) || (num_scheme == USE_GATHER_PT_PT_DIRECT) ||(num_scheme == USE_GATHER_PT_LINEAR_BINOMIAL) || (num_scheme == USE_GATHER_PT_LINEAR_DIRECT) || (num_scheme == USE_GATHER_LINEAR_PT_BINOMIAL) || (num_scheme == USE_GATHER_LINEAR_PT_DIRECT) || (num_scheme == USE_GATHER_LINEAR_LINEAR) || (num_scheme == USE_GATHER_SINGLE_LEADER))) { mpi_errno = MV2_Gather_intra_node_function(sendbuf, sendcnt, sendtype, recvbuf, recvcnt,recvtype, root, comm); } else #endif/*#if defined(_SMP_LIMIC_)*/ { if (local_rank == 0) { /* Node leader, allocate tmp_buffer */ if (rank == root) { tmp_buf = smpi_get_tmp_recvbuffer(recvcnt * MAX(recvtype_extent, recvtype_true_extent) * local_size); } else { tmp_buf = smpi_get_tmp_sendbuffer(sendcnt * MAX(sendtype_extent, sendtype_true_extent) * local_size); } if (tmp_buf == NULL) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } /*while testing mpich2 gather test, we see that * which basically splits the comm, and we come to * a point, where use_intra_sock_comm == 0, but if the * intra node function is MPIR_Intra_node_LIMIC_Gather_MV2, * it would use the intra sock comm. In such cases, we * fallback to binomial as a default case.*/ #if defined(_SMP_LIMIC_) if(*MV2_Gather_intra_node_function == MPIR_Intra_node_LIMIC_Gather_MV2) { mpi_errno = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, rank, tmp_buf, nbytes, TEMP_BUF_HAS_NO_DATA, shmem_commptr, MPIR_Gather_intra); } else #endif { /*We are gathering the data into tmp_buf and the output * will be of MPI_BYTE datatype. Since the tmp_buf has no * local data, we pass is_data_avail = TEMP_BUF_HAS_NO_DATA*/ mpi_errno = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, rank, tmp_buf, nbytes, TEMP_BUF_HAS_NO_DATA, shmem_comm, MV2_Gather_intra_node_function ); } } leader_comm = smpi_comm_get_leaders_comm(comm); int* leaders_map = smpi_comm_get_leaders_map(comm); leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]); leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]); /* leader_root is the rank of the leader of the root in leader_comm. * leader_root is to be used as the root of the inter-leader gather ops */ if (!smpi_comm_is_uniform(comm)) { if (local_rank == 0) { int *displs = NULL; int *recvcnts = NULL; int *node_sizes; int i = 0; /* Node leaders have all the data. But, different nodes can have * different number of processes. Do a Gather first to get the * buffer lengths at each leader, followed by a Gatherv to move * the actual data */ if (leader_comm_rank == leader_root && root != leader_of_root) { /* The root of the Gather operation is not a node-level * leader and this process's rank in the leader_comm * is the same as leader_root */ if(rank == root) { leader_gather_buf = smpi_get_tmp_recvbuffer(recvcnt * MAX(recvtype_extent, recvtype_true_extent) * comm_size); } else { leader_gather_buf = smpi_get_tmp_sendbuffer(sendcnt * MAX(sendtype_extent, sendtype_true_extent) * comm_size); } if (leader_gather_buf == NULL) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } node_sizes = smpi_comm_get_non_uniform_map(comm); if (leader_comm_rank == leader_root) { displs = xbt_malloc(sizeof (int) * leader_comm_size); recvcnts = xbt_malloc(sizeof (int) * leader_comm_size); if (!displs || !recvcnts) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } if (root == leader_of_root) { /* The root of the gather operation is also the node * leader. Receive into recvbuf and we are done */ if (leader_comm_rank == leader_root) { recvcnts[0] = node_sizes[0] * recvcnt; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * recvcnt; recvcnts[i] = node_sizes[i] * recvcnt; } } smpi_mpi_gatherv(tmp_buf, local_size * nbytes, MPI_BYTE, recvbuf, recvcnts, displs, recvtype, leader_root, leader_comm); } else { /* The root of the gather operation is not the node leader. * Receive into leader_gather_buf and then send * to the root */ if (leader_comm_rank == leader_root) { recvcnts[0] = node_sizes[0] * nbytes; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * nbytes; recvcnts[i] = node_sizes[i] * nbytes; } } smpi_mpi_gatherv(tmp_buf, local_size * nbytes, MPI_BYTE, leader_gather_buf, recvcnts, displs, MPI_BYTE, leader_root, leader_comm); } if (leader_comm_rank == leader_root) { xbt_free(displs); xbt_free(recvcnts); } } } else { /* All nodes have the same number of processes. * Just do one Gather to get all * the data at the leader of the root process */ if (local_rank == 0) { if (leader_comm_rank == leader_root && root != leader_of_root) { /* The root of the Gather operation is not a node-level leader */ leader_gather_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size); if (leader_gather_buf == NULL) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } if (root == leader_of_root) { mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf, nbytes * local_size, MPI_BYTE, recvbuf, recvcnt * local_size, recvtype, leader_root, leader_comm); } else { mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf, nbytes * local_size, MPI_BYTE, leader_gather_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } } } if ((local_rank == 0) && (root != rank) && (leader_of_root == rank)) { smpi_mpi_send(leader_gather_buf, nbytes * comm_size, MPI_BYTE, root, COLL_TAG_GATHER, comm); } if (rank == root && local_rank != 0) { /* The root of the gather operation is not the node leader. Receive y* data from the node leader */ smpi_mpi_recv(recvbuf, recvcnt * comm_size, recvtype, leader_of_root, COLL_TAG_GATHER, comm, &status); } /* check if multiple threads are calling this collective function */ if (local_rank == 0 ) { if (tmp_buf != NULL) { smpi_free_tmp_buffer(tmp_buf); } if (leader_gather_buf != NULL) { smpi_free_tmp_buffer(leader_gather_buf); } } return (mpi_errno); }
int smpi_coll_tuned_bcast_mvapich2_inter_node(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int rank; int mpi_errno = MPI_SUCCESS; MPI_Comm shmem_comm, leader_comm; int local_rank, local_size, global_rank = -1; int leader_root, leader_of_root; rank = smpi_comm_rank(comm); //comm_size = smpi_comm_size(comm); if (MV2_Bcast_function==NULL){ MV2_Bcast_function=smpi_coll_tuned_bcast_mpich; } if (MV2_Bcast_intra_node_function==NULL){ MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich; } if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } shmem_comm = smpi_comm_get_intra_comm(comm); local_rank = smpi_comm_rank(shmem_comm); local_size = smpi_comm_size(shmem_comm); leader_comm = smpi_comm_get_leaders_comm(comm); if ((local_rank == 0) && (local_size > 1)) { global_rank = smpi_comm_rank(leader_comm); } int* leaders_map = smpi_comm_get_leaders_map(comm); leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]); leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]); if (local_size > 1) { if ((local_rank == 0) && (root != rank) && (leader_root == global_rank)) { smpi_mpi_recv(buffer, count, datatype, root, COLL_TAG_BCAST, comm, MPI_STATUS_IGNORE); } if ((local_rank != 0) && (root == rank)) { smpi_mpi_send(buffer, count, datatype, leader_of_root, COLL_TAG_BCAST, comm); } } #if defined(_MCST_SUPPORT_) if (comm_ptr->ch.is_mcast_ok) { mpi_errno = MPIR_Mcast_inter_node_MV2(buffer, count, datatype, root, comm_ptr, errflag); if (mpi_errno == MPI_SUCCESS) { goto fn_exit; } else { goto fn_fail; } } #endif /* if (local_rank == 0) { leader_comm = smpi_comm_get_leaders_comm(comm); root = leader_root; } if (MV2_Bcast_function == &MPIR_Pipelined_Bcast_MV2) { mpi_errno = MPIR_Pipelined_Bcast_MV2(buffer, count, datatype, root, comm); } else if (MV2_Bcast_function == &MPIR_Bcast_scatter_ring_allgather_shm_MV2) { mpi_errno = MPIR_Bcast_scatter_ring_allgather_shm_MV2(buffer, count, datatype, root, comm); } else */{ if (local_rank == 0) { /* if (MV2_Bcast_function == &MPIR_Knomial_Bcast_inter_node_wrapper_MV2) { mpi_errno = MPIR_Knomial_Bcast_inter_node_wrapper_MV2(buffer, count, datatype, root, comm); } else {*/ mpi_errno = MV2_Bcast_function(buffer, count, datatype, leader_root, leader_comm); // } } } return mpi_errno; }
int smpi_coll_tuned_reduce_mvapich2_two_level( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int my_rank, total_size, local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = 0; MPI_Comm shmem_comm, leader_comm; int leader_root, leader_of_root; void *in_buf = NULL, *out_buf = NULL, *tmp_buf = NULL; MPI_Aint true_lb, true_extent, extent; int is_commutative = 0, stride = 0; int intra_node_root=0; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Reduce_function==NULL) MV2_Reduce_function=smpi_coll_tuned_reduce_mpich; if(MV2_Reduce_intra_function==NULL) MV2_Reduce_intra_function=smpi_coll_tuned_reduce_mpich; if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } my_rank = smpi_comm_rank(comm); total_size = smpi_comm_size(comm); shmem_comm = smpi_comm_get_intra_comm(comm); local_rank = smpi_comm_rank(shmem_comm); local_size = smpi_comm_size(shmem_comm); leader_comm = smpi_comm_get_leaders_comm(comm); int* leaders_map = smpi_comm_get_leaders_map(comm); leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]); leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]); is_commutative=smpi_op_is_commute(op); smpi_datatype_extent(datatype, &true_lb, &true_extent); extent =smpi_datatype_get_extent(datatype); stride = count * MAX(extent, true_extent); if (local_size == total_size) { /* First handle the case where there is only one node */ if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG && is_commutative == 1) { if (local_rank == 0 ) { tmp_buf=(void *)smpi_get_tmp_sendbuffer( count * (MAX(extent, true_extent))); tmp_buf = (void *) ((char *) tmp_buf - true_lb); } if (sendbuf != MPI_IN_PLACE) { in_buf = (void *)sendbuf; } else { in_buf = recvbuf; } if (local_rank == 0) { if( my_rank != root) { out_buf = tmp_buf; } else { out_buf = recvbuf; if(in_buf == out_buf) { in_buf = MPI_IN_PLACE; out_buf = recvbuf; } } } else { in_buf = (void *)sendbuf; out_buf = NULL; } if (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) { mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } if (local_rank == 0 && root != my_rank) { smpi_mpi_send(out_buf, count, datatype, root, COLL_TAG_REDUCE+1, comm); } if ((local_rank != 0) && (root == my_rank)) { smpi_mpi_recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE+1, comm, MPI_STATUS_IGNORE); } } else { if(mv2_use_knomial_reduce == 1) { reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2; } else { reduce_fn = &MPIR_Reduce_binomial_MV2; } mpi_errno = reduce_fn(sendbuf, recvbuf, count, datatype, op, root, comm); } /* We are done */ if(tmp_buf!=NULL) smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb)); goto fn_exit; } if (local_rank == 0) { leader_comm = smpi_comm_get_leaders_comm(comm); if(leader_comm==MPI_COMM_NULL){ leader_comm = MPI_COMM_WORLD; } leader_comm_size = smpi_comm_size(leader_comm); leader_comm_rank = smpi_comm_rank(leader_comm); tmp_buf=(void *)smpi_get_tmp_sendbuffer(count * (MAX(extent, true_extent))); tmp_buf = (void *) ((char *) tmp_buf - true_lb); } if (sendbuf != MPI_IN_PLACE) { in_buf = (void *)sendbuf; } else { in_buf = recvbuf; } if (local_rank == 0) { out_buf = tmp_buf; } else { out_buf = NULL; } if(local_size > 1) { /* Lets do the intra-node reduce operations, if we have more than one * process in the node */ /*Fix the input and outbuf buffers for the intra-node reduce. *Node leaders will have the reduced data in tmp_buf after *this step*/ if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2) { if (is_commutative == 1 && (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb)); tmp_buf = in_buf; } /* Now work on the inter-leader phase. Data is in tmp_buf */ if (local_rank == 0 && leader_comm_size > 1) { /*The leader of root will have the global reduced data in tmp_buf or recv_buf at the end of the reduce */ if (leader_comm_rank == leader_root) { if (my_rank == root) { /* I am the root of the leader-comm, and the * root of the reduce op. So, I will write the * final result directly into my recvbuf */ if(tmp_buf != recvbuf) { in_buf = tmp_buf; out_buf = recvbuf; } else { in_buf = (char *)smpi_get_tmp_sendbuffer(count* smpi_datatype_get_extent(datatype)); smpi_datatype_copy(tmp_buf, count, datatype, in_buf, count, datatype); //in_buf = MPI_IN_PLACE; out_buf = recvbuf; } } else { in_buf = (char *)smpi_get_tmp_sendbuffer(count* smpi_datatype_get_extent(datatype)); smpi_datatype_copy(tmp_buf, count, datatype, in_buf, count, datatype); //in_buf = MPI_IN_PLACE; out_buf = tmp_buf; } } else { in_buf = tmp_buf; out_buf = NULL; } /* inter-leader communication */ mpi_errno = MV2_Reduce_function(in_buf, out_buf, count, datatype, op, leader_root, leader_comm); } if (local_size > 1) { /* Send the message to the root if the leader is not the * root of the reduce operation. The reduced data is in tmp_buf */ if ((local_rank == 0) && (root != my_rank) && (leader_root == leader_comm_rank)) { smpi_mpi_send(tmp_buf, count, datatype, root, COLL_TAG_REDUCE+1, comm); } if ((local_rank != 0) && (root == my_rank)) { smpi_mpi_recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE+1, comm, MPI_STATUS_IGNORE); } smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb)); if (leader_comm_rank == leader_root) { if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) { smpi_free_tmp_buffer(in_buf); } } } fn_exit: return mpi_errno; }