int ompi_coll_tuned_dump_all_rules (ompi_coll_alg_rule_t* alg_p, int n_rules) { int i; if (!alg_p) { OPAL_OUTPUT((ompi_coll_tuned_stream,"Algorithm rule was a NULL ptr?!\n")); return (-1); } OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of algorithm rules %3d\n", n_rules)); for (i=0;i<n_rules;i++) { ompi_coll_tuned_dump_alg_rule (&(alg_p[i])); } return (0); }
int ompi_coll_tuned_allgatherv_intra_do_forced(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, const int *rdispls, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allgatherv_intra_do_forced selected algorithm %d", tuned_module->user_forced[ALLGATHERV].algorithm)); switch (tuned_module->user_forced[ALLGATHERV].algorithm) { case (0): return ompi_coll_tuned_allgatherv_intra_dec_fixed(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm, module); case (1): return ompi_coll_base_allgatherv_intra_basic_default(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm, module); case (2): return ompi_coll_base_allgatherv_intra_bruck(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm, module); case (3): return ompi_coll_base_allgatherv_intra_ring(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm, module); case (4): return ompi_coll_base_allgatherv_intra_neighborexchange(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm, module); case (5): return ompi_coll_base_allgatherv_intra_two_procs(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm, module); } /* switch */ OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allgatherv_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", tuned_module->user_forced[ALLGATHERV].algorithm, ompi_coll_tuned_forced_max_algorithms[ALLGATHERV])); return (MPI_ERR_ARG); }
int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rule, int mpi_msgsize, int *result_topo_faninout, int* result_segsize) { ompi_coll_msg_rule_t* msg_p = (ompi_coll_msg_rule_t*) NULL; ompi_coll_msg_rule_t* best_msg_p = (ompi_coll_msg_rule_t*) NULL; int i, best; if (!base_com_rule) { return (0); } if (!result_topo_faninout) { return (0); } if (!result_segsize) { return (0); } if (!base_com_rule->n_msg_sizes) { /* check for count of message sizes */ return (0); /* no msg sizes so no rule */ } /* ok have some msg sizes, now to find the one closest to my mpi_msgsize */ /* make a copy of the first msg rule */ best_msg_p = msg_p = base_com_rule->msg_rules; i = best = 0; while (i<base_com_rule->n_msg_sizes) { /* OPAL_OUTPUT((ompi_coll_tuned_stream,"checking mpi_msgsize %d against com_id %d msg_id %d index %d msg_size %d", */ /* mpi_msgsize, msg_p->com_rule_id, msg_p->msg_rule_id, i, msg_p->msg_size)); */ if (msg_p->msg_size <= mpi_msgsize) { best = i; best_msg_p = msg_p; /* OPAL_OUTPUT((ompi_coll_tuned_stream(":ok\n")); */ } else { /* OPAL_OUTPUT((ompi_coll_tuned_stream(":nop\n")); */ break; } /* go to the next entry */ msg_p++; i++; } OPAL_OUTPUT((ompi_coll_tuned_stream,"Selected the following msg rule id %d\n", best_msg_p->msg_rule_id)); ompi_coll_tuned_dump_msg_rule (best_msg_p); /* return the segment size */ *result_topo_faninout = best_msg_p->result_topo_faninout; /* return the segment size */ *result_segsize = best_msg_p->result_segsize; /* return the algorithm/method to use */ return (best_msg_p->result_alg); }
static int cleanup_scatter_handles(ompi_coll_portals4_request_t *request) { int ret, line; OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:portals4:cleanup_scatter_handles enter rank %d", request->u.scatter.my_rank)); /**********************************/ /* Cleanup Scatter Handles */ /**********************************/ do { ret = PtlMEUnlink(request->u.scatter.scatter_meh); if (PTL_IN_USE == ret) { opal_output(ompi_coll_base_framework.framework_output, "%s:%4d: scatter_meh still in use (ret=%d, rank %2d)", __FILE__, __LINE__, ret, request->u.scatter.my_rank); continue; } if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } } while (ret == PTL_IN_USE); ret = PtlCTFree(request->u.scatter.scatter_cth); if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:portals4:cleanup_scatter_handles exit rank %d", request->u.scatter.my_rank)); return OMPI_SUCCESS; err_hdlr: opal_output(ompi_coll_base_framework.framework_output, "%s:%4d:%4d\tError occurred ret=%d, rank %2d", __FILE__, __LINE__, line, ret, request->u.scatter.my_rank); return ret; }
/* i.e. alg table and dynamic changable rules if allocated etc */ static int tuned_close(void) { sdn_finalize(); OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:component_close: called")); /* dealloc alg table if allocated */ /* dealloc dynamic changable rules if allocated */ OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:component_close: done!")); if( NULL != mca_coll_tuned_component.all_base_rules ) { ompi_coll_tuned_free_all_rules(mca_coll_tuned_component.all_base_rules, COLLCOUNT); mca_coll_tuned_component.all_base_rules = NULL; } return OMPI_SUCCESS; }
static void main_thread_event_callback(int fd, short event, void *context) { cmd_t cmd; OPAL_OUTPUT((-1, "main thread -- reading command")); opal_fd_read(pipe_to_main_thread[0], cmd_size, &cmd); switch (cmd.pc_cmd) { case CMD_CALL_FUNCTION: OPAL_OUTPUT((-1, "fd main thread: calling command")); main_pipe_cmd_call_function(&cmd); break; default: OPAL_OUTPUT((-1, "fd main thread: unknown pipe command: %d", cmd.pc_cmd)); break; } }
/** * This graph API tell us if two vertices are adjacent * * @param graph The graph that the vertices belongs to. * @param vertex1 first vertex. * @param vertex2 second vertex. * * @return uint32_t the weight of the connection between the two * vertices or infinity if the vertices are not * connected. */ uint32_t opal_graph_adjacent(opal_graph_t *graph, opal_graph_vertex_t *vertex1, opal_graph_vertex_t *vertex2) { opal_adjacency_list_t *adj_list; opal_list_item_t *item; opal_graph_edge_t *edge; /** * Verify that the first vertex belongs to the graph. */ if (graph != vertex1->in_graph) { OPAL_OUTPUT((0,"opal_graph_adjacent 1 Vertex1 %p not in the graph %p\n",(void *)vertex1,(void *)graph)); return DISTANCE_INFINITY; } /** * Verify that the second vertex belongs to the graph. */ if (graph != vertex2->in_graph) { OPAL_OUTPUT((0,"opal_graph_adjacent 2 Vertex2 %p not in the graph %p\n",(void *)vertex2,(void *)graph)); return DISTANCE_INFINITY; } /** * If the first vertex and the second vertex are the same * vertex, the distance between the is 0. */ if (vertex1 == vertex2) { return 0; } /** * find the second vertex in the adjacency list of the first * vertex. */ adj_list = (opal_adjacency_list_t *) vertex1->in_adj_list; for (item = opal_list_get_first(adj_list->edges); item != opal_list_get_end(adj_list->edges); item = opal_list_get_next(item)) { edge = (opal_graph_edge_t *)item; if (edge->end == vertex2) { /* if the second vertex was found in the adjacency list of the first one, return the weight */ return edge->weight; } } /* if the second vertex was not found in the adjacency list of the first one, return infinity */ return DISTANCE_INFINITY; }
int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, err, rank, size, line; ompi_request_t** requests = NULL; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* All non-root send & receive zero-length message. */ if (rank > 0) { err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } /* The root collects and broadcasts the messages. */ else { requests = coll_base_comm_get_reqs(module->base_data, size); if( NULL == requests ) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } for (i = 1; i < size; ++i) { err = MCA_PML_CALL(irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE, MCA_COLL_BASE_TAG_BARRIER, comm, &(requests[i]))); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } err = ompi_request_wait_all( size-1, requests+1, MPI_STATUSES_IGNORE ); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } requests = NULL; /* we're done the requests array is clean */ for (i = 1; i < size; ++i) { err = MCA_PML_CALL(send(NULL, 0, MPI_BYTE, i, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } /* All done */ return MPI_SUCCESS; err_hndl: OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); (void)line; // silence compiler warning if( NULL != requests ) ompi_coll_base_free_reqs(requests, size); return err; }
int ompi_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; const int segsize = data->user_forced[REDUCE].segsize; const int chain_fanout = data->user_forced[REDUCE].chain_fanout; const int max_requests = data->user_forced[REDUCE].max_requests; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced selected algorithm %d", data->user_forced[REDUCE].algorithm)); switch (data->user_forced[REDUCE].algorithm) { case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm, module); case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, root, comm, module); case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm, module, segsize, chain_fanout, max_requests); case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype, op, root, comm, module, segsize, max_requests); case (4): return ompi_coll_tuned_reduce_intra_binary (sbuf, rbuf, count, dtype, op, root, comm, module, segsize, max_requests); case (5): return ompi_coll_tuned_reduce_intra_binomial (sbuf, rbuf, count, dtype, op, root, comm, module, segsize, max_requests); case (6): return ompi_coll_tuned_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype, op, root, comm, module, segsize, max_requests); default: OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", data->user_forced[REDUCE].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE])); return (MPI_ERR_ARG); } /* switch */ }
static void cts_sent(mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep, struct mca_btl_base_descriptor_t* des, int status) { /* Nothing to do/empty function (we can't pass in a NULL pointer for the des_cbfunc) */ OPAL_OUTPUT((-1, "CTS send to %s completed", opal_get_proc_hostname(ep->endpoint_proc->proc_opal))); }
uint32_t opal_graph_spf(opal_graph_t *graph, opal_graph_vertex_t *vertex1, opal_graph_vertex_t *vertex2) { opal_value_array_t *distance_array; uint32_t items_in_distance_array, spf = DISTANCE_INFINITY; vertex_distance_from_t *vertex_distance; uint32_t i; /** * Verify that the first vertex belongs to the graph. */ if (graph != vertex1->in_graph) { OPAL_OUTPUT((0,"opal_graph_spf 1 Vertex1 %p not in the graph %p\n",(void *)vertex1,(void *)graph)); return DISTANCE_INFINITY; } /** * Verify that the second vertex belongs to the graph. */ if (graph != vertex2->in_graph) { OPAL_OUTPUT((0,"opal_graph_spf 2 Vertex2 %p not in the graph %p\n",(void *)vertex2,(void *)graph)); return DISTANCE_INFINITY; } /** * Run Dijkstra algorithm on the graph from the start vertex. */ distance_array = OBJ_NEW(opal_value_array_t); opal_value_array_init(distance_array, sizeof(vertex_distance_from_t)); opal_value_array_reserve(distance_array,50); items_in_distance_array = opal_graph_dijkstra(graph, vertex1, distance_array); /** * find the end vertex in the distance array that Dijkstra * algorithm returned. */ for (i = 0; i < items_in_distance_array; i++) { vertex_distance = opal_value_array_get_item(distance_array, i); if (vertex_distance->vertex == vertex2) { spf = vertex_distance->weight; break; } } OBJ_RELEASE(distance_array); /* return the distance (weight) to the end vertex */ return spf; }
int opal_progress_set_event_flag(int flag) { int tmp = opal_progress_event_flag; opal_progress_event_flag = flag; OPAL_OUTPUT((debug_output, "progress: set_event_flag setting to %d", flag)); return tmp; }
int ompi_coll_tuned_allgatherv_intra_dec_dynamic(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, const int *rcounts, const int *rdispls, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allgatherv_intra_dec_dynamic")); if (tuned_module->com_rules[ALLGATHERV]) { /* We have file based rules: - calculate message size and other necessary information */ int comsize, i; int alg, faninout, segsize, ignoreme; size_t dsize, total_size; comsize = ompi_comm_size(comm); ompi_datatype_type_size (sdtype, &dsize); total_size = 0; for (i = 0; i < comsize; i++) { total_size += dsize * rcounts[i]; } alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLGATHERV], total_size, &faninout, &segsize, &ignoreme); if (alg) { /* we have found a valid choice from the file based rules for this message size */ return ompi_coll_tuned_allgatherv_intra_do_this (sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm, module, alg, faninout, segsize); } } /* We do not have file based rules */ if (tuned_module->user_forced[ALLGATHERV].algorithm) { /* User-forced algorithm */ return ompi_coll_tuned_allgatherv_intra_do_this(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm, module, tuned_module->user_forced[ALLGATHERV].algorithm, tuned_module->user_forced[ALLGATHERV].tree_fanout, tuned_module->user_forced[ALLGATHERV].segsize); } /* Use default decision */ return ompi_coll_tuned_allgatherv_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm, module); }
/* * Another recursive doubling type algorithm, but in this case * we go up the tree and back down the tree. */ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, size, depth, err, jump, partner; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_barrier_intra_tree %d", rank)); /* Find the nearest power of 2 of the communicator size. */ depth = opal_next_poweroftwo_inclusive(size); for (jump=1; jump<depth; jump<<=1) { partner = rank ^ jump; if (!(partner & (jump-1)) && partner < size) { if (partner > rank) { err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) return err; } else if (partner < rank) { err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) return err; } } } depth >>= 1; for (jump = depth; jump>0; jump>>=1) { partner = rank ^ jump; if (!(partner & (jump-1)) && partner < size) { if (partner > rank) { err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) return err; } else if (partner < rank) { err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) return err; } } } return MPI_SUCCESS; }
/* * Invoked when there's a new communicator that has been created. * Look at the communicator and decide which set of functions and * priority we want to return. */ const mca_coll_base_module_1_0_0_t * ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority, struct mca_coll_base_comm_t **data) { OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:module_tuned query called")); *priority = ompi_coll_tuned_priority; /* * Choose whether to use [intra|inter] decision functions * and if using fixed OR dynamic rule sets. * Right now you cannot mix them, maybe later on it can be changed * but this would probably add an extra if and funct call to the path */ if (OMPI_COMM_IS_INTER(comm)) { if (ompi_coll_tuned_use_dynamic_rules) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using inter_dynamic")); to_use = &inter_dynamic; } else { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using inter_fixed")); to_use = &inter_fixed; } } else { /* is an intra comm */ /** * If the communicator size is less than 2 we have specialized modules * to handle the intra collective communications. */ if( ompi_comm_size(comm) < 2) { *priority = 0; return NULL; } if (ompi_coll_tuned_use_dynamic_rules) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using intra_dynamic")); to_use = &intra_dynamic; } else { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using intra_fixed")); to_use = &intra_fixed; } } return to_use; }
int ompi_coll_tuned_barrier_intra_do_this (struct ompi_communicator_t *comm, mca_coll_base_module_t *module, int algorithm, int faninout, int segsize) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d", algorithm, faninout)); switch (algorithm) { case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm, module); case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm, module); case (2): return ompi_coll_tuned_barrier_intra_doublering (comm, module); case (3): return ompi_coll_tuned_barrier_intra_recursivedoubling (comm, module); case (4): return ompi_coll_tuned_barrier_intra_bruck (comm, module); case (5): return ompi_coll_tuned_barrier_intra_two_procs (comm, module); case (6): return ompi_coll_tuned_barrier_intra_tree (comm, module); default: OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER])); return (MPI_ERR_ARG); } /* switch */ }
bool opal_progress_set_yield_when_idle(bool yieldopt) { bool tmp = opal_progress_yield_when_idle; opal_progress_yield_when_idle = (yieldopt) ? 1 : 0; OPAL_OUTPUT((debug_output, "progress: progress_set_yield_when_idle to %s", opal_progress_yield_when_idle ? "true" : "false")); return tmp; }
bool opal_progress_set_yield_when_idle(bool yieldopt) { bool tmp = (call_yield == 0) ? false : true; call_yield = (yieldopt) ? 1 : 0; OPAL_OUTPUT((debug_output, "progress: progress_set_yield_when_idle to %s", call_yield == 0 ? "false" : "true")); return tmp; }
int ompi_coll_tuned_gather_intra_do_forced(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:gather_intra_do_forced selected algorithm %d", tuned_module->user_forced[GATHER].algorithm)); switch (tuned_module->user_forced[GATHER].algorithm) { case (0): return ompi_coll_tuned_gather_intra_dec_fixed(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm, module); case (1): return ompi_coll_base_gather_intra_basic_linear(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm, module); case (2): return ompi_coll_base_gather_intra_binomial(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm, module); case (3): return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm, module, tuned_module->user_forced[GATHER].segsize); } /* switch */ OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:gather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", tuned_module->user_forced[GATHER].algorithm, ompi_coll_tuned_forced_max_algorithms[GATHER])); return (MPI_ERR_ARG); }
/* * Send CTS control fragment */ void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint) { mca_btl_openib_send_control_frag_t *sc_frag; mca_btl_base_descriptor_t *base_des; mca_btl_openib_frag_t *openib_frag; mca_btl_openib_com_frag_t *com_frag; mca_btl_openib_control_header_t *ctl_hdr; OPAL_OUTPUT((-1, "SENDING CTS to %s on qp index %d (QP num %d)", (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ? "unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname, mca_btl_openib_component.credits_qp, endpoint->qps[mca_btl_openib_component.credits_qp].qp->lcl_qp->qp_num)); sc_frag = alloc_control_frag(endpoint->endpoint_btl); if (OPAL_UNLIKELY(NULL == sc_frag)) { BTL_ERROR(("Failed to allocate control buffer")); mca_btl_openib_endpoint_invoke_error(endpoint); return; } /* I dislike using the "to_<foo>()" macros; I prefer using the explicit member fields to ensure I get the types right. Since this is not a performance-criticial part of the code, it's ok. */ com_frag = &(sc_frag->super.super); openib_frag = &(com_frag->super); base_des = &(openib_frag->base); base_des->des_cbfunc = cts_sent; base_des->des_cbdata = NULL; base_des->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK; base_des->order = mca_btl_openib_component.credits_qp; openib_frag->segment.base.seg_len = sizeof(mca_btl_openib_control_header_t); com_frag->endpoint = endpoint; sc_frag->hdr->tag = MCA_BTL_TAG_IB; sc_frag->hdr->cm_seen = 0; sc_frag->hdr->credits = 0; ctl_hdr = (mca_btl_openib_control_header_t*) openib_frag->segment.base.seg_addr.pval; ctl_hdr->type = MCA_BTL_OPENIB_CONTROL_CTS; /* Send the fragment */ OPAL_THREAD_LOCK(&endpoint->endpoint_lock); if (OMPI_SUCCESS != mca_btl_openib_endpoint_post_send(endpoint, sc_frag)) { BTL_ERROR(("Failed to post CTS send")); mca_btl_openib_endpoint_invoke_error(endpoint); } endpoint->endpoint_cts_sent = true; OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); }
/* * gather_intra * * Function: - basic gather operation * Accepts: - same arguments as MPI_Gather() * Returns: - MPI_SUCCESS or error code */ int ompi_coll_base_gather_intra_basic_linear(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, err, rank, size; char *ptmp; MPI_Aint incr, extent, lb; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* Everyone but root sends data and returns. */ OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_basic_linear rank %d", rank)); if (rank != root) { return MCA_PML_CALL(send(sbuf, scount, sdtype, root, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); } /* I am the root, loop receiving the data. */ ompi_datatype_get_extent(rdtype, &lb, &extent); incr = extent * (ptrdiff_t)rcount; for (i = 0, ptmp = (char *) rbuf; i < size; ++i, ptmp += incr) { if (i == rank) { if (MPI_IN_PLACE != sbuf) { err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, rcount, rdtype); } else { err = MPI_SUCCESS; } } else { err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm, MPI_STATUS_IGNORE)); } if (MPI_SUCCESS != err) { return err; } } /* All done */ return MPI_SUCCESS; }
/* * Run a function in the main thread * Called by service thread */ int ompi_btl_openib_fd_run_in_main(ompi_btl_openib_fd_main_callback_fn_t *callback, void *context) { if (OPAL_HAVE_THREADS) { cmd_t cmd; OPAL_OUTPUT((-1, "run in main -- sending command")); /* For the threaded version, write a command down the pipe */ cmd.pc_cmd = CMD_CALL_FUNCTION; cmd.pc_fd = -1; cmd.pc_flags = 0; cmd.pc_fn.main = callback; cmd.pc_context = context; write_to_main_thread(&cmd); } else { /* Otherwise, call it directly */ OPAL_OUTPUT((-1, "run in main -- calling now!")); callback(context); } return OMPI_SUCCESS; }
int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced selected algorithm %d", comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm)); switch (comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm) { case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); case (3): return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); case (4): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); default: OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL])); return (MPI_ERR_ARG); } /* switch */ }
int ompi_coll_tuned_bcast_intra_do_forced(void *buf, int count, struct ompi_datatype_t *dtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced algorithm %d", data->user_forced[BCAST].algorithm)); switch (data->user_forced[BCAST].algorithm) { case (0): return ompi_coll_tuned_bcast_intra_dec_fixed( buf, count, dtype, root, comm, module ); case (1): return ompi_coll_tuned_bcast_intra_basic_linear( buf, count, dtype, root, comm, module ); case (2): return ompi_coll_tuned_bcast_intra_chain( buf, count, dtype, root, comm, module, data->user_forced[BCAST].segsize, data->user_forced[BCAST].chain_fanout ); case (3): return ompi_coll_tuned_bcast_intra_pipeline( buf, count, dtype, root, comm, module, data->user_forced[BCAST].segsize ); case (4): return ompi_coll_tuned_bcast_intra_split_bintree( buf, count, dtype, root, comm, module, data->user_forced[BCAST].segsize ); case (5): return ompi_coll_tuned_bcast_intra_bintree( buf, count, dtype, root, comm, module, data->user_forced[BCAST].segsize ); case (6): return ompi_coll_tuned_bcast_intra_binomial( buf, count, dtype, root, comm, module, data->user_forced[BCAST].segsize ); default: OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", data->user_forced[BCAST].algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST])); } /* switch */ return (MPI_ERR_ARG); }
/* * reduce_scatter_intra_dec * * Function: - seletects reduce_scatter algorithm to use * Accepts: - same arguments as MPI_Reduce_scatter() * Returns: - MPI_SUCCESS or error code (passed from * the reduce scatter implementation) * Note: If we detect zero valued counts in the rcounts array, we * fall back to the nonoverlapping algorithm because the other * algorithms do not currently handle it. */ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int comm_size, i, pow2; size_t total_message_size, dsize; const double a = 0.0012; const double b = 8.0; const size_t small_message_size = 12 * 1024; const size_t large_message_size = 256 * 1024; bool zerocounts = false; OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_scatter_intra_dec_fixed")); comm_size = ompi_comm_size(comm); /* We need data size for decision function */ ompi_ddt_type_size(dtype, &dsize); total_message_size = 0; for (i = 0; i < comm_size; i++) { total_message_size += rcounts[i]; if (0 == rcounts[i]) { zerocounts = true; } } if( !ompi_op_is_commute(op) || (zerocounts)) { return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts, dtype, op, comm, module); } total_message_size *= dsize; /* compute the nearest power of 2 */ for (pow2 = 1; pow2 < comm_size; pow2 <<= 1); if ((total_message_size <= small_message_size) || ((total_message_size <= large_message_size) && (pow2 == comm_size)) || (comm_size >= a * total_message_size + b)) { return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts, dtype, op, comm, module); } return ompi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts, dtype, op, comm, module); }
/* * Act on pipe commands */ static bool service_pipe_cmd(void) { bool ret = false; cmd_t cmd; cmd_list_item_t *cli; opal_fd_read(pipe_to_service_thread[0], cmd_size, &cmd); switch (cmd.pc_cmd) { case CMD_ADD_FD: OPAL_OUTPUT((-1, "fd service thread: CMD_ADD_FD")); if (OMPI_SUCCESS != service_pipe_cmd_add_fd(false, &cmd)) { ret = true; } break; case CMD_REMOVE_FD: OPAL_OUTPUT((-1, "fd service thread: CMD_REMOVE_FD")); if (OMPI_SUCCESS != service_pipe_cmd_remove_fd(&cmd)) { ret = true; } break; case CMD_CALL_FUNCTION: OPAL_OUTPUT((-1, "fd service thread: CMD_RUN_FUNCTION")); if (OMPI_SUCCESS != service_pipe_cmd_call_function(&cmd)) { ret = true; } break; case CMD_TIME_TO_QUIT: OPAL_OUTPUT((-1, "fd service thread: CMD_TIME_TO_QUIT")); ret = true; break; case ACK_RAN_FUNCTION: /* We don't have a guarantee that the main thread will check its pipe frequently, so we do some simple counting to ensure we just don't have too many outstanding commands to the main thread at any given time. The main thread will ACK every CALL_FUNCTION command, so this thread will always wake up and continue to drain any queued up functions. */ cli = (cmd_list_item_t*) opal_list_remove_first(&pending_to_main_thread); if (NULL != cli) { OPAL_OUTPUT((-1, "sending queued up cmd function to main thread")); opal_fd_write(pipe_to_main_thread[1], cmd_size, &(cli->cli_cmd)); OBJ_RELEASE(cli); } else { --waiting_for_ack_from_main_thread; } break; default: OPAL_OUTPUT((-1, "fd service thread: unknown pipe command!")); break; } return ret; }
int opal_common_ofacm_base_free_cts(mca_btl_base_endpoint_t *endpoint) { if (NULL != endpoint->endpoint_cts_mr) { ibv_dereg_mr(endpoint->endpoint_cts_mr); endpoint->endpoint_cts_mr = NULL; } if (NULL != endpoint->endpoint_cts_frag.super.super.base.super.ptr) { free(endpoint->endpoint_cts_frag.super.super.base.super.ptr); endpoint->endpoint_cts_frag.super.super.base.super.ptr = NULL; OPAL_OUTPUT((-1, "Freeing CTS frag")); } return OPAL_SUCCESS; }
int ompi_coll_tuned_allgatherv_intra_dec_fixed(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int *rcounts, int *rdispls, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i; int communicator_size; size_t dsize, total_dsize; communicator_size = ompi_comm_size(comm); /* Special case for 2 processes */ if (communicator_size == 2) { return ompi_coll_tuned_allgatherv_intra_two_procs (sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm, module); } /* Determine complete data size */ ompi_ddt_type_size(sdtype, &dsize); total_dsize = 0; for (i = 0; i < communicator_size; i++) { total_dsize += dsize * rcounts[i]; } OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allgatherv_intra_dec_fixed" " rank %d com_size %d msg_length %lu", ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize)); /* Decision based on allgather decision. */ if (total_dsize < 50000) { return ompi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm, module); } else { if (communicator_size % 2) { return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm, module); } else { return ompi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm, module); } } }
int ompi_coll_tuned_reduce_intra_do_this(void *sbuf, void* rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, int algorithm, int faninout, int segsize, int max_requests ) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d", algorithm, faninout, segsize)); switch (algorithm) { case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm, module); case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, root, comm, module); case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm, module, segsize, faninout, max_requests); case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype, op, root, comm, module, segsize, max_requests); case (4): return ompi_coll_tuned_reduce_intra_binary (sbuf, rbuf, count, dtype, op, root, comm, module, segsize, max_requests); case (5): return ompi_coll_tuned_reduce_intra_binomial (sbuf, rbuf, count, dtype, op, root, comm, module, segsize, max_requests); case (6): return ompi_coll_tuned_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype, op, root, comm, module, segsize, max_requests); default: OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE])); return (MPI_ERR_ARG); } /* switch */ }
int ompi_coll_tuned_bcast_intra_do_this(void *buf, int count, struct ompi_datatype_t *dtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, int algorithm, int faninout, int segsize) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this algorithm %d topo faninout %d segsize %d", algorithm, faninout, segsize)); switch (algorithm) { case (0): return ompi_coll_tuned_bcast_intra_dec_fixed( buf, count, dtype, root, comm, module ); case (1): return ompi_coll_base_bcast_intra_basic_linear( buf, count, dtype, root, comm, module ); case (2): return ompi_coll_base_bcast_intra_chain( buf, count, dtype, root, comm, module, segsize, faninout ); case (3): return ompi_coll_base_bcast_intra_pipeline( buf, count, dtype, root, comm, module, segsize ); case (4): return ompi_coll_base_bcast_intra_split_bintree( buf, count, dtype, root, comm, module, segsize ); case (5): return ompi_coll_base_bcast_intra_bintree( buf, count, dtype, root, comm, module, segsize ); case (6): return ompi_coll_base_bcast_intra_binomial( buf, count, dtype, root, comm, module, segsize ); case (7): return ompi_coll_base_bcast_intra_knomial(buf, count, dtype, root, comm, module, segsize, coll_tuned_bcast_knomial_radix); case (8): return ompi_coll_base_bcast_intra_scatter_allgather(buf, count, dtype, root, comm, module, segsize); case (9): return ompi_coll_base_bcast_intra_scatter_allgather_ring(buf, count, dtype, root, comm, module, segsize); } /* switch */ OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST])); return (MPI_ERR_ARG); }