int sendBytesByTCP(int *blockSocket, char *from, int length, struct sockaddr *sn) { int bytesWritten; while (1) /* Continue until not interrupted. */ { bytesWritten = isend(*blockSocket, from, length, 0); if (bytesWritten < 0) { switch (errno) { case EINTR: /* Interrupted; retry. */ continue; case EPIPE: /* Lost connection. */ case EBADF: case ETIMEDOUT: case ECONNRESET: closesocket(*blockSocket); *blockSocket = -1; bytesWritten = 0; } putSysErrmsg("TCP BSO write() error on socket", NULL); } return bytesWritten; } }
/* * do zero byte IRECV / ISEND: upper half sends to lower half (i.e. do * a ping, not a ping pong) */ int ompi_init_do_preconnect(void) { int comm_size = ompi_comm_size(MPI_COMM_WORLD); int my_rank = ompi_comm_rank(MPI_COMM_WORLD); int i, j, ret; struct ompi_request_t **requests; requests = (ompi_request_t**)malloc(comm_size * sizeof(struct ompi_request_t *)); if (NULL == requests) { return OMPI_ERR_OUT_OF_RESOURCE; } for (i = j = 0; i < comm_size; ++i) { if (i == my_rank) { continue; } else if (my_rank < i) { ret = MCA_PML_CALL(isend(MPI_BOTTOM, 0, MPI_BYTE, i, 1, MCA_PML_BASE_SEND_STANDARD, MPI_COMM_WORLD, &requests[j++])); } else { ret = MCA_PML_CALL(irecv(MPI_BOTTOM,0, MPI_BYTE, i, 1, MPI_COMM_WORLD, &requests[j++])); } if (OMPI_SUCCESS != ret) { return ret; } } ret = ompi_request_wait_all(j, requests, MPI_STATUSES_IGNORE); free(requests); return ret; }
static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, err, rank, size; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* All non-root send & receive zero-length message. */ if (rank > 0) { err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { return err; } err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { return err; } } /* The root collects and broadcasts the messages. */ else { ompi_request_t** requests; requests = (ompi_request_t**)malloc( size * sizeof(ompi_request_t*) ); for (i = 1; i < size; ++i) { err = MCA_PML_CALL(irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE, MCA_COLL_BASE_TAG_BARRIER, comm, &(requests[i]))); if (MPI_SUCCESS != err) { return err; } } ompi_request_wait_all( size-1, requests+1, MPI_STATUSES_IGNORE ); for (i = 1; i < size; ++i) { err = MCA_PML_CALL(isend(NULL, 0, MPI_BYTE, i, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm, &(requests[i]))); if (MPI_SUCCESS != err) { return err; } } ompi_request_wait_all( size-1, requests+1, MPI_STATUSES_IGNORE ); free( requests ); } /* All done */ return MPI_SUCCESS; }
static int mca_coll_basic_neighbor_allgather_dist_graph(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { const mca_topo_base_comm_dist_graph_2_2_0_t *dist_graph = comm->c_topo->mtc.dist_graph; const int *inedges, *outedges; int indegree, outdegree; ompi_request_t **reqs, **preqs; ptrdiff_t lb, extent; int rc = MPI_SUCCESS, neighbor; indegree = dist_graph->indegree; outdegree = dist_graph->outdegree; if( 0 == (indegree + outdegree) ) return OMPI_SUCCESS; inedges = dist_graph->in; outedges = dist_graph->out; ompi_datatype_get_extent(rdtype, &lb, &extent); reqs = preqs = coll_base_comm_get_reqs( module->base_data, indegree + outdegree); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } for (neighbor = 0; neighbor < indegree ; ++neighbor) { rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, inedges[neighbor], MCA_COLL_BASE_TAG_ALLGATHER, comm, preqs++)); if (OMPI_SUCCESS != rc) break; rbuf = (char *) rbuf + extent * rcount; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, neighbor + 1); return rc; } for (neighbor = 0 ; neighbor < outdegree ; ++neighbor) { /* remove cast from const when the pml layer is updated to take * a const for the send buffer. */ rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, outedges[neighbor], MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, indegree + neighbor + 1); return rc; } rc = ompi_request_wait_all (indegree + outdegree, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, indegree + outdegree); } return rc; }
static int mca_coll_basic_neighbor_alltoallv_graph(const void *sbuf, const int scounts[], const int sdisps[], struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[], const int rdisps[], struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t *) module; const mca_topo_base_comm_graph_2_2_0_t *graph = comm->c_topo->mtc.graph; int rc = MPI_SUCCESS, neighbor, degree; const int rank = ompi_comm_rank (comm); ptrdiff_t lb, rdextent, sdextent; ompi_request_t **reqs; const int *edges; mca_topo_base_graph_neighbors_count (comm, rank, °ree); /* ensure we have enough storage for requests */ rc = mca_coll_basic_check_for_requests (basic_module, 2 * degree); if (OMPI_SUCCESS != rc) { return rc; } edges = graph->edges; if (rank > 0) { edges += graph->index[rank - 1]; } ompi_datatype_get_extent(rdtype, &lb, &rdextent); ompi_datatype_get_extent(sdtype, &lb, &sdextent); /* post all receives first */ for (neighbor = 0, reqs = basic_module->mccb_reqs ; neighbor < degree ; ++neighbor) { rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[neighbor] * rdextent, rcounts[neighbor], rdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { /* should probably try to clean up here */ return rc; } for (neighbor = 0 ; neighbor < degree ; ++neighbor) { /* remove cast from const when the pml layer is updated to take a const for the send buffer */ rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[neighbor] * sdextent, scounts[neighbor], sdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { /* should probably try to clean up here */ return rc; } return ompi_request_wait_all (degree * 2, basic_module->mccb_reqs, MPI_STATUSES_IGNORE); }
static int mca_coll_basic_neighbor_alltoallv_graph(const void *sbuf, const int scounts[], const int sdisps[], struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[], const int rdisps[], struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { const mca_topo_base_comm_graph_2_2_0_t *graph = comm->c_topo->mtc.graph; int rc = MPI_SUCCESS, neighbor, degree; const int rank = ompi_comm_rank (comm); ptrdiff_t lb, rdextent, sdextent; ompi_request_t **reqs, **preqs; const int *edges; mca_topo_base_graph_neighbors_count (comm, rank, °ree); if( 0 == degree ) return OMPI_SUCCESS; edges = graph->edges; if (rank > 0) { edges += graph->index[rank - 1]; } ompi_datatype_get_extent(rdtype, &lb, &rdextent); ompi_datatype_get_extent(sdtype, &lb, &sdextent); reqs = preqs = ompi_coll_base_comm_get_reqs( module->base_data, 2 * degree ); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } /* post all receives first */ for (neighbor = 0; neighbor < degree ; ++neighbor) { rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[neighbor] * rdextent, rcounts[neighbor], rdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, neighbor + 1); return rc; } for (neighbor = 0 ; neighbor < degree ; ++neighbor) { /* remove cast from const when the pml layer is updated to take a const for the send buffer */ rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[neighbor] * sdextent, scounts[neighbor], sdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, degree + neighbor + 1); return rc; } rc = ompi_request_wait_all (degree * 2, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, degree * 2); } return rc; }
/* * scatterv_inter * * Function: - scatterv operation * Accepts: - same arguments as MPI_Scatterv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_scatterv_inter(const void *sbuf, const int *scounts, const int *disps, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, err; char *ptmp; ptrdiff_t lb, extent; ompi_request_t **reqs; /* Initialize */ size = ompi_comm_remote_size(comm); /* If not root, receive data. Note that we will only get here if * rcount > 0 or rank == root. */ if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* If not root, receive data. */ err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root, MCA_COLL_BASE_TAG_SCATTERV, comm, MPI_STATUS_IGNORE)); } else { /* I am the root, loop sending data. */ err = ompi_datatype_get_extent(sdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } reqs = coll_base_comm_get_reqs(module->base_data, size); for (i = 0; i < size; ++i) { ptmp = ((char *) sbuf) + (extent * disps[i]); err = MCA_PML_CALL(isend(ptmp, scounts[i], sdtype, i, MCA_COLL_BASE_TAG_SCATTERV, MCA_PML_BASE_SEND_STANDARD, comm, &(reqs[i]))); if (OMPI_SUCCESS != err) { ompi_coll_base_free_reqs(reqs, i); return err; } } err = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { ompi_coll_base_free_reqs(reqs, size); } } /* All done */ return err; }
ompi_dpm_base_disconnect_obj *ompi_dpm_base_disconnect_init ( ompi_communicator_t *comm) { ompi_dpm_base_disconnect_obj *obj=NULL; int ret; int i; obj = (ompi_dpm_base_disconnect_obj *) calloc(1,sizeof(ompi_dpm_base_disconnect_obj)); if ( NULL == obj ) { printf("Could not allocate disconnect object\n"); return NULL; } if ( OMPI_COMM_IS_INTER(comm) ) { obj->size = ompi_comm_remote_size (comm); } else { obj->size = ompi_comm_size (comm); } obj->comm = comm; obj->reqs = (ompi_request_t **) malloc(2*obj->size*sizeof(ompi_request_t *)); if ( NULL == obj->reqs ) { printf("Could not allocate request array for disconnect object\n"); free (obj); return NULL; } /* initiate all isend_irecvs. We use a dummy buffer stored on the object, since we are sending zero size messages anyway. */ for ( i=0; i < obj->size; i++ ) { ret = MCA_PML_CALL(irecv (&(obj->buf), 0, MPI_INT, i, OMPI_COMM_BARRIER_TAG, comm, &(obj->reqs[2*i]))); if ( OMPI_SUCCESS != ret ) { printf("dpm_base_disconnect_init: error %d in irecv to process %d\n", ret, i); free (obj->reqs); free (obj); return NULL; } ret = MCA_PML_CALL(isend (&(obj->buf), 0, MPI_INT, i, OMPI_COMM_BARRIER_TAG, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &(obj->reqs[2*i+1]))); if ( OMPI_SUCCESS != ret ) { printf("dpm_base_disconnect_init: error %d in isend to process %d\n", ret, i); free (obj->reqs); free (obj); return NULL; } } /* return handle */ return obj; }
int ompi_init_preconnect_mpi(void) { int comm_size = ompi_comm_size(MPI_COMM_WORLD); int comm_rank = ompi_comm_rank(MPI_COMM_WORLD); int param, next, prev, i, ret = OMPI_SUCCESS; struct ompi_request_t * requests[2]; char inbuf[1], outbuf[1]; const bool *value; param = mca_base_var_find("ompi", "mpi", NULL, "preconnect_mpi"); if (0 > param) return OMPI_SUCCESS; ret = mca_base_var_get_value(param, &value, NULL, NULL); if (OMPI_SUCCESS != ret || 0 == value[0]) { return OMPI_SUCCESS; } inbuf[0] = outbuf[0] = '\0'; /* Each iteration, every process sends to its neighbor i hops to the right and receives from its neighbor i hops to the left. Because send_complete is used, there will only ever be one outstanding send and one outstanding receive in the network at a time for any given process. This limits any "flooding" effect that can occur with other connection algorithms. While the flooding algorithms may be a more efficient use of resources, they can overwhelm the out-of-band connection system used to wire up some networks, leading to poor performance and hangs. */ for (i = 1 ; i <= comm_size / 2 ; ++i) { next = (comm_rank + i) % comm_size; prev = (comm_rank - i + comm_size) % comm_size; ret = MCA_PML_CALL(isend(outbuf, 1, MPI_CHAR, next, 1, MCA_PML_BASE_SEND_COMPLETE, MPI_COMM_WORLD, &requests[1])); if (OMPI_SUCCESS != ret) return ret; ret = MCA_PML_CALL(irecv(inbuf, 1, MPI_CHAR, prev, 1, MPI_COMM_WORLD, &requests[0])); if(OMPI_SUCCESS != ret) return ret; ret = ompi_request_wait_all(2, requests, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != ret) return ret; } return ret; }
static int mca_coll_basic_neighbor_allgather_graph(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { const mca_topo_base_comm_graph_2_2_0_t *graph = comm->c_topo->mtc.graph; const int rank = ompi_comm_rank (comm); const int *edges; int degree; ompi_request_t **reqs, **preqs; ptrdiff_t lb, extent; int rc = MPI_SUCCESS, neighbor; mca_topo_base_graph_neighbors_count (comm, rank, °ree); edges = graph->edges; if (rank > 0) { edges += graph->index[rank - 1]; } ompi_datatype_get_extent(rdtype, &lb, &extent); reqs = preqs = coll_base_comm_get_reqs( module->base_data, 2 * degree); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } for (neighbor = 0; neighbor < degree ; ++neighbor) { rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLGATHER, comm, preqs++)); if (OMPI_SUCCESS != rc) break; rbuf = (char *) rbuf + extent * rcount; /* remove cast from const when the pml layer is updated to take * a const for the send buffer. */ rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, (2 * neighbor + 1)); return rc; } rc = ompi_request_wait_all (degree * 2, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, degree * 2); } return rc; }
static int mca_coll_basic_neighbor_alltoallv_dist_graph(const void *sbuf, const int scounts[], const int sdisps[], struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[], const int rdisps[], struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t *) module; const mca_topo_base_comm_dist_graph_2_1_0_t *dist_graph = comm->c_topo->mtc.dist_graph; ptrdiff_t lb, rdextent, sdextent; int rc = MPI_SUCCESS, neighbor; const int *inedges, *outedges; int indegree, outdegree; ompi_request_t **reqs; indegree = dist_graph->indegree; outdegree = dist_graph->outdegree; inedges = dist_graph->in; outedges = dist_graph->out; ompi_datatype_get_extent(rdtype, &lb, &rdextent); ompi_datatype_get_extent(sdtype, &lb, &sdextent); /* post all receives first */ for (neighbor = 0, reqs = basic_module->mccb_reqs ; neighbor < indegree ; ++neighbor) { rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[neighbor] * rdextent, rcounts[neighbor], rdtype, inedges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { /* should probably try to clean up here */ return rc; } for (neighbor = 0 ; neighbor < outdegree ; ++neighbor) { /* remove cast from const when the pml layer is updated to take a const for the send buffer */ rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[neighbor] * sdextent, scounts[neighbor], sdtype, outedges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { /* should probably try to clean up here */ return rc; } return ompi_request_wait_all (indegree + outdegree, basic_module->mccb_reqs, MPI_STATUSES_IGNORE); }
/* * scatter_inter * * Function: - scatter operation * Accepts: - same arguments as MPI_Scatter() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_scatter_inter(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, err; char *ptmp; ptrdiff_t lb, incr; mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module; ompi_request_t **reqs = basic_module->mccb_reqs; /* Initialize */ size = ompi_comm_remote_size(comm); if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* If not root, receive data. */ err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root, MCA_COLL_BASE_TAG_SCATTER, comm, MPI_STATUS_IGNORE)); } else { /* I am the root, loop sending data. */ err = ompi_datatype_get_extent(sdtype, &lb, &incr); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } incr *= scount; for (i = 0, ptmp = (char *) sbuf; i < size; ++i, ptmp += incr) { err = MCA_PML_CALL(isend(ptmp, scount, sdtype, i, MCA_COLL_BASE_TAG_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm, reqs++)); if (OMPI_SUCCESS != err) { return err; } } err = ompi_request_wait_all(size, basic_module->mccb_reqs, MPI_STATUSES_IGNORE); } return err; }
/** * A quick version of the MPI_Sendreceive implemented for the barrier. * No actual data is moved across the wire, we use 0-byte messages to * signal a two peer synchronization. */ static inline int ompi_coll_base_sendrecv_zero(int dest, int stag, int source, int rtag, MPI_Comm comm) { int err, line = 0; ompi_request_t* reqs[2]; ompi_status_public_t statuses[2]; /* post new irecv */ err = MCA_PML_CALL(irecv( NULL, 0, MPI_BYTE, source, rtag, comm, &reqs[0])); if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; } /* send data to children */ err = MCA_PML_CALL(isend( NULL, 0, MPI_BYTE, dest, stag, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[1])); if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; } err = ompi_request_wait_all( 2, reqs, statuses ); if( MPI_ERR_IN_STATUS == err ) { line = __LINE__; /* As we use wait_all we will get MPI_ERR_IN_STATUS which is not an error * code that we can propagate up the stack. Instead, look for the real * error code from the MPI_ERROR in the status. */ int err_index = 0; if( MPI_SUCCESS == statuses[0].MPI_ERROR || MPI_ERR_PENDING == statuses[0].MPI_ERROR ) { err_index = 1; } err = statuses[err_index].MPI_ERROR; OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred in the %s" " stage of ompi_coll_base_sendrecv_zero\n", __FILE__, line, err, (0 == err_index ? "receive" : "send"))); return err; } if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; } return (MPI_SUCCESS); error_handler: /* Error discovered during the posting of the irecv or isend, * and no status is available. */ OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n", __FILE__, line, err)); (void)line; // silence compiler warning return err; }
int sendSegmentByAOS(int linkSocket, char *from, int length) { int bytesWritten; while (1) /* Continue until not interrupted. */ { bytesWritten = isend(linkSocket, from, length, 0); if (bytesWritten < 0) { if (errno == EINTR) /* Interrupted. */ { continue; /* Retry. */ } putSysErrmsg("LSO send() error on socket", NULL); } return bytesWritten; } }
static int ompi_comm_allreduce_group_broadcast (ompi_comm_request_t *request) { ompi_comm_allreduce_context_t *context = (ompi_comm_allreduce_context_t *) request->context; ompi_comm_cid_context_t *cid_context = context->cid_context; ompi_request_t *subreq[2]; int subreq_count = 0; int rc; for (int i = 0 ; i < 2 ; ++i) { if (MPI_PROC_NULL != context->peers_comm[i + 1]) { rc = MCA_PML_CALL(isend(context->outbuf, context->count, MPI_INT, context->peers_comm[i+1], cid_context->pml_tag, MCA_PML_BASE_SEND_STANDARD, cid_context->comm, subreq + subreq_count++)); if (OMPI_SUCCESS != rc) { return rc; } } } return ompi_comm_request_schedule_append (request, NULL, subreq, subreq_count); }
/** * The object is serialized using the messageCallack. * @note This function may return immediately. */ virtual BareMessage * isend(const void * object, const void * user_ptr = nullptr ){ uint64_t msg_size = headerSize() + messageCallback->serializeMessageLen(object); char * payload = (char*) malloc(msg_size); assert(payload); uint64_t pos = 0; // memset(payload, 255, msg_size); serializeHeader(payload, pos, msg_size); assert(headerSize() == pos); messageCallback->serializeMessage(object, payload, pos); assert(pos == msg_size); BareMessage * msg = new BareMessage(payload, msg_size, user_ptr); isend(msg); return msg; }
/* * bcast_lin_inter * * Function: - broadcast using O(N) algorithm * Accepts: - same arguments as MPI_Bcast() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_bcast_lin_inter(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i; int rsize; int err; mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module; ompi_request_t **reqs = basic_module->mccb_reqs; rsize = ompi_comm_remote_size(comm); if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* Non-root receive the data. */ err = MCA_PML_CALL(recv(buff, count, datatype, root, MCA_COLL_BASE_TAG_BCAST, comm, MPI_STATUS_IGNORE)); } else { /* root section */ for (i = 0; i < rsize; i++) { err = MCA_PML_CALL(isend(buff, count, datatype, i, MCA_COLL_BASE_TAG_BCAST, MCA_PML_BASE_SEND_STANDARD, comm, &(reqs[i]))); if (OMPI_SUCCESS != err) { return err; } } err = ompi_request_wait_all(rsize, reqs, MPI_STATUSES_IGNORE); } /* All done */ return err; }
int ompi_osc_pt2pt_component_isend(void *buf, size_t count, struct ompi_datatype_t *datatype, int dest, int tag, struct ompi_communicator_t *comm, ompi_request_t **request, ompi_request_complete_fn_t callback, void *cbdata) { int ret; bool missed_callback; ompi_request_complete_fn_t tmp; ret = MCA_PML_CALL(isend(buf, count, datatype, dest, tag, MCA_PML_BASE_SEND_STANDARD, comm, request)); if (OMPI_SUCCESS != ret) return ret; /* lock the giant request mutex to update the callback data so that the PML can't mark the request as complete while we're updating the callback data, which means we can deterministically ensure the callback is only fired once and that we didn't miss it. */ OPAL_THREAD_LOCK(&ompi_request_lock); (*request)->req_complete_cb = callback; (*request)->req_complete_cb_data = cbdata; missed_callback = (*request)->req_complete; OPAL_THREAD_UNLOCK(&ompi_request_lock); if (missed_callback) { tmp = (*request)->req_complete_cb; (*request)->req_complete_cb = NULL; tmp(*request); } return OMPI_SUCCESS; }
static int ompi_comm_allreduce_inter_leader_exchange (ompi_comm_request_t *request) { ompi_comm_allreduce_context_t *context = (ompi_comm_allreduce_context_t *) request->context; ompi_communicator_t *intercomm = context->cid_context->comm; ompi_request_t *subreqs[2]; int rc; /* local leader exchange their data and determine the overall result for both groups */ rc = MCA_PML_CALL(irecv (context->outbuf, context->count, MPI_INT, 0, OMPI_COMM_ALLREDUCE_TAG, intercomm, subreqs)); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { return rc; } rc = MCA_PML_CALL(isend (context->tmpbuf, context->count, MPI_INT, 0, OMPI_COMM_ALLREDUCE_TAG, MCA_PML_BASE_SEND_STANDARD, intercomm, subreqs + 1)); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { return rc; } return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_inter_leader_reduce, subreqs, 2); }
static int ompi_comm_allreduce_bridged_reduce_complete (ompi_comm_request_t *request) { ompi_comm_allreduce_context_t *context = (ompi_comm_allreduce_context_t *) request->context; ompi_communicator_t *bridgecomm = context->cid_context->bridgecomm; ompi_request_t *subreq[2]; int rc; /* step 2: leader exchange */ rc = MCA_PML_CALL(irecv (context->outbuf, context->count, MPI_INT, context->cid_context->remote_leader, OMPI_COMM_ALLREDUCE_TAG, bridgecomm, subreq + 1)); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { return rc; } rc = MCA_PML_CALL(isend (context->tmpbuf, context->count, MPI_INT, context->cid_context->remote_leader, OMPI_COMM_ALLREDUCE_TAG, MCA_PML_BASE_SEND_STANDARD, bridgecomm, subreq)); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { return rc; } return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_bridged_xchng_complete, subreq, 2); }
static int ompi_comm_allreduce_group_recv_complete (ompi_comm_request_t *request) { ompi_comm_allreduce_context_t *context = (ompi_comm_allreduce_context_t *) request->context; ompi_comm_cid_context_t *cid_context = context->cid_context; int *tmp = context->tmpbuf; ompi_request_t *subreq[2]; int rc; for (int i = 0 ; i < 2 ; ++i) { if (MPI_PROC_NULL != context->peers_comm[i + 1]) { ompi_op_reduce (context->op, tmp, context->outbuf, context->count, MPI_INT); tmp += context->count; } } if (MPI_PROC_NULL != context->peers_comm[0]) { /* interior node */ rc = MCA_PML_CALL(isend(context->outbuf, context->count, MPI_INT, context->peers_comm[0], cid_context->pml_tag, MCA_PML_BASE_SEND_STANDARD, cid_context->comm, subreq)); if (OMPI_SUCCESS != rc) { return rc; } rc = MCA_PML_CALL(irecv(context->outbuf, context->count, MPI_INT, context->peers_comm[0], cid_context->pml_tag, cid_context->comm, subreq + 1)); if (OMPI_SUCCESS != rc) { return rc; } return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_group_broadcast, subreq, 2); } /* root */ return ompi_comm_allreduce_group_broadcast (request); }
static int mca_coll_basic_neighbor_allgather_cart(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { const mca_topo_base_comm_cart_2_2_0_t *cart = comm->c_topo->mtc.cart; const int rank = ompi_comm_rank (comm); ompi_request_t **reqs, **preqs; ptrdiff_t lb, extent; int rc = MPI_SUCCESS, dim, nreqs; if( 0 == cart->ndims ) return OMPI_SUCCESS; ompi_datatype_get_extent(rdtype, &lb, &extent); reqs = preqs = coll_base_comm_get_reqs( module->base_data, 4 * cart->ndims ); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } /* The ordering is defined as -1 then +1 in each dimension in * order of dimension. */ for (dim = 0, nreqs = 0 ; dim < cart->ndims ; ++dim) { int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL; if (cart->dims[dim] > 1) { mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank); } else if (1 == cart->dims[dim] && cart->periods[dim]) { srank = drank = rank; } if (MPI_PROC_NULL != srank) { nreqs++; rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, srank, MCA_COLL_BASE_TAG_ALLGATHER, comm, preqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; /* remove cast from const when the pml layer is updated to take * a const for the send buffer. */ rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, srank, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } rbuf = (char *) rbuf + extent * rcount; if (MPI_PROC_NULL != drank) { nreqs++; rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, drank, MCA_COLL_BASE_TAG_ALLGATHER, comm, preqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, drank, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } rbuf = (char *) rbuf + extent * rcount; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, nreqs); return rc; } rc = ompi_request_wait_all (nreqs, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, nreqs); } return rc; }
static inline int NBC_Start_round(NBC_Handle *handle) { int num; /* number of operations */ int res; char* ptr; MPI_Request *tmp; NBC_Fn_type type; NBC_Args_send sendargs; NBC_Args_recv recvargs; NBC_Args_op opargs; NBC_Args_copy copyargs; NBC_Args_unpack unpackargs; void *buf1, *buf2; /* get round-schedule address */ ptr = handle->schedule->data + handle->row_offset; NBC_GET_BYTES(ptr,num); NBC_DEBUG(10, "start_round round at offset %d : posting %i operations\n", handle->row_offset, num); for (int i = 0 ; i < num ; ++i) { int offset = (intptr_t)(ptr - handle->schedule->data); memcpy (&type, ptr, sizeof (type)); switch(type) { case SEND: NBC_DEBUG(5," SEND (offset %li) ", offset); NBC_GET_BYTES(ptr,sendargs); NBC_DEBUG(5,"*buf: %p, count: %i, type: %p, dest: %i, tag: %i)\n", sendargs.buf, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag); /* get an additional request */ handle->req_count++; /* get buffer */ if(sendargs.tmpbuf) { buf1=(char*)handle->tmpbuf+(long)sendargs.buf; } else { buf1=(void *)sendargs.buf; } #ifdef NBC_TIMING Isend_time -= MPI_Wtime(); #endif tmp = (MPI_Request *) realloc ((void *) handle->req_array, handle->req_count * sizeof (MPI_Request)); if (NULL == tmp) { return OMPI_ERR_OUT_OF_RESOURCE; } handle->req_array = tmp; res = MCA_PML_CALL(isend(buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag, MCA_PML_BASE_SEND_STANDARD, sendargs.local?handle->comm->c_local_comm:handle->comm, handle->req_array+handle->req_count - 1)); if (OMPI_SUCCESS != res) { NBC_Error ("Error in MPI_Isend(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag, (unsigned long)handle->comm, res); return res; } #ifdef NBC_TIMING Isend_time += MPI_Wtime(); #endif break; case RECV: NBC_DEBUG(5, " RECV (offset %li) ", offset); NBC_GET_BYTES(ptr,recvargs); NBC_DEBUG(5, "*buf: %p, count: %i, type: %p, source: %i, tag: %i)\n", recvargs.buf, recvargs.count, recvargs.datatype, recvargs.source, handle->tag); /* get an additional request - TODO: req_count NOT thread safe */ handle->req_count++; /* get buffer */ if(recvargs.tmpbuf) { buf1=(char*)handle->tmpbuf+(long)recvargs.buf; } else { buf1=recvargs.buf; } #ifdef NBC_TIMING Irecv_time -= MPI_Wtime(); #endif tmp = (MPI_Request *) realloc ((void *) handle->req_array, handle->req_count * sizeof (MPI_Request)); if (NULL == tmp) { return OMPI_ERR_OUT_OF_RESOURCE; } handle->req_array = tmp; res = MCA_PML_CALL(irecv(buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, recvargs.local?handle->comm->c_local_comm:handle->comm, handle->req_array+handle->req_count-1)); if (OMPI_SUCCESS != res) { NBC_Error("Error in MPI_Irecv(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, (unsigned long)handle->comm, res); return res; } #ifdef NBC_TIMING Irecv_time += MPI_Wtime(); #endif break; case OP: NBC_DEBUG(5, " OP2 (offset %li) ", offset); NBC_GET_BYTES(ptr,opargs); NBC_DEBUG(5, "*buf1: %p, buf2: %p, count: %i, type: %p)\n", opargs.buf1, opargs.buf2, opargs.count, opargs.datatype); /* get buffers */ if(opargs.tmpbuf1) { buf1=(char*)handle->tmpbuf+(long)opargs.buf1; } else { buf1=(void *)opargs.buf1; } if(opargs.tmpbuf2) { buf2=(char*)handle->tmpbuf+(long)opargs.buf2; } else { buf2=opargs.buf2; } ompi_op_reduce(opargs.op, buf1, buf2, opargs.count, opargs.datatype); break; case COPY: NBC_DEBUG(5, " COPY (offset %li) ", offset); NBC_GET_BYTES(ptr,copyargs); NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %p, *tgt: %lu, tgtcount: %i, tgttype: %p)\n", (unsigned long) copyargs.src, copyargs.srccount, copyargs.srctype, (unsigned long) copyargs.tgt, copyargs.tgtcount, copyargs.tgttype); /* get buffers */ if(copyargs.tmpsrc) { buf1=(char*)handle->tmpbuf+(long)copyargs.src; } else { buf1=copyargs.src; } if(copyargs.tmptgt) { buf2=(char*)handle->tmpbuf+(long)copyargs.tgt; } else { buf2=copyargs.tgt; } res = NBC_Copy (buf1, copyargs.srccount, copyargs.srctype, buf2, copyargs.tgtcount, copyargs.tgttype, handle->comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } break; case UNPACK: NBC_DEBUG(5, " UNPACK (offset %li) ", offset); NBC_GET_BYTES(ptr,unpackargs); NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %p, *tgt: %lu\n", (unsigned long) unpackargs.inbuf, unpackargs.count, unpackargs.datatype, (unsigned long) unpackargs.outbuf); /* get buffers */ if(unpackargs.tmpinbuf) { buf1=(char*)handle->tmpbuf+(long)unpackargs.inbuf; } else { buf1=unpackargs.inbuf; } if(unpackargs.tmpoutbuf) { buf2=(char*)handle->tmpbuf+(long)unpackargs.outbuf; } else { buf2=unpackargs.outbuf; } res = NBC_Unpack (buf1, unpackargs.count, unpackargs.datatype, buf2, handle->comm); if (OMPI_SUCCESS != res) { NBC_Error ("NBC_Unpack() failed (code: %i)", res); return res; } break; default: NBC_Error ("NBC_Start_round: bad type %li at offset %li", (long)type, offset); return OMPI_ERROR; } } /* check if we can make progress - not in the first round, this allows us to leave the * initialization faster and to reach more overlap * * threaded case: calling progress in the first round can lead to a * deadlock if NBC_Free is called in this round :-( */ if (handle->row_offset) { res = NBC_Progress(handle); if ((NBC_OK != res) && (NBC_CONTINUE != res)) { return OMPI_ERROR; } } return OMPI_SUCCESS; }
static int send_nb( dte_data_representation_t data, uint32_t count, void *buffer, rte_ec_handle_t ec_h, rte_grp_handle_t grp_h, uint32_t tag, rte_request_handle_t *req) { ompi_communicator_t *comm = (ompi_communicator_t *)grp_h; if (! ec_h.handle) { fprintf(stderr,"***Error in hcolrte_rml_send_nb: wrong null argument: " "ec_h.handle = %p, ec_h.rank = %d\n",ec_h.handle,ec_h.rank); return 1; } if (HCOL_DTE_IS_INLINE(data)) { /*do inline nb recv*/ size_t size; ompi_request_t *ompi_req; if (!buffer && !HCOL_DTE_IS_ZERO(data)) { fprintf(stderr, "***Error in hcolrte_rml_send_nb: buffer pointer is NULL" " for non DTE_ZERO INLINE data representation\n"); return 1; } size = (size_t)data.rep.in_line_rep.data_handle.in_line.packed_size*count/8; HCOL_VERBOSE(30,"PML_ISEND: dest = %d: buf = %p: size = %u: comm = %p", ec_h.rank, buffer, (unsigned int)size, (void *)comm); if (MCA_PML_CALL(isend(buffer,size,&(ompi_mpi_unsigned_char.dt),ec_h.rank, tag,MCA_PML_BASE_SEND_STANDARD,comm,&ompi_req))) { return 1; } req->data = (void *)ompi_req; req->status = HCOLRTE_REQUEST_ACTIVE; } else { int total_entries_number; int i; unsigned int j; void *buf; uint64_t len; int repeat_count; struct dte_struct_t * repeat; if (NULL != buffer) { /* We have a full data description & buffer pointer simultaneously. It is ambiguous. Throw a warning since the user might have made a mistake with data reps*/ fprintf(stderr,"Warning: buffer_pointer != NULL for NON-inline data representation: buffer_pointer is ignored.\n"); } total_entries_number = count_total_dte_repeat_entries(&data); repeat = data.rep.general_rep->data_representation.data->repeat; repeat_count = data.rep.general_rep->data_representation.data->repeat_count; for (i=0; i< repeat_count; i++) { for (j=0; j<repeat[i].n_elements; j++) { char *repeat_unit = (char *)&repeat[i]; buf = (void *)(repeat_unit+repeat[i].elements[j].base_offset); len = repeat[i].elements[j].packed_size; send_nb(DTE_BYTE,len,buf,ec_h,grp_h,tag,req); } } } return HCOLL_SUCCESS; }
int ompi_coll_base_sendrecv_nonzero_actual( void* sendbuf, size_t scount, ompi_datatype_t* sdatatype, int dest, int stag, void* recvbuf, size_t rcount, ompi_datatype_t* rdatatype, int source, int rtag, struct ompi_communicator_t* comm, ompi_status_public_t* status ) { /* post receive first, then send, then waitall... should be fast (I hope) */ int err, line = 0, nreqs = 0; size_t typesize; ompi_request_t* reqs[2], **req = reqs; ompi_status_public_t statuses[2]; /* post new irecv */ ompi_datatype_type_size(rdatatype, &typesize); if (0 != rcount && 0 != typesize) { err = MCA_PML_CALL(irecv( recvbuf, rcount, rdatatype, source, rtag, comm, req++)); ++nreqs; if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; } } /* send data to children */ ompi_datatype_type_size(sdatatype, &typesize); if (0 != scount && 0 != typesize) { err = MCA_PML_CALL(isend( sendbuf, scount, sdatatype, dest, stag, MCA_PML_BASE_SEND_STANDARD, comm, req++)); ++nreqs; if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; } } if (0 != nreqs) { err = ompi_request_wait_all( nreqs, reqs, statuses ); if( MPI_ERR_IN_STATUS == err ) { /* As we use wait_all we will get MPI_ERR_IN_STATUS which is not an error * code that we can propagate up the stack. Instead, look for the real * error code from the MPI_ERROR in the status. */ int err_index = 0; if( MPI_SUCCESS == statuses[0].MPI_ERROR ) { err_index = 1; } if (MPI_STATUS_IGNORE != status) { *status = statuses[err_index]; } err = statuses[err_index].MPI_ERROR; OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred in the %s" " stage of ompi_coll_base_sendrecv_zero\n", __FILE__, line, err, (0 == err_index ? "receive" : "send"))); return err; } if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; } if (MPI_STATUS_IGNORE != status) { *status = statuses[0]; } } else { if( MPI_STATUS_IGNORE != status ) *status = ompi_status_empty; } return (MPI_SUCCESS); error_handler: /* Error discovered during the posting of the irecv or isend, * and no status is available. */ OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n", __FILE__, line, err)); if (MPI_STATUS_IGNORE != status) { status->MPI_ERROR = err; } return (err); }
int mca_fcoll_dynamic_file_read_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { MPI_Aint position = 0; MPI_Aint total_bytes = 0; /* total bytes to be read */ MPI_Aint bytes_to_read_in_cycle = 0; /* left to be read in a cycle*/ MPI_Aint bytes_per_cycle = 0; /* total read in each cycle by each process*/ int index = 0, ret=OMPI_SUCCESS; int cycles = 0; int i=0, j=0, l=0; int n=0; /* current position in total_bytes_per_process array */ MPI_Aint bytes_remaining = 0; /* how many bytes have been read from the current value from total_bytes_per_process */ int *sorted_file_offsets=NULL, entries_per_aggregator=0; int bytes_received = 0; int blocks = 0; /* iovec structure and count of the buffer passed in */ uint32_t iov_count = 0; struct iovec *decoded_iov = NULL; int iov_index = 0; size_t current_position = 0; struct iovec *local_iov_array=NULL, *global_iov_array=NULL; char *receive_buf = NULL; MPI_Aint *memory_displacements=NULL; /* global iovec at the readers that contain the iovecs created from file_set_view */ uint32_t total_fview_count = 0; int local_count = 0; int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL; int current_index=0, temp_index=0; int **blocklen_per_process=NULL; MPI_Aint **displs_per_process=NULL; char *global_buf = NULL; MPI_Aint global_count = 0; local_io_array *file_offsets_for_agg=NULL; /* array that contains the sorted indices of the global_iov */ int *sorted = NULL; int *displs = NULL; int dynamic_num_io_procs; size_t max_data = 0; int *bytes_per_process = NULL; MPI_Aint *total_bytes_per_process = NULL; ompi_datatype_t **sendtype = NULL; MPI_Request *send_req=NULL, *recv_req=NULL; #if TIME_BREAKDOWN double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0; double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0; double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0; print_entry nentry; #endif // if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) { // fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY; // } /************************************************************************** ** In case the data is not contigous in memory, decode it into an iovec ** **************************************************************************/ if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { ret = fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); if (OMPI_SUCCESS != ret){ goto exit; } } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } fh->f_get_num_aggregators ( &dynamic_num_io_procs); ret = fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh, dynamic_num_io_procs, max_data); if (OMPI_SUCCESS != ret){ goto exit; } total_bytes_per_process = (MPI_Aint*)malloc (fh->f_procs_per_group*sizeof(MPI_Aint)); if (NULL == total_bytes_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = fh->f_allgather_array (&max_data, 1, MPI_LONG, total_bytes_per_process, 1, MPI_LONG, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } for (i=0 ; i<fh->f_procs_per_group ; i++) { total_bytes += total_bytes_per_process[i]; } if (NULL != total_bytes_per_process) { free (total_bytes_per_process); total_bytes_per_process = NULL; } /********************************************************************* *** Generate the File offsets/lengths corresponding to this write *** ********************************************************************/ ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t *) fh, max_data, &local_iov_array, &local_count); if (ret != OMPI_SUCCESS){ goto exit; } /* #########################################################*/ /************************************************************* *** ALLGather the File View information at all processes *** *************************************************************/ fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == fview_count) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = fh->f_allgather_array (&local_count, 1, MPI_INT, fview_count, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } displs = (int*)malloc (fh->f_procs_per_group*sizeof(int)); if (NULL == displs) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs[0] = 0; total_fview_count = fview_count[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { total_fview_count += fview_count[i]; displs[i] = displs[i-1] + fview_count[i-1]; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (i=0 ; i<fh->f_procs_per_group ; i++) { printf ("%d: PROCESS: %d ELEMENTS: %d DISPLS: %d\n", fh->f_rank, i, fview_count[i], displs[i]); } } #endif /* allocate the global iovec */ if (0 != total_fview_count) { global_iov_array = (struct iovec*)malloc (total_fview_count * sizeof(struct iovec)); if (NULL == global_iov_array) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } ret = fh->f_allgatherv_array (local_iov_array, local_count, fh->f_iov_type, global_iov_array, fview_count, displs, fh->f_iov_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } /* sort it */ if (0 != total_fview_count) { sorted = (int *)malloc (total_fview_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_sort_iovec (global_iov_array, total_fview_count, sorted); } if (NULL != local_iov_array) { free (local_iov_array); local_iov_array = NULL; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (i=0 ; i<total_fview_count ; i++) { printf("%d: OFFSET: %p LENGTH: %d\n", fh->f_rank, global_iov_array[sorted[i]].iov_base, global_iov_array[sorted[i]].iov_len); } } #endif if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*)); if (NULL == displs_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0;i<fh->f_procs_per_group;i++){ blocklen_per_process[i] = NULL; displs_per_process[i] = NULL; } } /* * Calculate how many bytes are read in each cycle */ fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle); cycles = ceil((double)total_bytes/bytes_per_cycle); n = 0; bytes_remaining = 0; current_index = 0; #if TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif for (index = 0; index < cycles; index++) { /* Getting ready for next cycle Initializing and freeing buffers */ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL == sendtype){ sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == sendtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); if (NULL == blocklen_per_process[l]) { opal_output (1, "OUT OF MEMORY for blocklen\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } } if (cycles-1 == index) { bytes_to_read_in_cycle = total_bytes - bytes_per_cycle*index; } else { bytes_to_read_in_cycle = bytes_per_cycle; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { printf ("****%d: CYCLE %d Bytes %d**********\n", fh->f_rank, index, bytes_to_write_in_cycle); } #endif /* Calculate how much data will be contributed in this cycle by each process*/ bytes_received = 0; while (bytes_to_read_in_cycle) { blocks = fview_count[0]; for (j=0 ; j<fh->f_procs_per_group ; j++) { if (sorted[current_index] < blocks) { n = j; break; } else { blocks += fview_count[j+1]; } } if (bytes_remaining) { if (bytes_remaining <= bytes_to_read_in_cycle) { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_remaining; } current_index ++; bytes_to_read_in_cycle -= bytes_remaining; bytes_remaining = 0; if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *) realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } continue; } else { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_to_read_in_cycle; } bytes_remaining -= bytes_to_read_in_cycle; bytes_to_read_in_cycle = 0; break; } } else { if (bytes_to_read_in_cycle < (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_to_read_in_cycle; } bytes_remaining = global_iov_array[sorted[current_index]].iov_len - bytes_to_read_in_cycle; bytes_to_read_in_cycle = 0; break; } else { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = global_iov_array[sorted[current_index]].iov_len; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE) global_iov_array[sorted[current_index]].iov_base; blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *)realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += global_iov_array[sorted[current_index]].iov_len; } bytes_to_read_in_cycle -= global_iov_array[sorted[current_index]].iov_len; current_index ++; continue; } } } /* Calculate the displacement on where to put the data and allocate the recieve buffer (global_buf) */ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group; i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0) entries_per_aggregator++ ; } } if (entries_per_aggregator > 0){ file_offsets_for_agg = (local_io_array *) malloc(entries_per_aggregator*sizeof(local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /*Moving file offsets to an IO array!*/ temp_index = 0; global_count = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; global_count += blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; } } } } else{ continue; } read_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } global_buf = (char *) malloc (global_count * sizeof(char)); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_io_array = (mca_io_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_num_of_io_entries = 0; fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else{ fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if TIME_BREAKDOWN start_read_time = MPI_Wtime(); #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_preadv (fh)) { opal_output (1, "READ FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if TIME_BREAKDOWN end_read_time = MPI_Wtime(); read_time += end_read_time - start_read_time; #endif /********************************************************** ******************** DONE READING ************************ *********************************************************/ temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0; i<entries_per_aggregator; i++){ temp_index = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_index][temp_disp_index[temp_index]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_index] < disp_index[temp_index]){ temp_disp_index[temp_index] += 1; } else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_index, temp_disp_index[temp_index], temp_index, disp_index[temp_index]); } } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request)); if (NULL == send_req){ opal_output ( 1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif for (i=0;i<fh->f_procs_per_group;i++){ ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &sendtype[i]); ompi_datatype_commit(&sendtype[i]); ret = MCA_PML_CALL (isend(global_buf, 1, sendtype[i], fh->f_procs_in_group[i], 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, &send_req[i])); if(OMPI_SUCCESS != ret){ goto exit; } } #if TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif } /********************************************************** ********* Scatter the Data from the readers ************** *********************************************************/ if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) { receive_buf = &((char*)buf)[position]; } else if (bytes_received) { /* allocate a receive buffer and copy the data that needs to be received into it in case the data is non-contigous in memory */ receive_buf = malloc (bytes_received); if (NULL == receive_buf) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif recv_req = (MPI_Request *) malloc (sizeof (MPI_Request)); if (NULL == recv_req){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = MCA_PML_CALL(irecv(receive_buf, bytes_received, MPI_BYTE, fh->f_procs_in_group[fh->f_aggregator_index], 123, fh->f_comm, recv_req)); if (OMPI_SUCCESS != ret){ goto exit; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){ ret = ompi_request_wait_all (fh->f_procs_per_group, send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } ret = ompi_request_wait (recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } position += bytes_received; /* If data is not contigous in memory, copy the data from the receive buffer into the buffer passed in */ if (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { OPAL_PTRDIFF_TYPE mem_address; size_t remaining = 0; size_t temp_position = 0; remaining = bytes_received; while (remaining) { mem_address = (OPAL_PTRDIFF_TYPE) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, remaining); current_position = current_position + remaining; remaining = 0; } } if (NULL != receive_buf) { free (receive_buf); receive_buf = NULL; } } #if TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (NULL != recv_req){ free(recv_req); recv_req = NULL; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){ fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } for (i = 0; i < fh->f_procs_per_group; i++) ompi_datatype_destroy(sendtype+i); if (NULL != sendtype){ free(sendtype); sendtype=NULL; } if (NULL != send_req){ free(send_req); send_req = NULL; } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if (NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != bytes_per_process){ free(bytes_per_process); bytes_per_process =NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements= NULL; } } } #if TIME_BREAKDOWN end_rexch = MPI_Wtime(); read_exch += end_rexch - start_rexch; nentry.time[0] = read_time; nentry.time[1] = rcomm_time; nentry.time[2] = read_exch; if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = dynamic_num_io_procs; if (!fh->f_full_print_queue(READ_PRINT_QUEUE)){ fh->f_register_print_entry(READ_PRINT_QUEUE, nentry); } #endif exit: if (NULL != sorted) { free (sorted); sorted = NULL; } if (NULL != global_iov_array) { free (global_iov_array); global_iov_array = NULL; } if (NULL != fview_count) { free (fview_count); fview_count = NULL; } if (NULL != decoded_iov) { free (decoded_iov); decoded_iov = NULL; } if (NULL != local_iov_array){ free(local_iov_array); local_iov_array=NULL; } if (NULL != displs) { free (displs); displs = NULL; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL != disp_index){ free(disp_index); disp_index = NULL; } if ( NULL != blocklen_per_process){ for(l=0;l<fh->f_procs_per_group;l++){ if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } } free(blocklen_per_process); blocklen_per_process = NULL; } if (NULL != displs_per_process){ for (l=0; i<fh->f_procs_per_group; l++){ if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } } free(displs_per_process); displs_per_process = NULL; } } return ret; }
static int mca_coll_basic_neighbor_alltoallv_cart(const void *sbuf, const int scounts[], const int sdisps[], struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[], const int rdisps[], struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t *) module; const mca_topo_base_comm_cart_2_2_0_t *cart = comm->c_topo->mtc.cart; const int rank = ompi_comm_rank (comm); int rc = MPI_SUCCESS, dim, i, nreqs; ptrdiff_t lb, rdextent, sdextent; ompi_request_t **reqs; /* ensure we have enough storage for requests */ rc = mca_coll_basic_check_for_requests (basic_module, cart->ndims * 4); if (OMPI_SUCCESS != rc) { return rc; } ompi_datatype_get_extent(rdtype, &lb, &rdextent); ompi_datatype_get_extent(sdtype, &lb, &sdextent); /* post receives first */ for (dim = 0, nreqs = 0, i = 0, reqs = basic_module->mccb_reqs ; dim < cart->ndims ; ++dim, i += 2) { int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL; if (cart->dims[dim] > 1) { mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank); } else if (1 == cart->dims[dim] && cart->periods[dim]) { srank = drank = rank; } if (MPI_PROC_NULL != srank) { rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i] * rdextent, rcounts[i], rdtype, srank, MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; } if (MPI_PROC_NULL != drank) { rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i+1] * rdextent, rcounts[i+1], rdtype, drank, MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; } } if (OMPI_SUCCESS != rc) { /* should probably try to clean up here */ return rc; } for (dim = 0, i = 0 ; dim < cart->ndims ; ++dim, i += 2) { int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL; if (cart->dims[dim] > 1) { mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank); } else if (1 == cart->dims[dim] && cart->periods[dim]) { srank = drank = rank; } if (MPI_PROC_NULL != srank) { /* remove cast from const when the pml layer is updated to take a const for the send buffer */ rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i] * sdextent, scounts[i], sdtype, srank, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; } if (MPI_PROC_NULL != drank) { rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i+1] * sdextent, scounts[i+1], sdtype, drank, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; } } if (OMPI_SUCCESS != rc) { /* should probably try to clean up here */ return rc; } return ompi_request_wait_all (nreqs, basic_module->mccb_reqs, MPI_STATUSES_IGNORE); }
static int parse_file(const char *section, char *key, const char *buf,int *sec_s,int *sec_e, int *key_s,int *key_e, int *value_s, int *value_e) { const char *p = buf; int i=0; assert(buf!=NULL); assert(section != NULL && strlen(section)); assert(key != NULL && strlen(key)); *sec_e = *sec_s = *key_e = *key_s = *value_s = *value_e = -1; while( !isend(p[i]) ) { //find the section if( ( 0==i || isnewline(p[i-1]) ) && isleftbarce(p[i]) ) { int section_start=i+1; //find the ']' do { i++; } while( !isrightbrace(p[i]) && !isend(p[i])); if( 0 == strncmp(p+section_start,section, i-section_start)) { int newline_start=0; i++; //Skip over space char after ']' while(isspace(p[i])) { i++; } //find the section *sec_s = section_start; *sec_e = i; while( ! (isnewline(p[i-1]) && isleftbarce(p[i])) && !isend(p[i]) ) { int j=0; //get a new line newline_start = i; while( !isnewline(p[i]) && !isend(p[i]) ) { i++; } //now i is equal to end of the line j = newline_start; if(';' != p[j]) //skip over comment { while(j < i && p[j]!='=') { j++; if('=' == p[j]) { if(strncmp(key,p+newline_start,j-newline_start)==0) { //find the key ok *key_s = newline_start; *key_e = j-1; *value_s = j+1; *value_e = i; return 1; } } } } i++; } } } else { i++; } } return 0; }
int mca_fcoll_dynamic_gen2_file_read_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { MPI_Aint position = 0; MPI_Aint total_bytes = 0; /* total bytes to be read */ MPI_Aint bytes_to_read_in_cycle = 0; /* left to be read in a cycle*/ MPI_Aint bytes_per_cycle = 0; /* total read in each cycle by each process*/ int index = 0, ret=OMPI_SUCCESS; int cycles = 0; int i=0, j=0, l=0; int n=0; /* current position in total_bytes_per_process array */ MPI_Aint bytes_remaining = 0; /* how many bytes have been read from the current value from total_bytes_per_process */ int *sorted_file_offsets=NULL, entries_per_aggregator=0; int bytes_received = 0; int blocks = 0; /* iovec structure and count of the buffer passed in */ uint32_t iov_count = 0; struct iovec *decoded_iov = NULL; int iov_index = 0; size_t current_position = 0; struct iovec *local_iov_array=NULL, *global_iov_array=NULL; char *receive_buf = NULL; MPI_Aint *memory_displacements=NULL; /* global iovec at the readers that contain the iovecs created from file_set_view */ uint32_t total_fview_count = 0; int local_count = 0; int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL; int current_index=0, temp_index=0; int **blocklen_per_process=NULL; MPI_Aint **displs_per_process=NULL; char *global_buf = NULL; MPI_Aint global_count = 0; mca_io_ompio_local_io_array *file_offsets_for_agg=NULL; /* array that contains the sorted indices of the global_iov */ int *sorted = NULL; int *displs = NULL; int dynamic_gen2_num_io_procs; size_t max_data = 0; MPI_Aint *total_bytes_per_process = NULL; ompi_datatype_t **sendtype = NULL; MPI_Request *send_req=NULL, recv_req=NULL; int my_aggregator =-1; bool recvbuf_is_contiguous=false; size_t ftype_size; OPAL_PTRDIFF_TYPE ftype_extent, lb; #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0; double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0; double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0; mca_io_ompio_print_entry nentry; #endif /************************************************************************** ** 1. In case the data is not contigous in memory, decode it into an iovec **************************************************************************/ opal_datatype_type_size ( &datatype->super, &ftype_size ); opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent ); if ( (ftype_extent == (OPAL_PTRDIFF_TYPE) ftype_size) && opal_datatype_is_contiguous_memory_layout(&datatype->super,1) && 0 == lb ) { recvbuf_is_contiguous = true; } if (! recvbuf_is_contiguous ) { ret = fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); if (OMPI_SUCCESS != ret){ goto exit; } } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } fh->f_get_num_aggregators ( &dynamic_gen2_num_io_procs); ret = fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh, dynamic_gen2_num_io_procs, max_data); if (OMPI_SUCCESS != ret){ goto exit; } my_aggregator = fh->f_procs_in_group[fh->f_aggregator_index]; /************************************************************************** ** 2. Determine the total amount of data to be written **************************************************************************/ total_bytes_per_process = (MPI_Aint*)malloc(fh->f_procs_per_group*sizeof(MPI_Aint)); if (NULL == total_bytes_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = fcoll_base_coll_allgather_array (&max_data, 1, MPI_LONG, total_bytes_per_process, 1, MPI_LONG, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif for (i=0 ; i<fh->f_procs_per_group ; i++) { total_bytes += total_bytes_per_process[i]; } if (NULL != total_bytes_per_process) { free (total_bytes_per_process); total_bytes_per_process = NULL; } /********************************************************************* *** 3. Generate the File offsets/lengths corresponding to this write ********************************************************************/ ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t *) fh, max_data, &local_iov_array, &local_count); if (ret != OMPI_SUCCESS){ goto exit; } /************************************************************* *** 4. Allgather the File View information at all processes *************************************************************/ fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == fview_count) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = fcoll_base_coll_allgather_array (&local_count, 1, MPI_INT, fview_count, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif displs = (int*)malloc (fh->f_procs_per_group*sizeof(int)); if (NULL == displs) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs[0] = 0; total_fview_count = fview_count[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { total_fview_count += fview_count[i]; displs[i] = displs[i-1] + fview_count[i-1]; } #if DEBUG_ON if (my_aggregator == fh->f_rank) { for (i=0 ; i<fh->f_procs_per_group ; i++) { printf ("%d: PROCESS: %d ELEMENTS: %d DISPLS: %d\n", fh->f_rank, i, fview_count[i], displs[i]); } } #endif /* allocate the global iovec */ if (0 != total_fview_count) { global_iov_array = (struct iovec*)malloc (total_fview_count * sizeof(struct iovec)); if (NULL == global_iov_array) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = fcoll_base_coll_allgatherv_array (local_iov_array, local_count, fh->f_iov_type, global_iov_array, fview_count, displs, fh->f_iov_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif /**************************************************************************************** *** 5. Sort the global offset/lengths list based on the offsets. *** The result of the sort operation is the 'sorted', an integer array, *** which contains the indexes of the global_iov_array based on the offset. *** For example, if global_iov_array[x].offset is followed by global_iov_array[y].offset *** in the file, and that one is followed by global_iov_array[z].offset, than *** sorted[0] = x, sorted[1]=y and sorted[2]=z; ******************************************************************************************/ if (0 != total_fview_count) { sorted = (int *)malloc (total_fview_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_sort_iovec (global_iov_array, total_fview_count, sorted); } if (NULL != local_iov_array) { free (local_iov_array); local_iov_array = NULL; } #if DEBUG_ON if (my_aggregator == fh->f_rank) { for (i=0 ; i<total_fview_count ; i++) { printf("%d: OFFSET: %p LENGTH: %d\n", fh->f_rank, global_iov_array[sorted[i]].iov_base, global_iov_array[sorted[i]].iov_len); } } #endif /************************************************************* *** 6. Determine the number of cycles required to execute this *** operation *************************************************************/ fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle); cycles = ceil((double)total_bytes/bytes_per_cycle); if ( my_aggregator == fh->f_rank) { disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*)); if (NULL == displs_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0;i<fh->f_procs_per_group;i++){ blocklen_per_process[i] = NULL; displs_per_process[i] = NULL; } send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request)); if (NULL == send_req){ opal_output ( 1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } global_buf = (char *) malloc (bytes_per_cycle); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == sendtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for(l=0;l<fh->f_procs_per_group;l++){ sendtype[l] = MPI_DATATYPE_NULL; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif n = 0; bytes_remaining = 0; current_index = 0; for (index = 0; index < cycles; index++) { /********************************************************************** *** 7a. Getting ready for next cycle: initializing and freeing buffers **********************************************************************/ if (my_aggregator == fh->f_rank) { if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } fh->f_num_of_io_entries = 0; if (NULL != sendtype){ for (i =0; i< fh->f_procs_per_group; i++) { if ( MPI_DATATYPE_NULL != sendtype[i] ) { ompi_datatype_destroy(&sendtype[i]); sendtype[i] = MPI_DATATYPE_NULL; } } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); if (NULL == blocklen_per_process[l]) { opal_output (1, "OUT OF MEMORY for blocklen\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } } /* (my_aggregator == fh->f_rank */ /************************************************************************** *** 7b. Determine the number of bytes to be actually read in this cycle **************************************************************************/ if (cycles-1 == index) { bytes_to_read_in_cycle = total_bytes - bytes_per_cycle*index; } else { bytes_to_read_in_cycle = bytes_per_cycle; } #if DEBUG_ON if (my_aggregator == fh->f_rank) { printf ("****%d: CYCLE %d Bytes %d**********\n", fh->f_rank, index, bytes_to_write_in_cycle); } #endif /***************************************************************** *** 7c. Calculate how much data will be contributed in this cycle *** by each process *****************************************************************/ bytes_received = 0; while (bytes_to_read_in_cycle) { /* This next block identifies which process is the holder ** of the sorted[current_index] element; */ blocks = fview_count[0]; for (j=0 ; j<fh->f_procs_per_group ; j++) { if (sorted[current_index] < blocks) { n = j; break; } else { blocks += fview_count[j+1]; } } if (bytes_remaining) { /* Finish up a partially used buffer from the previous cycle */ if (bytes_remaining <= bytes_to_read_in_cycle) { /* Data fits completely into the block */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *) realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_remaining; } current_index ++; bytes_to_read_in_cycle -= bytes_remaining; bytes_remaining = 0; continue; } else { /* the remaining data from the previous cycle is larger than the bytes_to_write_in_cycle, so we have to segment again */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_to_read_in_cycle; } bytes_remaining -= bytes_to_read_in_cycle; bytes_to_read_in_cycle = 0; break; } } else { /* No partially used entry available, have to start a new one */ if (bytes_to_read_in_cycle < (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) { /* This entry has more data than we can sendin one cycle */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_to_read_in_cycle; } bytes_remaining = global_iov_array[sorted[current_index]].iov_len - bytes_to_read_in_cycle; bytes_to_read_in_cycle = 0; break; } else { /* Next data entry is less than bytes_to_write_in_cycle */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = global_iov_array[sorted[current_index]].iov_len; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE) global_iov_array[sorted[current_index]].iov_base; blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *)realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += global_iov_array[sorted[current_index]].iov_len; } bytes_to_read_in_cycle -= global_iov_array[sorted[current_index]].iov_len; current_index ++; continue; } } } /* end while (bytes_to_read_in_cycle) */ /************************************************************************* *** 7d. Calculate the displacement on where to put the data and allocate *** the recieve buffer (global_buf) *************************************************************************/ if (my_aggregator == fh->f_rank) { entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group; i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0) entries_per_aggregator++ ; } } if (entries_per_aggregator > 0){ file_offsets_for_agg = (mca_io_ompio_local_io_array *) malloc(entries_per_aggregator*sizeof(mca_io_ompio_local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /*Moving file offsets to an IO array!*/ temp_index = 0; global_count = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; global_count += blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; } } } } else{ continue; } /* Sort the displacements for each aggregator */ read_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } /********************************************************** *** 7e. Create the io array, and pass it to fbtl *********************************************************/ fh->f_io_array = (mca_io_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_num_of_io_entries = 0; fh->f_io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[0].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[0].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else{ fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_read_time = MPI_Wtime(); #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_preadv (fh)) { opal_output (1, "READ FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_read_time = MPI_Wtime(); read_time += end_read_time - start_read_time; #endif /********************************************************** ******************** DONE READING ************************ *********************************************************/ temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0; i<entries_per_aggregator; i++){ temp_index = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_index][temp_disp_index[temp_index]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_index] < disp_index[temp_index]){ temp_disp_index[temp_index] += 1; } else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_index, temp_disp_index[temp_index], temp_index, disp_index[temp_index]); } } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif for (i=0;i<fh->f_procs_per_group;i++){ send_req[i] = MPI_REQUEST_NULL; if ( 0 < disp_index[i] ) { ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &sendtype[i]); ompi_datatype_commit(&sendtype[i]); ret = MCA_PML_CALL (isend(global_buf, 1, sendtype[i], fh->f_procs_in_group[i], 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, &send_req[i])); if(OMPI_SUCCESS != ret){ goto exit; } } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif } /********************************************************** *** 7f. Scatter the Data from the readers *********************************************************/ if ( recvbuf_is_contiguous ) { receive_buf = &((char*)buf)[position]; } else if (bytes_received) { /* allocate a receive buffer and copy the data that needs to be received into it in case the data is non-contigous in memory */ receive_buf = malloc (bytes_received); if (NULL == receive_buf) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = MCA_PML_CALL(irecv(receive_buf, bytes_received, MPI_BYTE, my_aggregator, 123, fh->f_comm, &recv_req)); if (OMPI_SUCCESS != ret){ goto exit; } if (my_aggregator == fh->f_rank){ ret = ompi_request_wait_all (fh->f_procs_per_group, send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } ret = ompi_request_wait (&recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } position += bytes_received; /* If data is not contigous in memory, copy the data from the receive buffer into the buffer passed in */ if (!recvbuf_is_contiguous ) { OPAL_PTRDIFF_TYPE mem_address; size_t remaining = 0; size_t temp_position = 0; remaining = bytes_received; while (remaining) { mem_address = (OPAL_PTRDIFF_TYPE) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, remaining); current_position = current_position + remaining; remaining = 0; } } if (NULL != receive_buf) { free (receive_buf); receive_buf = NULL; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif } /* end for (index=0; index < cycles; index ++) */ #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rexch = MPI_Wtime(); read_exch += end_rexch - start_rexch; nentry.time[0] = read_time; nentry.time[1] = rcomm_time; nentry.time[2] = read_exch; if (my_aggregator == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = dynamic_gen2_num_io_procs; if (!fh->f_full_print_queue(READ_PRINT_QUEUE)){ fh->f_register_print_entry(READ_PRINT_QUEUE, nentry); } #endif exit: if (!recvbuf_is_contiguous) { if (NULL != receive_buf) { free (receive_buf); receive_buf = NULL; } } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } if (NULL != sorted) { free (sorted); sorted = NULL; } if (NULL != global_iov_array) { free (global_iov_array); global_iov_array = NULL; } if (NULL != fview_count) { free (fview_count); fview_count = NULL; } if (NULL != decoded_iov) { free (decoded_iov); decoded_iov = NULL; } if (NULL != local_iov_array){ free(local_iov_array); local_iov_array=NULL; } if (NULL != displs) { free (displs); displs = NULL; } if (my_aggregator == fh->f_rank) { if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if (NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements= NULL; } if (NULL != sendtype){ for (i = 0; i < fh->f_procs_per_group; i++) { if ( MPI_DATATYPE_NULL != sendtype[i] ) { ompi_datatype_destroy(&sendtype[i]); } } free(sendtype); sendtype=NULL; } if (NULL != disp_index){ free(disp_index); disp_index = NULL; } if ( NULL != blocklen_per_process){ for(l=0;l<fh->f_procs_per_group;l++){ if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } } free(blocklen_per_process); blocklen_per_process = NULL; } if (NULL != displs_per_process){ for (l=0; i<fh->f_procs_per_group; l++){ if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } } free(displs_per_process); displs_per_process = NULL; } if ( NULL != send_req ) { free ( send_req ); send_req = NULL; } } return ret; }
/* * allreduce_inter * * Function: - allreduce using other MPI collectives * Accepts: - same as MPI_Allreduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_allreduce_inter(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int err, rank, root = 0; char *tmpbuf = NULL, *pml_buffer = NULL; ompi_request_t *req[2]; ptrdiff_t gap, span; rank = ompi_comm_rank(comm); /* Perform the reduction locally */ span = opal_datatype_span(&dtype->super, count, &gap); tmpbuf = (char *) malloc(span); if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = tmpbuf - gap; err = comm->c_local_comm->c_coll->coll_reduce(sbuf, pml_buffer, count, dtype, op, root, comm->c_local_comm, comm->c_local_comm->c_coll->coll_reduce_module); if (OMPI_SUCCESS != err) { goto exit; } if (rank == root) { /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(irecv(rbuf, count, dtype, 0, MCA_COLL_BASE_TAG_ALLREDUCE, comm, &(req[0]))); if (OMPI_SUCCESS != err) { goto exit; } err = MCA_PML_CALL(isend(pml_buffer, count, dtype, 0, MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm, &(req[1]))); if (OMPI_SUCCESS != err) { goto exit; } err = ompi_request_wait_all(2, req, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { goto exit; } } /* bcast the message to all the local processes */ err = comm->c_local_comm->c_coll->coll_bcast(rbuf, count, dtype, root, comm->c_local_comm, comm->c_local_comm->c_coll->coll_bcast_module); if (OMPI_SUCCESS != err) { goto exit; } exit: if (NULL != tmpbuf) { free(tmpbuf); } return err; }