int ompi_coll_base_reduce_intra_chain( const void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, int fanout, int max_outstanding_reqs ) { int segcount = count; size_t typelng; mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; mca_coll_base_comm_t *data = base_module->base_data; OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_chain rank %d fo %d ss %5d", ompi_comm_rank(comm), fanout, segsize)); COLL_BASE_UPDATE_CHAIN( comm, base_module, root, fanout ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_chain, segcount, max_outstanding_reqs ); }
int mca_sharedfp_addproc_read_ordered (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { int ret = OMPI_SUCCESS; OMPI_MPI_OFFSET_TYPE offset = 0, offsetReceived = 0; long sendBuff = 0; long *buff=NULL; long offsetBuff, bytesRequested = 0; size_t numofBytes; int rank, size, i; struct mca_sharedfp_base_data_t *sh = NULL; if(NULL == fh->f_sharedfp_data){ opal_output(0, "sharedfp_addproc_read_ordered: shared file pointer " "structure not initialized correctly\n"); return OMPI_ERROR; } /*Retrieve the new communicator*/ sh = fh->f_sharedfp_data; /* Calculate the number of bytes to read*/ opal_datatype_type_size ( &datatype->super, &numofBytes); sendBuff = count * numofBytes; /* Get the ranks in the communicator */ rank = ompi_comm_rank ( sh->comm); size = ompi_comm_size ( sh->comm); if ( 0 == rank ) { buff = (long*)malloc(sizeof(OMPI_MPI_OFFSET_TYPE) * size); if ( NULL == buff ) return OMPI_ERR_OUT_OF_RESOURCE; } ret = sh->comm->c_coll.coll_gather( &sendBuff, 1, OMPI_OFFSET_DATATYPE, buff, 1, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_gather_module); if ( OMPI_SUCCESS != ret ) { goto exit; } /* All the counts are present now in the recvBuff. The size of recvBuff is sizeof_newComm */ if ( 0 == rank ) { for (i = 0; i < size ; i ++) { if ( mca_sharedfp_addproc_verbose ){ opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_addproc_read_ordered: Buff is %ld\n",buff[i]); } bytesRequested += buff[i]; if ( mca_sharedfp_addproc_verbose ){ opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_addproc_read_ordered: Bytes requested are %ld\n", bytesRequested); } } /* Request the offset to read bytesRequested bytes ** only the root process needs to do the request, ** since the root process will then tell the other ** processes at what offset they should read their ** share of the data. */ ret = mca_sharedfp_addproc_request_position(sh,bytesRequested,&offsetReceived); if( OMPI_SUCCESS != ret ){ goto exit; } if ( mca_sharedfp_addproc_verbose ){ opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_addproc_read_ordered: Offset received is %lld\n", offsetReceived); } buff[0] += offsetReceived; for (i = 1 ; i < size; i++) { buff[i] += buff[i-1]; } } /* Scatter the results to the other processes*/ ret = sh->comm->c_coll.coll_scatter ( buff, 1, OMPI_OFFSET_DATATYPE, &offsetBuff, 1, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_scatter_module ); if ( OMPI_SUCCESS != ret ) { goto exit; } /*Each process now has its own individual offset in recvBUFF*/ offset = offsetBuff - sendBuff; offset /= sh->sharedfh->f_etype_size; if ( mca_sharedfp_addproc_verbose ){ opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_addproc_read_ordered: Offset returned is %lld\n",offset); } /* read from the file */ ret = mca_common_ompio_file_read_at_all(sh->sharedfh,offset,buf,count,datatype,status); exit: if ( NULL != buff ) { free ( buff ); } return ret; }
int mca_sharedfp_individual_write_ordered_begin(mca_io_ompio_file_t *fh, const void *buf, int count, struct ompi_datatype_t *datatype) { int ret = OMPI_SUCCESS; int size = 0, rank = 0; int i = 0; size_t numofbytes = 0; size_t totalbytes = 0; OMPI_MPI_OFFSET_TYPE *offbuff=NULL; OMPI_MPI_OFFSET_TYPE global_offset = 0; OMPI_MPI_OFFSET_TYPE prev_offset = 0; OMPI_MPI_OFFSET_TYPE temp = 0, offset = 0; mca_sharedfp_individual_header_record *headnode = NULL; struct mca_sharedfp_base_data_t *sh = NULL; mca_sharedfp_base_module_t * shared_fp_base_module = NULL; if(fh->f_sharedfp_data==NULL){ if ( mca_sharedfp_individual_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_individual_write_ordered_begin - opening the shared file pointer\n"); } shared_fp_base_module = fh->f_sharedfp; ret = shared_fp_base_module->sharedfp_file_open(fh->f_comm, fh->f_filename, fh->f_amode, fh->f_info, fh); if ( OMPI_SUCCESS != ret ) { opal_output(0,"sharedfp_individual_write_ordered_begin - error opening the shared file pointer\n"); return ret; } } if ( true == fh->f_split_coll_in_use ) { opal_output(0, "Only one split collective I/O operation allowed per file handle at any given point in time!\n"); return MPI_ERR_REQUEST; } /*Retrieve the sharedfp data structures*/ sh = fh->f_sharedfp_data; rank = ompi_comm_rank ( sh->comm ); size = ompi_comm_size ( sh->comm ); /* Calculate the number of bytes of data that needs to be written*/ opal_datatype_type_size ( &datatype->super, &numofbytes); totalbytes = count * numofbytes; headnode = (mca_sharedfp_individual_header_record*)sh->selected_module_data; if ( NULL == headnode) { opal_output (0, "sharedfp_individual_write_ordered_begin: headnode is NULL but file is open\n"); return OMPI_ERROR; } /* Data from all the metadata is combined and written to the main file */ ret = mca_sharedfp_individual_collaborate_data ( sh ); if ( OMPI_SUCCESS != ret) { return ret; } if ( 0 == rank ) { offbuff = (OMPI_MPI_OFFSET_TYPE *)malloc ( sizeof(OMPI_MPI_OFFSET_TYPE) * size); if (NULL == offbuff ) { return OMPI_ERR_OUT_OF_RESOURCE; } } /*collect the total bytes to be written*/ sh->comm->c_coll.coll_gather ( &totalbytes, 1, OMPI_OFFSET_DATATYPE, offbuff, 1, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_gather_module ); if ( 0 == rank ) { prev_offset = offbuff[0]; offbuff[0] = sh->global_offset; for (i = 1; i < size ; i++){ temp = offbuff[i]; offbuff[i] = offbuff[i - 1] + prev_offset; prev_offset = temp; } for (i = 0; i < size; i++){ global_offset = offbuff[size - 1] + prev_offset; } } /* Scatter the results to the other processes */ ret = sh->comm->c_coll.coll_scatter ( offbuff, 1, OMPI_OFFSET_DATATYPE, &offset, 1, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_scatter_module ); if ( OMPI_SUCCESS != ret ) { opal_output(0,"sharedfp_individual_write_ordered_begin: Error in scattering offsets \n"); goto exit; } ret = sh->comm->c_coll.coll_bcast ( &global_offset, 1, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_bcast_module ); if ( OMPI_SUCCESS != ret ) { opal_output(0,"sharedfp_individual_write_ordered_begin: Error while bcasting global offset \n"); goto exit; } sh->global_offset = global_offset; /*use file_write_at_all to ensure the order*/ ret = mca_common_ompio_file_iwrite_at_all(sh->sharedfh,offset, buf,count,datatype, &fh->f_split_coll_req); fh->f_split_coll_in_use = true; if ( OMPI_SUCCESS != ret ) { opal_output(0,"sharedfp_individual_write_ordered_begin: Error while writing the datafile \n"); } exit: if ( NULL != offbuff ) { free ( offbuff); } return ret; }
/* linear iscan * working principle: * 1. each node (but node 0) receives from left neighbor * 2. performs op * 3. all but rank p-1 do sends to it's right neighbor and exits * */ static int nbc_scan_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { int rank, p, res; ptrdiff_t gap, span; NBC_Schedule *schedule; void *tmpbuf = NULL; char inplace; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; NBC_IN_PLACE(sendbuf, recvbuf, inplace); rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); #ifdef NBC_CACHE_SCHEDULE NBC_Scan_args *args, *found, search; /* search schedule in communicator specific tree */ search.sendbuf = sendbuf; search.recvbuf = recvbuf; search.count = count; search.datatype = datatype; search.op = op; found = (NBC_Scan_args *) hb_tree_search ((hb_tree *) libnbc_module->NBC_Dict[NBC_SCAN], &search); if (NULL == found) { #endif schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { return OMPI_ERR_OUT_OF_RESOURCE; } if (!inplace) { /* copy data to receivebuf */ res = NBC_Sched_copy ((void *)sendbuf, false, count, datatype, recvbuf, false, count, datatype, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } if(rank != 0) { span = opal_datatype_span(&datatype->super, count, &gap); tmpbuf = malloc (span); if (NULL == tmpbuf) { OBJ_RELEASE(schedule); return OMPI_ERR_OUT_OF_RESOURCE; } /* we have to wait until we have the data */ res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, rank-1, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } /* perform the reduce in my local buffer */ /* this cannot be done until tmpbuf is unused :-( so barrier after the op */ res = NBC_Sched_op ((void *)(-gap), true, recvbuf, false, count, datatype, op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } } if (rank != p-1) { res = NBC_Sched_send (recvbuf, false, count, datatype, rank+1, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } #ifdef NBC_CACHE_SCHEDULE /* save schedule to tree */ args = (NBC_Scan_args *) malloc (sizeof (args)); if (NULL != args) { args->sendbuf = sendbuf; args->recvbuf = recvbuf; args->count = count; args->datatype = datatype; args->op = op; args->schedule = schedule; res = hb_tree_insert ((hb_tree *) libnbc_module->NBC_Dict[NBC_SCAN], args, args, 0); if (0 == res) { OBJ_RETAIN(schedule); /* increase number of elements for A2A */ if (++libnbc_module->NBC_Dict_size[NBC_SCAN] > NBC_SCHED_DICT_UPPER) { NBC_SchedCache_dictwipe ((hb_tree *) libnbc_module->NBC_Dict[NBC_SCAN], &libnbc_module->NBC_Dict_size[NBC_SCAN]); } } else { NBC_Error("error in dict_insert() (%i)", res); free (args); } } } else { /* found schedule */ schedule = found->schedule; OBJ_RETAIN(schedule); } #endif res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } return OMPI_SUCCESS; }
/* * Linear functions are copied from the basic coll module. For * some small number of nodes and/or small data sizes they are just as * fast as tuned/tree based segmenting operations and as such may be * selected by the decision functions. These are copied into this module * due to the way we select modules in V1. i.e. in V2 we will handle this * differently and so will not have to duplicate code. * GEF Oct05 after asking Jeff. */ int ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, rank, err, nreqs; char *psnd, *prcv; ptrdiff_t sext, rext; MPI_Request *preq; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; if (MPI_IN_PLACE == sbuf) { return mca_coll_tuned_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:alltoallv_intra_basic_linear rank %d", rank)); ompi_datatype_type_extent(sdtype, &sext); ompi_datatype_type_extent(rdtype, &rext); /* Simple optimization - handle send to self first */ psnd = ((char *) sbuf) + (ptrdiff_t)sdisps[rank] * sext; prcv = ((char *) rbuf) + (ptrdiff_t)rdisps[rank] * rext; if (0 != scounts[rank]) { err = ompi_datatype_sndrcv(psnd, scounts[rank], sdtype, prcv, rcounts[rank], rdtype); if (MPI_SUCCESS != err) { return err; } } /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Now, initiate all send/recv to/from others. */ nreqs = 0; preq = data->mcct_reqs; /* Post all receives first */ for (i = 0; i < size; ++i) { if (i == rank || 0 == rcounts[i]) { continue; } prcv = ((char *) rbuf) + (ptrdiff_t)rdisps[i] * rext; err = MCA_PML_CALL(irecv_init(prcv, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); ++nreqs; if (MPI_SUCCESS != err) { ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs); return err; } } /* Now post all sends */ for (i = 0; i < size; ++i) { if (i == rank || 0 == scounts[i]) { continue; } psnd = ((char *) sbuf) + (ptrdiff_t)sdisps[i] * sext; err = MCA_PML_CALL(isend_init(psnd, scounts[i], sdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); ++nreqs; if (MPI_SUCCESS != err) { ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs); return err; } } /* Start your engines. This will never return an error. */ MCA_PML_CALL(start(nreqs, data->mcct_reqs)); /* Wait for them all. If there's an error, note that we don't care * what the error was -- just that there *was* an error. The PML * will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. * So free them anyway -- even if there was an error, and return the * error after we free everything. */ err = ompi_request_wait_all(nreqs, data->mcct_reqs, MPI_STATUSES_IGNORE); /* Free the requests. */ ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs); return err; }
/* simple linear Alltoallw */ int ompi_coll_libnbc_ialltoallw(void* sendbuf, int *sendcounts, int *sdispls, MPI_Datatype sendtypes[], void* recvbuf, int *recvcounts, int *rdispls, MPI_Datatype recvtypes[], struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int rank, p, res; NBC_Schedule *schedule; char *rbuf, *sbuf, inplace; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; NBC_IN_PLACE(sendbuf, recvbuf, inplace); rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); /* copy data to receivbuffer */ if ((sendcounts[rank] != 0) && !inplace) { rbuf = (char *) recvbuf + rdispls[rank]; sbuf = (char *) sendbuf + sdispls[rank]; res = NBC_Copy(sbuf, sendcounts[rank], sendtypes[rank], rbuf, recvcounts[rank], recvtypes[rank], comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { return OMPI_ERR_OUT_OF_RESOURCE; } for (int i = 0; i < p; i++) { if (i == rank) { continue; } /* post all sends */ if (sendcounts[i] != 0) { sbuf = (char *) sendbuf + sdispls[i]; res = NBC_Sched_send (sbuf, false, sendcounts[i], sendtypes[i], i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } /* post all receives */ if (recvcounts[i] != 0) { rbuf = (char *) recvbuf + rdispls[i]; res = NBC_Sched_recv (rbuf, false, recvcounts[i], recvtypes[i], i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Init_handle (comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Start (handle, schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } *request = (ompi_request_t *) handle; return OMPI_SUCCESS; }
int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, size, adjsize, err, line, mask, remote; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_recursivedoubling rank %d", rank)); /* do nearest power of 2 less than size calc */ adjsize = opal_next_poweroftwo(size); adjsize >>= 1; /* if size is not exact power of two, perform an extra step */ if (adjsize != size) { if (rank >= adjsize) { /* send message to lower ranked node */ remote = rank - adjsize; err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, remote, MCA_COLL_BASE_TAG_BARRIER, comm); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} } else if (rank < (size - adjsize)) { /* receive message from high level rank */ err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, rank+adjsize, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} } } /* exchange messages */ if ( rank < adjsize ) { mask = 0x1; while ( mask < adjsize ) { remote = rank ^ mask; mask <<= 1; if (remote >= adjsize) continue; /* post receive from the remote node */ err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, remote, MCA_COLL_BASE_TAG_BARRIER, comm); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} } } /* non-power of 2 case */ if (adjsize != size) { if (rank < (size - adjsize)) { /* send enter message to higher ranked node */ remote = rank + adjsize; err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, remote, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_SYNCHRONOUS, comm)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} } } return MPI_SUCCESS; err_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; }
/* * reduce_scatter_intra_basic_recursivehalving * * Function: - reduce scatter implementation using recursive-halving * algorithm * Accepts: - same as MPI_Reduce_scatter() * Returns: - MPI_SUCCESS or error code * Limitation: - Works only for commutative operations. */ int ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, count, err = OMPI_SUCCESS; int tmp_size, remain = 0, tmp_rank, *disps = NULL; ptrdiff_t true_lb, true_extent, lb, extent, buf_size; char *recv_buf = NULL, *recv_buf_free = NULL; char *result_buf = NULL, *result_buf_free = NULL; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_scatter_intra_basic_recursivehalving, rank %d", rank)); /* Find displacements and the like */ disps = (int*) malloc(sizeof(int) * size); if (NULL == disps) return OMPI_ERR_OUT_OF_RESOURCE; disps[0] = 0; for (i = 0; i < (size - 1); ++i) { disps[i + 1] = disps[i] + rcounts[i]; } count = disps[size - 1] + rcounts[size - 1]; /* short cut the trivial case */ if (0 == count) { free(disps); return OMPI_SUCCESS; } /* get datatype information */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); buf_size = true_extent + (ptrdiff_t)(count - 1) * extent; /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; } /* Allocate temporary receive buffer. */ recv_buf_free = (char*) malloc(buf_size); recv_buf = recv_buf_free - true_lb; if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } /* allocate temporary buffer for results */ result_buf_free = (char*) malloc(buf_size); result_buf = result_buf_free - true_lb; /* copy local buffer into the temporary results */ err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype); if (OMPI_SUCCESS != err) goto cleanup; /* figure out power of two mapping: grow until larger than comm size, then go back one, to get the largest power of two less than comm size */ tmp_size = opal_next_poweroftwo (size); tmp_size >>= 1; remain = size - tmp_size; /* If comm size is not a power of two, have the first "remain" procs with an even rank send to rank + 1, leaving a power of two procs to do the rest of the algorithm */ if (rank < 2 * remain) { if ((rank & 1) == 0) { err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) goto cleanup; /* we don't participate from here on out */ tmp_rank = -1; } else { err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); /* integrate their results into our temp results */ ompi_op_reduce(op, recv_buf, result_buf, count, dtype); /* adjust rank to be the bottom "remain" ranks */ tmp_rank = rank / 2; } } else { /* just need to adjust rank to show that the bottom "even remain" ranks dropped out */ tmp_rank = rank - remain; } /* For ranks not kicked out by the above code, perform the recursive halving */ if (tmp_rank >= 0) { int *tmp_disps = NULL, *tmp_rcounts = NULL; int mask, send_index, recv_index, last_index; /* recalculate disps and rcounts to account for the special "remainder" processes that are no longer doing anything */ tmp_rcounts = (int*) malloc(tmp_size * sizeof(int)); if (NULL == tmp_rcounts) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } tmp_disps = (int*) malloc(tmp_size * sizeof(int)); if (NULL == tmp_disps) { free(tmp_rcounts); err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } for (i = 0 ; i < tmp_size ; ++i) { if (i < remain) { /* need to include old neighbor as well */ tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2]; } else { tmp_rcounts[i] = rcounts[i + remain]; } } tmp_disps[0] = 0; for (i = 0; i < tmp_size - 1; ++i) { tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i]; } /* do the recursive halving communication. Don't use the dimension information on the communicator because I think the information is invalidated by our "shrinking" of the communicator */ mask = tmp_size >> 1; send_index = recv_index = 0; last_index = tmp_size; while (mask > 0) { int tmp_peer, peer, send_count, recv_count; struct ompi_request_t *request; tmp_peer = tmp_rank ^ mask; peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain; /* figure out if we're sending, receiving, or both */ send_count = recv_count = 0; if (tmp_rank < tmp_peer) { send_index = recv_index + mask; for (i = send_index ; i < last_index ; ++i) { send_count += tmp_rcounts[i]; } for (i = recv_index ; i < send_index ; ++i) { recv_count += tmp_rcounts[i]; } } else { recv_index = send_index + mask; for (i = send_index ; i < recv_index ; ++i) { send_count += tmp_rcounts[i]; } for (i = recv_index ; i < last_index ; ++i) { recv_count += tmp_rcounts[i]; } } /* actual data transfer. Send from result_buf, receive into recv_buf */ if (recv_count > 0) { err = MCA_PML_CALL(irecv(recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, recv_count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, &request)); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } if (send_count > 0) { err = MCA_PML_CALL(send(result_buf + (ptrdiff_t)tmp_disps[send_index] * extent, send_count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } /* if we received something on this step, push it into the results buffer */ if (recv_count > 0) { err = ompi_request_wait(&request, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } ompi_op_reduce(op, recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, result_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, recv_count, dtype); } /* update for next iteration */ send_index = recv_index; last_index = recv_index + mask; mask >>= 1; } /* copy local results from results buffer into real receive buffer */ if (0 != rcounts[rank]) { err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent, rcounts[rank], dtype, rbuf, rcounts[rank], dtype); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } free(tmp_rcounts); free(tmp_disps); }
/******************************************************************************* * ompi_coll_base_reduce_scatter_intra_nonoverlapping * * This function just calls a reduce to rank 0, followed by an * appropriate scatterv call. */ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int err, i, rank, size, total_count, *displs = NULL; const int root = 0; char *tmprbuf = NULL, *tmprbuf_free = NULL; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_scatter_intra_nonoverlapping, rank %d", rank)); for (i = 0, total_count = 0; i < size; i++) { total_count += rcounts[i]; } /* Reduce to rank 0 (root) and scatterv */ tmprbuf = (char*) rbuf; if (MPI_IN_PLACE == sbuf) { /* rbuf on root (0) is big enough to hold whole data */ if (root == rank) { err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count, dtype, op, root, comm, comm->c_coll.coll_reduce_module); } else { err = comm->c_coll.coll_reduce(tmprbuf, NULL, total_count, dtype, op, root, comm, comm->c_coll.coll_reduce_module); } } else { if (root == rank) { /* We must allocate temporary receive buffer on root to ensure that rbuf is big enough */ ptrdiff_t lb, extent, tlb, textent; ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &tlb, &textent); tmprbuf_free = (char*) malloc(textent + (ptrdiff_t)(total_count - 1) * extent); tmprbuf = tmprbuf_free - lb; } err = comm->c_coll.coll_reduce (sbuf, tmprbuf, total_count, dtype, op, root, comm, comm->c_coll.coll_reduce_module); } if (MPI_SUCCESS != err) { if (NULL != tmprbuf_free) free(tmprbuf_free); return err; } displs = (int*) malloc(size * sizeof(int)); displs[0] = 0; for (i = 1; i < size; i++) { displs[i] = displs[i-1] + rcounts[i-1]; } err = comm->c_coll.coll_scatterv (tmprbuf, rcounts, displs, dtype, rbuf, rcounts[rank], dtype, root, comm, comm->c_coll.coll_scatterv_module); free(displs); if (NULL != tmprbuf_free) free(tmprbuf_free); return err; }
/* * allgatherv_intra * * Function: - allgatherv using other MPI collectives * Accepts: - same as MPI_Allgatherv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_allgatherv_intra(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, rank ; int err; MPI_Aint extent; MPI_Aint lb; char *send_buf = NULL; struct ompi_datatype_t *newtype, *send_type; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* * We don't have a root process defined. Arbitrarily assign root * to process with rank 0 (OMPI convention) */ if (MPI_IN_PLACE == sbuf) { ompi_datatype_get_extent(rdtype, &lb, &extent); send_type = rdtype; send_buf = (char*)rbuf; for (i = 0; i < rank; ++i) { send_buf += (rcounts[i] * extent); } } else { send_buf = (char*)sbuf; send_type = sdtype; } err = comm->c_coll.coll_gatherv(send_buf, rcounts[rank], send_type,rbuf, rcounts, disps, rdtype, 0, comm, comm->c_coll.coll_gatherv_module); if (MPI_SUCCESS != err) { return err; } /* * we now have all the data in the root's rbuf. Need to * broadcast the data out to the other processes * * Need to define a datatype that captures the different vectors * from each process. MPI_TYPE_INDEXED with params * size,rcount,displs,rdtype,newtype * should do the trick. * Use underlying ddt functions to create, and commit the * new datatype on each process, then broadcast and destroy the * datatype. */ err = ompi_datatype_create_indexed(size,rcounts,disps,rdtype,&newtype); if (MPI_SUCCESS != err) { return err; } err = ompi_datatype_commit(&newtype); if(MPI_SUCCESS != err) { return err; } err = comm->c_coll.coll_bcast( rbuf, 1 ,newtype,0,comm, comm->c_coll.coll_bcast_module); ompi_datatype_destroy (&newtype); return err; }
int mca_sharedfp_lockedfile_file_open (struct ompi_communicator_t *comm, const char* filename, int amode, struct opal_info_t *info, mca_io_ompio_file_t *fh) { int err = MPI_SUCCESS; char * lockedfilename; int handle, rank; struct mca_sharedfp_lockedfile_data * module_data = NULL; struct mca_sharedfp_base_data_t* sh; mca_io_ompio_file_t * shfileHandle, *ompio_fh; mca_io_ompio_data_t *data; /*------------------------------------------------------------*/ /*Open the same file again without shared file pointer support*/ /*------------------------------------------------------------*/ shfileHandle = (mca_io_ompio_file_t *)malloc(sizeof(mca_io_ompio_file_t)); err = mca_common_ompio_file_open(comm,filename,amode,info,shfileHandle,false); if ( OMPI_SUCCESS != err) { opal_output(0, "mca_sharedfp_lockedfile_file_open: Error during file open\n"); return err; } shfileHandle->f_fh = fh->f_fh; data = (mca_io_ompio_data_t *) fh->f_fh->f_io_selected_data; ompio_fh = &data->ompio_fh; err = mca_common_ompio_set_view (shfileHandle, ompio_fh->f_disp, ompio_fh->f_etype, ompio_fh->f_orig_filetype, ompio_fh->f_datarep, &(MPI_INFO_NULL->super)); /*Memory is allocated here for the sh structure*/ sh = (struct mca_sharedfp_base_data_t*)malloc(sizeof(struct mca_sharedfp_base_data_t)); if ( NULL == sh){ opal_output(0, "mca_sharedfp_lockedfile_file_open: Error, unable to malloc f_sharedfp_ptr struct\n"); free ( shfileHandle); return OMPI_ERR_OUT_OF_RESOURCE; } /*Populate the sh file structure based on the implementation*/ sh->sharedfh = shfileHandle; /* Shared file pointer*/ sh->global_offset = 0; /* Global Offset*/ sh->comm = comm; /* Communicator*/ sh->selected_module_data = NULL; rank = ompi_comm_rank ( sh->comm); /*Open a new file which will maintain the pointer for this file open*/ if ( mca_sharedfp_lockedfile_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "mca_sharedfp_lockedfile_file_open: open locked file.\n"); } module_data = (struct mca_sharedfp_lockedfile_data*)malloc(sizeof(struct mca_sharedfp_lockedfile_data)); if ( NULL == module_data ) { opal_output(ompi_sharedfp_base_framework.framework_output, "mca_sharedfp_lockedfile_file_open: Error, unable to malloc lockedfile_data struct\n"); free (shfileHandle); free (sh); return OMPI_ERR_OUT_OF_RESOURCE; } opal_jobid_t masterjobid; if ( 0 == comm->c_my_rank ) { ompi_proc_t *masterproc = ompi_group_peer_lookup(comm->c_local_group, 0 ); masterjobid = OMPI_CAST_RTE_NAME(&masterproc->super.proc_name)->jobid; } comm->c_coll->coll_bcast ( &masterjobid, 1, MPI_UNSIGNED, 0, comm, comm->c_coll->coll_bcast_module ); size_t filenamelen = strlen(filename) + 16; lockedfilename = (char*)malloc(sizeof(char) * filenamelen); if ( NULL == lockedfilename ) { free (shfileHandle); free (sh); free (module_data); return OMPI_ERR_OUT_OF_RESOURCE; } snprintf(lockedfilename, filenamelen, "%s-%u%s",filename,masterjobid,".lock"); module_data->filename = lockedfilename; /*-------------------------------------------------*/ /*Open the lockedfile without shared file pointer */ /*-------------------------------------------------*/ if ( 0 == rank ) { OMPI_MPI_OFFSET_TYPE position=0; /*only let main process initialize file pointer, *therefore there is no need to lock the file */ handle = open ( lockedfilename, O_RDWR | O_CREAT, 0644 ); write ( handle, &position, sizeof(OMPI_MPI_OFFSET_TYPE) ); close ( handle ); } comm->c_coll->coll_barrier ( comm, comm->c_coll->coll_barrier_module ); handle = open ( lockedfilename, O_RDWR, 0644 ); if ( -1 == handle ) { opal_output(0, "[%d]mca_sharedfp_lockedfile_file_open: Error during file open\n", rank); free (shfileHandle); free (sh); free(module_data); return OMPI_ERROR; } /*Store the new file handle*/ module_data->handle = handle; /* Assign the lockedfile_data to sh->handle*/ sh->selected_module_data = module_data; /*remember the shared file handle*/ fh->f_sharedfp_data = sh; comm->c_coll->coll_barrier ( comm, comm->c_coll->coll_barrier_module ); return err; }
/* * reduce_lin_intra * * Function: - reduction using O(N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, err, size; ptrdiff_t extent, dsize, gap; char *free_buffer = NULL; char *pml_buffer = NULL; char *inplace_temp = NULL; char *inbuf; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* If not root, send data to the root. */ if (rank != root) { err = MCA_PML_CALL(send(sbuf, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); return err; } dsize = opal_datatype_span(&dtype->super, count, &gap); ompi_datatype_type_extent(dtype, &extent); if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; inplace_temp = (char*)malloc(dsize); if (NULL == inplace_temp) { return OMPI_ERR_OUT_OF_RESOURCE; } rbuf = inplace_temp - gap; } if (size > 1) { free_buffer = (char*)malloc(dsize); if (NULL == free_buffer) { if (NULL != inplace_temp) { free(inplace_temp); } return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - gap; } /* Initialize the receive buffer. */ if (rank == (size - 1)) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); } else { err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); } if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Loop receiving and calling reduction function (C or Fortran). */ for (i = size - 2; i >= 0; --i) { if (rank == i) { inbuf = (char*)sbuf; } else { err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } inbuf = pml_buffer; } /* Perform the reduction */ ompi_op_reduce(op, inbuf, rbuf, count, dtype); } if (NULL != inplace_temp) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, inplace_temp); free(inplace_temp); } if (NULL != free_buffer) { free(free_buffer); } /* All done */ return MPI_SUCCESS; }
/* * reduce_intra_in_order_binary * * Function: Logarithmic reduce operation for non-commutative operations. * Acecpts: same as MPI_Reduce() * Returns: MPI_SUCCESS or error code */ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, int max_outstanding_reqs ) { int ret, rank, size, io_root, segcount = count; void *use_this_sendbuf = NULL; void *use_this_recvbuf = NULL; size_t typelng; mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; mca_coll_base_comm_t *data = base_module->base_data; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_in_order_binary rank %d ss %5d", rank, segsize)); COLL_BASE_UPDATE_IN_ORDER_BINTREE( comm, base_module ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); /* An in-order binary tree must use root (size-1) to preserve the order of operations. Thus, if root is not rank (size - 1), then we must handle 1. MPI_IN_PLACE option on real root, and 2. we must allocate temporary recvbuf on rank (size - 1). Note that generic function must be careful not to switch order of operations for non-commutative ops. */ io_root = size - 1; use_this_sendbuf = (void *)sendbuf; use_this_recvbuf = recvbuf; if (io_root != root) { ptrdiff_t dsize, gap; char *tmpbuf = NULL; dsize = opal_datatype_span(&datatype->super, count, &gap); if ((root == rank) && (MPI_IN_PLACE == sendbuf)) { tmpbuf = (char *) malloc(dsize); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } ompi_datatype_copy_content_same_ddt(datatype, count, (char*)tmpbuf, (char*)recvbuf); use_this_sendbuf = tmpbuf; } else if (io_root == rank) { tmpbuf = (char *) malloc(dsize); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } use_this_recvbuf = tmpbuf; } } /* Use generic reduce with in-order binary tree topology and io_root */ ret = ompi_coll_base_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype, op, io_root, comm, module, data->cached_in_order_bintree, segcount, max_outstanding_reqs ); if (MPI_SUCCESS != ret) { return ret; } /* Clean up */ if (io_root != root) { if (root == rank) { /* Receive result from rank io_root to recvbuf */ ret = MCA_PML_CALL(recv(recvbuf, count, datatype, io_root, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != ret) { return ret; } if (MPI_IN_PLACE == sendbuf) { free(use_this_sendbuf); } } else if (io_root == rank) { /* Send result from use_this_recvbuf to root */ ret = MCA_PML_CALL(send(use_this_recvbuf, count, datatype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { return ret; } free(use_this_recvbuf); } } return MPI_SUCCESS; }
/** * This is a generic implementation of the reduce protocol. It used the tree * provided as an argument and execute all operations using a segment of * count times a datatype. * For the last communication it will update the count in order to limit * the number of datatype to the original count (original_count) * * Note that for non-commutative operations we cannot save memory copy * for the first block: thus we must copy sendbuf to accumbuf on intermediate * to keep the optimized loop happy. */ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, int original_count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, ompi_coll_tree_t* tree, int count_by_segment, int max_outstanding_reqs ) { char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL}; char *accumbuf = NULL, *accumbuf_free = NULL; char *local_op_buffer = NULL, *sendtmpbuf = NULL; ptrdiff_t extent, size, gap, segment_increment; ompi_request_t **sreq = NULL, *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; int num_segments, line, ret, segindex, i, rank; int recvcount, prevcount, inbi; /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_extent( datatype, &extent ); num_segments = (int)(((size_t)original_count + (size_t)count_by_segment - (size_t)1) / (size_t)count_by_segment); segment_increment = (ptrdiff_t)count_by_segment * extent; sendtmpbuf = (char*) sendbuf; if( sendbuf == MPI_IN_PLACE ) { sendtmpbuf = (char *)recvbuf; } OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d", original_count, (unsigned long)((ptrdiff_t)num_segments * (ptrdiff_t)segment_increment), (unsigned long)segment_increment, max_outstanding_reqs)); rank = ompi_comm_rank(comm); /* non-leaf nodes - wait for children to send me data & forward up (if needed) */ if( tree->tree_nextsize > 0 ) { ptrdiff_t real_segment_size; /* handle non existant recv buffer (i.e. its NULL) and protect the recv buffer on non-root nodes */ accumbuf = (char*)recvbuf; if( (NULL == accumbuf) || (root != rank) ) { /* Allocate temporary accumulator buffer. */ size = opal_datatype_span(&datatype->super, original_count, &gap); accumbuf_free = (char*)malloc(size); if (accumbuf_free == NULL) { line = __LINE__; ret = -1; goto error_hndl; } accumbuf = accumbuf_free - gap; } /* If this is a non-commutative operation we must copy sendbuf to the accumbuf, in order to simplfy the loops */ if (!ompi_op_is_commute(op)) { ompi_datatype_copy_content_same_ddt(datatype, original_count, (char*)accumbuf, (char*)sendtmpbuf); } /* Allocate two buffers for incoming segments */ real_segment_size = opal_datatype_span(&datatype->super, count_by_segment, &gap); inbuf_free[0] = (char*) malloc(real_segment_size); if( inbuf_free[0] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[0] = inbuf_free[0] - gap; /* if there is chance to overlap communication - allocate second buffer */ if( (num_segments > 1) || (tree->tree_nextsize > 1) ) { inbuf_free[1] = (char*) malloc(real_segment_size); if( inbuf_free[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[1] = inbuf_free[1] - gap; } /* reset input buffer index and receive count */ inbi = 0; recvcount = 0; /* for each segment */ for( segindex = 0; segindex <= num_segments; segindex++ ) { prevcount = recvcount; /* recvcount - number of elements in current segment */ recvcount = count_by_segment; if( segindex == (num_segments-1) ) recvcount = original_count - (ptrdiff_t)count_by_segment * (ptrdiff_t)segindex; /* for each child */ for( i = 0; i < tree->tree_nextsize; i++ ) { /** * We try to overlap communication: * either with next segment or with the next child */ /* post irecv for current segindex on current child */ if( segindex < num_segments ) { void* local_recvbuf = inbuf[inbi]; if( 0 == i ) { /* for the first step (1st child per segment) and * commutative operations we might be able to irecv * directly into the accumulate buffer so that we can * reduce(op) this with our sendbuf in one step as * ompi_op_reduce only has two buffer pointers, * this avoids an extra memory copy. * * BUT if the operation is non-commutative or * we are root and are USING MPI_IN_PLACE this is wrong! */ if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_recvbuf = accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment; } } ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount, datatype, tree->tree_next[i], MCA_COLL_BASE_TAG_REDUCE, comm, &reqs[inbi])); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;} } /* wait for previous req to complete, if any. if there are no requests reqs[inbi ^1] will be MPI_REQUEST_NULL. */ /* wait on data from last child for previous segment */ ret = ompi_request_wait_all( 1, &reqs[inbi ^ 1], MPI_STATUSES_IGNORE ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } local_op_buffer = inbuf[inbi ^ 1]; if( i > 0 ) { /* our first operation is to combine our own [sendbuf] data * with the data we recvd from down stream (but only * the operation is commutative and if we are not root and * not using MPI_IN_PLACE) */ if( 1 == i ) { if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment; } } /* apply operation */ ompi_op_reduce(op, local_op_buffer, accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, recvcount, datatype ); } else if ( segindex > 0 ) { void* accumulator = accumbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; if( tree->tree_nextsize <= 1 ) { if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; } } ompi_op_reduce(op, local_op_buffer, accumulator, prevcount, datatype ); /* all reduced on available data this step (i) complete, * pass to the next process unless you are the root. */ if (rank != tree->tree_root) { /* send combined/accumulated data to parent */ ret = MCA_PML_CALL( send( accumulator, prevcount, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* we stop when segindex = number of segments (i.e. we do num_segment+1 steps for pipelining */ if (segindex == num_segments) break; } /* update input buffer index */ inbi = inbi ^ 1; } /* end of for each child */ } /* end of for each segment */ /* clean up */ if( inbuf_free[0] != NULL) free(inbuf_free[0]); if( inbuf_free[1] != NULL) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf_free); } /* leaf nodes Depending on the value of max_outstanding_reqs and the number of segments we have two options: - send all segments using blocking send to the parent, or - avoid overflooding the parent nodes by limiting the number of outstanding requests to max_oustanding_reqs. TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size for the current communication, synchronization should be used only when the message/segment size is smaller than the eager size. */ else { /* If the number of segments is less than a maximum number of oustanding requests or there is no limit on the maximum number of outstanding requests, we send data to the parent using blocking send */ if ((0 == max_outstanding_reqs) || (num_segments <= max_outstanding_reqs)) { segindex = 0; while ( original_count > 0) { if (original_count < count_by_segment) { count_by_segment = original_count; } ret = MCA_PML_CALL( send((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } segindex++; original_count -= count_by_segment; } } /* Otherwise, introduce flow control: - post max_outstanding_reqs non-blocking synchronous send, - for remaining segments - wait for a ssend to complete, and post the next one. - wait for all outstanding sends to complete. */ else { int creq = 0; sreq = coll_base_comm_get_reqs(module->base_data, max_outstanding_reqs); if (NULL == sreq) { line = __LINE__; ret = -1; goto error_hndl; } /* post first group of requests */ for (segindex = 0; segindex < max_outstanding_reqs; segindex++) { ret = MCA_PML_CALL( isend((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &sreq[segindex]) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } original_count -= count_by_segment; } creq = 0; while ( original_count > 0 ) { /* wait on a posted request to complete */ ret = ompi_request_wait(&sreq[creq], MPI_STATUS_IGNORE); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } if( original_count < count_by_segment ) { count_by_segment = original_count; } ret = MCA_PML_CALL( isend((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &sreq[creq]) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } creq = (creq + 1) % max_outstanding_reqs; segindex++; original_count -= count_by_segment; } /* Wait on the remaining request to complete */ ret = ompi_request_wait_all( max_outstanding_reqs, sreq, MPI_STATUSES_IGNORE ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } } return OMPI_SUCCESS; error_hndl: /* error handler */ OPAL_OUTPUT (( ompi_coll_base_framework.framework_output, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret )); (void)line; // silence compiler warning if( inbuf_free[0] != NULL ) free(inbuf_free[0]); if( inbuf_free[1] != NULL ) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf); if( NULL != sreq ) { ompi_coll_base_free_reqs(sreq, max_outstanding_reqs); } return ret; }
/* * file_open_plfs * * Function: - opens a new file * Accepts: - same arguments as MPI_File_open() * Returns: - Success if new file handle */ int mca_fs_plfs_file_open (struct ompi_communicator_t *comm, const char* filename, int access_mode, struct ompi_info_t *info, mca_io_ompio_file_t *fh) { int rank; int amode; int old_mask, perm; plfs_error_t plfs_ret; Plfs_fd *pfd = NULL; char wpath[1024]; size_t len = sizeof(int); char key[] = "num_hostdirs"; rank = ompi_comm_rank ( comm ); getcwd( wpath, sizeof(wpath) ); sprintf( wpath,"%s/%s",wpath,filename ); if (OMPIO_PERM_NULL == fh->f_perm) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else { perm = fh->f_perm; } amode = 0; if (access_mode & MPI_MODE_RDONLY) amode = amode | O_RDONLY; if (access_mode & MPI_MODE_WRONLY) amode = amode | O_WRONLY; if (access_mode & MPI_MODE_RDWR) amode = amode | O_RDWR; if (access_mode & MPI_MODE_EXCL) { if( is_plfs_path(wpath) == 1 ) { //the file already exists return OMPI_ERROR; } } if (0 == rank) { /* MODE_CREATE and MODE_EXCL can only be set by one process */ if (access_mode & MPI_MODE_CREATE) amode = amode | O_CREAT; plfs_ret = plfs_open( &pfd, wpath, amode, 0, perm, NULL ); fh->f_fs_ptr = pfd; } comm->c_coll.coll_bcast ( &plfs_ret, 1, MPI_INT, 0, comm, comm->c_coll.coll_bcast_module); if ( PLFS_SUCCESS != plfs_ret ) { return OMPI_ERROR; } if (0 != rank) { plfs_ret = plfs_open( &pfd, wpath, amode, 0, perm, NULL ); if (PLFS_SUCCESS != plfs_ret) { opal_output(0, "fs_plfs_file_open: Error in plfs_open:\n%s\n", strplfserr(plfs_ret)); return OMPI_ERROR; } else { fh->f_fs_ptr = pfd; } } if (mca_fs_plfs_num_hostdir > 0) { plfs_ret = plfs_setxattr( pfd, &mca_fs_plfs_num_hostdir, key, len ); if (PLFS_SUCCESS != plfs_ret) { opal_output(0, "fs_plfs_file_open: Error in plfs_setxattr:\n%s\n", strplfserr(plfs_ret)); return OMPI_ERROR; } } return OMPI_SUCCESS; }
static int mca_coll_basic_neighbor_alltoallv_cart(const void *sbuf, const int scounts[], const int sdisps[], struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[], const int rdisps[], struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t *) module; const mca_topo_base_comm_cart_2_1_0_t *cart = comm->c_topo->mtc.cart; const int rank = ompi_comm_rank (comm); int rc = MPI_SUCCESS, dim, i, nreqs; ptrdiff_t lb, rdextent, sdextent; ompi_request_t **reqs; ompi_datatype_get_extent(rdtype, &lb, &rdextent); ompi_datatype_get_extent(sdtype, &lb, &sdextent); /* post receives first */ for (dim = 0, nreqs = 0, i = 0, reqs = basic_module->mccb_reqs ; dim < cart->ndims ; ++dim, i += 2) { int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL; if (cart->dims[dim] > 1) { mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank); } else if (1 == cart->dims[dim] && cart->periods[dim]) { srank = drank = rank; } if (MPI_PROC_NULL != srank) { rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i] * rdextent, rcounts[i], rdtype, srank, MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; } if (MPI_PROC_NULL != drank) { rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i+1] * rdextent, rcounts[i+1], rdtype, drank, MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; } } if (OMPI_SUCCESS != rc) { /* should probably try to clean up here */ return rc; } for (dim = 0, i = 0 ; dim < cart->ndims ; ++dim, i += 2) { int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL; if (cart->dims[dim] > 1) { mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank); } else if (1 == cart->dims[dim] && cart->periods[dim]) { srank = drank = rank; } if (MPI_PROC_NULL != srank) { /* remove cast from const when the pml layer is updated to take a const for the send buffer */ rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i] * sdextent, scounts[i], sdtype, srank, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; } if (MPI_PROC_NULL != drank) { rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i+1] * sdextent, scounts[i+1], sdtype, drank, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; } } if (OMPI_SUCCESS != rc) { /* should probably try to clean up here */ return rc; } return ompi_request_wait_all (nreqs, basic_module->mccb_reqs, MPI_STATUSES_IGNORE); }
int mca_sharedfp_sm_seek (mca_io_ompio_file_t *fh, OMPI_MPI_OFFSET_TYPE offset, int whence) { int rank, status=0; OMPI_MPI_OFFSET_TYPE end_position=0; int ret = OMPI_SUCCESS; struct mca_sharedfp_base_data_t *sh = NULL; mca_sharedfp_base_module_t * shared_fp_base_module = NULL; struct mca_sharedfp_sm_data * sm_data = NULL; struct mca_sharedfp_sm_offset * sm_offset_ptr = NULL; if( NULL == fh->f_sharedfp_data ) { if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_sm_seek: opening the shared file pointer\n"); } shared_fp_base_module = fh->f_sharedfp; ret = shared_fp_base_module->sharedfp_file_open(fh->f_comm, fh->f_filename, fh->f_amode, fh->f_info, fh); if ( OMPI_SUCCESS != ret ) { opal_output(0,"sharedfp_sm_seek - error opening the shared file pointer\n"); return ret; } } sh = fh->f_sharedfp_data; rank = ompi_comm_rank ( sh->comm ); if( 0 == rank ){ if ( MPI_SEEK_SET == whence){ /*no nothing*/ if ( offset < 0){ opal_output(0,"sharedfp_sm_seek - MPI_SEEK_SET, offset must be > 0, got offset=%lld.\n",offset); ret = -1; } if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_sm_seek: MPI_SEEK_SET new_offset=%lld\n",offset); } } else if( MPI_SEEK_CUR == whence){ OMPI_MPI_OFFSET_TYPE current_position; ret = mca_sharedfp_sm_get_position ( fh, ¤t_position); if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_sm_seek: MPI_SEEK_CUR: curr=%lld, offset=%lld, call status=%d\n", current_position,offset,status); } offset = current_position + offset; if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_sm_seek: MPI_SEEK_CUR: new_offset=%lld\n",offset); } if(offset < 0){ opal_output(0,"sharedfp_sm_seek - MPI_SEEK_CURE, offset must be > 0, got offset=%lld.\n",offset); ret = -1; } } else if( MPI_SEEK_END == whence){ end_position=0; mca_common_ompio_file_get_size(sh->sharedfh,&end_position); offset = end_position + offset; if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_sm_seek: MPI_SEEK_END: file_get_size=%lld\n",end_position); } if(offset < 0){ opal_output(0,"sharedfp_sm_seek - MPI_SEEK_CUR, offset must be > 0, got offset=%lld.\n",offset); ret = -1; } } else { opal_output(0,"sharedfp_sm_seek - whence=%i is not supported\n",whence); ret = -1; } /*-----------------------------------------------------*/ /* Set Shared file pointer */ /*-----------------------------------------------------*/ sm_data = sh->selected_module_data; sm_offset_ptr = sm_data->sm_offset_ptr; /*-------------------*/ /*lock the file */ /*--------------------*/ if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_sm_seek: Aquiring lock, rank=%d...",rank); fflush(stdout); } /* Aquire an exclusive lock */ sm_offset_ptr = sm_data->sm_offset_ptr; sem_wait(sm_data->mutex); if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_sm_seek: Success! Acquired sm lock.for rank=%d\n",rank); } sm_offset_ptr->offset=offset; if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_sm_seek: Releasing sm lock...rank=%d",rank); fflush(stdout); } sem_post(sm_data->mutex); } /* since we are only letting process 0, update the current pointer * all of the other processes need to wait before proceeding. */ sh->comm->c_coll.coll_barrier ( sh->comm, sh->comm->c_coll.coll_barrier_module ); return ret; }
static int my_rank (rte_grp_handle_t grp_h ) { return ompi_comm_rank((ompi_communicator_t *)grp_h); }
/* * Simple double ring version of barrier * * synchronous gurantee made by last ring of sends are synchronous * */ int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, size, err = 0, line = 0, left, right; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank)); left = ((rank-1)%size); right = ((rank+1)%size); if (rank > 0) { /* receive message from the left */ err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* Send message to the right */ err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* root needs to receive from the last node */ if (rank == 0) { err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* Allow nodes to exit */ if (rank > 0) { /* post Receive from left */ err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* send message to the right one */ err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_SYNCHRONOUS, comm)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* rank 0 post receive from the last node */ if (rank == 0) { err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } return MPI_SUCCESS; err_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; }
/* * gather_intra_linear_sync * * Function: - synchronized gather operation with * Accepts: - same arguments as MPI_Gather(), first segment size * Returns: - MPI_SUCCESS or error code */ int ompi_coll_base_gather_intra_linear_sync(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, int first_segment_size) { int i, ret, line, rank, size, first_segment_count; ompi_request_t **reqs = NULL; MPI_Aint extent, lb; size_t typelng; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size)); if (rank != root) { /* Non-root processes: - receive zero byte message from the root, - send the first segment of the data synchronously, - send the second segment of the data. */ ompi_datatype_type_size(sdtype, &typelng); ompi_datatype_get_extent(sdtype, &lb, &extent); first_segment_count = scount; COLL_BASE_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng, first_segment_count ); ret = MCA_PML_CALL(recv(rbuf, 0, MPI_BYTE, root, MCA_COLL_BASE_TAG_GATHER, comm, MPI_STATUS_IGNORE)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } ret = MCA_PML_CALL(send(sbuf, first_segment_count, sdtype, root, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } ret = MCA_PML_CALL(send((char*)sbuf + extent * first_segment_count, (scount - first_segment_count), sdtype, root, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } else { /* Root process, - For every non-root node: - post irecv for the first segment of the message - send zero byte message to signal node to send the message - post irecv for the second segment of the message - wait for the first segment to complete - Copy local data if necessary - Waitall for all the second segments to complete. */ char *ptmp; ompi_request_t *first_segment_req; reqs = coll_base_comm_get_reqs(module->base_data, size); if (NULL == reqs) { ret = -1; line = __LINE__; goto error_hndl; } ompi_datatype_type_size(rdtype, &typelng); ompi_datatype_get_extent(rdtype, &lb, &extent); first_segment_count = rcount; COLL_BASE_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng, first_segment_count ); ptmp = (char *) rbuf; for (i = 0; i < size; ++i) { if (i == rank) { /* skip myself */ reqs[i] = MPI_REQUEST_NULL; continue; } /* irecv for the first segment from i */ ptmp = (char*)rbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * extent; ret = MCA_PML_CALL(irecv(ptmp, first_segment_count, rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm, &first_segment_req)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } /* send sync message */ ret = MCA_PML_CALL(send(rbuf, 0, MPI_BYTE, i, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } /* irecv for the second segment */ ptmp = (char*)rbuf + ((ptrdiff_t)i * (ptrdiff_t)rcount + first_segment_count) * extent; ret = MCA_PML_CALL(irecv(ptmp, (rcount - first_segment_count), rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm, &reqs[i])); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } /* wait on the first segment to complete */ ret = ompi_request_wait(&first_segment_req, MPI_STATUS_IGNORE); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* copy local data if necessary */ if (MPI_IN_PLACE != sbuf) { ret = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent, rcount, rdtype); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* wait all second segments to complete */ ret = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* All done */ return MPI_SUCCESS; error_hndl: if (NULL != reqs) { ompi_coll_base_free_reqs(reqs, size); } OPAL_OUTPUT (( ompi_coll_base_framework.framework_output, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret )); (void)line; // silence compiler warning return ret; }
int mca_sharedfp_sm_read_ordered_begin(mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype) { int ret = OMPI_SUCCESS; OMPI_MPI_OFFSET_TYPE offset = 0; long sendBuff = 0; long *buff=NULL; long offsetBuff; OMPI_MPI_OFFSET_TYPE offsetReceived = 0; long bytesRequested = 0; int recvcnt = 1, sendcnt = 1; size_t numofBytes; int rank, size, i; struct mca_sharedfp_base_data_t *sh = NULL; mca_sharedfp_base_module_t * shared_fp_base_module = NULL; if ( NULL == fh->f_sharedfp_data){ if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_sm_read_ordered_begin: opening the shared file pointer\n"); } shared_fp_base_module = fh->f_sharedfp; ret = shared_fp_base_module->sharedfp_file_open(fh->f_comm, fh->f_filename, fh->f_amode, fh->f_info, fh); if ( OMPI_SUCCESS != ret ) { opal_output(0,"sharedfp_sm_read_ordered_begin - error opening the shared file pointer\n"); return ret; } } if ( true == fh->f_split_coll_in_use ) { opal_output(0,"Only one split collective I/O operation allowed per file handle at any given point in time!\n"); return MPI_ERR_REQUEST; } /*Retrieve the new communicator*/ sh = fh->f_sharedfp_data; /* Calculate the number of bytes to read*/ opal_datatype_type_size ( &datatype->super, &numofBytes); sendBuff = count * numofBytes; /* Get the ranks in the communicator */ rank = ompi_comm_rank ( sh->comm ); size = ompi_comm_size ( sh->comm ); if ( 0 == rank ) { buff = (long*)malloc(sizeof(long) * size); if ( NULL == buff ) return OMPI_ERR_OUT_OF_RESOURCE; } ret = sh->comm->c_coll.coll_gather ( &sendBuff, sendcnt, OMPI_OFFSET_DATATYPE, buff, recvcnt, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_gather_module ); if( OMPI_SUCCESS != ret){ goto exit; } /* All the counts are present now in the recvBuff. ** The size of recvBuff is sizeof_newComm */ if ( 0 == rank ) { for (i = 0; i < size ; i ++) { bytesRequested += buff[i]; if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "mca_sharedfp_sm_read_ordered_begin: Bytes requested are %ld\n", bytesRequested); } } /* Request the offset to read bytesRequested bytes ** only the root process needs to do the request, ** since the root process will then tell the other ** processes at what offset they should read their ** share of the data. */ ret = mca_sharedfp_sm_request_position(sh,bytesRequested,&offsetReceived); if( OMPI_SUCCESS != ret){ goto exit; } if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "mca_sharedfp_sm_read_ordered_begin: Offset received is %lld\n",offsetReceived); } buff[0] += offsetReceived; for (i = 1 ; i < size; i++) { buff[i] += buff[i-1]; } } /* Scatter the results to the other processes*/ ret = sh->comm->c_coll.coll_scatter ( buff, sendcnt, OMPI_OFFSET_DATATYPE, &offsetBuff, recvcnt, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_scatter_module ); if( OMPI_SUCCESS != ret){ goto exit; } /*Each process now has its own individual offset in recvBUFF*/ offset = offsetBuff - sendBuff; offset /= sh->sharedfh->f_etype_size; if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "mca_sharedfp_sm_read_ordered_begin: Offset returned is %lld\n",offset); } /* read to the file */ ret = ompio_io_ompio_file_iread_at_all(sh->sharedfh,offset,buf,count,datatype, &fh->f_split_coll_req); fh->f_split_coll_in_use = true; exit: if ( NULL != buff ) { free ( buff ); } return ret; }
/* Todo: gather_intra_generic, gather_intra_binary, gather_intra_chain, * gather_intra_pipeline, segmentation? */ int ompi_coll_base_gather_intra_binomial(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int line = -1, i, rank, vrank, size, total_recv = 0, err; char *ptmp = NULL, *tempbuf = NULL; ompi_coll_tree_t* bmtree; MPI_Status status; MPI_Aint sextent, sgap, ssize; MPI_Aint rextent, rgap, rsize; mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; mca_coll_base_comm_t *data = base_module->base_data; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d", rank)); /* create the binomial tree */ COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root ); bmtree = data->cached_in_order_bmtree; ompi_datatype_type_extent(sdtype, &sextent); ompi_datatype_type_extent(rdtype, &rextent); ssize = opal_datatype_span(&sdtype->super, (int64_t)scount * size, &sgap); rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * size, &rgap); vrank = (rank - root + size) % size; if (rank == root) { if (0 == root){ /* root on 0, just use the recv buffer */ ptmp = (char *) rbuf; if (sbuf != MPI_IN_PLACE) { err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } else { /* root is not on 0, allocate temp buffer for recv, * rotate data at the end */ tempbuf = (char *) malloc(rsize); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } ptmp = tempbuf - rgap; if (sbuf != MPI_IN_PLACE) { /* copy from sbuf to temp buffer */ err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } else { /* copy from rbuf to temp buffer */ err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp, (char *)rbuf + (ptrdiff_t)rank * rextent * (ptrdiff_t)rcount); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } total_recv = rcount; } else if (!(vrank % 2)) { /* other non-leaf nodes, allocate temp buffer for data received from * children, the most we need is half of the total data elements due * to the property of binimoal tree */ tempbuf = (char *) malloc(ssize); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } ptmp = tempbuf - sgap; /* local copy to tempbuf */ err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, scount, sdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } /* use sdtype,scount as rdtype,rdcount since they are ignored on * non-root procs */ rdtype = sdtype; rcount = scount; rextent = sextent; total_recv = rcount; } else { /* leaf nodes, no temp buffer needed, use sdtype,scount as * rdtype,rdcount since they are ignored on non-root procs */ ptmp = (char *) sbuf; total_recv = scount; } if (!(vrank % 2)) { /* all non-leaf nodes recv from children */ for (i = 0; i < bmtree->tree_nextsize; i++) { int mycount = 0, vkid; /* figure out how much data I have to send to this child */ vkid = (bmtree->tree_next[i] - root + size) % size; mycount = vkid - vrank; if (mycount > (size - vkid)) mycount = size - vkid; mycount *= rcount; OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d recv %d mycount = %d", rank, bmtree->tree_next[i], mycount)); err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype, bmtree->tree_next[i], MCA_COLL_BASE_TAG_GATHER, comm, &status)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } total_recv += mycount; } } if (rank != root) { /* all nodes except root send to parents */ OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d send %d count %d\n", rank, bmtree->tree_prev, total_recv)); err = MCA_PML_CALL(send(ptmp, total_recv, sdtype, bmtree->tree_prev, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } if (rank == root) { if (root != 0) { /* rotate received data on root if root != 0 */ err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)(size - root), (char *)rbuf + rextent * (ptrdiff_t)root * (ptrdiff_t)rcount, ptmp); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)root, (char *) rbuf, ptmp + rextent * (ptrdiff_t)rcount * (ptrdiff_t)(size-root)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } free(tempbuf); } } else if (!(vrank % 2)) { /* other non-leaf nodes */ free(tempbuf); } return MPI_SUCCESS; err_hndl: if (NULL != tempbuf) free(tempbuf); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); (void)line; // silence compiler warning return err; }
/* * scan * * Function: - basic scan operation * Accepts: - same arguments as MPI_Scan() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_scan_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int size, rank, err; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *pml_buffer = NULL; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* If I'm rank 0, just copy into the receive buffer */ if (0 == rank) { if (MPI_IN_PLACE != sbuf) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); if (MPI_SUCCESS != err) { return err; } } } /* Otherwise receive previous buffer and reduce. */ else { /* Allocate a temporary buffer. Rationale for this size is * listed in coll_basic_reduce.c. Use this temporary buffer to * receive into, later. */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - lb; /* Copy the send buffer into the receive buffer. */ if (MPI_IN_PLACE != sbuf) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } } /* Receive the prior answer */ err = MCA_PML_CALL(recv(pml_buffer, count, dtype, rank - 1, MCA_COLL_BASE_TAG_SCAN, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Perform the operation */ ompi_op_reduce(op, pml_buffer, rbuf, count, dtype); /* All done */ if (NULL != free_buffer) { free(free_buffer); } } /* Send result to next process. */ if (rank < (size - 1)) { return MCA_PML_CALL(send(rbuf, count, dtype, rank + 1, MCA_COLL_BASE_TAG_SCAN, MCA_PML_BASE_SEND_STANDARD, comm)); } /* All done */ return MPI_SUCCESS; }
int mca_common_ompio_file_open (ompi_communicator_t *comm, const char *filename, int amode, ompi_info_t *info, mca_io_ompio_file_t *ompio_fh, bool use_sharedfp) { int ret = OMPI_SUCCESS; int remote_arch; ompio_fh->f_iov_type = MPI_DATATYPE_NULL; ompio_fh->f_comm = MPI_COMM_NULL; if ( ((amode&MPI_MODE_RDONLY)?1:0) + ((amode&MPI_MODE_RDWR)?1:0) + ((amode&MPI_MODE_WRONLY)?1:0) != 1 ) { return MPI_ERR_AMODE; } if ((amode & MPI_MODE_RDONLY) && ((amode & MPI_MODE_CREATE) || (amode & MPI_MODE_EXCL))) { return MPI_ERR_AMODE; } if ((amode & MPI_MODE_RDWR) && (amode & MPI_MODE_SEQUENTIAL)) { return MPI_ERR_AMODE; } ompio_fh->f_rank = ompi_comm_rank (comm); ompio_fh->f_size = ompi_comm_size (comm); remote_arch = opal_local_arch; ompio_fh->f_convertor = opal_convertor_create (remote_arch, 0); if ( true == use_sharedfp ) { ret = ompi_comm_dup (comm, &ompio_fh->f_comm); if ( OMPI_SUCCESS != ret ) { goto fn_fail; } } else { /* No need to duplicate the communicator if the file_open is called from the sharedfp component, since the comm used as an input is already a dup of the user level comm. */ ompio_fh->f_flags |= OMPIO_SHAREDFP_IS_SET; ompio_fh->f_comm = comm; } ompio_fh->f_fstype = NONE; ompio_fh->f_amode = amode; ompio_fh->f_info = info; ompio_fh->f_atomicity = 0; ompi_io_ompio_set_file_defaults (ompio_fh); ompio_fh->f_filename = filename; ompio_fh->f_split_coll_req = NULL; ompio_fh->f_split_coll_in_use = false; /*Initialize the print_queues queues here!*/ mca_common_ompio_initialize_print_queue(&ompio_fh->f_coll_write_time); mca_common_ompio_initialize_print_queue(&ompio_fh->f_coll_read_time); /* set some function pointers required for fcoll, fbtls and sharedfp modules*/ ompio_fh->f_decode_datatype=ompi_io_ompio_decode_datatype; ompio_fh->f_generate_current_file_view=ompi_io_ompio_generate_current_file_view; ompio_fh->f_sort=ompi_io_ompio_sort; ompio_fh->f_sort_iovec=ompi_io_ompio_sort_iovec; ompio_fh->f_get_num_aggregators=mca_io_ompio_get_num_aggregators; ompio_fh->f_get_bytes_per_agg=mca_io_ompio_get_bytes_per_agg; ompio_fh->f_set_aggregator_props=mca_io_ompio_set_aggregator_props; /* This fix is needed for data seiving to work with two-phase collective I/O */ if ((amode & MPI_MODE_WRONLY)){ amode -= MPI_MODE_WRONLY; amode += MPI_MODE_RDWR; } /*--------------------------------------------------*/ if (OMPI_SUCCESS != (ret = mca_fs_base_file_select (ompio_fh, NULL))) { opal_output(1, "mca_fs_base_file_select() failed\n"); goto fn_fail; } if (OMPI_SUCCESS != (ret = mca_fbtl_base_file_select (ompio_fh, NULL))) { opal_output(1, "mca_fbtl_base_file_select() failed\n"); goto fn_fail; } if (OMPI_SUCCESS != (ret = mca_fcoll_base_file_select (ompio_fh, NULL))) { opal_output(1, "mca_fcoll_base_file_select() failed\n"); goto fn_fail; } ompio_fh->f_sharedfp_component = NULL; /*component*/ ompio_fh->f_sharedfp = NULL; /*module*/ ompio_fh->f_sharedfp_data = NULL; /*data*/ if ( true == use_sharedfp ) { if (OMPI_SUCCESS != (ret = mca_sharedfp_base_file_select (ompio_fh, NULL))) { opal_output ( ompi_io_base_framework.framework_output, "mca_sharedfp_base_file_select() failed\n"); ompio_fh->f_sharedfp = NULL; /*module*/ /* Its ok to not have a shared file pointer module as long as the shared file ** pointer operations are not used. However, the first call to any file_read/write_shared ** function will return an error code. */ } /* open the file once more for the shared file pointer if required. ** Per default, the shared file pointer specific actions are however ** only performed on first access of the shared file pointer, except ** for the addproc sharedfp component. ** ** Lazy open does not work for the addproc sharedfp ** component since it starts by spawning a process using MPI_Comm_spawn. ** For this, the first operation has to be collective which we can ** not guarantuee outside of the MPI_File_open operation. */ if ( NULL != ompio_fh->f_sharedfp && true == use_sharedfp && (!mca_io_ompio_sharedfp_lazy_open || !strcmp (ompio_fh->f_sharedfp_component->mca_component_name, "addproc") )) { ret = ompio_fh->f_sharedfp->sharedfp_file_open(comm, filename, amode, info, ompio_fh); if ( OMPI_SUCCESS != ret ) { goto fn_fail; } } } /*Determine topology information if set*/ if (ompio_fh->f_comm->c_flags & OMPI_COMM_CART){ ret = mca_io_ompio_cart_based_grouping(ompio_fh); if(OMPI_SUCCESS != ret ){ ret = MPI_ERR_FILE; } } ret = ompio_fh->f_fs->fs_file_open (comm, filename, amode, info, ompio_fh); if ( OMPI_SUCCESS != ret ) { ret = MPI_ERR_FILE; goto fn_fail; } /* If file has been opened in the append mode, move the internal file pointer of OMPIO to the very end of the file. */ if ( ompio_fh->f_amode & MPI_MODE_APPEND ) { OMPI_MPI_OFFSET_TYPE current_size; ompio_fh->f_fs->fs_file_get_size( ompio_fh, ¤t_size); mca_common_ompio_set_explicit_offset (ompio_fh, current_size); } return OMPI_SUCCESS; fn_fail: /* no need to free resources here, since the destructor * is calling mca_io_ompio_file_close, which actually gets *rid of all allocated memory items */ return ret; }
static int mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; int i, j, size, rank, err=MPI_SUCCESS; MPI_Request *preq; char *tmp_buffer; size_t max_size, rdtype_size; ptrdiff_t ext; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); ompi_datatype_type_size(rdtype, &rdtype_size); /* If only one process, we're done. */ if (1 == size || 0 == rdtype_size) { return MPI_SUCCESS; } /* Find the largest receive amount */ ompi_datatype_type_extent (rdtype, &ext); for (i = 0, max_size = 0 ; i < size ; ++i) { size_t size = ext * rcounts[i]; max_size = size > max_size ? size : max_size; } /* Allocate a temporary buffer */ tmp_buffer = calloc (max_size, 1); if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } /* in-place alltoallv slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { for (j = i+1 ; j < size ; ++j) { /* Initiate all send/recv to/from others. */ preq = tuned_module->tuned_data->mcct_reqs; if (i == rank && rcounts[j]) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[j], tmp_buffer, (char *) rbuf + rdisps[j] * ext); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j] * ext, rcounts[j], rdtype, j, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[j], rdtype, j, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else if (j == rank && rcounts[i]) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[i], tmp_buffer, (char *) rbuf + rdisps[i] * ext); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i] * ext, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else { continue; } /* Wait for the requests to complete */ err = ompi_request_wait_all (2, tuned_module->tuned_data->mcct_reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { goto error_hndl; } /* Free the requests. */ mca_coll_tuned_free_reqs(tuned_module->tuned_data->mcct_reqs, 2); } } error_hndl: /* Free the temporary buffer */ free (tmp_buffer); /* All done */ return err; }
/** * Shared memory broadcast. * * For the root, the general algorithm is to wait for a set of * segments to become available. Once it is, the root claims the set * by writing the current operation number and the number of processes * using the set to the flag. The root then loops over the set of * segments; for each segment, it copies a fragment of the user's * buffer into the shared data segment and then writes the data size * into its childrens' control buffers. The process is repeated until * all fragments have been written. * * For non-roots, for each set of buffers, they wait until the current * operation number appears in the in-use flag (i.e., written by the * root). Then for each segment, they wait for a nonzero to appear * into their control buffers. If they have children, they copy the * data from their parent's shared data segment into their shared data * segment, and write the data size into each of their childrens' * control buffers. They then copy the data from their shared [local] * data segment into the user's output buffer. The process is * repeated until all fragments have been received. If they do not * have children, they copy the data directly from the parent's shared * data segment into the user's output buffer. */ int mca_coll_sm_bcast_intra(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { struct iovec iov; mca_coll_sm_module_t *sm_module = (mca_coll_sm_module_t*) module; mca_coll_sm_comm_t *data; int i, ret, rank, size, num_children, src_rank; int flag_num, segment_num, max_segment_num; int parent_rank; size_t total_size, max_data, bytes; mca_coll_sm_in_use_flag_t *flag; opal_convertor_t convertor; mca_coll_sm_tree_node_t *me, *parent, **children; mca_coll_sm_data_index_t *index; /* Lazily enable the module the first time we invoke a collective on it */ if (!sm_module->enabled) { if (OMPI_SUCCESS != (ret = ompi_coll_sm_lazy_enable(module, comm))) { return ret; } } data = sm_module->sm_comm_data; /* Setup some identities */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OBJ_CONSTRUCT(&convertor, opal_convertor_t); iov.iov_len = mca_coll_sm_component.sm_fragment_size; bytes = 0; me = &data->mcb_tree[(rank + size - root) % size]; parent = me->mcstn_parent; children = me->mcstn_children; num_children = me->mcstn_num_children; /* Only have one top-level decision as to whether I'm the root or not. Do this at the slight expense of repeating a little logic -- but it's better than a conditional branch in every loop iteration. */ /********************************************************************* * Root *********************************************************************/ if (root == rank) { /* The root needs a send convertor to pack from the user's buffer to shared memory */ if (OMPI_SUCCESS != (ret = opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor, &(datatype->super), count, buff, 0, &convertor))) { return ret; } opal_convertor_get_packed_size(&convertor, &total_size); /* Main loop over sending fragments */ do { flag_num = (data->mcb_operation_count++ % mca_coll_sm_component.sm_comm_num_in_use_flags); FLAG_SETUP(flag_num, flag, data); FLAG_WAIT_FOR_IDLE(flag, bcast_root_label); FLAG_RETAIN(flag, size - 1, data->mcb_operation_count - 1); /* Loop over all the segments in this set */ segment_num = flag_num * mca_coll_sm_component.sm_segs_per_inuse_flag; max_segment_num = (flag_num + 1) * mca_coll_sm_component.sm_segs_per_inuse_flag; do { index = &(data->mcb_data_index[segment_num]); /* Copy the fragment from the user buffer to my fragment in the current segment */ max_data = mca_coll_sm_component.sm_fragment_size; COPY_FRAGMENT_IN(convertor, index, rank, iov, max_data); bytes += max_data; /* Wait for the write to absolutely complete */ opal_atomic_wmb(); /* Tell my children that this fragment is ready */ PARENT_NOTIFY_CHILDREN(children, num_children, index, max_data); ++segment_num; } while (bytes < total_size && segment_num < max_segment_num); } while (bytes < total_size); } /********************************************************************* * Non-root *********************************************************************/ else { /* Non-root processes need a receive convertor to unpack from shared mmory to the user's buffer */ if (OMPI_SUCCESS != (ret = opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor, &(datatype->super), count, buff, 0, &convertor))) { return ret; } opal_convertor_get_packed_size(&convertor, &total_size); /* Loop over receiving (and possibly re-sending) the fragments */ do { flag_num = (data->mcb_operation_count % mca_coll_sm_component.sm_comm_num_in_use_flags); /* Wait for the root to mark this set of segments as ours */ FLAG_SETUP(flag_num, flag, data); FLAG_WAIT_FOR_OP(flag, data->mcb_operation_count, bcast_nonroot_label1); ++data->mcb_operation_count; /* Loop over all the segments in this set */ segment_num = flag_num * mca_coll_sm_component.sm_segs_per_inuse_flag; max_segment_num = (flag_num + 1) * mca_coll_sm_component.sm_segs_per_inuse_flag; do { /* Pre-calculate some values */ parent_rank = (parent->mcstn_id + root) % size; index = &(data->mcb_data_index[segment_num]); /* Wait for my parent to tell me that the segment is ready */ CHILD_WAIT_FOR_NOTIFY(rank, index, max_data, bcast_nonroot_label2); /* If I have children, send the data to them */ if (num_children > 0) { /* Copy the fragment from the parent's portion in the segment to my portion in the segment. */ COPY_FRAGMENT_BETWEEN(parent_rank, rank, index, max_data); /* Wait for the write to absolutely complete */ opal_atomic_wmb(); /* Tell my children that this fragment is ready */ PARENT_NOTIFY_CHILDREN(children, num_children, index, max_data); /* Set the "copy from buffer" to be my local segment buffer so that we don't potentially incur a non-local memory copy from the parent's fan out data segment [again] when copying to the user's buffer */ src_rank = rank; } /* If I don't have any children, set the "copy from buffer" to be my parent's fan out segment to copy directly from my parent */ else { src_rank = parent_rank; } /* Copy to my output buffer */ COPY_FRAGMENT_OUT(convertor, src_rank, index, iov, max_data); bytes += max_data; ++segment_num; } while (bytes < total_size && segment_num < max_segment_num); /* Wait for all copy-out writes to complete before I say I'm done with the segments */ opal_atomic_wmb(); /* We're finished with this set of segments */ FLAG_RELEASE(flag); } while (bytes < total_size); } /* Kill the convertor */ OBJ_DESTRUCT(&convertor); /* All done */ return OMPI_SUCCESS; }
int ompi_coll_libnbc_igatherv(void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int *recvcounts, int *displs, MPI_Datatype recvtype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int rank, p, res; MPI_Aint rcvext = 0; NBC_Schedule *schedule; char *rbuf, inplace; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; NBC_IN_PLACE(sendbuf, recvbuf, inplace); rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); if (rank == root) { res = MPI_Type_extent(recvtype, &rcvext); if (MPI_SUCCESS != res) { NBC_Error("MPI Error in MPI_Type_extent() (%i)", res); return res; } } schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { return OMPI_ERR_OUT_OF_RESOURCE; } /* send to root */ if (rank != root) { /* send msg to root */ res = NBC_Sched_send (sendbuf, false, sendcount, sendtype, root, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } else { for (int i = 0 ; i < p ; ++i) { rbuf = (char *) recvbuf + displs[i] * rcvext; if (i == root) { if (!inplace) { /* if I am the root - just copy the message */ res = NBC_Copy (sendbuf, sendcount, sendtype, rbuf, recvcounts[i], recvtype, comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } } else { /* root receives message to the right buffer */ res = NBC_Sched_recv (rbuf, false, recvcounts[i], recvtype, i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } } } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Init_handle (comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Start (handle, schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } *request = (ompi_request_t *) handle; return OMPI_SUCCESS; }
struct mca_sharedfp_base_module_1_0_0_t * mca_sharedfp_lockedfile_component_file_query(mca_io_ompio_file_t *fh, int *priority) { struct flock lock; int fd, err; /*char *filename;*/ char filename[256]; int rank; bool has_file_lock_support=false; *priority = mca_sharedfp_lockedfile_priority; /*get the rank of this process*/ rank = ompi_comm_rank ( fh->f_comm); /*test, and update priority*/ /* * This test tests to see if fcntl returns success when asked to * establish a file lock. This test is intended for use on file systems * such as NFS that may not implement file locks. The locked file algorithm makes use * of file locks to implement the shared file pointer operations, and will not work * properly if file locks are not available. * * This is a simple test and has at least two limitations: * * 1. Some implementations of NFS are known to return success for * setting a file lock when in fact no lock has been set. This * test will not detect such erroneous implementations of NFS * * 2. Some implementations will hang (enter and wait indefinitately) * within the fcntl call. This test will also hang in that case. * Under normal conditions, this test should only take a few seconds to * run. * * The test prints a message showing the success or failure of * setting the file lock if PRINT_TAG is set to true in sharedfp.h. * It also sets the priority to a non-zero value on success and * returns NULL on failure. If there is a failure, the system routine * strerror is interogated in order to print the reason. */ /* Set the filename. */ /*data filename created by appending .locktest.$rank to the original filename*/ sprintf(filename,"%s%s%d",fh->f_filename,".locktest.",rank); lock.l_type = F_WRLCK; lock.l_start = 0; lock.l_whence = SEEK_SET; lock.l_len = 100; lock.l_pid = getpid(); fd = open(filename, O_RDWR | O_CREAT, 0644); if ( -1 == fd ){ opal_output(1,"mca_sharedfp_lockedfile_component_file_query: error opening file %s", filename); opal_output(1,"%s\n", strerror(errno)); has_file_lock_support=false; } else{ err = fcntl(fd, F_SETLKW, &lock); if ( mca_sharedfp_lockedfile_verbose ) { printf("mca_sharedfp_lockedfile_component_file_query: returned err=%d, for fd=%d\n",err,fd); } if (err) { opal_output(1, "mca_sharedfp_lockedfile_component_file_query: Failed to set a file lock on %s\n", filename ); opal_output(1, "err=%d, errno=%d, EOPNOTSUPP=%d, EINVAL=%d, ENOSYS=%d, EACCES=%d, EAGAIN=%d, EBADF=%d\n", err,errno,EOPNOTSUPP,EINVAL,ENOSYS,EACCES,EAGAIN,EBADF); opal_output(1,"%s\n", strerror(errno)); if (errno == EACCES || errno == EAGAIN) { opal_output(1,"errno=EACCES || EAGAIN, Already locked by another process\n"); } } else { if ( mca_sharedfp_lockedfile_verbose ) { printf( "mca_sharedfp_lockedfile_component_file_query: fcntl claims success in setting a file lock on %s\n", filename ); } has_file_lock_support=true; } /* printf("err = %d, errno = %d\n", err, errno); */ close(fd); unlink( filename ); } /**priority=100;*/ if(has_file_lock_support){ return &lockedfile; } *priority = 0; /*module can not run!, return NULL to indicate that we are unable to run*/ opal_output(1,"mca_sharedfp_lockedfile_component_file_query: Can not run!, file locking not supported\n"); return NULL; }
/* * Initialize module on the communicator */ static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module, struct ompi_communicator_t *comm) { mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*) module; hcoll_module->comm = comm; if (OMPI_SUCCESS != __save_coll_handlers(hcoll_module)){ HCOL_ERROR("coll_hcol: __save_coll_handlers failed"); return OMPI_ERROR; } hcoll_set_runtime_tag_offset(-100,mca_pml.pml_max_tag); hcoll_set_rte_halt_flag_address(&ompi_mpi_finalized); hcoll_set_rte_halt_flag_size(sizeof(ompi_mpi_finalized)); hcoll_module->hcoll_context = hcoll_create_context((rte_grp_handle_t)comm); if (NULL == hcoll_module->hcoll_context){ HCOL_VERBOSE(1,"hcoll_create_context returned NULL"); return OMPI_ERROR; } #if 0 { printf("\033[33mrank %d: DOING EXTRA TEST\033[0m\n",ompi_comm_rank(comm)); fflush(stdout); sleep(1); rte_ec_handle_t handle; rte_grp_handle_t world_group = hcoll_rte_functions.rte_world_group_fn(); int peer; const int max_count = 10000000; const int step = max_count/100; int buf = 0; int i; rte_request_handle_t req; peer = (ompi_comm_rank(comm)+1)%2; hcoll_rte_functions.get_ec_handles_fn(1,&peer,world_group,&handle); for (i=1; i<max_count+1; i++){ if (0 == ompi_comm_rank(comm)){ if (i/step*step == i){ printf("%d %% done...\n",i/step);fflush(stdout); } buf = 1; hcoll_rte_functions.send_fn(DTE_INT32,1,&buf,handle,world_group,0,&req); } else { hcoll_rte_functions.recv_fn(DTE_INT32,1,&buf,handle,world_group,0,&req); } int completed = 0; hcoll_rte_functions.test_fn(&req,&completed); while(!completed){ hcoll_rte_functions.test_fn(&req,&completed); /*hcoll_rte_functions.rte_progress_fn();*/ opal_progress(); } } printf("\033[32mrank %d: EXTRA TEST PASS\033[0m\n",ompi_comm_rank(comm)); fflush(stdout); sleep(1); } #endif return OMPI_SUCCESS; }
/* * allgatherv_inter * * Function: - allgatherv using other MPI collectives * Accepts: - same as MPI_Allgatherv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_allgatherv_inter(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, size_local, total=0, err; int *count=NULL,*displace=NULL; char *ptmp=NULL; MPI_Aint incr; MPI_Aint extent; MPI_Aint lb; ompi_datatype_t *ndtype; ompi_request_t *req[2]; rank = ompi_comm_rank(comm); size_local = ompi_comm_size(comm->c_local_comm); size = ompi_comm_remote_size(comm); if (0 == rank) { count = (int *)malloc(sizeof(int) * size_local); if (NULL == count) { return OMPI_ERR_OUT_OF_RESOURCE; } displace = (int *)malloc(sizeof(int) * size_local); if (NULL == displace) { return OMPI_ERR_OUT_OF_RESOURCE; } } /* Local gather to get the scount of each process */ err = comm->c_local_comm->c_coll.coll_gather(&scount, 1, MPI_INT, count, 1, MPI_INT, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_gather_module); if (OMPI_SUCCESS != err) { return err; } if(0 == rank) { displace[0] = 0; for (i = 1; i < size_local; i++) { displace[i] = displace[i-1] + count[i-1]; } /* Perform the gatherv locally with the first process as root */ err = ompi_ddt_get_extent(sdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } incr = 0; for (i = 0; i < size_local; i++) { incr = incr + extent*count[i]; } ptmp = (char*)malloc(incr); if (NULL == ptmp) { return OMPI_ERR_OUT_OF_RESOURCE; } } err = comm->c_local_comm->c_coll.coll_gatherv(sbuf, scount, sdtype, ptmp, count, displace, sdtype,0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_gatherv_module); if (OMPI_SUCCESS != err) { return err; } ompi_ddt_create_indexed(size,rcounts,disps,rdtype,&ndtype); ompi_ddt_commit(&ndtype); if (0 == rank) { for (i = 0; i < size_local; i++) { total = total + count[i]; } /* Exchange data between roots */ err = MCA_PML_CALL(irecv(rbuf, 1, ndtype, 0, MCA_COLL_BASE_TAG_ALLGATHERV, comm, &(req[0]))); if (OMPI_SUCCESS != err) { return err; } err = MCA_PML_CALL(isend(ptmp, total, sdtype, 0, MCA_COLL_BASE_TAG_ALLGATHERV, MCA_PML_BASE_SEND_STANDARD, comm, &(req[1]))); if (OMPI_SUCCESS != err) { return err; } err = ompi_request_wait_all(2, req, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { return err; } } /* bcast the message to all the local processes */ err = comm->c_local_comm->c_coll.coll_bcast(rbuf, 1, ndtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_bcast_module); if (OMPI_SUCCESS != err) { return err; } ompi_ddt_destroy(&ndtype); if (NULL != ptmp) { free(ptmp); } if (NULL != displace) { free(displace); } if (NULL != count) { free(count); } return err; }