Пример #1
0
int ompi_coll_base_reduce_intra_chain( const void *sendbuf, void *recvbuf, int count,
                                        ompi_datatype_t* datatype,
                                        ompi_op_t* op, int root,
                                        ompi_communicator_t* comm,
                                        mca_coll_base_module_t *module,
                                        uint32_t segsize, int fanout,
                                        int max_outstanding_reqs )
{
    int segcount = count;
    size_t typelng;
    mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
    mca_coll_base_comm_t *data = base_module->base_data;

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_chain rank %d fo %d ss %5d", ompi_comm_rank(comm), fanout, segsize));

    COLL_BASE_UPDATE_CHAIN( comm, base_module, root, fanout );
    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    ompi_datatype_type_size( datatype, &typelng );
    COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
                                           op, root, comm, module,
                                           data->cached_chain,
                                           segcount, max_outstanding_reqs );
}
int mca_sharedfp_addproc_read_ordered (mca_io_ompio_file_t *fh,
                                       void *buf,
                                       int count,
                                       struct ompi_datatype_t *datatype,
                                       ompi_status_public_t *status)
{
    int ret = OMPI_SUCCESS;
    OMPI_MPI_OFFSET_TYPE offset = 0, offsetReceived = 0;
    long sendBuff = 0;
    long *buff=NULL;
    long offsetBuff, bytesRequested = 0;
    size_t numofBytes;
    int rank, size, i;
    struct mca_sharedfp_base_data_t *sh = NULL;

    if(NULL == fh->f_sharedfp_data){
        opal_output(0, "sharedfp_addproc_read_ordered: shared file pointer "
                    "structure not initialized correctly\n");
        return OMPI_ERROR;
    }

    /*Retrieve the new communicator*/
    sh = fh->f_sharedfp_data;

    /* Calculate the number of bytes to read*/
    opal_datatype_type_size ( &datatype->super, &numofBytes);
    sendBuff = count * numofBytes;

    /* Get the ranks in the communicator */
    rank = ompi_comm_rank ( sh->comm);
    size = ompi_comm_size ( sh->comm);

    if ( 0  == rank )   {
        buff = (long*)malloc(sizeof(OMPI_MPI_OFFSET_TYPE) * size);
        if ( NULL ==  buff )
            return OMPI_ERR_OUT_OF_RESOURCE;
    }

    ret = sh->comm->c_coll.coll_gather( &sendBuff, 1, OMPI_OFFSET_DATATYPE,
					buff, 1, OMPI_OFFSET_DATATYPE, 0, sh->comm,
					sh->comm->c_coll.coll_gather_module);
    if ( OMPI_SUCCESS != ret ) {
	goto exit;
    }

    /* All the counts are present now in the recvBuff.
       The size of recvBuff is sizeof_newComm
     */
    if ( 0 == rank ) {
        for (i = 0; i < size ; i ++) {
	    if ( mca_sharedfp_addproc_verbose ){
		opal_output(ompi_sharedfp_base_framework.framework_output,
                            "sharedfp_addproc_read_ordered: Buff is %ld\n",buff[i]);
	    }
            bytesRequested += buff[i];

	    if ( mca_sharedfp_addproc_verbose ){
		opal_output(ompi_sharedfp_base_framework.framework_output,
                            "sharedfp_addproc_read_ordered: Bytes requested are %ld\n",
                            bytesRequested);
	    }
	}

        /* Request the offset to read bytesRequested bytes
	** only the root process needs to do the request,
	** since the root process will then tell the other
	** processes at what offset they should read their
	** share of the data.
         */
        ret = mca_sharedfp_addproc_request_position(sh,bytesRequested,&offsetReceived);
        if( OMPI_SUCCESS != ret ){
	    goto exit;
        }
	if ( mca_sharedfp_addproc_verbose ){
	    opal_output(ompi_sharedfp_base_framework.framework_output,
                        "sharedfp_addproc_read_ordered: Offset received is %lld\n",
                        offsetReceived);
	}
        buff[0] += offsetReceived;


        for (i = 1 ; i < size; i++) {
            buff[i] += buff[i-1];
        }
    }

    /* Scatter the results to the other processes*/
    ret = sh->comm->c_coll.coll_scatter ( buff, 1, OMPI_OFFSET_DATATYPE, &offsetBuff,
					  1, OMPI_OFFSET_DATATYPE, 0, sh->comm,
					  sh->comm->c_coll.coll_scatter_module );
    if ( OMPI_SUCCESS != ret ) {
	goto exit;
    }

    /*Each process now has its own individual offset in recvBUFF*/
    offset = offsetBuff - sendBuff;
    offset /= sh->sharedfh->f_etype_size;

    if ( mca_sharedfp_addproc_verbose ){
	opal_output(ompi_sharedfp_base_framework.framework_output,
                    "sharedfp_addproc_read_ordered: Offset returned is %lld\n",offset);
    }

    /* read from the file */
    ret = mca_common_ompio_file_read_at_all(sh->sharedfh,offset,buf,count,datatype,status);

exit:
    if ( NULL != buff ) {
	free ( buff );
    }

    return ret;
}
int mca_sharedfp_individual_write_ordered_begin(mca_io_ompio_file_t *fh,
                                                const void *buf,
                                                int count,
                                                struct ompi_datatype_t *datatype)
{
    int ret = OMPI_SUCCESS;
    int size = 0, rank = 0;
    int i = 0;
    size_t numofbytes = 0;
    size_t totalbytes = 0;
    OMPI_MPI_OFFSET_TYPE *offbuff=NULL;
    OMPI_MPI_OFFSET_TYPE global_offset = 0;
    OMPI_MPI_OFFSET_TYPE prev_offset = 0;
    OMPI_MPI_OFFSET_TYPE temp = 0, offset = 0;
    mca_sharedfp_individual_header_record *headnode = NULL;
    struct mca_sharedfp_base_data_t *sh = NULL;
    mca_sharedfp_base_module_t * shared_fp_base_module = NULL;

    if(fh->f_sharedfp_data==NULL){
	if ( mca_sharedfp_individual_verbose ) {
	    opal_output(ompi_sharedfp_base_framework.framework_output,
			"sharedfp_individual_write_ordered_begin - opening the shared file pointer\n");
	}
        shared_fp_base_module = fh->f_sharedfp;

        ret = shared_fp_base_module->sharedfp_file_open(fh->f_comm,
                                                        fh->f_filename,
                                                        fh->f_amode,
                                                        fh->f_info,
                                                        fh);
        if ( OMPI_SUCCESS != ret ) {
            opal_output(0,"sharedfp_individual_write_ordered_begin - error opening the shared file pointer\n");
            return ret;
        }
    }

    if ( true == fh->f_split_coll_in_use ) {
        opal_output(0, "Only one split collective I/O operation allowed per file handle at any given point in time!\n");
        return MPI_ERR_REQUEST;
    }

    /*Retrieve the sharedfp data structures*/
    sh = fh->f_sharedfp_data;
    rank = ompi_comm_rank ( sh->comm );
    size = ompi_comm_size ( sh->comm );

    /* Calculate the number of bytes of data that needs to be written*/
    opal_datatype_type_size ( &datatype->super, &numofbytes);
    totalbytes = count * numofbytes;

    headnode = (mca_sharedfp_individual_header_record*)sh->selected_module_data;
    if ( NULL == headnode)  {
	opal_output (0, "sharedfp_individual_write_ordered_begin: headnode is NULL but file is open\n");
	return OMPI_ERROR;
    }

    /* Data from all the metadata is combined and written to the main file */
    ret  = mca_sharedfp_individual_collaborate_data ( sh );
    if ( OMPI_SUCCESS != ret)  {
	return ret;
    }

    if ( 0 == rank )  {
	offbuff = (OMPI_MPI_OFFSET_TYPE *)malloc ( sizeof(OMPI_MPI_OFFSET_TYPE) * size);
	if (NULL == offbuff ) {
	    return OMPI_ERR_OUT_OF_RESOURCE;
	}
    }

    /*collect the total bytes to be written*/
    sh->comm->c_coll.coll_gather ( &totalbytes, 1, OMPI_OFFSET_DATATYPE,
				   offbuff, 1, OMPI_OFFSET_DATATYPE, 0,
				   sh->comm, sh->comm->c_coll.coll_gather_module );

    if ( 0 == rank ) {
        prev_offset = offbuff[0];
        offbuff[0]   = sh->global_offset;

        for (i = 1; i < size ; i++){
            temp = offbuff[i];
            offbuff[i] = offbuff[i - 1] + prev_offset;
            prev_offset = temp;
        }

        for (i = 0; i < size; i++){
            global_offset = offbuff[size - 1] + prev_offset;
        }
    }


    /* Scatter the results to the other processes */
    ret = sh->comm->c_coll.coll_scatter ( offbuff, 1, OMPI_OFFSET_DATATYPE,
					  &offset, 1, OMPI_OFFSET_DATATYPE, 0,
					  sh->comm, sh->comm->c_coll.coll_scatter_module );
    if ( OMPI_SUCCESS != ret )  {
	opal_output(0,"sharedfp_individual_write_ordered_begin: Error in scattering offsets \n");
	goto exit;
    }

    ret = sh->comm->c_coll.coll_bcast ( &global_offset, 1, OMPI_OFFSET_DATATYPE,
                                        0, sh->comm, sh->comm->c_coll.coll_bcast_module );
    if ( OMPI_SUCCESS != ret )  {
	opal_output(0,"sharedfp_individual_write_ordered_begin: Error while bcasting global offset \n");
	goto exit;
    }

    sh->global_offset = global_offset;

    /*use file_write_at_all to ensure the order*/
    ret = mca_common_ompio_file_iwrite_at_all(sh->sharedfh,offset, buf,count,datatype,
                                              &fh->f_split_coll_req);
    fh->f_split_coll_in_use = true;
    if ( OMPI_SUCCESS != ret )  {
	opal_output(0,"sharedfp_individual_write_ordered_begin: Error while writing the datafile \n");
    }

exit:
    if ( NULL != offbuff ) {
	free ( offbuff);
    }

    return ret;
}
Пример #4
0
/* linear iscan
 * working principle:
 * 1. each node (but node 0) receives from left neighbor
 * 2. performs op
 * 3. all but rank p-1 do sends to it's right neighbor and exits
 *
 */
static int nbc_scan_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
                         struct ompi_communicator_t *comm, ompi_request_t ** request,
                         struct mca_coll_base_module_2_3_0_t *module, bool persistent) {
  int rank, p, res;
  ptrdiff_t gap, span;
  NBC_Schedule *schedule;
  void *tmpbuf = NULL;
  char inplace;
  ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;

  NBC_IN_PLACE(sendbuf, recvbuf, inplace);

  rank = ompi_comm_rank (comm);
  p = ompi_comm_size (comm);

#ifdef NBC_CACHE_SCHEDULE
  NBC_Scan_args *args, *found, search;

  /* search schedule in communicator specific tree */
  search.sendbuf = sendbuf;
  search.recvbuf = recvbuf;
  search.count = count;
  search.datatype = datatype;
  search.op = op;
  found = (NBC_Scan_args *) hb_tree_search ((hb_tree *) libnbc_module->NBC_Dict[NBC_SCAN], &search);
  if (NULL == found) {
#endif
    schedule = OBJ_NEW(NBC_Schedule);
    if (OPAL_UNLIKELY(NULL == schedule)) {
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    if (!inplace) {
      /* copy data to receivebuf */
      res = NBC_Sched_copy ((void *)sendbuf, false, count, datatype,
                            recvbuf, false, count, datatype, schedule, false);
      if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
        OBJ_RELEASE(schedule);
        return res;
      }
    }

    if(rank != 0) {
      span = opal_datatype_span(&datatype->super, count, &gap);
      tmpbuf = malloc (span);
      if (NULL == tmpbuf) {
        OBJ_RELEASE(schedule);
        return OMPI_ERR_OUT_OF_RESOURCE;
      }

      /* we have to wait until we have the data */
      res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, rank-1, schedule, true);
      if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
        OBJ_RELEASE(schedule);
        free(tmpbuf);
        return res;
      }

      /* perform the reduce in my local buffer */
      /* this cannot be done until tmpbuf is unused :-( so barrier after the op */
      res = NBC_Sched_op ((void *)(-gap), true, recvbuf, false, count, datatype, op, schedule,
                          true);
      if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
        OBJ_RELEASE(schedule);
        free(tmpbuf);
        return res;
      }
    }

    if (rank != p-1) {
      res = NBC_Sched_send (recvbuf, false, count, datatype, rank+1, schedule, false);
      if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
        OBJ_RELEASE(schedule);
        free(tmpbuf);
        return res;
      }
    }

    res = NBC_Sched_commit (schedule);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
      OBJ_RELEASE(schedule);
      free(tmpbuf);
      return res;
    }

#ifdef NBC_CACHE_SCHEDULE
    /* save schedule to tree */
    args = (NBC_Scan_args *) malloc (sizeof (args));
    if (NULL != args) {
      args->sendbuf = sendbuf;
      args->recvbuf = recvbuf;
      args->count = count;
      args->datatype = datatype;
      args->op = op;
      args->schedule = schedule;
      res = hb_tree_insert ((hb_tree *) libnbc_module->NBC_Dict[NBC_SCAN], args, args, 0);
      if (0 == res) {
        OBJ_RETAIN(schedule);

        /* increase number of elements for A2A */
        if (++libnbc_module->NBC_Dict_size[NBC_SCAN] > NBC_SCHED_DICT_UPPER) {
          NBC_SchedCache_dictwipe ((hb_tree *) libnbc_module->NBC_Dict[NBC_SCAN],
                                   &libnbc_module->NBC_Dict_size[NBC_SCAN]);
        }
      } else {
        NBC_Error("error in dict_insert() (%i)", res);
        free (args);
      }
    }
  } else {
    /* found schedule */
    schedule = found->schedule;
    OBJ_RETAIN(schedule);
  }
#endif

  res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf);
  if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
    OBJ_RELEASE(schedule);
    free(tmpbuf);
    return res;
  }

  return OMPI_SUCCESS;
}
Пример #5
0
/*  
 * Linear functions are copied from the basic coll module.  For
 * some small number of nodes and/or small data sizes they are just as
 * fast as tuned/tree based segmenting operations and as such may be
 * selected by the decision functions.  These are copied into this module
 * due to the way we select modules in V1. i.e. in V2 we will handle this
 * differently and so will not have to duplicate code.  
 * GEF Oct05 after asking Jeff.  
 */
int
ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdisps,
                                            struct ompi_datatype_t *sdtype,
                                            void *rbuf, int *rcounts, int *rdisps,
                                            struct ompi_datatype_t *rdtype,
                                            struct ompi_communicator_t *comm,
                                            mca_coll_base_module_t *module)
{
    int i, size, rank, err, nreqs;
    char *psnd, *prcv;
    ptrdiff_t sext, rext;
    MPI_Request *preq;
    mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
    mca_coll_tuned_comm_t *data = tuned_module->tuned_data;

    if (MPI_IN_PLACE == sbuf) {
        return  mca_coll_tuned_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps,
                                                              rdtype, comm, module);
    }

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_tuned_stream,
                 "coll:tuned:alltoallv_intra_basic_linear rank %d", rank));

    ompi_datatype_type_extent(sdtype, &sext);
    ompi_datatype_type_extent(rdtype, &rext);

    /* Simple optimization - handle send to self first */
    psnd = ((char *) sbuf) + (ptrdiff_t)sdisps[rank] * sext;
    prcv = ((char *) rbuf) + (ptrdiff_t)rdisps[rank] * rext;
    if (0 != scounts[rank]) {
        err = ompi_datatype_sndrcv(psnd, scounts[rank], sdtype,
                              prcv, rcounts[rank], rdtype);
        if (MPI_SUCCESS != err) {
            return err;
        }
    }

    /* If only one process, we're done. */
    if (1 == size) {
        return MPI_SUCCESS;
    }

    /* Now, initiate all send/recv to/from others. */
    nreqs = 0;
    preq = data->mcct_reqs;

    /* Post all receives first */
    for (i = 0; i < size; ++i) {
        if (i == rank || 0 == rcounts[i]) {
            continue;
        }

        prcv = ((char *) rbuf) + (ptrdiff_t)rdisps[i] * rext;
        err = MCA_PML_CALL(irecv_init(prcv, rcounts[i], rdtype,
                                      i, MCA_COLL_BASE_TAG_ALLTOALLV, comm,
                                      preq++));
        ++nreqs;
        if (MPI_SUCCESS != err) {
            ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs);
            return err;
        }
    }

    /* Now post all sends */
    for (i = 0; i < size; ++i) {
        if (i == rank || 0 == scounts[i]) {
            continue;
        }

        psnd = ((char *) sbuf) + (ptrdiff_t)sdisps[i] * sext;
        err = MCA_PML_CALL(isend_init(psnd, scounts[i], sdtype,
                                      i, MCA_COLL_BASE_TAG_ALLTOALLV,
                                      MCA_PML_BASE_SEND_STANDARD, comm,
                                      preq++));
        ++nreqs;
        if (MPI_SUCCESS != err) {
            ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs);
            return err;
        }
    }

    /* Start your engines.  This will never return an error. */
    MCA_PML_CALL(start(nreqs, data->mcct_reqs));

    /* Wait for them all.  If there's an error, note that we don't care
     * what the error was -- just that there *was* an error.  The PML
     * will finish all requests, even if one or more of them fail.
     * i.e., by the end of this call, all the requests are free-able.
     * So free them anyway -- even if there was an error, and return the
     * error after we free everything. */
    err = ompi_request_wait_all(nreqs, data->mcct_reqs,
                                MPI_STATUSES_IGNORE);

    /* Free the requests. */
    ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs);

    return err;
}
Пример #6
0
/* simple linear Alltoallw */
int ompi_coll_libnbc_ialltoallw(void* sendbuf, int *sendcounts, int *sdispls,
                                MPI_Datatype sendtypes[], void* recvbuf, int *recvcounts, int *rdispls,
                                MPI_Datatype recvtypes[], struct ompi_communicator_t *comm, ompi_request_t ** request,
				struct mca_coll_base_module_2_1_0_t *module)
{
  int rank, p, res;
  NBC_Schedule *schedule;
  char *rbuf, *sbuf, inplace;
  NBC_Handle *handle;
  ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;

  NBC_IN_PLACE(sendbuf, recvbuf, inplace);

  rank = ompi_comm_rank (comm);
  p = ompi_comm_size (comm);

  /* copy data to receivbuffer */
  if ((sendcounts[rank] != 0) && !inplace) {
    rbuf = (char *) recvbuf + rdispls[rank];
    sbuf = (char *) sendbuf + sdispls[rank];
    res = NBC_Copy(sbuf, sendcounts[rank], sendtypes[rank], rbuf, recvcounts[rank], recvtypes[rank], comm);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
      return res;
    }
  }

  schedule = OBJ_NEW(NBC_Schedule);
  if (OPAL_UNLIKELY(NULL == schedule)) {
    return OMPI_ERR_OUT_OF_RESOURCE;
  }

  for (int i = 0; i < p; i++) {
    if (i == rank) {
      continue;
    }

    /* post all sends */
    if (sendcounts[i] != 0) {
      sbuf = (char *) sendbuf + sdispls[i];
      res = NBC_Sched_send (sbuf, false, sendcounts[i], sendtypes[i], i, schedule, false);
      if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
        OBJ_RELEASE(schedule);
        return res;
      }
    }
    /* post all receives */
    if (recvcounts[i] != 0) {
      rbuf = (char *) recvbuf + rdispls[i];
      res = NBC_Sched_recv (rbuf, false, recvcounts[i], recvtypes[i], i, schedule, false);
      if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
        OBJ_RELEASE(schedule);
        return res;
      }
    }
  }

  res = NBC_Sched_commit (schedule);
  if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
    OBJ_RELEASE(schedule);
    return res;
  }

  res = NBC_Init_handle (comm, &handle, libnbc_module);
  if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
    OBJ_RELEASE(schedule);
    return res;
  }

  res = NBC_Start (handle, schedule);
  if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
    NBC_Return_handle (handle);
    return res;
  }

  *request = (ompi_request_t *) handle;

  return OMPI_SUCCESS;
}
Пример #7
0
int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm,
                                                    mca_coll_base_module_t *module)
{
    int rank, size, adjsize, err, line, mask, remote;

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);
    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "ompi_coll_base_barrier_intra_recursivedoubling rank %d",
                 rank));

    /* do nearest power of 2 less than size calc */
    adjsize = opal_next_poweroftwo(size);
    adjsize >>= 1;

    /* if size is not exact power of two, perform an extra step */
    if (adjsize != size) {
        if (rank >= adjsize) {
            /* send message to lower ranked node */
            remote = rank - adjsize;
            err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
                                                remote, MCA_COLL_BASE_TAG_BARRIER,
                                                comm);
            if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}

        } else if (rank < (size - adjsize)) {

            /* receive message from high level rank */
            err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, rank+adjsize,
                                    MCA_COLL_BASE_TAG_BARRIER, comm,
                                    MPI_STATUS_IGNORE));

            if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
        }
    }

    /* exchange messages */
    if ( rank < adjsize ) {
        mask = 0x1;
        while ( mask < adjsize ) {
            remote = rank ^ mask;
            mask <<= 1;
            if (remote >= adjsize) continue;

            /* post receive from the remote node */
            err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
                                                remote, MCA_COLL_BASE_TAG_BARRIER,
                                                comm);
            if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
        }
    }

    /* non-power of 2 case */
    if (adjsize != size) {
        if (rank < (size - adjsize)) {
            /* send enter message to higher ranked node */
            remote = rank + adjsize;
            err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, remote,
                                    MCA_COLL_BASE_TAG_BARRIER,
                                    MCA_PML_BASE_SEND_SYNCHRONOUS, comm));

            if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
        }
    }

    return MPI_SUCCESS;

 err_hndl:
    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
                 __FILE__, line, err, rank));
    return err;
}
Пример #8
0
/*
 *  reduce_scatter_intra_basic_recursivehalving
 *
 *  Function:   - reduce scatter implementation using recursive-halving
 *                algorithm
 *  Accepts:    - same as MPI_Reduce_scatter()
 *  Returns:    - MPI_SUCCESS or error code
 *  Limitation: - Works only for commutative operations.
 */
int
ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
                                                            void *rbuf,
                                                            int *rcounts,
                                                            struct ompi_datatype_t *dtype,
                                                            struct ompi_op_t *op,
                                                            struct ompi_communicator_t *comm,
                                                            mca_coll_base_module_t *module)
{
    int i, rank, size, count, err = OMPI_SUCCESS;
    int tmp_size, remain = 0, tmp_rank, *disps = NULL;
    ptrdiff_t true_lb, true_extent, lb, extent, buf_size;
    char *recv_buf = NULL, *recv_buf_free = NULL;
    char *result_buf = NULL, *result_buf_free = NULL;

    /* Initialize */
    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_scatter_intra_basic_recursivehalving, rank %d", rank));

    /* Find displacements and the like */
    disps = (int*) malloc(sizeof(int) * size);
    if (NULL == disps) return OMPI_ERR_OUT_OF_RESOURCE;

    disps[0] = 0;
    for (i = 0; i < (size - 1); ++i) {
        disps[i + 1] = disps[i] + rcounts[i];
    }
    count = disps[size - 1] + rcounts[size - 1];

    /* short cut the trivial case */
    if (0 == count) {
        free(disps);
        return OMPI_SUCCESS;
    }

    /* get datatype information */
    ompi_datatype_get_extent(dtype, &lb, &extent);
    ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
    buf_size = true_extent + (ptrdiff_t)(count - 1) * extent;

    /* Handle MPI_IN_PLACE */
    if (MPI_IN_PLACE == sbuf) {
        sbuf = rbuf;
    }

    /* Allocate temporary receive buffer. */
    recv_buf_free = (char*) malloc(buf_size);
    recv_buf = recv_buf_free - true_lb;
    if (NULL == recv_buf_free) {
        err = OMPI_ERR_OUT_OF_RESOURCE;
        goto cleanup;
    }

    /* allocate temporary buffer for results */
    result_buf_free = (char*) malloc(buf_size);
    result_buf = result_buf_free - true_lb;

    /* copy local buffer into the temporary results */
    err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype);
    if (OMPI_SUCCESS != err) goto cleanup;

    /* figure out power of two mapping: grow until larger than
       comm size, then go back one, to get the largest power of
       two less than comm size */
    tmp_size = opal_next_poweroftwo (size);
    tmp_size >>= 1;
    remain = size - tmp_size;

    /* If comm size is not a power of two, have the first "remain"
       procs with an even rank send to rank + 1, leaving a power of
       two procs to do the rest of the algorithm */
    if (rank < 2 * remain) {
        if ((rank & 1) == 0) {
            err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1,
                                    MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                    MCA_PML_BASE_SEND_STANDARD,
                                    comm));
            if (OMPI_SUCCESS != err) goto cleanup;

            /* we don't participate from here on out */
            tmp_rank = -1;
        } else {
            err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1,
                                    MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                    comm, MPI_STATUS_IGNORE));

            /* integrate their results into our temp results */
            ompi_op_reduce(op, recv_buf, result_buf, count, dtype);

            /* adjust rank to be the bottom "remain" ranks */
            tmp_rank = rank / 2;
        }
    } else {
        /* just need to adjust rank to show that the bottom "even
           remain" ranks dropped out */
        tmp_rank = rank - remain;
    }

    /* For ranks not kicked out by the above code, perform the
       recursive halving */
    if (tmp_rank >= 0) {
        int *tmp_disps = NULL, *tmp_rcounts = NULL;
        int mask, send_index, recv_index, last_index;

        /* recalculate disps and rcounts to account for the
           special "remainder" processes that are no longer doing
           anything */
        tmp_rcounts = (int*) malloc(tmp_size * sizeof(int));
        if (NULL == tmp_rcounts) {
            err = OMPI_ERR_OUT_OF_RESOURCE;
            goto cleanup;
        }
        tmp_disps = (int*) malloc(tmp_size * sizeof(int));
        if (NULL == tmp_disps) {
            free(tmp_rcounts);
            err = OMPI_ERR_OUT_OF_RESOURCE;
            goto cleanup;
        }

        for (i = 0 ; i < tmp_size ; ++i) {
            if (i < remain) {
                /* need to include old neighbor as well */
                tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2];
            } else {
                tmp_rcounts[i] = rcounts[i + remain];
            }
        }

        tmp_disps[0] = 0;
        for (i = 0; i < tmp_size - 1; ++i) {
            tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i];
        }

        /* do the recursive halving communication.  Don't use the
           dimension information on the communicator because I
           think the information is invalidated by our "shrinking"
           of the communicator */
        mask = tmp_size >> 1;
        send_index = recv_index = 0;
        last_index = tmp_size;
        while (mask > 0) {
            int tmp_peer, peer, send_count, recv_count;
            struct ompi_request_t *request;

            tmp_peer = tmp_rank ^ mask;
            peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain;

            /* figure out if we're sending, receiving, or both */
            send_count = recv_count = 0;
            if (tmp_rank < tmp_peer) {
                send_index = recv_index + mask;
                for (i = send_index ; i < last_index ; ++i) {
                    send_count += tmp_rcounts[i];
                }
                for (i = recv_index ; i < send_index ; ++i) {
                    recv_count += tmp_rcounts[i];
                }
            } else {
                recv_index = send_index + mask;
                for (i = send_index ; i < recv_index ; ++i) {
                    send_count += tmp_rcounts[i];
                }
                for (i = recv_index ; i < last_index ; ++i) {
                    recv_count += tmp_rcounts[i];
                }
            }

            /* actual data transfer.  Send from result_buf,
               receive into recv_buf */
            if (recv_count > 0) {
                err = MCA_PML_CALL(irecv(recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
                                         recv_count, dtype, peer,
                                         MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                         comm, &request));
                if (OMPI_SUCCESS != err) {
                    free(tmp_rcounts);
                    free(tmp_disps);
                    goto cleanup;
                }
            }
            if (send_count > 0) {
                err = MCA_PML_CALL(send(result_buf + (ptrdiff_t)tmp_disps[send_index] * extent,
                                        send_count, dtype, peer,
                                        MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                        MCA_PML_BASE_SEND_STANDARD,
                                        comm));
                if (OMPI_SUCCESS != err) {
                    free(tmp_rcounts);
                    free(tmp_disps);
                    goto cleanup;
                }
            }

            /* if we received something on this step, push it into
               the results buffer */
            if (recv_count > 0) {
                err = ompi_request_wait(&request, MPI_STATUS_IGNORE);
                if (OMPI_SUCCESS != err) {
                    free(tmp_rcounts);
                    free(tmp_disps);
                    goto cleanup;
                }

                ompi_op_reduce(op,
                               recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
                               result_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
                               recv_count, dtype);
            }

            /* update for next iteration */
            send_index = recv_index;
            last_index = recv_index + mask;
            mask >>= 1;
        }

        /* copy local results from results buffer into real receive buffer */
        if (0 != rcounts[rank]) {
            err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent,
                                       rcounts[rank], dtype,
                                       rbuf, rcounts[rank], dtype);
            if (OMPI_SUCCESS != err) {
                free(tmp_rcounts);
                free(tmp_disps);
                goto cleanup;
            }
        }

        free(tmp_rcounts);
        free(tmp_disps);
    }
Пример #9
0
/*******************************************************************************
 * ompi_coll_base_reduce_scatter_intra_nonoverlapping
 *
 * This function just calls a reduce to rank 0, followed by an
 * appropriate scatterv call.
 */
int ompi_coll_base_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
                                                        int *rcounts,
                                                        struct ompi_datatype_t *dtype,
                                                        struct ompi_op_t *op,
                                                        struct ompi_communicator_t *comm,
                                                        mca_coll_base_module_t *module)
{
    int err, i, rank, size, total_count, *displs = NULL;
    const int root = 0;
    char *tmprbuf = NULL, *tmprbuf_free = NULL;

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_scatter_intra_nonoverlapping, rank %d", rank));

    for (i = 0, total_count = 0; i < size; i++) { total_count += rcounts[i]; }

    /* Reduce to rank 0 (root) and scatterv */
    tmprbuf = (char*) rbuf;
    if (MPI_IN_PLACE == sbuf) {
        /* rbuf on root (0) is big enough to hold whole data */
        if (root == rank) {
            err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count,
                                            dtype, op, root, comm, comm->c_coll.coll_reduce_module);
        } else {
            err = comm->c_coll.coll_reduce(tmprbuf, NULL, total_count,
                                           dtype, op, root, comm, comm->c_coll.coll_reduce_module);
        }
    } else {
        if (root == rank) {
            /* We must allocate temporary receive buffer on root to ensure that
               rbuf is big enough */
            ptrdiff_t lb, extent, tlb, textent;

            ompi_datatype_get_extent(dtype, &lb, &extent);
            ompi_datatype_get_true_extent(dtype, &tlb, &textent);

            tmprbuf_free = (char*) malloc(textent + (ptrdiff_t)(total_count - 1) * extent);
            tmprbuf = tmprbuf_free - lb;
        }
        err = comm->c_coll.coll_reduce (sbuf, tmprbuf, total_count,
                                        dtype, op, root, comm, comm->c_coll.coll_reduce_module);
    }
    if (MPI_SUCCESS != err) {
        if (NULL != tmprbuf_free) free(tmprbuf_free);
        return err;
    }

    displs = (int*) malloc(size * sizeof(int));
    displs[0] = 0;
    for (i = 1; i < size; i++) {
        displs[i] = displs[i-1] + rcounts[i-1];
    }
    err =  comm->c_coll.coll_scatterv (tmprbuf, rcounts, displs, dtype,
                                       rbuf, rcounts[rank], dtype,
                                       root, comm, comm->c_coll.coll_scatterv_module);
    free(displs);
    if (NULL != tmprbuf_free) free(tmprbuf_free);

    return err;
}
/*
 *	allgatherv_intra
 *
 *	Function:	- allgatherv using other MPI collectives
 *	Accepts:	- same as MPI_Allgatherv()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_allgatherv_intra(void *sbuf, int scount,
                                struct ompi_datatype_t *sdtype,
                                void *rbuf, int *rcounts, int *disps,
                                struct ompi_datatype_t *rdtype,
                                struct ompi_communicator_t *comm,
                                mca_coll_base_module_t *module)
{
    int i, size, rank ;
    int err;
    MPI_Aint extent;
    MPI_Aint lb;
    char *send_buf = NULL;
    struct ompi_datatype_t *newtype, *send_type;

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);
    /*
     * We don't have a root process defined. Arbitrarily assign root
     * to process with rank 0 (OMPI convention)
     */

    if (MPI_IN_PLACE == sbuf) {
        ompi_datatype_get_extent(rdtype, &lb, &extent);
        send_type = rdtype;
        send_buf = (char*)rbuf;
        for (i = 0; i < rank; ++i) {
            send_buf += (rcounts[i] * extent);
        }
    } else {
        send_buf = (char*)sbuf;
        send_type = sdtype;
    }

    err = comm->c_coll.coll_gatherv(send_buf,
                                    rcounts[rank], send_type,rbuf,
                                    rcounts, disps, rdtype, 0,
                                    comm, comm->c_coll.coll_gatherv_module);

    if (MPI_SUCCESS != err) {
        return err;
    }
    /*
     * we now have all the data in the root's rbuf. Need to
     * broadcast the data out to the other processes
     *
     * Need to define a datatype that captures the different vectors
     * from each process. MPI_TYPE_INDEXED with params 
     *                    size,rcount,displs,rdtype,newtype
     * should do the trick.
     * Use underlying ddt functions to create, and commit the
     * new datatype on each process, then broadcast and destroy the
     * datatype.
     */

    err = ompi_datatype_create_indexed(size,rcounts,disps,rdtype,&newtype);
    if (MPI_SUCCESS != err) {
        return err;
    }
    
    err = ompi_datatype_commit(&newtype);
    if(MPI_SUCCESS != err) {
       return err;
    }

    err = comm->c_coll.coll_bcast( rbuf, 1 ,newtype,0,comm,
                                   comm->c_coll.coll_bcast_module);

    ompi_datatype_destroy (&newtype);

    return err;
}
Пример #11
0
int mca_sharedfp_lockedfile_file_open (struct ompi_communicator_t *comm,
				       const char* filename,
				       int amode,
				       struct opal_info_t *info,
				       mca_io_ompio_file_t *fh)
{
    int err = MPI_SUCCESS;
    char * lockedfilename;
    int handle, rank;
    struct mca_sharedfp_lockedfile_data * module_data = NULL;
    struct mca_sharedfp_base_data_t* sh;
    mca_io_ompio_file_t * shfileHandle, *ompio_fh;
    mca_io_ompio_data_t *data;

    /*------------------------------------------------------------*/
    /*Open the same file again without shared file pointer support*/
    /*------------------------------------------------------------*/
    shfileHandle =  (mca_io_ompio_file_t *)malloc(sizeof(mca_io_ompio_file_t));
    err = mca_common_ompio_file_open(comm,filename,amode,info,shfileHandle,false);
    if ( OMPI_SUCCESS != err)  {
        opal_output(0, "mca_sharedfp_lockedfile_file_open: Error during file open\n");
        return err;
    }
    shfileHandle->f_fh = fh->f_fh;
    data = (mca_io_ompio_data_t *) fh->f_fh->f_io_selected_data;
    ompio_fh = &data->ompio_fh;

    err = mca_common_ompio_set_view (shfileHandle,
                                     ompio_fh->f_disp,
                                     ompio_fh->f_etype,
                                     ompio_fh->f_orig_filetype,
                                     ompio_fh->f_datarep,
                                     &(MPI_INFO_NULL->super));
    

    /*Memory is allocated here for the sh structure*/
    sh = (struct mca_sharedfp_base_data_t*)malloc(sizeof(struct mca_sharedfp_base_data_t));
    if ( NULL == sh){
        opal_output(0, "mca_sharedfp_lockedfile_file_open: Error, unable to malloc f_sharedfp_ptr struct\n");
	free ( shfileHandle);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    /*Populate the sh file structure based on the implementation*/
    sh->sharedfh      = shfileHandle;			/* Shared file pointer*/
    sh->global_offset = 0;				/* Global Offset*/
    sh->comm          = comm; 				/* Communicator*/
    sh->selected_module_data = NULL;

    rank = ompi_comm_rank ( sh->comm);

    /*Open a new file which will maintain the pointer for this file open*/
    if ( mca_sharedfp_lockedfile_verbose ) {
        opal_output(ompi_sharedfp_base_framework.framework_output,
                    "mca_sharedfp_lockedfile_file_open: open locked file.\n");
    }


    module_data = (struct mca_sharedfp_lockedfile_data*)malloc(sizeof(struct mca_sharedfp_lockedfile_data));
    if ( NULL == module_data ) {
        opal_output(ompi_sharedfp_base_framework.framework_output,
                    "mca_sharedfp_lockedfile_file_open: Error, unable to malloc lockedfile_data struct\n");
	free (shfileHandle);
	free (sh);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    opal_jobid_t masterjobid;
    if ( 0 == comm->c_my_rank  ) {
        ompi_proc_t *masterproc = ompi_group_peer_lookup(comm->c_local_group, 0 );
        masterjobid = OMPI_CAST_RTE_NAME(&masterproc->super.proc_name)->jobid;
    }
    comm->c_coll->coll_bcast ( &masterjobid, 1, MPI_UNSIGNED, 0, comm, 
                               comm->c_coll->coll_bcast_module );
 
    size_t filenamelen = strlen(filename) + 16;
    lockedfilename = (char*)malloc(sizeof(char) * filenamelen);
    if ( NULL == lockedfilename ) {
	free (shfileHandle);
	free (sh);
        free (module_data);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    snprintf(lockedfilename, filenamelen, "%s-%u%s",filename,masterjobid,".lock");
    module_data->filename = lockedfilename;

    /*-------------------------------------------------*/
    /*Open the lockedfile without shared file pointer  */
    /*-------------------------------------------------*/
    if ( 0 == rank ) {
	OMPI_MPI_OFFSET_TYPE position=0;
	/*only let main process initialize file pointer,
	 *therefore there is no need to lock the file
	 */
	handle = open ( lockedfilename, O_RDWR | O_CREAT, 0644 );
	write ( handle, &position, sizeof(OMPI_MPI_OFFSET_TYPE) );
	close ( handle );
    }
    comm->c_coll->coll_barrier ( comm, comm->c_coll->coll_barrier_module );

    handle = open ( lockedfilename, O_RDWR, 0644  );
    if ( -1 == handle ) {
        opal_output(0, "[%d]mca_sharedfp_lockedfile_file_open: Error during file open\n", rank);
	free (shfileHandle);
	free (sh);
	free(module_data);
        return OMPI_ERROR;
    }

    /*Store the new file handle*/
    module_data->handle = handle;
    /* Assign the lockedfile_data to sh->handle*/
    sh->selected_module_data   = module_data;
    /*remember the shared file handle*/
    fh->f_sharedfp_data = sh;

    comm->c_coll->coll_barrier ( comm, comm->c_coll->coll_barrier_module );

    return err;
}
Пример #12
0
/*
 *  reduce_lin_intra
 *
 *  Function:   - reduction using O(N) algorithm
 *  Accepts:    - same as MPI_Reduce()
 *  Returns:    - MPI_SUCCESS or error code
 */
int
ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, int count,
                                         struct ompi_datatype_t *dtype,
                                         struct ompi_op_t *op,
                                         int root,
                                         struct ompi_communicator_t *comm,
                                         mca_coll_base_module_t *module)
{
    int i, rank, err, size;
    ptrdiff_t extent, dsize, gap;
    char *free_buffer = NULL;
    char *pml_buffer = NULL;
    char *inplace_temp = NULL;
    char *inbuf;

    /* Initialize */

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    /* If not root, send data to the root. */

    if (rank != root) {
        err = MCA_PML_CALL(send(sbuf, count, dtype, root,
                                MCA_COLL_BASE_TAG_REDUCE,
                                MCA_PML_BASE_SEND_STANDARD, comm));
        return err;
    }

    dsize = opal_datatype_span(&dtype->super, count, &gap);
    ompi_datatype_type_extent(dtype, &extent);

    if (MPI_IN_PLACE == sbuf) {
        sbuf = rbuf;
        inplace_temp = (char*)malloc(dsize);
        if (NULL == inplace_temp) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        rbuf = inplace_temp - gap;
    }

    if (size > 1) {
        free_buffer = (char*)malloc(dsize);
        if (NULL == free_buffer) {
            if (NULL != inplace_temp) {
                free(inplace_temp);
            }
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        pml_buffer = free_buffer - gap;
    }

    /* Initialize the receive buffer. */

    if (rank == (size - 1)) {
        err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
    } else {
        err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1,
                                MCA_COLL_BASE_TAG_REDUCE, comm,
                                MPI_STATUS_IGNORE));
    }
    if (MPI_SUCCESS != err) {
        if (NULL != free_buffer) {
            free(free_buffer);
        }
        return err;
    }

    /* Loop receiving and calling reduction function (C or Fortran). */

    for (i = size - 2; i >= 0; --i) {
        if (rank == i) {
            inbuf = (char*)sbuf;
        } else {
            err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i,
                                    MCA_COLL_BASE_TAG_REDUCE, comm,
                                    MPI_STATUS_IGNORE));
            if (MPI_SUCCESS != err) {
                if (NULL != free_buffer) {
                    free(free_buffer);
                }
                return err;
            }

            inbuf = pml_buffer;
        }

        /* Perform the reduction */

        ompi_op_reduce(op, inbuf, rbuf, count, dtype);
    }

    if (NULL != inplace_temp) {
        err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, inplace_temp);
        free(inplace_temp);
    }
    if (NULL != free_buffer) {
        free(free_buffer);
    }

    /* All done */

    return MPI_SUCCESS;
}
Пример #13
0
/*
 * reduce_intra_in_order_binary
 *
 * Function:      Logarithmic reduce operation for non-commutative operations.
 * Acecpts:       same as MPI_Reduce()
 * Returns:       MPI_SUCCESS or error code
 */
int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recvbuf,
                                                  int count,
                                                  ompi_datatype_t* datatype,
                                                  ompi_op_t* op, int root,
                                                  ompi_communicator_t* comm,
                                                  mca_coll_base_module_t *module,
                                                  uint32_t segsize,
                                                  int max_outstanding_reqs  )
{
    int ret, rank, size, io_root, segcount = count;
    void *use_this_sendbuf = NULL;
    void *use_this_recvbuf = NULL;
    size_t typelng;
    mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
    mca_coll_base_comm_t *data = base_module->base_data;

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);
    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_in_order_binary rank %d ss %5d",
                 rank, segsize));

    COLL_BASE_UPDATE_IN_ORDER_BINTREE( comm, base_module );

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    ompi_datatype_type_size( datatype, &typelng );
    COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    /* An in-order binary tree must use root (size-1) to preserve the order of
       operations.  Thus, if root is not rank (size - 1), then we must handle
       1. MPI_IN_PLACE option on real root, and
       2. we must allocate temporary recvbuf on rank (size - 1).
       Note that generic function must be careful not to switch order of
       operations for non-commutative ops.
    */
    io_root = size - 1;
    use_this_sendbuf = (void *)sendbuf;
    use_this_recvbuf = recvbuf;
    if (io_root != root) {
        ptrdiff_t dsize, gap;
        char *tmpbuf = NULL;

        dsize = opal_datatype_span(&datatype->super, count, &gap);

        if ((root == rank) && (MPI_IN_PLACE == sendbuf)) {
            tmpbuf = (char *) malloc(dsize);
            if (NULL == tmpbuf) {
                return MPI_ERR_INTERN;
            }
            ompi_datatype_copy_content_same_ddt(datatype, count,
                                                (char*)tmpbuf,
                                                (char*)recvbuf);
            use_this_sendbuf = tmpbuf;
        } else if (io_root == rank) {
            tmpbuf = (char *) malloc(dsize);
            if (NULL == tmpbuf) {
                return MPI_ERR_INTERN;
            }
            use_this_recvbuf = tmpbuf;
        }
    }

    /* Use generic reduce with in-order binary tree topology and io_root */
    ret = ompi_coll_base_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype,
                                          op, io_root, comm, module,
                                          data->cached_in_order_bintree,
                                          segcount, max_outstanding_reqs );
    if (MPI_SUCCESS != ret) { return ret; }

    /* Clean up */
    if (io_root != root) {
        if (root == rank) {
            /* Receive result from rank io_root to recvbuf */
            ret = MCA_PML_CALL(recv(recvbuf, count, datatype, io_root,
                                    MCA_COLL_BASE_TAG_REDUCE, comm,
                                    MPI_STATUS_IGNORE));
            if (MPI_SUCCESS != ret) { return ret; }
            if (MPI_IN_PLACE == sendbuf) {
                free(use_this_sendbuf);
            }

        } else if (io_root == rank) {
            /* Send result from use_this_recvbuf to root */
            ret = MCA_PML_CALL(send(use_this_recvbuf, count, datatype, root,
                                    MCA_COLL_BASE_TAG_REDUCE,
                                    MCA_PML_BASE_SEND_STANDARD, comm));
            if (MPI_SUCCESS != ret) { return ret; }
            free(use_this_recvbuf);
        }
    }

    return MPI_SUCCESS;
}
Пример #14
0
/**
 * This is a generic implementation of the reduce protocol. It used the tree
 * provided as an argument and execute all operations using a segment of
 * count times a datatype.
 * For the last communication it will update the count in order to limit
 * the number of datatype to the original count (original_count)
 *
 * Note that for non-commutative operations we cannot save memory copy
 * for the first block: thus we must copy sendbuf to accumbuf on intermediate
 * to keep the optimized loop happy.
 */
int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, int original_count,
                                    ompi_datatype_t* datatype, ompi_op_t* op,
                                    int root, ompi_communicator_t* comm,
                                    mca_coll_base_module_t *module,
                                    ompi_coll_tree_t* tree, int count_by_segment,
                                    int max_outstanding_reqs )
{
    char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL};
    char *accumbuf = NULL, *accumbuf_free = NULL;
    char *local_op_buffer = NULL, *sendtmpbuf = NULL;
    ptrdiff_t extent, size, gap, segment_increment;
    ompi_request_t **sreq = NULL, *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
    int num_segments, line, ret, segindex, i, rank;
    int recvcount, prevcount, inbi;

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    ompi_datatype_type_extent( datatype, &extent );
    num_segments = (int)(((size_t)original_count + (size_t)count_by_segment - (size_t)1) / (size_t)count_by_segment);
    segment_increment = (ptrdiff_t)count_by_segment * extent;

    sendtmpbuf = (char*) sendbuf;
    if( sendbuf == MPI_IN_PLACE ) {
        sendtmpbuf = (char *)recvbuf;
    }

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d",
                 original_count, (unsigned long)((ptrdiff_t)num_segments * (ptrdiff_t)segment_increment),
                 (unsigned long)segment_increment, max_outstanding_reqs));

    rank = ompi_comm_rank(comm);

    /* non-leaf nodes - wait for children to send me data & forward up
       (if needed) */
    if( tree->tree_nextsize > 0 ) {
        ptrdiff_t real_segment_size;

        /* handle non existant recv buffer (i.e. its NULL) and
           protect the recv buffer on non-root nodes */
        accumbuf = (char*)recvbuf;
        if( (NULL == accumbuf) || (root != rank) ) {
            /* Allocate temporary accumulator buffer. */
            size = opal_datatype_span(&datatype->super, original_count, &gap);
            accumbuf_free = (char*)malloc(size);
            if (accumbuf_free == NULL) {
                line = __LINE__; ret = -1; goto error_hndl;
            }
            accumbuf = accumbuf_free - gap;
        }

        /* If this is a non-commutative operation we must copy
           sendbuf to the accumbuf, in order to simplfy the loops */
        if (!ompi_op_is_commute(op)) {
            ompi_datatype_copy_content_same_ddt(datatype, original_count,
                                                (char*)accumbuf,
                                                (char*)sendtmpbuf);
        }
        /* Allocate two buffers for incoming segments */
        real_segment_size = opal_datatype_span(&datatype->super, count_by_segment, &gap);
        inbuf_free[0] = (char*) malloc(real_segment_size);
        if( inbuf_free[0] == NULL ) {
            line = __LINE__; ret = -1; goto error_hndl;
        }
        inbuf[0] = inbuf_free[0] - gap;
        /* if there is chance to overlap communication -
           allocate second buffer */
        if( (num_segments > 1) || (tree->tree_nextsize > 1) ) {
            inbuf_free[1] = (char*) malloc(real_segment_size);
            if( inbuf_free[1] == NULL ) {
                line = __LINE__; ret = -1; goto error_hndl;
            }
            inbuf[1] = inbuf_free[1] - gap;
        }

        /* reset input buffer index and receive count */
        inbi = 0;
        recvcount = 0;
        /* for each segment */
        for( segindex = 0; segindex <= num_segments; segindex++ ) {
            prevcount = recvcount;
            /* recvcount - number of elements in current segment */
            recvcount = count_by_segment;
            if( segindex == (num_segments-1) )
                recvcount = original_count - (ptrdiff_t)count_by_segment * (ptrdiff_t)segindex;

            /* for each child */
            for( i = 0; i < tree->tree_nextsize; i++ ) {
                /**
                 * We try to overlap communication:
                 * either with next segment or with the next child
                 */
                /* post irecv for current segindex on current child */
                if( segindex < num_segments ) {
                    void* local_recvbuf = inbuf[inbi];
                    if( 0 == i ) {
                        /* for the first step (1st child per segment) and
                         * commutative operations we might be able to irecv
                         * directly into the accumulate buffer so that we can
                         * reduce(op) this with our sendbuf in one step as
                         * ompi_op_reduce only has two buffer pointers,
                         * this avoids an extra memory copy.
                         *
                         * BUT if the operation is non-commutative or
                         * we are root and are USING MPI_IN_PLACE this is wrong!
                         */
                        if( (ompi_op_is_commute(op)) &&
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_recvbuf = accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment;
                        }
                    }

                    ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount, datatype,
                                             tree->tree_next[i],
                                             MCA_COLL_BASE_TAG_REDUCE, comm,
                                             &reqs[inbi]));
                    if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;}
                }
                /* wait for previous req to complete, if any.
                   if there are no requests reqs[inbi ^1] will be
                   MPI_REQUEST_NULL. */
                /* wait on data from last child for previous segment */
                ret = ompi_request_wait_all( 1, &reqs[inbi ^ 1],
                                             MPI_STATUSES_IGNORE );
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
                local_op_buffer = inbuf[inbi ^ 1];
                if( i > 0 ) {
                    /* our first operation is to combine our own [sendbuf] data
                     * with the data we recvd from down stream (but only
                     * the operation is commutative and if we are not root and
                     * not using MPI_IN_PLACE)
                     */
                    if( 1 == i ) {
                        if( (ompi_op_is_commute(op)) &&
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_op_buffer = sendtmpbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment;
                        }
                    }
                    /* apply operation */
                    ompi_op_reduce(op, local_op_buffer,
                                   accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
                                   recvcount, datatype );
                } else if ( segindex > 0 ) {
                    void* accumulator = accumbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment;
                    if( tree->tree_nextsize <= 1 ) {
                        if( (ompi_op_is_commute(op)) &&
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_op_buffer = sendtmpbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment;
                        }
                    }
                    ompi_op_reduce(op, local_op_buffer, accumulator, prevcount,
                                   datatype );

                    /* all reduced on available data this step (i) complete,
                     * pass to the next process unless you are the root.
                     */
                    if (rank != tree->tree_root) {
                        /* send combined/accumulated data to parent */
                        ret = MCA_PML_CALL( send( accumulator, prevcount,
                                                  datatype, tree->tree_prev,
                                                  MCA_COLL_BASE_TAG_REDUCE,
                                                  MCA_PML_BASE_SEND_STANDARD,
                                                  comm) );
                        if (ret != MPI_SUCCESS) {
                            line = __LINE__; goto error_hndl;
                        }
                    }

                    /* we stop when segindex = number of segments
                       (i.e. we do num_segment+1 steps for pipelining */
                    if (segindex == num_segments) break;
                }

                /* update input buffer index */
                inbi = inbi ^ 1;
            } /* end of for each child */
        } /* end of for each segment */

        /* clean up */
        if( inbuf_free[0] != NULL) free(inbuf_free[0]);
        if( inbuf_free[1] != NULL) free(inbuf_free[1]);
        if( accumbuf_free != NULL ) free(accumbuf_free);
    }

    /* leaf nodes
       Depending on the value of max_outstanding_reqs and
       the number of segments we have two options:
       - send all segments using blocking send to the parent, or
       - avoid overflooding the parent nodes by limiting the number of
       outstanding requests to max_oustanding_reqs.
       TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size
       for the current communication, synchronization should be used only
       when the message/segment size is smaller than the eager size.
    */
    else {

        /* If the number of segments is less than a maximum number of oustanding
           requests or there is no limit on the maximum number of outstanding
           requests, we send data to the parent using blocking send */
        if ((0 == max_outstanding_reqs) ||
            (num_segments <= max_outstanding_reqs)) {

            segindex = 0;
            while ( original_count > 0) {
                if (original_count < count_by_segment) {
                    count_by_segment = original_count;
                }
                ret = MCA_PML_CALL( send((char*)sendbuf +
                                         (ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
                                         count_by_segment, datatype,
                                         tree->tree_prev,
                                         MCA_COLL_BASE_TAG_REDUCE,
                                         MCA_PML_BASE_SEND_STANDARD,
                                         comm) );
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
                segindex++;
                original_count -= count_by_segment;
            }
        }

        /* Otherwise, introduce flow control:
           - post max_outstanding_reqs non-blocking synchronous send,
           - for remaining segments
           - wait for a ssend to complete, and post the next one.
           - wait for all outstanding sends to complete.
        */
        else {

            int creq = 0;

            sreq = coll_base_comm_get_reqs(module->base_data, max_outstanding_reqs);
            if (NULL == sreq) { line = __LINE__; ret = -1; goto error_hndl; }

            /* post first group of requests */
            for (segindex = 0; segindex < max_outstanding_reqs; segindex++) {
                ret = MCA_PML_CALL( isend((char*)sendbuf +
                                          (ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
                                          count_by_segment, datatype,
                                          tree->tree_prev,
                                          MCA_COLL_BASE_TAG_REDUCE,
                                          MCA_PML_BASE_SEND_SYNCHRONOUS, comm,
                                          &sreq[segindex]) );
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
                original_count -= count_by_segment;
            }

            creq = 0;
            while ( original_count > 0 ) {
                /* wait on a posted request to complete */
                ret = ompi_request_wait(&sreq[creq], MPI_STATUS_IGNORE);
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }

                if( original_count < count_by_segment ) {
                    count_by_segment = original_count;
                }
                ret = MCA_PML_CALL( isend((char*)sendbuf +
                                          (ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
                                          count_by_segment, datatype,
                                          tree->tree_prev,
                                          MCA_COLL_BASE_TAG_REDUCE,
                                          MCA_PML_BASE_SEND_SYNCHRONOUS, comm,
                                          &sreq[creq]) );
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
                creq = (creq + 1) % max_outstanding_reqs;
                segindex++;
                original_count -= count_by_segment;
            }

            /* Wait on the remaining request to complete */
            ret = ompi_request_wait_all( max_outstanding_reqs, sreq,
                                         MPI_STATUSES_IGNORE );
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
        }
    }
    return OMPI_SUCCESS;

 error_hndl:  /* error handler */
    OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,
                   "ERROR_HNDL: node %d file %s line %d error %d\n",
                   rank, __FILE__, line, ret ));
    (void)line;  // silence compiler warning
    if( inbuf_free[0] != NULL ) free(inbuf_free[0]);
    if( inbuf_free[1] != NULL ) free(inbuf_free[1]);
    if( accumbuf_free != NULL ) free(accumbuf);
    if( NULL != sreq ) {
        ompi_coll_base_free_reqs(sreq, max_outstanding_reqs);
    }
    return ret;
}
Пример #15
0
/*
 *	file_open_plfs
 *
 *	Function:	- opens a new file
 *	Accepts:	- same arguments as MPI_File_open()
 *	Returns:	- Success if new file handle
 */
int
mca_fs_plfs_file_open (struct ompi_communicator_t *comm,
                     const char* filename,
                     int access_mode,
                     struct ompi_info_t *info,
                     mca_io_ompio_file_t *fh)
{
    int rank;
    int amode;
    int old_mask, perm;
    plfs_error_t plfs_ret;
    Plfs_fd *pfd = NULL;
    char wpath[1024];
    size_t len = sizeof(int);
    char key[] = "num_hostdirs";

    rank = ompi_comm_rank ( comm );

    getcwd( wpath, sizeof(wpath) );
    sprintf( wpath,"%s/%s",wpath,filename );

    if (OMPIO_PERM_NULL == fh->f_perm) {
        old_mask = umask(022);
        umask(old_mask);
        perm = old_mask ^ 0666;
    }
    else {
        perm = fh->f_perm;
    }

    amode = 0;

    if (access_mode & MPI_MODE_RDONLY)
        amode = amode | O_RDONLY;
    if (access_mode & MPI_MODE_WRONLY)
        amode = amode | O_WRONLY;
    if (access_mode & MPI_MODE_RDWR)
        amode = amode | O_RDWR;
    if (access_mode & MPI_MODE_EXCL) {
        if( is_plfs_path(wpath) == 1 ) { //the file already exists
	    return OMPI_ERROR;
	}
    }

    if (0 == rank) {
        /* MODE_CREATE and MODE_EXCL can only be set by one process */
        if (access_mode & MPI_MODE_CREATE)
	    amode = amode | O_CREAT;

	plfs_ret = plfs_open( &pfd, wpath, amode, 0, perm, NULL );
	fh->f_fs_ptr = pfd;
    }

    comm->c_coll.coll_bcast ( &plfs_ret, 1, MPI_INT, 0, comm, comm->c_coll.coll_bcast_module);
    if ( PLFS_SUCCESS != plfs_ret ) {
        return OMPI_ERROR;
    }

    if (0 != rank) {
        plfs_ret = plfs_open( &pfd, wpath, amode, 0, perm, NULL );
	if (PLFS_SUCCESS != plfs_ret) {
	    opal_output(0, "fs_plfs_file_open: Error in plfs_open:\n%s\n", strplfserr(plfs_ret));
	    return OMPI_ERROR;
	}
	else {
	    fh->f_fs_ptr = pfd;
	}
    }

    if (mca_fs_plfs_num_hostdir > 0) {
        plfs_ret = plfs_setxattr( pfd, &mca_fs_plfs_num_hostdir, key, len );
        if (PLFS_SUCCESS != plfs_ret) {
	    opal_output(0, "fs_plfs_file_open: Error in plfs_setxattr:\n%s\n", strplfserr(plfs_ret));
	    return OMPI_ERROR;
	}
    }
    return OMPI_SUCCESS;
}
Пример #16
0
static int
mca_coll_basic_neighbor_alltoallv_cart(const void *sbuf, const int scounts[], const int sdisps[],
                                       struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[],
                                       const int rdisps[], struct ompi_datatype_t *rdtype,
                                       struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
{
    mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t *) module;
    const mca_topo_base_comm_cart_2_1_0_t *cart = comm->c_topo->mtc.cart;
    const int rank = ompi_comm_rank (comm);
    int rc = MPI_SUCCESS, dim, i, nreqs;
    ptrdiff_t lb, rdextent, sdextent;
    ompi_request_t **reqs;

    ompi_datatype_get_extent(rdtype, &lb, &rdextent);
    ompi_datatype_get_extent(sdtype, &lb, &sdextent);

    /* post receives first */
    for (dim = 0, nreqs = 0, i = 0, reqs = basic_module->mccb_reqs ; dim < cart->ndims ; ++dim, i += 2) {
        int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL;

        if (cart->dims[dim] > 1) {
            mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank);
        } else if (1 == cart->dims[dim] && cart->periods[dim]) {
            srank = drank = rank;
        }

        if (MPI_PROC_NULL != srank) {
            rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i] * rdextent, rcounts[i], rdtype, srank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++));
            if (OMPI_SUCCESS != rc) break;
            nreqs++;
        }

        if (MPI_PROC_NULL != drank) {
            rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i+1] * rdextent, rcounts[i+1], rdtype, drank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++));
            if (OMPI_SUCCESS != rc) break;
            nreqs++;
        }
    }

    if (OMPI_SUCCESS != rc) {
        /* should probably try to clean up here */
        return rc;
    }

    for (dim = 0, i = 0 ; dim < cart->ndims ; ++dim, i += 2) {
        int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL;

        if (cart->dims[dim] > 1) {
            mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank);
        } else if (1 == cart->dims[dim] && cart->periods[dim]) {
            srank = drank = rank;
        }

        if (MPI_PROC_NULL != srank) {
            /* remove cast from const when the pml layer is updated to take a const for the send buffer */
            rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i] * sdextent, scounts[i], sdtype, srank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++));
            if (OMPI_SUCCESS != rc) break;
            nreqs++;
        }

        if (MPI_PROC_NULL != drank) {
            rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i+1] * sdextent, scounts[i+1], sdtype, drank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++));
            if (OMPI_SUCCESS != rc) break;
            nreqs++;
        }
    }

    if (OMPI_SUCCESS != rc) {
        /* should probably try to clean up here */
        return rc;
    }

    return ompi_request_wait_all (nreqs, basic_module->mccb_reqs, MPI_STATUSES_IGNORE);
}
Пример #17
0
int
mca_sharedfp_sm_seek (mca_io_ompio_file_t *fh,
                      OMPI_MPI_OFFSET_TYPE offset, int whence)
{
    int rank, status=0;
    OMPI_MPI_OFFSET_TYPE end_position=0;
    int ret = OMPI_SUCCESS;
    struct mca_sharedfp_base_data_t *sh = NULL;
    mca_sharedfp_base_module_t * shared_fp_base_module = NULL;
    struct mca_sharedfp_sm_data * sm_data = NULL;
    struct mca_sharedfp_sm_offset * sm_offset_ptr = NULL;

    if( NULL == fh->f_sharedfp_data ) {
        if ( mca_sharedfp_sm_verbose ) {
            opal_output(ompi_sharedfp_base_framework.framework_output,
                        "sharedfp_sm_seek: opening the shared file pointer\n");
        }
        shared_fp_base_module = fh->f_sharedfp;

        ret = shared_fp_base_module->sharedfp_file_open(fh->f_comm,
                                                        fh->f_filename,
                                                        fh->f_amode,
                                                        fh->f_info,
                                                        fh);
        if ( OMPI_SUCCESS != ret ) {
            opal_output(0,"sharedfp_sm_seek - error opening the shared file pointer\n");
            return ret;
        }
    }

    sh = fh->f_sharedfp_data;
    rank = ompi_comm_rank ( sh->comm );

    if( 0 == rank ){
        if ( MPI_SEEK_SET == whence){
            /*no nothing*/
            if ( offset < 0){
                opal_output(0,"sharedfp_sm_seek - MPI_SEEK_SET, offset must be > 0, got offset=%lld.\n",offset);
                ret = -1;
            }
            if ( mca_sharedfp_sm_verbose ) {
                opal_output(ompi_sharedfp_base_framework.framework_output,
                            "sharedfp_sm_seek: MPI_SEEK_SET new_offset=%lld\n",offset);
            }
        }
        else if( MPI_SEEK_CUR == whence){
            OMPI_MPI_OFFSET_TYPE current_position;
            ret = mca_sharedfp_sm_get_position ( fh, &current_position);
            if ( mca_sharedfp_sm_verbose ) {
                opal_output(ompi_sharedfp_base_framework.framework_output,
                            "sharedfp_sm_seek: MPI_SEEK_CUR: curr=%lld, offset=%lld, call status=%d\n",
                            current_position,offset,status);
            }
            offset = current_position + offset;
            if ( mca_sharedfp_sm_verbose ) {
                opal_output(ompi_sharedfp_base_framework.framework_output,
                            "sharedfp_sm_seek: MPI_SEEK_CUR: new_offset=%lld\n",offset);
            }
            if(offset < 0){
                opal_output(0,"sharedfp_sm_seek - MPI_SEEK_CURE, offset must be > 0, got offset=%lld.\n",offset);
                ret = -1;
            }
        }
        else if( MPI_SEEK_END == whence){
            end_position=0;
            mca_common_ompio_file_get_size(sh->sharedfh,&end_position);

            offset = end_position + offset;
            if ( mca_sharedfp_sm_verbose ) {
                opal_output(ompi_sharedfp_base_framework.framework_output,
                            "sharedfp_sm_seek: MPI_SEEK_END: file_get_size=%lld\n",end_position);
            }
            if(offset < 0){
                opal_output(0,"sharedfp_sm_seek - MPI_SEEK_CUR, offset must be > 0, got offset=%lld.\n",offset);
                ret = -1;
            }
        }
        else {
            opal_output(0,"sharedfp_sm_seek - whence=%i is not supported\n",whence);
            ret = -1;
        }

        /*-----------------------------------------------------*/
        /* Set Shared file pointer                             */
        /*-----------------------------------------------------*/
        sm_data = sh->selected_module_data;
        sm_offset_ptr = sm_data->sm_offset_ptr;

        /*-------------------*/
        /*lock the file  */
        /*--------------------*/
        if ( mca_sharedfp_sm_verbose ) {
            opal_output(ompi_sharedfp_base_framework.framework_output,
                        "sharedfp_sm_seek: Aquiring lock, rank=%d...",rank); fflush(stdout);
        }

        /* Aquire an exclusive lock */
        sm_offset_ptr = sm_data->sm_offset_ptr;

        sem_wait(sm_data->mutex);

        if ( mca_sharedfp_sm_verbose ) {
            opal_output(ompi_sharedfp_base_framework.framework_output,
                        "sharedfp_sm_seek: Success! Acquired sm lock.for rank=%d\n",rank);
        }
        sm_offset_ptr->offset=offset;
        if ( mca_sharedfp_sm_verbose ) {
            opal_output(ompi_sharedfp_base_framework.framework_output,
                        "sharedfp_sm_seek: Releasing sm lock...rank=%d",rank); fflush(stdout);
        }
        sem_post(sm_data->mutex);
    }

    /* since we are only letting process 0, update the current pointer
     * all of the other processes need to wait before proceeding.
     */
    sh->comm->c_coll.coll_barrier ( sh->comm, sh->comm->c_coll.coll_barrier_module );

    return ret;
}
Пример #18
0
static int my_rank (rte_grp_handle_t grp_h )
{
    return ompi_comm_rank((ompi_communicator_t *)grp_h);
}
Пример #19
0
/*
 * Simple double ring version of barrier
 *
 * synchronous gurantee made by last ring of sends are synchronous
 *
 */
int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm,
                                             mca_coll_base_module_t *module)
{
    int rank, size, err = 0, line = 0, left, right;

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank));

    left = ((rank-1)%size);
    right = ((rank+1)%size);

    if (rank > 0) { /* receive message from the left */
        err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
                                MCA_COLL_BASE_TAG_BARRIER, comm,
                                MPI_STATUS_IGNORE));
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
    }

    /* Send message to the right */
    err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
                            MCA_COLL_BASE_TAG_BARRIER,
                            MCA_PML_BASE_SEND_STANDARD, comm));
    if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }

    /* root needs to receive from the last node */
    if (rank == 0) {
        err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
                                MCA_COLL_BASE_TAG_BARRIER, comm,
                                MPI_STATUS_IGNORE));
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
    }

    /* Allow nodes to exit */
    if (rank > 0) { /* post Receive from left */
        err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
                                MCA_COLL_BASE_TAG_BARRIER, comm,
                                MPI_STATUS_IGNORE));
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
    }

    /* send message to the right one */
    err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
                            MCA_COLL_BASE_TAG_BARRIER,
                            MCA_PML_BASE_SEND_SYNCHRONOUS, comm));
    if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }

    /* rank 0 post receive from the last node */
    if (rank == 0) {
        err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
                                MCA_COLL_BASE_TAG_BARRIER, comm,
                                MPI_STATUS_IGNORE));
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
    }

    return MPI_SUCCESS;

 err_hndl:
    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
                 __FILE__, line, err, rank));
    return err;
}
Пример #20
0
/*
 *	gather_intra_linear_sync
 *
 *	Function:	- synchronized gather operation with
 *	Accepts:	- same arguments as MPI_Gather(), first segment size
 *	Returns:	- MPI_SUCCESS or error code
 */
int
ompi_coll_base_gather_intra_linear_sync(const void *sbuf, int scount,
                                         struct ompi_datatype_t *sdtype,
                                         void *rbuf, int rcount,
                                         struct ompi_datatype_t *rdtype,
                                         int root,
                                         struct ompi_communicator_t *comm,
                                         mca_coll_base_module_t *module,
                                         int first_segment_size)
{
    int i, ret, line, rank, size, first_segment_count;
    ompi_request_t **reqs = NULL;
    MPI_Aint extent, lb;
    size_t typelng;

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "ompi_coll_base_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size));

    if (rank != root) {
        /* Non-root processes:
           - receive zero byte message from the root,
           - send the first segment of the data synchronously,
           - send the second segment of the data.
        */

        ompi_datatype_type_size(sdtype, &typelng);
        ompi_datatype_get_extent(sdtype, &lb, &extent);
        first_segment_count = scount;
        COLL_BASE_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng,
                                      first_segment_count );

        ret = MCA_PML_CALL(recv(rbuf, 0, MPI_BYTE, root,
                                MCA_COLL_BASE_TAG_GATHER,
                                comm, MPI_STATUS_IGNORE));
        if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

        ret = MCA_PML_CALL(send(sbuf, first_segment_count, sdtype, root,
                                MCA_COLL_BASE_TAG_GATHER,
                                MCA_PML_BASE_SEND_STANDARD, comm));
        if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

        ret = MCA_PML_CALL(send((char*)sbuf + extent * first_segment_count,
                                (scount - first_segment_count), sdtype,
                                root, MCA_COLL_BASE_TAG_GATHER,
                                MCA_PML_BASE_SEND_STANDARD, comm));
        if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

    } else {

        /* Root process,
           - For every non-root node:
           - post irecv for the first segment of the message
           - send zero byte message to signal node to send the message
           - post irecv for the second segment of the message
           - wait for the first segment to complete
           - Copy local data if necessary
           - Waitall for all the second segments to complete.
        */
        char *ptmp;
        ompi_request_t *first_segment_req;
        reqs = coll_base_comm_get_reqs(module->base_data, size);
        if (NULL == reqs) { ret = -1; line = __LINE__; goto error_hndl; }

        ompi_datatype_type_size(rdtype, &typelng);
        ompi_datatype_get_extent(rdtype, &lb, &extent);
        first_segment_count = rcount;
        COLL_BASE_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng,
                                      first_segment_count );

        ptmp = (char *) rbuf;
        for (i = 0; i < size; ++i) {
            if (i == rank) {
                /* skip myself */
                reqs[i] = MPI_REQUEST_NULL;
                continue;
            }

            /* irecv for the first segment from i */
            ptmp = (char*)rbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * extent;
            ret = MCA_PML_CALL(irecv(ptmp, first_segment_count, rdtype, i,
                                     MCA_COLL_BASE_TAG_GATHER, comm,
                                     &first_segment_req));
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

            /* send sync message */
            ret = MCA_PML_CALL(send(rbuf, 0, MPI_BYTE, i,
                                    MCA_COLL_BASE_TAG_GATHER,
                                    MCA_PML_BASE_SEND_STANDARD, comm));
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

            /* irecv for the second segment */
            ptmp = (char*)rbuf + ((ptrdiff_t)i * (ptrdiff_t)rcount + first_segment_count) * extent;
            ret = MCA_PML_CALL(irecv(ptmp, (rcount - first_segment_count),
                                     rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm,
                                     &reqs[i]));
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

            /* wait on the first segment to complete */
            ret = ompi_request_wait(&first_segment_req, MPI_STATUS_IGNORE);
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
        }

        /* copy local data if necessary */
        if (MPI_IN_PLACE != sbuf) {
            ret = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype,
                                       (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent,
                                       rcount, rdtype);
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
        }

        /* wait all second segments to complete */
        ret = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE);
        if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
    }

    /* All done */
    return MPI_SUCCESS;
 error_hndl:
    if (NULL != reqs) {
        ompi_coll_base_free_reqs(reqs, size);
    }
    OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,
                   "ERROR_HNDL: node %d file %s line %d error %d\n",
                   rank, __FILE__, line, ret ));
    (void)line;  // silence compiler warning
    return ret;
}
Пример #21
0
int mca_sharedfp_sm_read_ordered_begin(mca_io_ompio_file_t *fh,
                                       void *buf,
                                       int count,
                                       struct ompi_datatype_t *datatype)
{
    int ret = OMPI_SUCCESS;
    OMPI_MPI_OFFSET_TYPE offset = 0;
    long sendBuff = 0;
    long *buff=NULL;
    long offsetBuff;
    OMPI_MPI_OFFSET_TYPE offsetReceived = 0;
    long bytesRequested = 0;
    int recvcnt = 1, sendcnt = 1;
    size_t numofBytes;
    int rank, size, i;
    struct mca_sharedfp_base_data_t *sh = NULL;
    mca_sharedfp_base_module_t * shared_fp_base_module = NULL;

    if ( NULL == fh->f_sharedfp_data){
 	if ( mca_sharedfp_sm_verbose ) {
	    opal_output(ompi_sharedfp_base_framework.framework_output,
			"sharedfp_sm_read_ordered_begin: opening the shared file pointer\n");
	}
        shared_fp_base_module = fh->f_sharedfp;

        ret = shared_fp_base_module->sharedfp_file_open(fh->f_comm,
                                                        fh->f_filename,
                                                        fh->f_amode,
                                                        fh->f_info,
                                                        fh);
        if (  OMPI_SUCCESS != ret ) {
            opal_output(0,"sharedfp_sm_read_ordered_begin - error opening the shared file pointer\n");
            return ret;
        }
    }
    if ( true == fh->f_split_coll_in_use ) {
        opal_output(0,"Only one split collective I/O operation allowed per file handle at any given point in time!\n");
        return MPI_ERR_REQUEST;
    }



    /*Retrieve the new communicator*/
    sh = fh->f_sharedfp_data;

    /* Calculate the number of bytes to read*/
    opal_datatype_type_size ( &datatype->super, &numofBytes);
    sendBuff = count * numofBytes;

    /* Get the ranks in the communicator */
    rank = ompi_comm_rank ( sh->comm );
    size = ompi_comm_size ( sh->comm );

    if ( 0  == rank ) {
        buff = (long*)malloc(sizeof(long) * size);
        if (  NULL == buff )
            return OMPI_ERR_OUT_OF_RESOURCE;
    }

    ret = sh->comm->c_coll.coll_gather ( &sendBuff, sendcnt, OMPI_OFFSET_DATATYPE,
					 buff, recvcnt, OMPI_OFFSET_DATATYPE, 0,
					 sh->comm, sh->comm->c_coll.coll_gather_module );
    if( OMPI_SUCCESS != ret){
	goto exit;
    }

    /* All the counts are present now in the recvBuff.
    ** The size of recvBuff is sizeof_newComm
    */
    if (  0 == rank ) {
        for (i = 0; i < size ; i ++) {
	    bytesRequested += buff[i];
	    if ( mca_sharedfp_sm_verbose ) {
		opal_output(ompi_sharedfp_base_framework.framework_output,
			    "mca_sharedfp_sm_read_ordered_begin: Bytes requested are %ld\n",
			    bytesRequested);
	    }
        }

        /* Request the offset to read bytesRequested bytes
	** only the root process needs to do the request,
	** since the root process will then tell the other
	** processes at what offset they should read their
	** share of the data.
	*/
        ret = mca_sharedfp_sm_request_position(sh,bytesRequested,&offsetReceived);
        if( OMPI_SUCCESS != ret){
	    goto exit;
        }
	if ( mca_sharedfp_sm_verbose ) {
	    opal_output(ompi_sharedfp_base_framework.framework_output,
			"mca_sharedfp_sm_read_ordered_begin: Offset received is %lld\n",offsetReceived);
	}

        buff[0] += offsetReceived;
        for (i = 1 ; i < size; i++)  {
            buff[i] += buff[i-1];
        }
    }

    /* Scatter the results to the other processes*/
    ret = sh->comm->c_coll.coll_scatter ( buff, sendcnt, OMPI_OFFSET_DATATYPE,
					  &offsetBuff, recvcnt, OMPI_OFFSET_DATATYPE, 0,
					  sh->comm, sh->comm->c_coll.coll_scatter_module );
    if( OMPI_SUCCESS != ret){
	goto exit;
    }

    /*Each process now has its own individual offset in recvBUFF*/
    offset = offsetBuff - sendBuff;
    offset /= sh->sharedfh->f_etype_size;

    if ( mca_sharedfp_sm_verbose ) {
	opal_output(ompi_sharedfp_base_framework.framework_output,
		    "mca_sharedfp_sm_read_ordered_begin: Offset returned is %lld\n",offset);
    }

    /* read to the file */
    ret = ompio_io_ompio_file_iread_at_all(sh->sharedfh,offset,buf,count,datatype,
					   &fh->f_split_coll_req);
    fh->f_split_coll_in_use = true;

exit:
    if ( NULL != buff ) {
	free ( buff );
    }

    return ret;
}
Пример #22
0
/* Todo: gather_intra_generic, gather_intra_binary, gather_intra_chain,
 * gather_intra_pipeline, segmentation? */
int
ompi_coll_base_gather_intra_binomial(const void *sbuf, int scount,
                                      struct ompi_datatype_t *sdtype,
                                      void *rbuf, int rcount,
                                      struct ompi_datatype_t *rdtype,
                                      int root,
                                      struct ompi_communicator_t *comm,
                                      mca_coll_base_module_t *module)
{
    int line = -1, i, rank, vrank, size, total_recv = 0, err;
    char *ptmp     = NULL, *tempbuf  = NULL;
    ompi_coll_tree_t* bmtree;
    MPI_Status status;
    MPI_Aint sextent, sgap, ssize;
    MPI_Aint rextent, rgap, rsize;
    mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
    mca_coll_base_comm_t *data = base_module->base_data;

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "ompi_coll_base_gather_intra_binomial rank %d", rank));

    /* create the binomial tree */
    COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root );
    bmtree = data->cached_in_order_bmtree;

    ompi_datatype_type_extent(sdtype, &sextent);
    ompi_datatype_type_extent(rdtype, &rextent);
    ssize = opal_datatype_span(&sdtype->super, (int64_t)scount * size, &sgap);
    rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * size, &rgap);

    vrank = (rank - root + size) % size;

    if (rank == root) {
        if (0 == root){
            /* root on 0, just use the recv buffer */
            ptmp = (char *) rbuf;
            if (sbuf != MPI_IN_PLACE) {
                err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype,
                                           ptmp, rcount, rdtype);
                if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
            }
        } else {
            /* root is not on 0, allocate temp buffer for recv,
             * rotate data at the end */
            tempbuf = (char *) malloc(rsize);
            if (NULL == tempbuf) {
                err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
            }

            ptmp = tempbuf - rgap;
            if (sbuf != MPI_IN_PLACE) {
                /* copy from sbuf to temp buffer */
                err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype,
                                           ptmp, rcount, rdtype);
                if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
            } else {
                /* copy from rbuf to temp buffer  */
                err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp,
                                                          (char *)rbuf + (ptrdiff_t)rank * rextent * (ptrdiff_t)rcount);
                if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
            }
        }
        total_recv = rcount;
    } else if (!(vrank % 2)) {
        /* other non-leaf nodes, allocate temp buffer for data received from
         * children, the most we need is half of the total data elements due
         * to the property of binimoal tree */
        tempbuf = (char *) malloc(ssize);
        if (NULL == tempbuf) {
            err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
        }

        ptmp = tempbuf - sgap;
        /* local copy to tempbuf */
        err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype,
                                   ptmp, scount, sdtype);
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

        /* use sdtype,scount as rdtype,rdcount since they are ignored on
         * non-root procs */
        rdtype = sdtype;
        rcount = scount;
        rextent = sextent;
        total_recv = rcount;
    } else {
        /* leaf nodes, no temp buffer needed, use sdtype,scount as
         * rdtype,rdcount since they are ignored on non-root procs */
        ptmp = (char *) sbuf;
        total_recv = scount;
    }

    if (!(vrank % 2)) {
        /* all non-leaf nodes recv from children */
        for (i = 0; i < bmtree->tree_nextsize; i++) {
            int mycount = 0, vkid;
            /* figure out how much data I have to send to this child */
            vkid = (bmtree->tree_next[i] - root + size) % size;
            mycount = vkid - vrank;
            if (mycount > (size - vkid))
                mycount = size - vkid;
            mycount *= rcount;

            OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                         "ompi_coll_base_gather_intra_binomial rank %d recv %d mycount = %d",
                         rank, bmtree->tree_next[i], mycount));

            err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype,
                                    bmtree->tree_next[i], MCA_COLL_BASE_TAG_GATHER,
                                    comm, &status));
            if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

            total_recv += mycount;
        }
    }

    if (rank != root) {
        /* all nodes except root send to parents */
        OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                     "ompi_coll_base_gather_intra_binomial rank %d send %d count %d\n",
                     rank, bmtree->tree_prev, total_recv));

        err = MCA_PML_CALL(send(ptmp, total_recv, sdtype,
                                bmtree->tree_prev,
                                MCA_COLL_BASE_TAG_GATHER,
                                MCA_PML_BASE_SEND_STANDARD, comm));
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
    }

    if (rank == root) {
        if (root != 0) {
            /* rotate received data on root if root != 0 */
            err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)(size - root),
                                                      (char *)rbuf + rextent * (ptrdiff_t)root * (ptrdiff_t)rcount, ptmp);
            if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }


            err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)root,
                                                      (char *) rbuf, ptmp + rextent * (ptrdiff_t)rcount * (ptrdiff_t)(size-root));
            if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

            free(tempbuf);
        }
    } else if (!(vrank % 2)) {
        /* other non-leaf nodes */
        free(tempbuf);
    }
    return MPI_SUCCESS;

 err_hndl:
    if (NULL != tempbuf)
        free(tempbuf);

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
                 __FILE__, line, err, rank));
    (void)line;  // silence compiler warning
    return err;
}
Пример #23
0
/*
 *	scan
 *
 *	Function:	- basic scan operation
 *	Accepts:	- same arguments as MPI_Scan()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_scan_intra(void *sbuf, void *rbuf, int count,
                          struct ompi_datatype_t *dtype,
                          struct ompi_op_t *op,
                          struct ompi_communicator_t *comm,
                          mca_coll_base_module_t *module)
{
    int size, rank, err;
    ptrdiff_t true_lb, true_extent, lb, extent;
    char *free_buffer = NULL;
    char *pml_buffer = NULL;

    /* Initialize */

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    /* If I'm rank 0, just copy into the receive buffer */

    if (0 == rank) {
        if (MPI_IN_PLACE != sbuf) {
            err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
            if (MPI_SUCCESS != err) {
                return err;
            }
        }
    }

    /* Otherwise receive previous buffer and reduce. */

    else {
        /* Allocate a temporary buffer.  Rationale for this size is
         * listed in coll_basic_reduce.c.  Use this temporary buffer to
         * receive into, later. */

        ompi_datatype_get_extent(dtype, &lb, &extent);
        ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);

        free_buffer = (char*)malloc(true_extent + (count - 1) * extent);
        if (NULL == free_buffer) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        pml_buffer = free_buffer - lb;

        /* Copy the send buffer into the receive buffer. */

        if (MPI_IN_PLACE != sbuf) {
            err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
            if (MPI_SUCCESS != err) {
                if (NULL != free_buffer) {
                    free(free_buffer);
                }
                return err;
            }
        }

        /* Receive the prior answer */

        err = MCA_PML_CALL(recv(pml_buffer, count, dtype,
                                rank - 1, MCA_COLL_BASE_TAG_SCAN, comm,
                                MPI_STATUS_IGNORE));
        if (MPI_SUCCESS != err) {
            if (NULL != free_buffer) {
                free(free_buffer);
            }
            return err;
        }

        /* Perform the operation */

        ompi_op_reduce(op, pml_buffer, rbuf, count, dtype);

        /* All done */

        if (NULL != free_buffer) {
            free(free_buffer);
        }
    }

    /* Send result to next process. */

    if (rank < (size - 1)) {
        return MCA_PML_CALL(send(rbuf, count, dtype, rank + 1,
                                 MCA_COLL_BASE_TAG_SCAN,
                                 MCA_PML_BASE_SEND_STANDARD, comm));
    }

    /* All done */

    return MPI_SUCCESS;
}
int mca_common_ompio_file_open (ompi_communicator_t *comm,
                              const char *filename,
                              int amode,
                              ompi_info_t *info,
                              mca_io_ompio_file_t *ompio_fh, bool use_sharedfp)
{
    int ret = OMPI_SUCCESS;
    int remote_arch;


    ompio_fh->f_iov_type = MPI_DATATYPE_NULL;
    ompio_fh->f_comm     = MPI_COMM_NULL;

    if ( ((amode&MPI_MODE_RDONLY)?1:0) + ((amode&MPI_MODE_RDWR)?1:0) +
	 ((amode&MPI_MODE_WRONLY)?1:0) != 1 ) {
	return MPI_ERR_AMODE;
    }

    if ((amode & MPI_MODE_RDONLY) &&
        ((amode & MPI_MODE_CREATE) || (amode & MPI_MODE_EXCL))) {
	return  MPI_ERR_AMODE;
    }

    if ((amode & MPI_MODE_RDWR) && (amode & MPI_MODE_SEQUENTIAL)) {
	return MPI_ERR_AMODE;
    }

    ompio_fh->f_rank     = ompi_comm_rank (comm);
    ompio_fh->f_size     = ompi_comm_size (comm);
    remote_arch = opal_local_arch;
    ompio_fh->f_convertor = opal_convertor_create (remote_arch, 0);

    if ( true == use_sharedfp ) {
	ret = ompi_comm_dup (comm, &ompio_fh->f_comm);
	if ( OMPI_SUCCESS != ret )  {
	    goto fn_fail;
	}
    }
    else {
	/* No need to duplicate the communicator if the file_open is called
	   from the sharedfp component, since the comm used as an input
	   is already a dup of the user level comm. */
	ompio_fh->f_flags |= OMPIO_SHAREDFP_IS_SET;
	ompio_fh->f_comm = comm;
    }

    ompio_fh->f_fstype = NONE;
    ompio_fh->f_amode  = amode;
    ompio_fh->f_info   = info;
    ompio_fh->f_atomicity = 0;

    ompi_io_ompio_set_file_defaults (ompio_fh);
    ompio_fh->f_filename = filename;

    ompio_fh->f_split_coll_req    = NULL;
    ompio_fh->f_split_coll_in_use = false;

    /*Initialize the print_queues queues here!*/
    mca_common_ompio_initialize_print_queue(&ompio_fh->f_coll_write_time);
    mca_common_ompio_initialize_print_queue(&ompio_fh->f_coll_read_time);

    /* set some function pointers required for fcoll, fbtls and sharedfp modules*/
    ompio_fh->f_decode_datatype=ompi_io_ompio_decode_datatype;
    ompio_fh->f_generate_current_file_view=ompi_io_ompio_generate_current_file_view;

    ompio_fh->f_sort=ompi_io_ompio_sort;
    ompio_fh->f_sort_iovec=ompi_io_ompio_sort_iovec;

    ompio_fh->f_get_num_aggregators=mca_io_ompio_get_num_aggregators;
    ompio_fh->f_get_bytes_per_agg=mca_io_ompio_get_bytes_per_agg;
    ompio_fh->f_set_aggregator_props=mca_io_ompio_set_aggregator_props;

    /* This fix is needed for data seiving to work with
       two-phase collective I/O */
     if ((amode & MPI_MODE_WRONLY)){
       amode -= MPI_MODE_WRONLY;
       amode += MPI_MODE_RDWR;
     }
     /*--------------------------------------------------*/


    if (OMPI_SUCCESS != (ret = mca_fs_base_file_select (ompio_fh,
                                                        NULL))) {
        opal_output(1, "mca_fs_base_file_select() failed\n");
        goto fn_fail;
    }
    if (OMPI_SUCCESS != (ret = mca_fbtl_base_file_select (ompio_fh,
                                                          NULL))) {
        opal_output(1, "mca_fbtl_base_file_select() failed\n");
        goto fn_fail;
    }

    if (OMPI_SUCCESS != (ret = mca_fcoll_base_file_select (ompio_fh,
                                                           NULL))) {
        opal_output(1, "mca_fcoll_base_file_select() failed\n");
        goto fn_fail;
    }

    ompio_fh->f_sharedfp_component = NULL; /*component*/
    ompio_fh->f_sharedfp           = NULL; /*module*/
    ompio_fh->f_sharedfp_data      = NULL; /*data*/

    if ( true == use_sharedfp ) {
	if (OMPI_SUCCESS != (ret = mca_sharedfp_base_file_select (ompio_fh, NULL))) {
	    opal_output ( ompi_io_base_framework.framework_output,
			  "mca_sharedfp_base_file_select() failed\n");
	    ompio_fh->f_sharedfp           = NULL; /*module*/
	    /* Its ok to not have a shared file pointer module as long as the shared file
	    ** pointer operations are not used. However, the first call to any file_read/write_shared
	    ** function will return an error code.
	    */
	}

	/* open the file once more for the shared file pointer if required.
	** Per default, the shared file pointer specific actions are however
	** only performed on first access of the shared file pointer, except
	** for the addproc sharedfp component.
	**
	** Lazy open does not work for the addproc sharedfp
	** component since it starts by spawning a process using MPI_Comm_spawn.
	** For this, the first operation has to be collective which we can
	** not guarantuee outside of the MPI_File_open operation.
	*/
	if ( NULL != ompio_fh->f_sharedfp &&
	     true == use_sharedfp &&
	     (!mca_io_ompio_sharedfp_lazy_open ||
	      !strcmp (ompio_fh->f_sharedfp_component->mca_component_name,
		       "addproc")               )) {
	    ret = ompio_fh->f_sharedfp->sharedfp_file_open(comm,
							   filename,
							   amode,
							   info,
							   ompio_fh);

	    if ( OMPI_SUCCESS != ret ) {
		goto fn_fail;
	    }
	}
    }

     /*Determine topology information if set*/
    if (ompio_fh->f_comm->c_flags & OMPI_COMM_CART){
        ret = mca_io_ompio_cart_based_grouping(ompio_fh);
	if(OMPI_SUCCESS != ret ){
	    ret = MPI_ERR_FILE;
	}
    }

    ret = ompio_fh->f_fs->fs_file_open (comm,
					filename,
					amode,
					info,
					ompio_fh);




    if ( OMPI_SUCCESS != ret ) {
	ret = MPI_ERR_FILE;
        goto fn_fail;
    }


    /* If file has been opened in the append mode, move the internal
       file pointer of OMPIO to the very end of the file. */
    if ( ompio_fh->f_amode & MPI_MODE_APPEND ) {
        OMPI_MPI_OFFSET_TYPE current_size;

        ompio_fh->f_fs->fs_file_get_size( ompio_fh,
                                          &current_size);
        mca_common_ompio_set_explicit_offset (ompio_fh, current_size);
    }



    return OMPI_SUCCESS;

    fn_fail:
        /* no need to free resources here, since the destructor
	 * is calling mca_io_ompio_file_close, which actually gets
	 *rid of all allocated memory items */

    return ret;
}
Пример #25
0
static int
mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, const int *rdisps,
                                             struct ompi_datatype_t *rdtype,
                                             struct ompi_communicator_t *comm,
                                             mca_coll_base_module_t *module)
{
    mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
    int i, j, size, rank, err=MPI_SUCCESS;
    MPI_Request *preq;
    char *tmp_buffer;
    size_t max_size, rdtype_size;
    ptrdiff_t ext;

    /* Initialize. */

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);
    ompi_datatype_type_size(rdtype, &rdtype_size);

    /* If only one process, we're done. */
    if (1 == size || 0 == rdtype_size) {
        return MPI_SUCCESS;
    }

    /* Find the largest receive amount */
    ompi_datatype_type_extent (rdtype, &ext);
    for (i = 0, max_size = 0 ; i < size ; ++i) {
        size_t size = ext * rcounts[i];

        max_size = size > max_size ? size : max_size;
    }

    /* Allocate a temporary buffer */
    tmp_buffer = calloc (max_size, 1);
    if (NULL == tmp_buffer) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* in-place alltoallv slow algorithm (but works) */
    for (i = 0 ; i < size ; ++i) {
        for (j = i+1 ; j < size ; ++j) {
            /* Initiate all send/recv to/from others. */
            preq = tuned_module->tuned_data->mcct_reqs;

            if (i == rank && rcounts[j]) {
                /* Copy the data into the temporary buffer */
                err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[j],
                                                           tmp_buffer, (char *) rbuf + rdisps[j] * ext);
                if (MPI_SUCCESS != err) { goto error_hndl; }

                /* Exchange data with the peer */
                err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j] * ext, rcounts[j], rdtype,
                                          j, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }

                err = MCA_PML_CALL(isend ((void *) tmp_buffer,  rcounts[j], rdtype,
                                          j, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD,
                                          comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }
            } else if (j == rank && rcounts[i]) {
                /* Copy the data into the temporary buffer */
                err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[i],
                                                           tmp_buffer, (char *) rbuf + rdisps[i] * ext);
                if (MPI_SUCCESS != err) { goto error_hndl; }

                /* Exchange data with the peer */
                err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i] * ext, rcounts[i], rdtype,
                                          i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }

                err = MCA_PML_CALL(isend ((void *) tmp_buffer,  rcounts[i], rdtype,
                                          i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD,
                                          comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }
            } else {
                continue;
            }

            /* Wait for the requests to complete */
            err = ompi_request_wait_all (2, tuned_module->tuned_data->mcct_reqs, MPI_STATUSES_IGNORE);
            if (MPI_SUCCESS != err) { goto error_hndl; }

            /* Free the requests. */
            mca_coll_tuned_free_reqs(tuned_module->tuned_data->mcct_reqs, 2);
        }
    }

 error_hndl:
    /* Free the temporary buffer */
    free (tmp_buffer);

    /* All done */

    return err;
}
Пример #26
0
/**
 * Shared memory broadcast.
 *
 * For the root, the general algorithm is to wait for a set of
 * segments to become available.  Once it is, the root claims the set
 * by writing the current operation number and the number of processes
 * using the set to the flag.  The root then loops over the set of
 * segments; for each segment, it copies a fragment of the user's
 * buffer into the shared data segment and then writes the data size
 * into its childrens' control buffers.  The process is repeated until
 * all fragments have been written.
 *
 * For non-roots, for each set of buffers, they wait until the current
 * operation number appears in the in-use flag (i.e., written by the
 * root).  Then for each segment, they wait for a nonzero to appear
 * into their control buffers.  If they have children, they copy the
 * data from their parent's shared data segment into their shared data
 * segment, and write the data size into each of their childrens'
 * control buffers.  They then copy the data from their shared [local]
 * data segment into the user's output buffer.  The process is
 * repeated until all fragments have been received.  If they do not
 * have children, they copy the data directly from the parent's shared
 * data segment into the user's output buffer.
 */
int mca_coll_sm_bcast_intra(void *buff, int count,
                            struct ompi_datatype_t *datatype, int root,
                            struct ompi_communicator_t *comm,
                            mca_coll_base_module_t *module)
{
    struct iovec iov;
    mca_coll_sm_module_t *sm_module = (mca_coll_sm_module_t*) module;
    mca_coll_sm_comm_t *data;
    int i, ret, rank, size, num_children, src_rank;
    int flag_num, segment_num, max_segment_num;
    int parent_rank;
    size_t total_size, max_data, bytes;
    mca_coll_sm_in_use_flag_t *flag;
    opal_convertor_t convertor;
    mca_coll_sm_tree_node_t *me, *parent, **children;
    mca_coll_sm_data_index_t *index;

    /* Lazily enable the module the first time we invoke a collective
       on it */
    if (!sm_module->enabled) {
        if (OMPI_SUCCESS != (ret = ompi_coll_sm_lazy_enable(module, comm))) {
            return ret;
        }
    }
    data = sm_module->sm_comm_data;

    /* Setup some identities */

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    OBJ_CONSTRUCT(&convertor, opal_convertor_t);
    iov.iov_len = mca_coll_sm_component.sm_fragment_size;
    bytes = 0;

    me = &data->mcb_tree[(rank + size - root) % size];
    parent = me->mcstn_parent;
    children = me->mcstn_children;
    num_children = me->mcstn_num_children;

    /* Only have one top-level decision as to whether I'm the root or
       not.  Do this at the slight expense of repeating a little logic
       -- but it's better than a conditional branch in every loop
       iteration. */

    /*********************************************************************
     * Root
     *********************************************************************/

    if (root == rank) {

        /* The root needs a send convertor to pack from the user's
           buffer to shared memory */

        if (OMPI_SUCCESS !=
            (ret =
             opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor,
                                                      &(datatype->super),
                                                      count,
                                                      buff,
                                                      0,
                                                      &convertor))) {
            return ret;
        }
        opal_convertor_get_packed_size(&convertor, &total_size);

        /* Main loop over sending fragments */

        do {
            flag_num = (data->mcb_operation_count++ %
                        mca_coll_sm_component.sm_comm_num_in_use_flags);

            FLAG_SETUP(flag_num, flag, data);
            FLAG_WAIT_FOR_IDLE(flag, bcast_root_label);
            FLAG_RETAIN(flag, size - 1, data->mcb_operation_count - 1);

            /* Loop over all the segments in this set */

            segment_num =
                flag_num * mca_coll_sm_component.sm_segs_per_inuse_flag;
            max_segment_num =
                (flag_num + 1) * mca_coll_sm_component.sm_segs_per_inuse_flag;
            do {
                index = &(data->mcb_data_index[segment_num]);

                /* Copy the fragment from the user buffer to my fragment
                   in the current segment */
                max_data = mca_coll_sm_component.sm_fragment_size;
                COPY_FRAGMENT_IN(convertor, index, rank, iov, max_data);
                bytes += max_data;

                /* Wait for the write to absolutely complete */
                opal_atomic_wmb();

                /* Tell my children that this fragment is ready */
                PARENT_NOTIFY_CHILDREN(children, num_children, index,
                                       max_data);

                ++segment_num;
            } while (bytes < total_size && segment_num < max_segment_num);
        } while (bytes < total_size);
    }

    /*********************************************************************
     * Non-root
     *********************************************************************/

    else {

        /* Non-root processes need a receive convertor to unpack from
           shared mmory to the user's buffer */

        if (OMPI_SUCCESS !=
            (ret =
             opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor,
                                                      &(datatype->super),
                                                      count,
                                                      buff,
                                                      0,
                                                      &convertor))) {
            return ret;
        }
        opal_convertor_get_packed_size(&convertor, &total_size);

        /* Loop over receiving (and possibly re-sending) the
           fragments */

        do {
            flag_num = (data->mcb_operation_count %
                        mca_coll_sm_component.sm_comm_num_in_use_flags);

            /* Wait for the root to mark this set of segments as
               ours */
            FLAG_SETUP(flag_num, flag, data);
            FLAG_WAIT_FOR_OP(flag, data->mcb_operation_count, bcast_nonroot_label1);
            ++data->mcb_operation_count;

            /* Loop over all the segments in this set */

            segment_num =
                flag_num * mca_coll_sm_component.sm_segs_per_inuse_flag;
            max_segment_num =
                (flag_num + 1) * mca_coll_sm_component.sm_segs_per_inuse_flag;
            do {

                /* Pre-calculate some values */
                parent_rank = (parent->mcstn_id + root) % size;
                index = &(data->mcb_data_index[segment_num]);

                /* Wait for my parent to tell me that the segment is ready */
                CHILD_WAIT_FOR_NOTIFY(rank, index, max_data, bcast_nonroot_label2);

                /* If I have children, send the data to them */
                if (num_children > 0) {
                    /* Copy the fragment from the parent's portion in
                       the segment to my portion in the segment. */
                    COPY_FRAGMENT_BETWEEN(parent_rank, rank, index, max_data);

                    /* Wait for the write to absolutely complete */
                    opal_atomic_wmb();

                    /* Tell my children that this fragment is ready */
                    PARENT_NOTIFY_CHILDREN(children, num_children, index,
                                           max_data);

                    /* Set the "copy from buffer" to be my local
                       segment buffer so that we don't potentially
                       incur a non-local memory copy from the parent's
                       fan out data segment [again] when copying to
                       the user's buffer */
                    src_rank = rank;
                }

                /* If I don't have any children, set the "copy from
                   buffer" to be my parent's fan out segment to copy
                   directly from my parent */

                else {
                    src_rank = parent_rank;
                }

                /* Copy to my output buffer */
                COPY_FRAGMENT_OUT(convertor, src_rank, index, iov, max_data);

                bytes += max_data;
                ++segment_num;
            } while (bytes < total_size && segment_num < max_segment_num);

            /* Wait for all copy-out writes to complete before I say
               I'm done with the segments */
            opal_atomic_wmb();

            /* We're finished with this set of segments */
            FLAG_RELEASE(flag);
        } while (bytes < total_size);
    }

    /* Kill the convertor */

    OBJ_DESTRUCT(&convertor);

    /* All done */

    return OMPI_SUCCESS;
}
Пример #27
0
int ompi_coll_libnbc_igatherv(void* sendbuf, int sendcount, MPI_Datatype sendtype,
                              void* recvbuf, int *recvcounts, int *displs, MPI_Datatype recvtype,
                              int root, struct ompi_communicator_t *comm, ompi_request_t ** request,
                              struct mca_coll_base_module_2_1_0_t *module) {
  int rank, p, res;
  MPI_Aint rcvext = 0;
  NBC_Schedule *schedule;
  char *rbuf, inplace;
  NBC_Handle *handle;
  ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;

  NBC_IN_PLACE(sendbuf, recvbuf, inplace);

  rank = ompi_comm_rank (comm);
  p = ompi_comm_size (comm);

  if (rank == root) {
    res = MPI_Type_extent(recvtype, &rcvext);
    if (MPI_SUCCESS != res) {
      NBC_Error("MPI Error in MPI_Type_extent() (%i)", res);
      return res;
    }
  }

  schedule = OBJ_NEW(NBC_Schedule);
  if (OPAL_UNLIKELY(NULL == schedule)) {
    return OMPI_ERR_OUT_OF_RESOURCE;
  }

  /* send to root */
  if (rank != root) {
    /* send msg to root */
    res = NBC_Sched_send (sendbuf, false, sendcount, sendtype, root, schedule, false);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
      OBJ_RELEASE(schedule);
      return res;
    }
  } else {
    for (int i = 0 ; i < p ; ++i) {
      rbuf = (char *) recvbuf + displs[i] * rcvext;
      if (i == root) {
        if (!inplace) {
          /* if I am the root - just copy the message */
          res = NBC_Copy (sendbuf, sendcount, sendtype, rbuf, recvcounts[i], recvtype,
                          comm);
          if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
            OBJ_RELEASE(schedule);
            return res;
          }
        }
      } else {
        /* root receives message to the right buffer */
        res = NBC_Sched_recv (rbuf, false, recvcounts[i], recvtype, i, schedule, false);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
          OBJ_RELEASE(schedule);
          return res;
        }
      }
    }
  }

  res = NBC_Sched_commit (schedule);
  if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
    OBJ_RELEASE(schedule);
    return res;
  }

  res = NBC_Init_handle (comm, &handle, libnbc_module);
  if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
    OBJ_RELEASE(schedule);
    return res;
  }

  res = NBC_Start (handle, schedule);
  if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
    NBC_Return_handle (handle);
    return res;
  }

  *request = (ompi_request_t *) handle;

  return OMPI_SUCCESS;
}
struct mca_sharedfp_base_module_1_0_0_t * mca_sharedfp_lockedfile_component_file_query(mca_io_ompio_file_t *fh, int *priority) {
    struct flock lock;
    int fd, err;
    /*char *filename;*/
    char filename[256];
    int rank;
    bool has_file_lock_support=false;

    *priority = mca_sharedfp_lockedfile_priority;

    /*get the rank of this process*/
    rank = ompi_comm_rank ( fh->f_comm);

    /*test, and update priority*/
    /*
     * This test tests to see if fcntl returns success when asked to
     * establish a file lock.  This test is intended for use on file systems
     * such as NFS that may not implement file locks.  The locked file algorithm makes use
     * of file locks to implement the shared file pointer operations, and will not work
     * properly if file locks are not available.
     *
     * This is a simple test and has at least two limitations:
     *
     * 1. Some implementations of NFS are known to return success for
     * setting a file lock when in fact no lock has been set.  This
     * test will not detect such erroneous implementations of NFS
     *
     * 2. Some implementations will hang (enter and wait indefinitately)
     * within the fcntl call.  This test will also hang in that case.
     * Under normal conditions, this test should only take a few seconds to
     * run.
     *
     * The test prints a message showing the success or failure of
     * setting the file lock if PRINT_TAG is set to true in sharedfp.h.
     * It also sets the priority to a non-zero value on success and
     * returns NULL on failure.  If there is a failure, the system routine
     * strerror is interogated in order to print the reason.
     */

    /* Set the filename. */
    /*data filename created by appending .locktest.$rank to the original filename*/
    sprintf(filename,"%s%s%d",fh->f_filename,".locktest.",rank);

    lock.l_type   = F_WRLCK;
    lock.l_start  = 0;
    lock.l_whence = SEEK_SET;
    lock.l_len    = 100;
    lock.l_pid      = getpid();

    fd = open(filename, O_RDWR | O_CREAT, 0644);

    if ( -1 == fd ){
        opal_output(1,"mca_sharedfp_lockedfile_component_file_query: error opening file %s", filename);
        opal_output(1,"%s\n", strerror(errno));
        has_file_lock_support=false;
    }
    else{
        err = fcntl(fd, F_SETLKW, &lock);
	if ( mca_sharedfp_lockedfile_verbose ) {
	    printf("mca_sharedfp_lockedfile_component_file_query: returned err=%d, for fd=%d\n",err,fd);
	}

        if (err) {
            opal_output(1, "mca_sharedfp_lockedfile_component_file_query: Failed to set a file lock on %s\n", filename );
            opal_output(1, "err=%d, errno=%d, EOPNOTSUPP=%d, EINVAL=%d, ENOSYS=%d, EACCES=%d, EAGAIN=%d, EBADF=%d\n",
                        err,errno,EOPNOTSUPP,EINVAL,ENOSYS,EACCES,EAGAIN,EBADF);
            opal_output(1,"%s\n", strerror(errno));

            if (errno == EACCES || errno == EAGAIN) {
                opal_output(1,"errno=EACCES || EAGAIN, Already locked by another process\n");
            }

        }
        else {
	    if ( mca_sharedfp_lockedfile_verbose ) {
		printf( "mca_sharedfp_lockedfile_component_file_query: fcntl claims success in setting a file lock on %s\n", filename );
	    }

            has_file_lock_support=true;
        }
        /* printf("err = %d, errno = %d\n", err, errno); */
        close(fd);
        unlink( filename );
    }
    /**priority=100;*/
    if(has_file_lock_support){
        return &lockedfile;
    }

    *priority = 0;
    /*module can not run!, return NULL to indicate that we are unable to run*/

    opal_output(1,"mca_sharedfp_lockedfile_component_file_query: Can not run!, file locking not supported\n");
    return NULL;
}
Пример #29
0
/*
 * Initialize module on the communicator
 */
static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module,
                                        struct ompi_communicator_t *comm)
{
    mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*) module;
    hcoll_module->comm = comm;

    if (OMPI_SUCCESS != __save_coll_handlers(hcoll_module)){
        HCOL_ERROR("coll_hcol: __save_coll_handlers failed");
        return OMPI_ERROR;
    }

    hcoll_set_runtime_tag_offset(-100,mca_pml.pml_max_tag);
    hcoll_set_rte_halt_flag_address(&ompi_mpi_finalized);
    hcoll_set_rte_halt_flag_size(sizeof(ompi_mpi_finalized));

    hcoll_module->hcoll_context =
        hcoll_create_context((rte_grp_handle_t)comm);
    if (NULL == hcoll_module->hcoll_context){
        HCOL_VERBOSE(1,"hcoll_create_context returned NULL");
        return OMPI_ERROR;
    }


#if 0
    {
        printf("\033[33mrank %d: DOING EXTRA TEST\033[0m\n",ompi_comm_rank(comm));
        fflush(stdout);
        sleep(1);
        rte_ec_handle_t handle;
        rte_grp_handle_t world_group = hcoll_rte_functions.rte_world_group_fn();
        int peer;
        const int max_count = 10000000;
        const int step = max_count/100;
        int buf = 0;
        int i;
        rte_request_handle_t req;
        peer = (ompi_comm_rank(comm)+1)%2;
        hcoll_rte_functions.get_ec_handles_fn(1,&peer,world_group,&handle);

        for (i=1; i<max_count+1; i++){
            if (0 == ompi_comm_rank(comm)){
                if (i/step*step == i){
                    printf("%d %% done...\n",i/step);fflush(stdout);
                }
                buf = 1;
                hcoll_rte_functions.send_fn(DTE_INT32,1,&buf,handle,world_group,0,&req);
            } else {
                hcoll_rte_functions.recv_fn(DTE_INT32,1,&buf,handle,world_group,0,&req);
            }
            int completed = 0;
            hcoll_rte_functions.test_fn(&req,&completed);
            while(!completed){
                hcoll_rte_functions.test_fn(&req,&completed);
                /*hcoll_rte_functions.rte_progress_fn();*/
                opal_progress();
            }
        }
        printf("\033[32mrank %d: EXTRA TEST PASS\033[0m\n",ompi_comm_rank(comm));
        fflush(stdout);
        sleep(1);
    }
#endif
    return OMPI_SUCCESS;
}
Пример #30
0
/*
 *	allgatherv_inter
 *
 *	Function:	- allgatherv using other MPI collectives
 *	Accepts:	- same as MPI_Allgatherv()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_inter_allgatherv_inter(void *sbuf, int scount,
                                struct ompi_datatype_t *sdtype,
                                void *rbuf, int *rcounts, int *disps,
                                struct ompi_datatype_t *rdtype,
                                struct ompi_communicator_t *comm,
                                mca_coll_base_module_t *module)
{
    int i, rank, size, size_local, total=0, err;
    int *count=NULL,*displace=NULL;
    char *ptmp=NULL;
    MPI_Aint incr;
    MPI_Aint extent;
    MPI_Aint lb;
    ompi_datatype_t *ndtype;
    ompi_request_t *req[2];

    rank = ompi_comm_rank(comm);
    size_local = ompi_comm_size(comm->c_local_comm);
    size = ompi_comm_remote_size(comm);

    if (0 == rank) {
        count = (int *)malloc(sizeof(int) * size_local);
        if (NULL == count) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        displace = (int *)malloc(sizeof(int) * size_local);
        if (NULL == displace) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    }
    /* Local gather to get the scount of each process */
    err = comm->c_local_comm->c_coll.coll_gather(&scount, 1, MPI_INT,
            count, 1, MPI_INT,
            0, comm->c_local_comm,
            comm->c_local_comm->c_coll.coll_gather_module);
    if (OMPI_SUCCESS != err) {
        return err;
    }
    if(0 == rank) {
        displace[0] = 0;
        for (i = 1; i < size_local; i++) {
            displace[i] = displace[i-1] + count[i-1];
        }
        /* Perform the gatherv locally with the first process as root */
        err = ompi_ddt_get_extent(sdtype, &lb, &extent);
        if (OMPI_SUCCESS != err) {
            return OMPI_ERROR;
        }
        incr = 0;
        for (i = 0; i < size_local; i++) {
            incr = incr + extent*count[i];
        }
        ptmp = (char*)malloc(incr);
        if (NULL == ptmp) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    }
    err = comm->c_local_comm->c_coll.coll_gatherv(sbuf, scount, sdtype,
            ptmp, count, displace,
            sdtype,0, comm->c_local_comm,
            comm->c_local_comm->c_coll.coll_gatherv_module);
    if (OMPI_SUCCESS != err) {
        return err;
    }

    ompi_ddt_create_indexed(size,rcounts,disps,rdtype,&ndtype);
    ompi_ddt_commit(&ndtype);

    if (0 == rank) {
        for (i = 0; i < size_local; i++) {
            total = total + count[i];
        }
        /* Exchange data between roots */
        err = MCA_PML_CALL(irecv(rbuf, 1, ndtype, 0,
                                 MCA_COLL_BASE_TAG_ALLGATHERV, comm,
                                 &(req[0])));
        if (OMPI_SUCCESS != err) {
            return err;
        }

        err = MCA_PML_CALL(isend(ptmp, total, sdtype, 0,
                                 MCA_COLL_BASE_TAG_ALLGATHERV,
                                 MCA_PML_BASE_SEND_STANDARD,
                                 comm, &(req[1])));
        if (OMPI_SUCCESS != err) {
            return err;
        }

        err = ompi_request_wait_all(2, req, MPI_STATUSES_IGNORE);
        if (OMPI_SUCCESS != err) {
            return err;
        }
    }

    /* bcast the message to all the local processes */
    err = comm->c_local_comm->c_coll.coll_bcast(rbuf, 1, ndtype,
            0, comm->c_local_comm,
            comm->c_local_comm->c_coll.coll_bcast_module);
    if (OMPI_SUCCESS != err) {
        return err;
    }

    ompi_ddt_destroy(&ndtype);
    if (NULL != ptmp) {
        free(ptmp);
    }
    if (NULL != displace) {
        free(displace);
    }
    if (NULL != count) {
        free(count);
    }

    return err;

}