int
MPIDO_Reduce_binom(void * sendbuf,
		   void * recvbuf,
		   int count,
		   DCMF_Dt dcmf_dt,
		   DCMF_Op dcmf_op,
		   MPI_Datatype data_type,
		   int root,
		   MPID_Comm * comm)
{
  int rc, hw_root = comm->vcr[root];
  DCMF_CollectiveRequest_t request;
  volatile unsigned active = 1;
  DCMF_Callback_t callback = { reduce_cb_done, (void *) &active };
  DCMF_Geometry_t * geometry = &(comm->dcmf.geometry);

  rc = DCMF_Reduce(&MPIDI_CollectiveProtocols.binomial_reduce,
                   &request,
                   callback,
                   DCMF_MATCH_CONSISTENCY,
                   geometry,
                   hw_root,
                   sendbuf,
                   recvbuf,
                   count,
                   dcmf_dt,
                   dcmf_op);

  MPID_PROGRESS_WAIT_WHILE(active);
  return rc;
}
int
MPIDO_Reduce_global_tree(void * sendbuf,
			 void * recvbuf,
			 int count,
			 DCMF_Dt dcmf_dt,
			 DCMF_Op dcmf_op,
			 MPI_Datatype data_type,
			 int root,
			 MPID_Comm * comm)
{
  int rc, hw_root = comm->vcr[root];
  DCMF_CollectiveRequest_t request;
  volatile unsigned active = 1;
  DCMF_Callback_t callback = { reduce_cb_done, (void *) &active };
  rc = DCMF_GlobalAllreduce(&MPIDI_Protocols.globalallreduce,
                            (DCMF_Request_t *)&request,
                            callback,
                            DCMF_MATCH_CONSISTENCY,
                            hw_root,
                            sendbuf,
                            recvbuf,
                            count,
                            dcmf_dt,
                            dcmf_op);
  MPID_PROGRESS_WAIT_WHILE(active);

  return rc;
}
int
MPIDO_Barrier_gi(MPID_Comm * comm)
{
  int rc;
  MPID_Comm * comm_world;
  MPID_Comm_get_ptr(MPI_COMM_WORLD, comm_world);
  DCMF_Callback_t callback = { barrier_cb_done,
			       (void *) &mpid_globalbarrier_active };

  /* initialize global active field */
  mpid_globalbarrier_active = 1;

  if (mpid_globalbarrier_restart)
    rc = DCMF_Restart (&mpid_globalbarrier_request);
  else
  {
    mpid_globalbarrier_restart = 1;
    rc = DCMF_GlobalBarrier(&MPIDI_Protocols.globalbarrier,
                            &mpid_globalbarrier_request, callback);
  }

  if (rc == DCMF_SUCCESS)
    MPID_PROGRESS_WAIT_WHILE(* (int *) callback.clientdata);

  return rc;
}
예제 #4
0
int
MPID_Win_fence(int       assert,
               MPID_Win *win)
{
  int mpi_errno = MPI_SUCCESS;

  struct MPIDI_Win_sync* sync = &win->mpid.sync;
  MPID_PROGRESS_WAIT_WHILE(sync->total != sync->complete);
  sync->total    = 0;
  sync->started  = 0;
  sync->complete = 0;

  mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &mpi_errno);
  return mpi_errno;
}
예제 #5
0
int
MPID_Probe(int source,
           int tag,
           MPID_Comm * comm,
           int context_offset,
           MPI_Status * status)
{
  const int context = comm->recvcontext_id + context_offset;

  if (source == MPI_PROC_NULL)
    {
        MPIR_Status_set_procnull(status);
        return MPI_SUCCESS;
    }
  MPID_PROGRESS_WAIT_WHILE(!MPIDI_Recvq_FU_r(source, tag, context, status));
  return MPI_SUCCESS;
}
int MPIDO_Alltoallw_torus(void *sendbuf,
			  int * sendcounts,
			  int * senddispls,
			  MPI_Datatype * sendtypes,
			  void * recvbuf,
			  int * recvcounts,
			  int * recvdispls,
			  MPI_Datatype *recvtypes,
			  MPID_Comm * comm)
{
  int rc;
  DCMF_CollectiveRequest_t request;
  volatile unsigned active = 1;
  DCMF_Callback_t callback = { alltoallw_cb_done, (void *) &active };
  DCMF_Geometry_t * geometry = &(comm->dcmf.geometry);   

  /* ignore some of the args passed in, used the one setup in comm ptr */
  unsigned * sndlen = comm->dcmf.sndlen;
  unsigned * sdispls = comm->dcmf.sdispls;
  unsigned * rcvlen = comm->dcmf.rcvlen;
  unsigned * rdispls = comm->dcmf.rdispls;
  unsigned * sndcounters = comm->dcmf.sndcounters;
  unsigned * rcvcounters = comm->dcmf.rcvcounters;

  rc = DCMF_Alltoallv(&MPIDI_CollectiveProtocols.torus_alltoallv,
                      &request,
                      callback,
                      DCMF_MATCH_CONSISTENCY,
                      geometry,
                      sendbuf,
                      sndlen,
                      sdispls,
                      recvbuf,
                      rcvlen,
                      rdispls,
                      sndcounters,
                      rcvcounters);

  MPID_PROGRESS_WAIT_WHILE(active);
  return rc;
}
int
MPIDO_Barrier_dcmf(MPID_Comm * comm)
{
  int rc;

  /* use local (thread safe) active field */
  volatile unsigned active;

  DCMF_Callback_t callback = { barrier_cb_done, (void *) &active};

  /* initialize local (thread safe) active field */
  active = 1;

  /* geometry sets up proper barrier for the geometry at init time */
  rc = DCMF_Barrier(&comm->dcmf.geometry, callback, DCMF_MATCH_CONSISTENCY);

  if (rc == DCMF_SUCCESS)
    MPID_PROGRESS_WAIT_WHILE(* (int *) callback.clientdata);

  return rc;
}
예제 #8
0
int MPIDO_Gatherv(const void *sendbuf, 
                  int sendcount, 
                  MPI_Datatype sendtype,
                  void *recvbuf, 
                  const int *recvcounts, 
                  const int *displs, 
                  MPI_Datatype recvtype,
                  int root, 
                  MPID_Comm * comm_ptr, 
                  int *mpierrno)

{
#ifndef HAVE_PAMI_IN_PLACE
  if (sendbuf == MPI_IN_PLACE)
  {
    MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`");
    return -1;
  }
#endif
   TRACE_ERR("Entering MPIDO_Gatherv\n");
   int i;
   int contig ATTRIBUTE((unused)), rsize ATTRIBUTE((unused)), ssize ATTRIBUTE((unused));
   int pamidt = 1;
   MPID_Datatype *dt_ptr = NULL;
   MPI_Aint send_true_lb, recv_true_lb;
   char *sbuf, *rbuf;
   pami_type_t stype, rtype;
   int tmp;
   volatile unsigned gatherv_active = 1;
   const int rank = comm_ptr->rank;
   const int size = comm_ptr->local_size;
#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
    const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif
   const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
   const int selected_type = mpid->user_selected_type[PAMI_XFER_GATHERV_INT];

   /* Check for native PAMI types and MPI_IN_PLACE on sendbuf */
   /* MPI_IN_PLACE is a nonlocal decision. We will need a preallreduce if we ever have
    * multiple "good" gathervs that work on different counts for example */
   if((sendbuf != MPI_IN_PLACE) && (MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp) != MPI_SUCCESS))
      pamidt = 0;
   if(MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp) != MPI_SUCCESS)
      pamidt = 0;

   if(pamidt == 0 || selected_type == MPID_COLL_USE_MPICH)
   {
      if(unlikely(verbose))
         fprintf(stderr,"Using MPICH gatherv algorithm\n");
      TRACE_ERR("GATHERV using MPICH\n");
      MPIDI_Update_last_algorithm(comm_ptr, "GATHERV_MPICH");
#if CUDA_AWARE_SUPPORT
    if(MPIDI_Process.cuda_aware_support_on)
    {
       MPI_Aint sdt_extent,rdt_extent;
       MPID_Datatype_get_extent_macro(sendtype, sdt_extent);
       MPID_Datatype_get_extent_macro(recvtype, rdt_extent);
       char *scbuf = NULL;
       char *rcbuf = NULL;
       int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
       int is_recv_dev_buf = (rank == root) ? MPIDI_cuda_is_device_buf(recvbuf) : 0;
       if(is_send_dev_buf)
       {
         scbuf = MPL_malloc(sdt_extent * sendcount);
         cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost);
         if (cudaSuccess != cudaerr)
           fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
       }
       else
         scbuf = sendbuf;
       size_t rtotal_buf;
       if(is_recv_dev_buf)
       {
         //Since displs can be non-continous, we need to calculate max buffer size 
         int highest_displs = displs[size - 1];
         int highest_recvcount = recvcounts[size - 1];
         for(i = 0; i < size; i++)
         {
           if(displs[i]+recvcounts[i] > highest_displs+highest_recvcount)
           {
             highest_displs = displs[i];
             highest_recvcount = recvcounts[i];
           }
         }
         rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent;
         rcbuf = MPL_malloc(rtotal_buf);
         if(sendbuf == MPI_IN_PLACE)
         {
           cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rtotal_buf, cudaMemcpyDeviceToHost);
           if (cudaSuccess != cudaerr)
             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
         }
         else
           memset(rcbuf, 0, rtotal_buf);
       }
       else
         rcbuf = recvbuf;
       int cuda_res =  MPIR_Gatherv(scbuf, sendcount, sendtype, rcbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno);
       if(is_send_dev_buf)MPL_free(scbuf);
       if(is_recv_dev_buf)
         {
           cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, rtotal_buf, cudaMemcpyHostToDevice);
           if (cudaSuccess != cudaerr)
             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
           MPL_free(rcbuf);
         }
       return cuda_res;
    }
    else
#endif
      return MPIR_Gatherv(sendbuf, sendcount, sendtype,
               recvbuf, recvcounts, displs, recvtype,
               root, comm_ptr, mpierrno);
   }

   MPIDI_Datatype_get_info(1, recvtype, contig, rsize, dt_ptr, recv_true_lb);
   rbuf = (char *)recvbuf + recv_true_lb;
   sbuf = (void *) sendbuf;

   pami_xfer_t gatherv;

   gatherv.cb_done = cb_gatherv;
   gatherv.cookie = (void *)&gatherv_active;
   gatherv.cmd.xfer_gatherv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0);
   gatherv.cmd.xfer_gatherv_int.rcvbuf = rbuf;
   gatherv.cmd.xfer_gatherv_int.rtype = rtype;
   gatherv.cmd.xfer_gatherv_int.rtypecounts = (int *) recvcounts;
   gatherv.cmd.xfer_gatherv_int.rdispls = (int *) displs;

   gatherv.cmd.xfer_gatherv_int.sndbuf = NULL;
   gatherv.cmd.xfer_gatherv_int.stype = stype;
   gatherv.cmd.xfer_gatherv_int.stypecount = sendcount;

   if(rank == root)
   {
      if(sendbuf == MPI_IN_PLACE) 
      {
         if(unlikely(verbose))
            fprintf(stderr,"gatherv MPI_IN_PLACE buffering\n");
         sbuf = PAMI_IN_PLACE;
         gatherv.cmd.xfer_gatherv_int.stype = rtype;
         gatherv.cmd.xfer_gatherv_int.stypecount = recvcounts[rank];
      }
      else
      {
         MPIDI_Datatype_get_info(1, sendtype, contig, ssize, dt_ptr, send_true_lb);
         sbuf = (char *)sbuf + send_true_lb;
      }
   }
   gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf;

   pami_algorithm_t my_gatherv;
   const pami_metadata_t *my_md = (pami_metadata_t *)NULL;
   int queryreq = 0;

   if(selected_type == MPID_COLL_OPTIMIZED)
   {
      TRACE_ERR("Optimized gatherv %s was selected\n",
         mpid->opt_protocol_md[PAMI_XFER_GATHERV_INT][0].name);
      my_gatherv = mpid->opt_protocol[PAMI_XFER_GATHERV_INT][0];
      my_md = &mpid->opt_protocol_md[PAMI_XFER_GATHERV_INT][0];
      queryreq = mpid->must_query[PAMI_XFER_GATHERV_INT][0];
   }
   else
   {
      TRACE_ERR("Optimized gatherv %s was set by user\n",
         mpid->user_metadata[PAMI_XFER_GATHERV_INT].name);
         my_gatherv = mpid->user_selected[PAMI_XFER_GATHERV_INT];
         my_md = &mpid->user_metadata[PAMI_XFER_GATHERV_INT];
         queryreq = selected_type;
   }

   gatherv.algorithm = my_gatherv;


   if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY || 
               queryreq == MPID_COLL_CHECK_FN_REQUIRED))
   {
      metadata_result_t result = {0};
      TRACE_ERR("querying gatherv protocol %s, type was %d\n", 
         my_md->name, queryreq);
      if(my_md->check_fn == NULL)
      {
         /* process metadata bits */
         if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE))
            result.check.unspecified = 1;
/* Can't check ranges like this.  Non-local.  Comment out for now.
         if(my_md->check_correct.values.rangeminmax)
         {
            MPI_Aint data_true_lb;
            MPID_Datatype *data_ptr;
            int data_size, data_contig;
            MPIDI_Datatype_get_info(sendcount, sendtype, data_contig, data_size, data_ptr, data_true_lb); 
            if((my_md->range_lo <= data_size) &&
               (my_md->range_hi >= data_size))
               ; 
            else
            {
               result.check.range = 1;
               if(unlikely(verbose))
               {   
                  fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n",
                          data_size,
                          my_md->range_lo,
                          my_md->range_hi,
                          my_md->name);
               }
            }
         }
 */
      }
      else /* calling the check fn is sufficient */
         result = my_md->check_fn(&gatherv);
      TRACE_ERR("bitmask: %#X\n", result.bitmask);
      result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */
      if(result.bitmask)
      {
         if(unlikely(verbose))
            fprintf(stderr,"Query failed for %s. Using MPICH gatherv.\n", my_md->name);
         MPIDI_Update_last_algorithm(comm_ptr, "GATHERV_MPICH");
         return MPIR_Gatherv(sendbuf, sendcount, sendtype,
                             recvbuf, recvcounts, displs, recvtype,
                             root, comm_ptr, mpierrno);
      }
      if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) 
      { 
         comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
         int tmpmpierrno;   
         if(unlikely(verbose))
            fprintf(stderr,"Query barrier required for %s\n", my_md->name);
         MPIDO_Barrier(comm_ptr, &tmpmpierrno);
      }
   }
   
   MPIDI_Update_last_algorithm(comm_ptr, my_md->name);

   if(unlikely(verbose))
   {
      unsigned long long int threadID;
      MPL_thread_id_t tid;
      MPL_thread_self(&tid);
      threadID = (unsigned long long int)tid;
      fprintf(stderr,"<%llx> Using protocol %s for gatherv on %u\n", 
              threadID,
              my_md->name,
              (unsigned) comm_ptr->context_id);
   }

   MPIDI_Post_coll_t gatherv_post;
   MPIDI_Context_post(MPIDI_Context[0], &gatherv_post.state,
                      MPIDI_Pami_post_wrapper, (void *)&gatherv);
   
   TRACE_ERR("Waiting on active %d\n", gatherv_active);
   MPID_PROGRESS_WAIT_WHILE(gatherv_active);

   TRACE_ERR("Leaving MPIDO_Gatherv\n");
   return 0;
}
예제 #9
0
int MPIDO_Gatherv_simple(const void *sendbuf, 
                  int sendcount, 
                  MPI_Datatype sendtype,
                  void *recvbuf, 
                  const int *recvcounts, 
                  const int *displs, 
                  MPI_Datatype recvtype,
                  int root, 
                  MPID_Comm * comm_ptr, 
                  int *mpierrno)

{
#ifndef HAVE_PAMI_IN_PLACE
  if (sendbuf == MPI_IN_PLACE)
  {
    MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`");
    return -1;
  }
#endif
   TRACE_ERR("Entering MPIDO_Gatherv_optimized\n");
   int snd_contig = 1, rcv_contig = 1;
   void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL;
   void *sbuf = NULL, *rbuf = NULL;
   int  *rcounts = NULL;
   int  *rdispls = NULL;
   int send_size = 0;
   int recv_size = 0;
   int rcvlen    = 0;
  int totalrecvcount  = 0;
   pami_type_t rtype = PAMI_TYPE_NULL;
   MPID_Segment segment;
   MPID_Datatype *data_ptr = NULL;
   int send_true_lb, recv_true_lb = 0;
   int i, tmp;
   volatile unsigned gatherv_active = 1;
   const int rank = comm_ptr->rank;
   const int size = comm_ptr->local_size;
#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
    const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif

   const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
  int recvok=PAMI_SUCCESS, recvcontinuous=0;

   if(sendbuf != MPI_IN_PLACE)
   {
     MPIDI_Datatype_get_info(sendcount, sendtype, snd_contig,
                            send_size, data_ptr, send_true_lb);
    if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL)
    {
      advisor_algorithm_t advisor_algorithms[1];
      int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_GATHERV_INT, 64, advisor_algorithms, 1);
      if(num_algorithms)
      {
        if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO)
        {
          return MPIR_Gatherv(sendbuf, sendcount, sendtype,
                              recvbuf, recvcounts, displs, recvtype,
                              root, comm_ptr, mpierrno);
        }
        else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests)))
        {
          comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
          int tmpmpierrno;
          if(unlikely(verbose))
            fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name);
          MPIDO_Barrier(comm_ptr, &tmpmpierrno);
        }
      }
    }

    sbuf = (char *)sendbuf + send_true_lb;
    if(!snd_contig)
    {
      snd_noncontig_buff = MPL_malloc(send_size);
      sbuf = snd_noncontig_buff;
      if(snd_noncontig_buff == NULL)
      {
        MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
                   "Fatal:  Cannot allocate pack buffer");
      }
      DLOOP_Offset last = send_size;
      MPID_Segment_init(sendbuf, sendcount, sendtype, &segment, 0);
      MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff);
    }
  }
  else
  {
    MPIDI_Datatype_get_info(1, recvtype, rcv_contig,
                            rcvlen, data_ptr, recv_true_lb);
    if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL)
    {
      advisor_algorithm_t advisor_algorithms[1];
      int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_GATHERV_INT, 64, advisor_algorithms, 1);
      if(num_algorithms)
      {
        if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO)
        {
          return MPIR_Gatherv(sendbuf, sendcount, sendtype,
                              recvbuf, recvcounts, displs, recvtype,
                              root, comm_ptr, mpierrno);
        }
        else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests)))
        {
          comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
          int tmpmpierrno;
          if(unlikely(verbose))
            fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name);
          MPIDO_Barrier(comm_ptr, &tmpmpierrno);
        }
      }
    }
  }

   pami_xfer_t gatherv;
   rbuf = (char *)recvbuf + recv_true_lb;
   rcounts = (int*)recvcounts;
   rdispls = (int*)displs;
   if(rank == root)
   {
    if((recvok = MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp)) != MPI_SUCCESS)
      {
        MPIDI_Datatype_get_info(1, recvtype, rcv_contig,
                                rcvlen, data_ptr, recv_true_lb);
      totalrecvcount = recvcounts[0];
      recvcontinuous = displs[0] == 0? 1 : 0 ;
          rcounts = (int*)MPL_malloc(size);
          rdispls = (int*)MPL_malloc(size);
      rdispls[0] = 0;
      rcounts[0] = rcvlen * recvcounts[0];
      for(i = 1; i < size; i++)
      {
        rdispls[i]= rcvlen * totalrecvcount;
        totalrecvcount += recvcounts[i];
        if(displs[i] != (displs[i-1] + recvcounts[i-1]))
          recvcontinuous = 0;
            rcounts[i] = rcvlen * recvcounts[i];
          }
      recv_size = rcvlen * totalrecvcount;

          rcv_noncontig_buff = MPL_malloc(recv_size);
          rbuf = rcv_noncontig_buff;
          rtype = PAMI_TYPE_BYTE;
          if(rcv_noncontig_buff == NULL)
          {
             MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
                "Fatal:  Cannot allocate pack buffer");
          }
      if(sendbuf == MPI_IN_PLACE)
      {
        size_t extent;
        MPID_Datatype_get_extent_macro(recvtype,extent);
        MPIR_Localcopy(recvbuf + displs[rank]*extent, recvcounts[rank], recvtype,
                     rcv_noncontig_buff + rdispls[rank], rcounts[rank],MPI_CHAR);
      }
    }
    if(sendbuf == MPI_IN_PLACE)
    {
      gatherv.cmd.xfer_gatherv_int.sndbuf = PAMI_IN_PLACE;
    }
    else
    {
      gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf;
    }
    gatherv.cmd.xfer_gatherv_int.stype = PAMI_TYPE_BYTE;/* stype is ignored when sndbuf == PAMI_IN_PLACE */
    gatherv.cmd.xfer_gatherv_int.stypecount = send_size;

  }
  else
  {
    gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf;
    gatherv.cmd.xfer_gatherv_int.stype = PAMI_TYPE_BYTE;
    gatherv.cmd.xfer_gatherv_int.stypecount = send_size;     
  }


  gatherv.cb_done = cb_gatherv;
  gatherv.cookie = (void *)&gatherv_active;
  gatherv.cmd.xfer_gatherv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0);
  gatherv.cmd.xfer_gatherv_int.rcvbuf = rbuf;
  gatherv.cmd.xfer_gatherv_int.rtype = rtype;
  gatherv.cmd.xfer_gatherv_int.rtypecounts = (int *) rcounts;
  gatherv.cmd.xfer_gatherv_int.rdispls = (int *) rdispls;


  const pami_metadata_t *my_gatherv_md;

  gatherv.algorithm = mpid->coll_algorithm[PAMI_XFER_GATHERV_INT][0][0];
  my_gatherv_md = &mpid->coll_metadata[PAMI_XFER_GATHERV_INT][0][0];

  MPIDI_Update_last_algorithm(comm_ptr, my_gatherv_md->name);

  MPIDI_Post_coll_t gatherv_post;
  TRACE_ERR("%s gatherv\n", MPIDI_Process.context_post.active>0?"Posting":"Invoking");
  MPIDI_Context_post(MPIDI_Context[0], &gatherv_post.state,
                     MPIDI_Pami_post_wrapper, (void *)&gatherv);
  TRACE_ERR("Gatherv %s\n", MPIDI_Process.context_post.active>0?"posted":"invoked");

  TRACE_ERR("Waiting on active %d\n", gatherv_active);
  MPID_PROGRESS_WAIT_WHILE(gatherv_active);

  if(!rcv_contig || recvok != PAMI_SUCCESS)
  {
    if(recvcontinuous)
   {
      MPIR_Localcopy(rcv_noncontig_buff, recv_size, MPI_CHAR,
                     recvbuf,   totalrecvcount,     recvtype);
    }
    else
    {
      size_t extent;
      MPID_Datatype_get_extent_macro(recvtype,extent);
      for(i=0; i<size; ++i)
      {
        char* scbuf = (char*)rcv_noncontig_buff+ rdispls[i];
        char* rcbuf = (char*)recvbuf + displs[i]*extent;
        MPIR_Localcopy(scbuf, rcounts[i], MPI_CHAR,
                       rcbuf, recvcounts[i], recvtype);
        TRACE_ERR("Pack recv src  extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n",
                  (size_t)extent, (size_t)i,(size_t)precvdispls[i],(size_t)i,(size_t)precvcounts[i],(size_t)precvdispls[i], *(int*)scbuf);
        TRACE_ERR("Pack recv dest extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n",
                  (size_t)extent, (size_t)i,(size_t)displs[i],(size_t)i,(size_t)recvcounts[i],(size_t)displs[i], *(int*)rcbuf);
      }

    }
      MPL_free(rcv_noncontig_buff);
      if(rank == root)
      {
         MPL_free(rcounts);
         MPL_free(rdispls);
      }
   }
   if(!snd_contig)  MPL_free(snd_noncontig_buff);


   TRACE_ERR("Leaving MPIDO_Gatherv_optimized\n");
   return MPI_SUCCESS;
}
예제 #10
0
int MPIDO_Scatterv_simple(const void *sendbuf,
                   const int *sendcounts,
                   const int *displs,
                   MPI_Datatype sendtype,
                   void *recvbuf,
                   int recvcount,
                   MPI_Datatype recvtype,
                   int root,
                   MPID_Comm *comm_ptr,
                   int *mpierrno)
{
#ifndef HAVE_PAMI_IN_PLACE
  if (sendbuf == MPI_IN_PLACE)
  {
    MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`");
    return -1;
  }
#endif
  int snd_contig = 1;
  int rcv_contig = 1;
  int send_size = 0, recv_size = 0;
  int ssize = 0;
  MPID_Datatype *dt_ptr = NULL;
  MPI_Aint send_true_lb=0, recv_true_lb=0;
  void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL;
  void *sbuf = NULL, *rbuf = NULL;
  int *sdispls = NULL, *scounts = NULL;
  int sndcount  = 0;
  MPID_Segment segment;
  int tmp, i;
  pami_type_t stype = PAMI_TYPE_NULL;
  const int rank = comm_ptr->rank;
  const int size = comm_ptr->local_size;
  const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);

  if (rank == root && sendtype != MPI_DATATYPE_NULL && sendcounts[0] >= 0)
  {
    MPIDI_Datatype_get_info(1, sendtype, snd_contig, ssize, dt_ptr, send_true_lb);
    if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL)
    {
      advisor_algorithm_t advisor_algorithms[1];
      int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_SCATTERV_INT, 64, advisor_algorithms, 1);
      if(num_algorithms)
      {
        if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO)
        {
          return MPIR_Scatterv(sendbuf, sendcounts, displs, sendtype,
                             recvbuf, recvcount, recvtype,
                             root, comm_ptr, mpierrno);
        }
        else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests)))
        {
          comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
          int tmpmpierrno;
          MPIDO_Barrier(comm_ptr, &tmpmpierrno);
        }

      }
    }
  }

  if (recvtype != MPI_DATATYPE_NULL && recvcount >= 0)
  {
    MPIDI_Datatype_get_info(recvcount, recvtype, rcv_contig,
                            recv_size, dt_ptr, recv_true_lb);
    if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL)
    {
      advisor_algorithm_t advisor_algorithms[1];
      int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_SCATTERV_INT, 64, advisor_algorithms, 1);
      if(num_algorithms)
      {
        if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO)
        {
          return MPIR_Scatterv(sendbuf, sendcounts, displs, sendtype,
                             recvbuf, recvcount, recvtype,
                             root, comm_ptr, mpierrno);
        }
        else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests)))
        {
          comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
          int tmpmpierrno;
          MPIDO_Barrier(comm_ptr, &tmpmpierrno);
        }

      }
    }
  }

   pami_xfer_t scatterv;
   const pami_metadata_t *my_scatterv_md;
   volatile unsigned scatterv_active = 1;

   sbuf = (char *)sendbuf + send_true_lb;
   rbuf = (char *)recvbuf + recv_true_lb;
   scounts = (int*)sendcounts;
   sdispls = (int*)displs;
   if(rank == root)
   {
     if(MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp) != MPI_SUCCESS)
     {
       if (!snd_contig)
       {
          scounts = (int*)MPIU_Malloc(size);
          sdispls = (int*)MPIU_Malloc(size);
          for(i = 0; i < size; i++)
          {
            scounts[i] = ssize * sendcounts[i];
            sdispls[i] = ssize * displs[i];
            send_size += scounts[i];
            sndcount  += sendcounts[i];
          }
          snd_noncontig_buff = MPIU_Malloc(send_size);
          sbuf = snd_noncontig_buff;
          stype = PAMI_TYPE_BYTE;
          if(snd_noncontig_buff == NULL)
          {
             MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
                "Fatal:  Cannot allocate pack buffer");
          }
          DLOOP_Offset last = send_size;
          MPID_Segment_init(sendbuf, sndcount, sendtype, &segment, 0);
          MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff);
       }
     }
     if(recvbuf == MPI_IN_PLACE)
     {
       rbuf = PAMI_IN_PLACE;
     }
   }

   if(recvbuf != MPI_IN_PLACE)
   {
     if (!rcv_contig)
     {
       rcv_noncontig_buff = MPIU_Malloc(recv_size);
       rbuf = rcv_noncontig_buff;
       if(rcv_noncontig_buff == NULL)
       {
          MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
             "Fatal:  Cannot allocate pack buffer");
       }
     }
   }

   scatterv.cb_done = cb_scatterv;
   scatterv.cookie = (void *)&scatterv_active;
   scatterv.cmd.xfer_scatterv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0);

   scatterv.algorithm = mpid->coll_algorithm[PAMI_XFER_SCATTERV_INT][0][0];
   my_scatterv_md = &mpid->coll_metadata[PAMI_XFER_SCATTERV_INT][0][0];
   
   scatterv.cmd.xfer_scatterv_int.rcvbuf = rbuf;
   scatterv.cmd.xfer_scatterv_int.sndbuf = sbuf;
   scatterv.cmd.xfer_scatterv_int.stype = stype;
   scatterv.cmd.xfer_scatterv_int.rtype = PAMI_TYPE_BYTE;/* rtype is ignored when rcvbuf == PAMI_IN_PLACE */
   scatterv.cmd.xfer_scatterv_int.stypecounts = (int *) scounts;
   scatterv.cmd.xfer_scatterv_int.rtypecount = recv_size;
   scatterv.cmd.xfer_scatterv_int.sdispls = (int *) sdispls;


   MPIDI_Update_last_algorithm(comm_ptr, my_scatterv_md->name);


   MPIDI_Post_coll_t scatterv_post;
   TRACE_ERR("%s scatterv\n", MPIDI_Process.context_post.active>0?"Posting":"Invoking");
   MPIDI_Context_post(MPIDI_Context[0], &scatterv_post.state,
                      MPIDI_Pami_post_wrapper, (void *)&scatterv);

   TRACE_ERR("Waiting on active %d\n", scatterv_active);
   MPID_PROGRESS_WAIT_WHILE(scatterv_active);

   if(!rcv_contig)
   {
      MPIR_Localcopy(rcv_noncontig_buff, recv_size, MPI_CHAR,
                        recvbuf,         recvcount,     recvtype);
      MPIU_Free(rcv_noncontig_buff);
   }
   if(!snd_contig) 
   {
     MPIU_Free(snd_noncontig_buff);
     MPIU_Free(scounts);
     MPIU_Free(sdispls);
   }

   TRACE_ERR("Leaving MPIDO_Scatterv_optimized\n");
   return MPI_SUCCESS;
}
예제 #11
0
int MPIDO_Barrier(MPID_Comm *comm_ptr, int *mpierrno)
{
   TRACE_ERR("Entering MPIDO_Barrier\n");
   volatile unsigned active=1;
   MPIDI_Post_coll_t barrier_post;
   pami_xfer_t barrier;
   pami_algorithm_t my_barrier;
   pami_metadata_t *my_barrier_md;
   int queryreq = 0;

   if(comm_ptr->mpid.user_selected_type[PAMI_XFER_BARRIER] == MPID_COLL_USE_MPICH)
   {
     if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL && comm_ptr->rank == 0))
       fprintf(stderr,"Using MPICH barrier\n");
      TRACE_ERR("Using MPICH Barrier\n");
      return MPIR_Barrier(comm_ptr, mpierrno);
   }

   barrier.cb_done = cb_barrier;
   barrier.cookie = (void *)&active;
   if(comm_ptr->mpid.user_selected_type[PAMI_XFER_BARRIER] == MPID_COLL_OPTIMIZED)
   {
      TRACE_ERR("Optimized barrier (%s) was pre-selected\n", comm_ptr->mpid.opt_protocol_md[PAMI_XFER_BARRIER][0].name);
      my_barrier = comm_ptr->mpid.opt_protocol[PAMI_XFER_BARRIER][0];
      my_barrier_md = &comm_ptr->mpid.opt_protocol_md[PAMI_XFER_BARRIER][0];
      queryreq = comm_ptr->mpid.must_query[PAMI_XFER_BARRIER][0];
   }
   else
   {
      TRACE_ERR("Barrier (%s) was specified by user\n", comm_ptr->mpid.user_metadata[PAMI_XFER_BARRIER].name);
      my_barrier = comm_ptr->mpid.user_selected[PAMI_XFER_BARRIER];
      my_barrier_md = &comm_ptr->mpid.user_metadata[PAMI_XFER_BARRIER];
      queryreq = comm_ptr->mpid.user_selected_type[PAMI_XFER_BARRIER];
   }

   barrier.algorithm = my_barrier;
   /* There is no support for query-required barrier protocols here */
   MPID_assert_always(queryreq != MPID_COLL_ALWAYS_QUERY);
   MPID_assert_always(queryreq != MPID_COLL_CHECK_FN_REQUIRED);

   /* TODO Name needs fixed somehow */
   MPIDI_Update_last_algorithm(comm_ptr, my_barrier_md->name);
   if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL && comm_ptr->rank == 0))
   {
      unsigned long long int threadID;
      MPIU_Thread_id_t tid;
      MPIU_Thread_self(&tid);
      threadID = (unsigned long long int)tid;
     fprintf(stderr,"<%llx> Using protocol %s for barrier on %u\n", 
             threadID,
             my_barrier_md->name,
/*             comm_ptr->rank,comm_ptr->local_size,comm_ptr->remote_size,*/
            (unsigned) comm_ptr->context_id);
   }
   TRACE_ERR("%s barrier\n", MPIDI_Process.context_post.active>0?"posting":"invoking");
   MPIDI_Context_post(MPIDI_Context[0], &barrier_post.state,
                      MPIDI_Pami_post_wrapper, (void *)&barrier);
   TRACE_ERR("barrier %s rc: %d\n", MPIDI_Process.context_post.active>0?"posted":"invoked", rc);

   TRACE_ERR("advance spinning\n");
   MPID_PROGRESS_WAIT_WHILE(active);
   TRACE_ERR("exiting mpido_barrier\n");
   return 0;
}
예제 #12
0
int
MPIDO_Allgatherv_simple(const void *sendbuf,
		 int sendcount,
		 MPI_Datatype sendtype,
		 void *recvbuf,
		 const int *recvcounts,
		 const int *displs,
		 MPI_Datatype recvtype,
		 MPID_Comm * comm_ptr,
                 int *mpierrno)
{
#ifndef HAVE_PAMI_IN_PLACE
  if (sendbuf == MPI_IN_PLACE)
  {
    MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`");
    return -1;
  }
#endif
   TRACE_ERR("Entering MPIDO_Allgatherv_optimized\n");
  /* function pointer to be used to point to approperiate algorithm */
  /* Check the nature of the buffers */
  MPID_Datatype *dt_null = NULL;
  MPI_Aint send_true_lb  = 0;
  MPI_Aint recv_true_lb  = 0;
  size_t   send_size     = 0;
  size_t   recv_size     = 0;
  size_t   rcvtypelen    = 0;
  int snd_data_contig = 0, rcv_data_contig = 0;
  void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL;
  int scount=sendcount;

  char *sbuf, *rbuf;
  pami_type_t stype = NULL, rtype;
  const int rank = comm_ptr->rank;
  const int size = comm_ptr->local_size;
  const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
   const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif

  int recvcontinuous=0;
  size_t totalrecvcount=0;
  int *lrecvdispls = NULL; /* possible local displs calculated for noncontinous */
  int *lrecvcounts  = NULL;/* possible local counts calculated for noncontinous */
  const int *precvdispls = displs; /* pointer to displs to use as pami parmi */
  const int *precvcounts = recvcounts; /* pointer to counts to use as pami parmi */
  int inplace = sendbuf == MPI_IN_PLACE? 1 : 0;


  volatile unsigned allgatherv_active = 1;
  int recvok=PAMI_SUCCESS, sendok=PAMI_SUCCESS;
  int tmp;
  const pami_metadata_t *my_md;


   MPIDI_Datatype_get_info(1,
                          recvtype,
                          rcv_data_contig,
                          rcvtypelen,
                          dt_null,
                          recv_true_lb);

  if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL)
  {
    advisor_algorithm_t advisor_algorithms[1];
    int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_ALLGATHERV_INT, rcvtypelen * recvcounts[0], advisor_algorithms, 1);
     if(num_algorithms)
     {
       if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO)
       {
         return MPIR_Allgatherv(sendbuf, sendcount, sendtype,
                       recvbuf, recvcounts, displs, recvtype,
                       comm_ptr, mpierrno);
       }
       else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests)))
       {
         comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
         int tmpmpierrno;
         if(unlikely(verbose))
           fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name);
         MPIDO_Barrier(comm_ptr, &tmpmpierrno);
       }
     }
   }


  if(!inplace)
   {
    sendok = MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp);
    MPIDI_Datatype_get_info(sendcount, sendtype, snd_data_contig, send_size, dt_null, send_true_lb);
    sbuf = (char *)sendbuf + send_true_lb;
    if(!snd_data_contig || (sendok != PAMI_SUCCESS))
   {
      stype  = PAMI_TYPE_UNSIGNED_CHAR;
      scount = send_size;
      if(!snd_data_contig)
   {
        snd_noncontig_buff = MPIU_Malloc(send_size);
        sbuf = snd_noncontig_buff;
        if(snd_noncontig_buff == NULL)
   {
          MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
                   "Fatal:  Cannot allocate pack buffer");
   }
        MPIR_Localcopy(sendbuf, sendcount, sendtype,
                       snd_noncontig_buff, send_size,MPI_CHAR);
      }
    }
  }
  else
    sbuf = PAMI_IN_PLACE;

  recvok = MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp);
   rbuf = (char *)recvbuf+recv_true_lb;
  if(!rcv_data_contig || (recvok != PAMI_SUCCESS))
  {
    rtype = PAMI_TYPE_UNSIGNED_CHAR;
    totalrecvcount = recvcounts[0];
    recvcontinuous = displs[0] == 0? 1 : 0 ;
    int i;
    precvdispls = lrecvdispls = MPIU_Malloc(size*sizeof(int));
    precvcounts = lrecvcounts = MPIU_Malloc(size*sizeof(int));
    lrecvdispls[0]= 0;
    lrecvcounts[0]= rcvtypelen * recvcounts[0];
    for(i=1; i<size; ++i)
    {
      lrecvdispls[i]= rcvtypelen * totalrecvcount;
      totalrecvcount += recvcounts[i];
      if(displs[i] != (displs[i-1] + recvcounts[i-1]))
        recvcontinuous = 0;
      lrecvcounts[i]= rcvtypelen * recvcounts[i];
    }
    recv_size = rcvtypelen * totalrecvcount;
    TRACE_ERR("Pack receive rcv_contig %zu, recvok %zd, totalrecvcount %zu, recvcontinuous %zu, rcvtypelen %zu, recv_size %zu\n",
              (size_t)rcv_data_contig, (size_t)recvok, (size_t)totalrecvcount, (size_t)recvcontinuous,(size_t)rcvtypelen, (size_t)recv_size);

    rcv_noncontig_buff = MPIU_Malloc(recv_size);
    rbuf = rcv_noncontig_buff;
    if(rcv_noncontig_buff == NULL)
    {
      MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
                 "Fatal:  Cannot allocate pack buffer");
    }
    if(inplace)
    {
      size_t extent;
      MPID_Datatype_get_extent_macro(recvtype,extent);
      MPIR_Localcopy(recvbuf + displs[rank]*extent, recvcounts[rank], recvtype,
                     rcv_noncontig_buff + precvdispls[rank], precvcounts[rank],MPI_CHAR);
      scount = precvcounts[rank];
      stype   = PAMI_TYPE_UNSIGNED_CHAR;
      sbuf    = PAMI_IN_PLACE;
    }
   }


   pami_xfer_t allgatherv;
   allgatherv.cb_done = allgatherv_cb_done;
   allgatherv.cookie = (void *)&allgatherv_active;
   allgatherv.cmd.xfer_allgatherv_int.sndbuf = sbuf;
   allgatherv.cmd.xfer_allgatherv_int.rcvbuf = rbuf;
   allgatherv.cmd.xfer_allgatherv_int.stype = stype;/* stype is ignored when sndbuf == PAMI_IN_PLACE */
   allgatherv.cmd.xfer_allgatherv_int.rtype = rtype;
   allgatherv.cmd.xfer_allgatherv_int.stypecount = scount;
  allgatherv.cmd.xfer_allgatherv_int.rtypecounts = (int *) precvcounts;
  allgatherv.cmd.xfer_allgatherv_int.rdispls = (int *) precvdispls;
   allgatherv.algorithm = mpid->coll_algorithm[PAMI_XFER_ALLGATHERV_INT][0][0];
   my_md = &mpid->coll_metadata[PAMI_XFER_ALLGATHERV_INT][0][0];

   TRACE_ERR("Calling allgatherv via %s()\n", MPIDI_Process.context_post.active>0?"PAMI_Collective":"PAMI_Context_post");
   MPIDI_Post_coll_t allgatherv_post;
   MPIDI_Context_post(MPIDI_Context[0], &allgatherv_post.state,
                      MPIDI_Pami_post_wrapper, (void *)&allgatherv);

   MPIDI_Update_last_algorithm(comm_ptr, my_md->name);

   TRACE_ERR("Rank %d waiting on active %d\n", rank, allgatherv_active);
   MPID_PROGRESS_WAIT_WHILE(allgatherv_active);

  if(!rcv_data_contig || (recvok != PAMI_SUCCESS))
  {
    if(recvcontinuous)
    {
      MPIR_Localcopy(rcv_noncontig_buff, recv_size,MPI_CHAR,
                     recvbuf, totalrecvcount, recvtype);
    }
    else
    {
      size_t extent;
      int i;
      MPID_Datatype_get_extent_macro(recvtype,extent);
      for(i=0; i<size; ++i)
      {
        char* scbuf = (char*)rcv_noncontig_buff+ precvdispls[i];
        char* rcbuf = (char*)recvbuf + displs[i]*extent;
        MPIR_Localcopy(scbuf, precvcounts[i], MPI_CHAR,
                       rcbuf, recvcounts[i], recvtype);
        TRACE_ERR("Pack recv src  extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n",
                  (size_t)extent, (size_t)i,(size_t)precvdispls[i],(size_t)i,(size_t)precvcounts[i],(size_t)precvdispls[i], *(int*)scbuf);
        TRACE_ERR("Pack recv dest extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n",
                  (size_t)extent, (size_t)i,(size_t)displs[i],(size_t)i,(size_t)recvcounts[i],(size_t)displs[i], *(int*)rcbuf);
      }
    }
    MPIU_Free(rcv_noncontig_buff);
  }
  if(!snd_data_contig)  MPIU_Free(snd_noncontig_buff);
  if(lrecvdispls) MPIU_Free(lrecvdispls);
  if(lrecvcounts) MPIU_Free(lrecvcounts);

   return MPI_SUCCESS;
}
예제 #13
0
int MPIDO_Bcast(void *buffer,
                int count,
                MPI_Datatype datatype,
                int root,
                MPID_Comm *comm_ptr,
                int *mpierrno)
{
   TRACE_ERR("in mpido_bcast\n");
   const size_t BCAST_LIMIT =      0x40000000;
   int data_contig, rc;
   void *data_buffer    = NULL,
        *noncontig_buff = NULL;
   volatile unsigned active = 1;
   MPI_Aint data_true_lb = 0;
   MPID_Datatype *data_ptr;
   MPID_Segment segment;
   MPIDI_Post_coll_t bcast_post;
   const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
   const int rank = comm_ptr->rank;
#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
   const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif
   const int selected_type = mpid->user_selected_type[PAMI_XFER_BROADCAST];

   /* Must calculate data_size based on count=1 in case it's total size is > integer */
   int data_size_one;
   MPIDI_Datatype_get_info(1, datatype,
			   data_contig, data_size_one, data_ptr, data_true_lb);
   /* do this calculation once and use twice */
   const size_t data_size_sz = (size_t)data_size_one*(size_t)count;
   if(unlikely(verbose))
     fprintf(stderr,"bcast count %d, size %d (%#zX), root %d, buffer %p\n",
	     count,data_size_one, (size_t)data_size_one*(size_t)count, root,buffer);
   if(unlikely( data_size_sz > BCAST_LIMIT) )
   {
      void *new_buffer=buffer;
      int c, new_count = (int)BCAST_LIMIT/data_size_one;
      MPID_assert(new_count > 0);

      for(c=1; ((size_t)c*(size_t)new_count) <= (size_t)count; ++c)
      {
        if ((rc = MPIDO_Bcast(new_buffer,
                        new_count,
                        datatype,
                        root,
                        comm_ptr,
                              mpierrno)) != MPI_SUCCESS)
         return rc;
	 new_buffer = (char*)new_buffer + (size_t)data_size_one*(size_t)new_count;
      }
      new_count = count % new_count; /* 0 is ok, just returns no-op */
      return MPIDO_Bcast(new_buffer,
                         new_count,
                         datatype,
                         root,
                         comm_ptr,
                         mpierrno);
   }

   /* Must use data_size based on count for byte bcast processing.
      Previously calculated as a size_t but large data_sizes were 
      handled above so this cast to int should be fine here.  
   */
   const int data_size = (int)data_size_sz;

   if(selected_type == MPID_COLL_USE_MPICH || data_size == 0)
   {
     if(unlikely(verbose))
       fprintf(stderr,"Using MPICH bcast algorithm\n");
      MPIDI_Update_last_algorithm(comm_ptr,"BCAST_MPICH");
      return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno);
   }

   data_buffer = (char *)buffer + data_true_lb;

   if(!data_contig)
   {
      noncontig_buff = MPIU_Malloc(data_size);
      data_buffer = noncontig_buff;
      if(noncontig_buff == NULL)
      {
         MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
            "Fatal:  Cannot allocate pack buffer");
      }
      if(rank == root)
      {
         DLOOP_Offset last = data_size;
         MPID_Segment_init(buffer, count, datatype, &segment, 0);
         MPID_Segment_pack(&segment, 0, &last, noncontig_buff);
      }
   }

   pami_xfer_t bcast;
   pami_algorithm_t my_bcast;
   const pami_metadata_t *my_md = (pami_metadata_t *)NULL;
   int queryreq = 0;

   bcast.cb_done = cb_bcast;
   bcast.cookie = (void *)&active;
   bcast.cmd.xfer_broadcast.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0);
   bcast.algorithm = mpid->user_selected[PAMI_XFER_BROADCAST];
   bcast.cmd.xfer_broadcast.buf = data_buffer;
   bcast.cmd.xfer_broadcast.type = PAMI_TYPE_BYTE;
   /* Needs to be sizeof(type)*count since we are using bytes as * the generic type */
   bcast.cmd.xfer_broadcast.typecount = data_size;

   if(selected_type == MPID_COLL_OPTIMIZED)
   {
      TRACE_ERR("Optimized bcast (%s) and (%s) were pre-selected\n",
         mpid->opt_protocol_md[PAMI_XFER_BROADCAST][0].name,
         mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1].name);

      if(mpid->cutoff_size[PAMI_XFER_BROADCAST][1] != 0)/* SSS: There is FCA cutoff (FCA only sets cutoff for [PAMI_XFER_BROADCAST][1]) */
      {
        if(data_size <= mpid->cutoff_size[PAMI_XFER_BROADCAST][1])
        {
          my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][1];
          my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1];
          queryreq = mpid->must_query[PAMI_XFER_BROADCAST][1];
        }
        else
        {
          return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno);
        }
      }

      if(data_size > mpid->cutoff_size[PAMI_XFER_BROADCAST][0])
      {
         my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][1];
         my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1];
         queryreq = mpid->must_query[PAMI_XFER_BROADCAST][1];
      }
      else
      {
         my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][0];
         my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][0];
         queryreq = mpid->must_query[PAMI_XFER_BROADCAST][0];
      }
   }
   else
   {
      TRACE_ERR("Bcast (%s) was specified by user\n",
         mpid->user_metadata[PAMI_XFER_BROADCAST].name);
      my_bcast =  mpid->user_selected[PAMI_XFER_BROADCAST];
      my_md = &mpid->user_metadata[PAMI_XFER_BROADCAST];
      queryreq = selected_type;
   }

   bcast.algorithm = my_bcast;

   if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY ||
               queryreq == MPID_COLL_CHECK_FN_REQUIRED))
   {
      metadata_result_t result = {0};
      TRACE_ERR("querying bcast protocol %s, type was: %d\n",
                my_md->name, queryreq);
      if(my_md->check_fn != NULL) /* calling the check fn is sufficient */
      {
         metadata_result_t result = {0};
         result = my_md->check_fn(&bcast);
         result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */
      } 
      else /* no check_fn, manually look at the metadata fields */
      {
         TRACE_ERR("Optimzed selection line %d\n",__LINE__);
         /* Check if the message range if restricted */
         if(my_md->check_correct.values.rangeminmax)
         {
            if((my_md->range_lo <= data_size) &&
               (my_md->range_hi >= data_size))
               ; /* ok, algorithm selected */
            else
            {
               result.check.range = 1;
               if(unlikely(verbose))
               {   
                  fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n",
                          data_size,
                          my_md->range_lo,
                          my_md->range_hi,
                          my_md->name);
               }
            }
         }
         /* \todo check the rest of the metadata */
      }
      TRACE_ERR("bitmask: %#X\n", result.bitmask);
      if(result.bitmask)
      {
         if(unlikely(verbose))
            fprintf(stderr,"Using MPICH bcast algorithm - query fn failed\n");
         MPIDI_Update_last_algorithm(comm_ptr,"BCAST_MPICH");
         return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno);
      }
      if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) 
      { 
         comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
         int tmpmpierrno;   
         if(unlikely(verbose))
            fprintf(stderr,"Query barrier required for %s\n", my_md->name);
         MPIDO_Barrier(comm_ptr, &tmpmpierrno);
      }
   }

   if(unlikely(verbose))
   {
      unsigned long long int threadID;
      MPIU_Thread_id_t tid;
      MPIU_Thread_self(&tid);
      threadID = (unsigned long long int)tid;
      fprintf(stderr,"<%llx> Using protocol %s for bcast on %u\n", 
              threadID,
              my_md->name,
              (unsigned) comm_ptr->context_id);
   }

   MPIDI_Context_post(MPIDI_Context[0], &bcast_post.state, MPIDI_Pami_post_wrapper, (void *)&bcast);
   MPIDI_Update_last_algorithm(comm_ptr, my_md->name);
   MPID_PROGRESS_WAIT_WHILE(active);
   TRACE_ERR("bcast done\n");

   if(!data_contig)
   {
      if(rank != root)
         MPIR_Localcopy(noncontig_buff, data_size, MPI_CHAR,
                        buffer,         count,     datatype);
      MPIU_Free(noncontig_buff);
   }

   TRACE_ERR("leaving bcast\n");
   return 0;
}
예제 #14
0
int MPIDO_Bcast_simple(void *buffer,
                int count,
                MPI_Datatype datatype,
                int root,
                MPID_Comm *comm_ptr,
                int *mpierrno)
{
   TRACE_ERR("Entering MPIDO_Bcast_optimized\n");

   int data_contig;
   void *data_buffer    = NULL,
        *noncontig_buff = NULL;
   volatile unsigned active = 1;
   MPI_Aint data_true_lb = 0;
   MPID_Datatype *data_ptr;
   MPID_Segment segment;
   MPIDI_Post_coll_t bcast_post;
   const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
   const int rank = comm_ptr->rank;

   /* Must calculate data_size based on count=1 in case it's total size is > integer */
   int data_size_one;
   MPIDI_Datatype_get_info(1, datatype,
			   data_contig, data_size_one, data_ptr, data_true_lb);
   if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL)
   {
     advisor_algorithm_t advisor_algorithms[1];
     int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_BROADCAST, data_size_one * count, advisor_algorithms, 1);
     if(num_algorithms)
     {
       if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO)
       {
         return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno);
       }
     }
   }

   const int data_size = data_size_one*(size_t)count;

   data_buffer = (char *)buffer + data_true_lb;

   if(!data_contig)
   {
      noncontig_buff = MPIU_Malloc(data_size);
      data_buffer = noncontig_buff;
      if(noncontig_buff == NULL)
      {
         MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
            "Fatal:  Cannot allocate pack buffer");
      }
      if(rank == root)
      {
         DLOOP_Offset last = data_size;
         MPID_Segment_init(buffer, count, datatype, &segment, 0);
         MPID_Segment_pack(&segment, 0, &last, noncontig_buff);
      }
   }

   pami_xfer_t bcast;
   const pami_metadata_t *my_bcast_md;
   int queryreq = 0;

   bcast.cb_done = cb_bcast;
   bcast.cookie = (void *)&active;
   bcast.cmd.xfer_broadcast.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0);
   bcast.algorithm = mpid->coll_algorithm[PAMI_XFER_BROADCAST][0][0];
   bcast.cmd.xfer_broadcast.buf = data_buffer;
   bcast.cmd.xfer_broadcast.type = PAMI_TYPE_BYTE;
   /* Needs to be sizeof(type)*count since we are using bytes as * the generic type */
   bcast.cmd.xfer_broadcast.typecount = data_size;
   my_bcast_md = &mpid->coll_metadata[PAMI_XFER_BROADCAST][0][0];

   MPIDI_Context_post(MPIDI_Context[0], &bcast_post.state, MPIDI_Pami_post_wrapper, (void *)&bcast);
   MPIDI_Update_last_algorithm(comm_ptr, my_bcast_md->name);
   MPID_PROGRESS_WAIT_WHILE(active);
   TRACE_ERR("bcast done\n");

   if(!data_contig)
   {
      if(rank != root)
         MPIR_Localcopy(noncontig_buff, data_size, MPI_CHAR,
                        buffer,         count,     datatype);
      MPIU_Free(noncontig_buff);
   }

   TRACE_ERR("Exiting MPIDO_Bcast_optimized\n");
   return 0;
}
예제 #15
0
int
MPIDO_Allgather(const void *sendbuf,
                int sendcount,
                MPI_Datatype sendtype,
                void *recvbuf,
                int recvcount,
                MPI_Datatype recvtype,
                MPID_Comm * comm_ptr,
                int *mpierrno)
{
  /* *********************************
   * Check the nature of the buffers
   * *********************************
   */
/*  MPIDO_Coll_config config = {1,1,1,1,1,1};*/
   int config[6], i;
   MPID_Datatype * dt_null = NULL;
   MPI_Aint send_true_lb = 0;
   MPI_Aint recv_true_lb = 0;
   int rc, comm_size = comm_ptr->local_size;
   size_t send_size = 0;
   size_t recv_size = 0;
   volatile unsigned allred_active = 1;
   volatile unsigned allgather_active = 1;
   pami_xfer_t allred;
   for (i=0;i<6;i++) config[i] = 1;
   pami_metadata_t *my_md;


   allred.cb_done = allred_cb_done;
   allred.cookie = (void *)&allred_active;
   /* Pick an algorithm that is guaranteed to work for the pre-allreduce */
   /* TODO: This needs selection for fast(er|est) allreduce protocol */
   allred.algorithm = comm_ptr->mpid.coll_algorithm[PAMI_XFER_ALLREDUCE][0][0]; 
   allred.cmd.xfer_allreduce.sndbuf = (void *)config;
   allred.cmd.xfer_allreduce.stype = PAMI_TYPE_SIGNED_INT;
   allred.cmd.xfer_allreduce.rcvbuf = (void *)config;
   allred.cmd.xfer_allreduce.rtype = PAMI_TYPE_SIGNED_INT;
   allred.cmd.xfer_allreduce.stypecount = 6;
   allred.cmd.xfer_allreduce.rtypecount = 6;
   allred.cmd.xfer_allreduce.op = PAMI_DATA_BAND;

  char use_tree_reduce, use_alltoall, use_bcast, use_pami, use_opt;
  char *rbuf = NULL, *sbuf = NULL;

   use_alltoall = comm_ptr->mpid.allgathers[2];
   use_tree_reduce = comm_ptr->mpid.allgathers[0];
   use_bcast = comm_ptr->mpid.allgathers[1];
   use_pami = 
      (comm_ptr->mpid.user_selected_type[PAMI_XFER_ALLGATHER] == MPID_COLL_USE_MPICH) ? 0 : 1;
/*   if(sendbuf == MPI_IN_PLACE) use_pami = 0;*/
   use_opt = use_alltoall || use_tree_reduce || use_bcast || use_pami;


   TRACE_ERR("flags before: b: %d a: %d t: %d p: %d\n", use_bcast, use_alltoall, use_tree_reduce, use_pami);
   if(!use_opt)
   {
      if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL && comm_ptr->rank == 0))
         fprintf(stderr,"Using MPICH allgather algorithm\n");
      TRACE_ERR("No options set/available; using MPICH for allgather\n");
      MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH");
      return MPIR_Allgather(sendbuf, sendcount, sendtype,
                            recvbuf, recvcount, recvtype,
                            comm_ptr, mpierrno);
   }
   if ((sendcount < 1 && sendbuf != MPI_IN_PLACE) || recvcount < 1)
      return MPI_SUCCESS;

   /* Gather datatype information */
   MPIDI_Datatype_get_info(recvcount,
			  recvtype,
        config[MPID_RECV_CONTIG],
			  recv_size,
			  dt_null,
			  recv_true_lb);

   send_size = recv_size;
   rbuf = (char *)recvbuf+recv_true_lb;

   if(sendbuf != MPI_IN_PLACE)
   {
      if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL))
         fprintf(stderr,"allgather MPI_IN_PLACE buffering\n");
      MPIDI_Datatype_get_info(sendcount,
                            sendtype,
                            config[MPID_SEND_CONTIG],
                            send_size,
                            dt_null,
                            send_true_lb);
      sbuf = (char *)sendbuf+send_true_lb;
   }
   else
   {
      sbuf = (char *)recvbuf+recv_size*comm_ptr->rank;
   }
/*   fprintf(stderr,"sendount: %d, recvcount: %d send_size: %zd
     recv_size: %zd\n", sendcount, recvcount, send_size,
     recv_size);*/

  /* verify everyone's datatype contiguity */
  /* Check buffer alignment now, since we're pre-allreducing anyway */
  /* Only do this if one of the glue protocols is likely to be used */
  if(use_alltoall || use_tree_reduce || use_bcast)
  {
     config[MPID_ALIGNEDBUFFER] =
               !((long)sendbuf & 0x0F) && !((long)recvbuf & 0x0F);

      /* #warning need to determine best allreduce for short messages */
      if(comm_ptr->mpid.preallreduces[MPID_ALLGATHER_PREALLREDUCE])
      {
         TRACE_ERR("Preallreducing in allgather\n");
         MPIDI_Post_coll_t allred_post;
         MPIDI_Context_post(MPIDI_Context[0], &allred_post.state,
                            MPIDI_Pami_post_wrapper, (void *)&allred);

         MPID_PROGRESS_WAIT_WHILE(allred_active);
     }


       use_alltoall = comm_ptr->mpid.allgathers[2] &&
            config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG];;

      /* Note: some of the glue protocols use recv_size*comm_size rather than 
       * recv_size so we use that for comparison here, plus we pass that in
       * to those protocols. */
       use_tree_reduce = comm_ptr->mpid.allgathers[0] &&
         config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG] &&
         config[MPID_RECV_CONTINUOUS] && (recv_size*comm_size % sizeof(int) == 0);

       use_bcast = comm_ptr->mpid.allgathers[1];

       TRACE_ERR("flags after: b: %d a: %d t: %d p: %d\n", use_bcast, use_alltoall, use_tree_reduce, use_pami);
   }
   if(use_pami)
   {
      TRACE_ERR("Using PAMI-level allgather protocol\n");
      pami_xfer_t allgather;
      allgather.cb_done = allgather_cb_done;
      allgather.cookie = (void *)&allgather_active;
      allgather.cmd.xfer_allgather.rcvbuf = rbuf;
      allgather.cmd.xfer_allgather.sndbuf = sbuf;
      allgather.cmd.xfer_allgather.stype = PAMI_TYPE_BYTE;
      allgather.cmd.xfer_allgather.rtype = PAMI_TYPE_BYTE;
      allgather.cmd.xfer_allgather.stypecount = send_size;
      allgather.cmd.xfer_allgather.rtypecount = recv_size;
      if(comm_ptr->mpid.user_selected_type[PAMI_XFER_ALLGATHER] == MPID_COLL_OPTIMIZED)
      {
         allgather.algorithm = comm_ptr->mpid.opt_protocol[PAMI_XFER_ALLGATHER][0];
         my_md = &comm_ptr->mpid.opt_protocol_md[PAMI_XFER_ALLGATHER][0];
      }
      else
      {
         allgather.algorithm = comm_ptr->mpid.user_selected[PAMI_XFER_ALLGATHER];
         my_md = &comm_ptr->mpid.user_metadata[PAMI_XFER_ALLGATHER];
      }

      if(unlikely( comm_ptr->mpid.user_selected_type[PAMI_XFER_ALLGATHER] == MPID_COLL_ALWAYS_QUERY ||
                   comm_ptr->mpid.user_selected_type[PAMI_XFER_ALLGATHER] == MPID_COLL_CHECK_FN_REQUIRED))
      {
         metadata_result_t result = {0};
         TRACE_ERR("Querying allgather protocol %s, type was: %d\n",
            my_md->name,
            comm_ptr->mpid.user_selected_type[PAMI_XFER_ALLGATHER]);
         result = my_md->check_fn(&allgather);
         TRACE_ERR("bitmask: %#X\n", result.bitmask);
         if(!result.bitmask)
         {
            fprintf(stderr,"Query failed for %s.\n",
               my_md->name);
         }
      }

      if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL && comm_ptr->rank == 0))
      {
         unsigned long long int threadID;
         MPIU_Thread_id_t tid;
         MPIU_Thread_self(&tid);
         threadID = (unsigned long long int)tid;
         fprintf(stderr,"<%llx> Using protocol %s for allgather on %u\n", 
                 threadID,
                 my_md->name,
              (unsigned) comm_ptr->context_id);
      }
      TRACE_ERR("Calling PAMI_Collective with allgather structure\n");
      MPIDI_Post_coll_t allgather_post;
      MPIDI_Context_post(MPIDI_Context[0], &allgather_post.state, MPIDI_Pami_post_wrapper, (void *)&allgather);
      TRACE_ERR("Allgather %s\n", MPIDI_Process.context_post.active>0?"posted":"invoked");

      MPIDI_Update_last_algorithm(comm_ptr, my_md->name);

      MPID_PROGRESS_WAIT_WHILE(allgather_active);
      TRACE_ERR("Allgather done\n");
      return PAMI_SUCCESS;
   }

   if(use_tree_reduce)
   {
      TRACE_ERR("Using allgather via allreduce\n");
      MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_ALLREDUCE");
     rc = MPIDO_Allgather_allreduce(sendbuf, sendcount, sendtype,
                               recvbuf, recvcount, recvtype,
                               send_true_lb, recv_true_lb, send_size, recv_size*comm_size, comm_ptr, mpierrno);
      return rc;
   }
   if(use_alltoall)
   {
      TRACE_ERR("Using allgather via alltoall\n");
      MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_ALLTOALL");
     rc = MPIDO_Allgather_alltoall(sendbuf, sendcount, sendtype,
                               recvbuf, recvcount, recvtype,
                               send_true_lb, recv_true_lb, send_size, recv_size*comm_size, comm_ptr, mpierrno);
      return rc;
   }

   if(use_bcast)
   {
      TRACE_ERR("Using allgather via bcast\n");
      MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_BCAST");
     rc = MPIDO_Allgather_bcast(sendbuf, sendcount, sendtype,
                               recvbuf, recvcount, recvtype,
                               send_true_lb, recv_true_lb, send_size, recv_size*comm_size, comm_ptr, mpierrno);
      return rc;
   }
   
   /* Nothing used yet; dump to MPICH */
   if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL && comm_ptr->rank == 0))
      fprintf(stderr,"Using MPICH allgather algorithm\n");
   TRACE_ERR("Using allgather via mpich\n");
   MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH");
   return MPIR_Allgather(sendbuf, sendcount, sendtype,
                         recvbuf, recvcount, recvtype,
                         comm_ptr, mpierrno);
}
예제 #16
0
void MPIDI_Coll_comm_create(MPID_Comm *comm)
{
   volatile int geom_init = 1;
   int i;
   MPIDI_Post_geom_create_t geom_post;

  TRACE_ERR("MPIDI_Coll_comm_create enter\n");
  if (!MPIDI_Process.optimized.collectives)
    return;

  if(comm->comm_kind != MPID_INTRACOMM) return;
  /* Create a geometry */

  comm->coll_fns = MPIU_Calloc0(1, MPID_Collops);
  MPID_assert(comm->coll_fns != NULL);

   if(comm->mpid.geometry != MPIDI_Process.world_geometry)
   {
      if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_0 && comm->rank == 0))
         fprintf(stderr,"world geom: %p parent geom: %p\n", MPIDI_Process.world_geometry, comm->mpid.parent);
      TRACE_ERR("Creating subgeom\n");
      /* Change to this at some point */

      comm->mpid.tasks = NULL;
      for(i=1;i<comm->local_size;i++)
      {
        /* only if sequential tasks should we use a (single) range.
           Multi or reordered ranges are inefficient */
        if(MPID_VCR_GET_LPID(comm->vcr, i) != (MPID_VCR_GET_LPID(comm->vcr, i-1) + 1)) {
        /* not sequential, use tasklist */
          MPID_VCR_GET_LPIDS(comm, comm->mpid.tasks);
          break;
        }
      }
      /* Should we use a range? (no task list set) */
      if(comm->mpid.tasks == NULL)
      {
         /* one range, {first rank ... last rank} */
         comm->mpid.range.lo = MPID_VCR_GET_LPID(comm->vcr, 0);
         comm->mpid.range.hi = MPID_VCR_GET_LPID(comm->vcr, comm->local_size-1);
      }

      if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_0 && comm->rank == 0))
         fprintf(stderr,"create geometry tasks %p {%u..%u}\n", comm->mpid.tasks, MPID_VCR_GET_LPID(comm->vcr, 0),MPID_VCR_GET_LPID(comm->vcr, comm->local_size-1));

      pami_configuration_t config[3];
      size_t numconfigs = 0;
#ifdef HAVE_PAMI_GEOMETRY_NONCONTIG
      config[0].name = PAMI_GEOMETRY_NONCONTIG;
      if(MPIDI_Process.optimized.memory & MPID_OPT_LVL_NONCONTIG) 
         config[0].value.intval = 0; // Disable non-contig, pamid doesn't use pami for non-contig data collectives
      else
         config[0].value.intval = 1; // Enable non-contig even though pamid doesn't use pami for non-contig data collectives, 
                                     // we still possibly want those collectives for other reasons.
      ++numconfigs;
#endif
      if(MPIDI_Process.optimized.subcomms)
      {
         config[numconfigs].name = PAMI_GEOMETRY_OPTIMIZE;
         config[numconfigs].value.intval = 1; 
         ++numconfigs;
      }
#ifdef HAVE_PAMI_GEOMETRY_MEMORY_OPTIMIZE
      if(MPIDI_Process.optimized.memory) 
      {
         config[numconfigs].name = PAMI_GEOMETRY_MEMORY_OPTIMIZE;
         config[numconfigs].value.intval = MPIDI_Process.optimized.memory; /* level of optimization */
         ++numconfigs;
      }
#endif

      if((MPIDI_Process.optimized.memory  & MPID_OPT_LVL_IRREG) && (comm->local_size & (comm->local_size-1)))
      {
         /* Don't create irregular geometries.  Fallback to MPICH only collectives */
         geom_init = 0;
         comm->mpid.geometry = PAMI_GEOMETRY_NULL;
      }
      else if(comm->mpid.tasks == NULL)
      {   
         geom_post.client = MPIDI_Client;
         geom_post.configs = config;
         geom_post.context_offset = 0; /* TODO BES investigate */
         geom_post.num_configs = numconfigs;
         geom_post.newgeom = &comm->mpid.geometry,
         geom_post.parent = PAMI_GEOMETRY_NULL;
         geom_post.id     = comm->context_id;
         geom_post.ranges = &comm->mpid.range;
         geom_post.tasks = NULL;;
         geom_post.count = (size_t)1;
         geom_post.fn = geom_create_cb_done;
         geom_post.cookie = (void*)&geom_init;

         TRACE_ERR("%s geom_rangelist_create\n", MPIDI_Process.context_post>0?"Posting":"Invoking");
         MPIDI_Context_post(MPIDI_Context[0], &geom_post.state,
                            geom_rangelist_create_wrapper, (void *)&geom_post);
      }
      else
      {
         geom_post.client = MPIDI_Client;
         geom_post.configs = config;
         geom_post.context_offset = 0; /* TODO BES investigate */
         geom_post.num_configs = numconfigs;
         geom_post.newgeom = &comm->mpid.geometry,
         geom_post.parent = PAMI_GEOMETRY_NULL;
         geom_post.id     = comm->context_id;
         geom_post.ranges = NULL;
         geom_post.tasks = comm->mpid.tasks;
         geom_post.count = (size_t)comm->local_size;
         geom_post.fn = geom_create_cb_done;
         geom_post.cookie = (void*)&geom_init;

         TRACE_ERR("%s geom_tasklist_create\n", MPIDI_Process.context_post>0?"Posting":"Invoking");
         MPIDI_Context_post(MPIDI_Context[0], &geom_post.state,
                            geom_tasklist_create_wrapper, (void *)&geom_post);
      }

      TRACE_ERR("Waiting for geom create to finish\n");
      MPID_PROGRESS_WAIT_WHILE(geom_init);

      if(comm->mpid.geometry == PAMI_GEOMETRY_NULL)
      {
         if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_0 && comm->rank == 0))
            fprintf(stderr,"Created unoptimized communicator id=%u, size=%u\n", (unsigned) comm->context_id,comm->local_size);
         MPIU_TestFree(&comm->coll_fns);
         return;
      }
   }
   /* Initialize the async flow control in case it will be used. */
   comm->mpid.num_requests = MPIDI_Process.optimized.num_requests;

   TRACE_ERR("Querying protocols\n");
   /* Determine what protocols are available for this comm/geom */
   /* These two functions moved to mpid_collselect.c */
   MPIDI_Comm_coll_query(comm);
   MPIDI_Comm_coll_envvars(comm);
   if(MPIDI_Process.optimized.select_colls)
      MPIDI_Comm_coll_select(comm);
   TRACE_ERR("mpir barrier\n");
   int mpierrno = FALSE;
   /* Switch to comm->coll_fns->fn() */
   MPIDO_Barrier(comm, &mpierrno);


  TRACE_ERR("MPIDI_Coll_comm_create exit\n");
}
예제 #17
0
void MPIDI_Coll_comm_destroy(MPID_Comm *comm)
{
  TRACE_ERR("MPIDI_Coll_comm_destroy enter\n");
  int i;
  volatile int geom_destroy = 1;
  MPIDI_Post_geom_destroy_t geom_destroy_post;
  if (!MPIDI_Process.optimized.collectives)
    return;

  if(comm->comm_kind != MPID_INTRACOMM)
    return;

  /* It's possible (MPIR_Setup_intercomm_localcomm) to have an intracomm
     without a geometry even when using optimized collectives */
  if(comm->mpid.geometry == PAMI_GEOMETRY_NULL)
    return;

   MPIU_TestFree(&comm->coll_fns);
   for(i=0;i<PAMI_XFER_COUNT;i++)
   {
     TRACE_ERR("Freeing algo/meta %d\n", i);
     /* When allocating comm->mpid.coll_algorithm, we skip allocations for
       AM collectives. Also there is no explicit initialization of 
       comm->mpid.coll_algorithm to NULLs. This may cause MPIU_TestFree to
       cause problems when freeing. We skip AM collectives here as we skip
       allocating them in MPIDI_Comm_coll_query */
     if(i == PAMI_XFER_AMBROADCAST || i == PAMI_XFER_AMSCATTER ||
         i == PAMI_XFER_AMGATHER || i == PAMI_XFER_AMREDUCE)
         continue;

     MPIU_TestFree(&comm->mpid.coll_algorithm[i][0]);
     MPIU_TestFree(&comm->mpid.coll_algorithm[i][1]);
     MPIU_TestFree(&comm->mpid.coll_metadata[i][0]);
     MPIU_TestFree(&comm->mpid.coll_metadata[i][1]);
   }


   if(MPIDI_Process.optimized.auto_select_colls != MPID_AUTO_SELECT_COLLS_NONE && MPIDI_Process.optimized.auto_select_colls != MPID_AUTO_SELECT_COLLS_TUNE && comm->local_size > 1)
   {
     /* Destroy the fast query object. */
     pami_extension_collsel_query_destroy pamix_collsel_query_destroy =
      (pami_extension_collsel_query_destroy) PAMI_Extension_symbol(MPIDI_Collsel_extension,
                                                                        "Collsel_query_destroy");
     if(pamix_collsel_query_destroy != NULL)
     {
       pamix_collsel_query_destroy(&(comm->mpid.collsel_fast_query));
     }
   }

   TRACE_ERR("Destroying geometry\n");

   geom_destroy_post.client = MPIDI_Client;
   geom_destroy_post.geom = &comm->mpid.geometry;
   geom_destroy_post.fn = geom_destroy_cb_done;
   geom_destroy_post.cookie = (void *)&geom_destroy;
   TRACE_ERR("%s geom_destroy\n", MPIDI_Process.context_post>0?"Posting":"Invoking");
   MPIDI_Context_post(MPIDI_Context[0], &geom_destroy_post.state,
                      geom_destroy_wrapper, (void *)&geom_destroy_post);

   TRACE_ERR("Waiting for geom destroy to finish\n");
   MPID_PROGRESS_WAIT_WHILE(geom_destroy);
   MPID_VCR_FREE_LPIDS(comm->mpid.tasks);
/*   TRACE_ERR("Freeing geometry ranges\n");
   MPIU_TestFree(&comm->mpid.tasks_descriptor.ranges);
*/
   TRACE_ERR("MPIDI_Coll_comm_destroy exit\n");
}
예제 #18
0
int
MPIDO_Allgather(const void *sendbuf,
                int sendcount,
                MPI_Datatype sendtype,
                void *recvbuf,
                int recvcount,
                MPI_Datatype recvtype,
                MPID_Comm * comm_ptr,
                int *mpierrno)
{
#ifndef HAVE_PAMI_IN_PLACE
  if (sendbuf == MPI_IN_PLACE)
  {
    MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`");
    return -1;
  }
#endif
  /* *********************************
   * Check the nature of the buffers
   * *********************************
   */
   const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
   int config[6], i;
   MPID_Datatype * dt_null = NULL;
   MPI_Aint send_true_lb = 0;
   MPI_Aint recv_true_lb = 0;
   int comm_size = comm_ptr->local_size;
   size_t send_bytes = 0;
   size_t recv_bytes = 0;
   volatile unsigned allred_active = 1;
   volatile unsigned allgather_active = 1;
   pami_xfer_t allred;
   const int rank = comm_ptr->rank;
   int queryreq = 0;
#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
    const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif
   const int selected_type = mpid->user_selected_type[PAMI_XFER_ALLGATHER];

   for (i=0;i<6;i++) config[i] = 1;
   const pami_metadata_t *my_md = (pami_metadata_t *)NULL;


   allred.cb_done = allred_cb_done;
   allred.cookie = (void *)&allred_active;
   /* Pick an algorithm that is guaranteed to work for the pre-allreduce */
   /* TODO: This needs selection for fast(er|est) allreduce protocol */
   allred.algorithm = mpid->coll_algorithm[PAMI_XFER_ALLREDUCE][0][0]; 
   allred.cmd.xfer_allreduce.sndbuf = (void *)config;
   allred.cmd.xfer_allreduce.stype = PAMI_TYPE_SIGNED_INT;
   allred.cmd.xfer_allreduce.rcvbuf = (void *)config;
   allred.cmd.xfer_allreduce.rtype = PAMI_TYPE_SIGNED_INT;
   allred.cmd.xfer_allreduce.stypecount = 6;
   allred.cmd.xfer_allreduce.rtypecount = 6;
   allred.cmd.xfer_allreduce.op = PAMI_DATA_BAND;

  char use_tree_reduce, use_alltoall, use_bcast, use_pami, use_opt;
  char *rbuf = NULL, *sbuf = NULL;

   const char * const allgathers = mpid->allgathers;
   use_alltoall = allgathers[2];
   use_tree_reduce = allgathers[0];
   use_bcast = allgathers[1];
   use_pami = (selected_type == MPID_COLL_USE_MPICH) ? 0 : 1;
   use_opt = use_alltoall || use_tree_reduce || use_bcast || use_pami;


   TRACE_ERR("flags before: b: %d a: %d t: %d p: %d\n", use_bcast, use_alltoall, use_tree_reduce, use_pami);
   if(!use_opt)
   {
     if(unlikely(verbose))
       fprintf(stderr,"Using MPICH allgather algorithm\n");
      TRACE_ERR("No options set/available; using MPICH for allgather\n");
      MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH");
#if CUDA_AWARE_SUPPORT
    if(MPIDI_Process.cuda_aware_support_on)
    {
       MPI_Aint sdt_extent,rdt_extent;
       MPID_Datatype_get_extent_macro(sendtype, sdt_extent);
       MPID_Datatype_get_extent_macro(recvtype, rdt_extent);
       char *scbuf = NULL;
       char *rcbuf = NULL;
       int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
       int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf);
       if(is_send_dev_buf)
       {
         scbuf = MPL_malloc(sdt_extent * sendcount);
         cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost);
         if (cudaSuccess != cudaerr)
           fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
       }
       else
         scbuf = sendbuf;
       if(is_recv_dev_buf)
       {
         rcbuf = MPL_malloc(rdt_extent * recvcount);
         if(sendbuf == MPI_IN_PLACE)
         {
           cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rdt_extent * recvcount, cudaMemcpyDeviceToHost);
           if (cudaSuccess != cudaerr)
             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
         }
         else
           memset(rcbuf, 0, rdt_extent * recvcount);
       }
       else
         rcbuf = recvbuf;
       int cuda_res =  MPIR_Allgather(scbuf, sendcount, sendtype, rcbuf, recvcount, recvtype, comm_ptr, mpierrno);
       if(is_send_dev_buf)MPL_free(scbuf);
       if(is_recv_dev_buf)
         {
           cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, rdt_extent * recvcount, cudaMemcpyHostToDevice);
           if (cudaSuccess != cudaerr)
             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
           MPL_free(rcbuf);
         }
       return cuda_res;
    }
    else
#endif
      return MPIR_Allgather(sendbuf, sendcount, sendtype,
                            recvbuf, recvcount, recvtype,
                            comm_ptr, mpierrno);
   }
   if ((sendcount < 1 && sendbuf != MPI_IN_PLACE) || recvcount < 1)
      return MPI_SUCCESS;

   /* Gather datatype information */
   MPIDI_Datatype_get_info(recvcount,
			  recvtype,
        config[MPID_RECV_CONTIG],
			  recv_bytes,
			  dt_null,
			  recv_true_lb);

   send_bytes = recv_bytes;
   rbuf = (char *)recvbuf+recv_true_lb;

   sbuf = PAMI_IN_PLACE;
   if(sendbuf != MPI_IN_PLACE)
   {
      MPIDI_Datatype_get_info(sendcount,
                            sendtype,
                            config[MPID_SEND_CONTIG],
                            send_bytes,
                            dt_null,
                            send_true_lb);
      sbuf = (char *)sendbuf+send_true_lb;
   }
   else
     if(unlikely(verbose))
       fprintf(stderr,"allgather MPI_IN_PLACE buffering\n");


  /* verify everyone's datatype contiguity */
  /* Check buffer alignment now, since we're pre-allreducing anyway */
  /* Only do this if one of the glue protocols is likely to be used */
  if(use_alltoall || use_tree_reduce || use_bcast)
  {
     config[MPID_ALIGNEDBUFFER] =
               !((long)sendbuf & 0x0F) && !((long)recvbuf & 0x0F);

      /* #warning need to determine best allreduce for short messages */
      if(mpid->preallreduces[MPID_ALLGATHER_PREALLREDUCE])
      {
         TRACE_ERR("Preallreducing in allgather\n");
         MPIDI_Post_coll_t allred_post;
         MPIDI_Context_post(MPIDI_Context[0], &allred_post.state,
                            MPIDI_Pami_post_wrapper, (void *)&allred);

         MPID_PROGRESS_WAIT_WHILE(allred_active);
     }


       use_alltoall = allgathers[2] &&
            config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG];;

      /* Note: some of the glue protocols use recv_bytes*comm_size rather than 
       * recv_bytes so we use that for comparison here, plus we pass that in
       * to those protocols. */
       use_tree_reduce =  allgathers[0] &&
         config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG] &&
         config[MPID_RECV_CONTINUOUS] && (recv_bytes*comm_size%sizeof(unsigned)) == 0;

       use_bcast = allgathers[1];

       TRACE_ERR("flags after: b: %d a: %d t: %d p: %d\n", use_bcast, use_alltoall, use_tree_reduce, use_pami);
   }
   if(use_pami)
   {
      TRACE_ERR("Using PAMI-level allgather protocol\n");
      pami_xfer_t allgather;
      allgather.cb_done = allgather_cb_done;
      allgather.cookie = (void *)&allgather_active;
      allgather.cmd.xfer_allgather.rcvbuf = rbuf;
      allgather.cmd.xfer_allgather.sndbuf = sbuf;
      allgather.cmd.xfer_allgather.stype = PAMI_TYPE_BYTE;
      allgather.cmd.xfer_allgather.rtype = PAMI_TYPE_BYTE;
      allgather.cmd.xfer_allgather.stypecount = send_bytes;
      allgather.cmd.xfer_allgather.rtypecount = recv_bytes;
      if(selected_type == MPID_COLL_OPTIMIZED)
      {
        if((mpid->cutoff_size[PAMI_XFER_ALLGATHER][0] == 0) || 
           (mpid->cutoff_size[PAMI_XFER_ALLGATHER][0] > 0 && mpid->cutoff_size[PAMI_XFER_ALLGATHER][0] >= send_bytes))
        {
           allgather.algorithm = mpid->opt_protocol[PAMI_XFER_ALLGATHER][0];
           my_md = &mpid->opt_protocol_md[PAMI_XFER_ALLGATHER][0];
           queryreq     = mpid->must_query[PAMI_XFER_ALLGATHER][0];
        }
        else
        {
           return MPIR_Allgather(sendbuf, sendcount, sendtype,
                       recvbuf, recvcount, recvtype,
                       comm_ptr, mpierrno);
        }
      }
      else
      {
         allgather.algorithm = mpid->user_selected[PAMI_XFER_ALLGATHER];
         my_md = &mpid->user_metadata[PAMI_XFER_ALLGATHER];
         queryreq     = selected_type;
      }

      if(unlikely( queryreq == MPID_COLL_ALWAYS_QUERY ||
                   queryreq == MPID_COLL_CHECK_FN_REQUIRED))
      {
         metadata_result_t result = {0};
         TRACE_ERR("Querying allgather protocol %s, type was: %d\n",
            my_md->name,
            selected_type);
         if(my_md->check_fn == NULL)
         {
           /* process metadata bits */
           if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE))
              result.check.unspecified = 1;
           if(my_md->check_correct.values.rangeminmax)
           {
             if((my_md->range_lo <= recv_bytes) &&
                (my_md->range_hi >= recv_bytes))
                ; /* ok, algorithm selected */
             else
             {
               result.check.range = 1;
               if(unlikely(verbose))
               {   
                 fprintf(stderr,"message size (%zu) outside range (%zu<->%zu) for %s.\n",
                         recv_bytes,
                         my_md->range_lo,
                         my_md->range_hi,
                         my_md->name);
               }
             }
           }
         }
         else /* calling the check fn is sufficient */
           result = my_md->check_fn(&allgather);
         TRACE_ERR("bitmask: %#X\n", result.bitmask);
         result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */
         if(result.bitmask)
         {
           if(unlikely(verbose))
             fprintf(stderr,"Query failed for %s.  Using MPICH allgather\n",
                     my_md->name);
           MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH");
           return MPIR_Allgather(sendbuf, sendcount, sendtype,
                       recvbuf, recvcount, recvtype,
                       comm_ptr, mpierrno);
         }
         if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) 
         { 
           comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
           int tmpmpierrno;   
           if(unlikely(verbose))
             fprintf(stderr,"Query barrier required for %s\n", my_md->name);
           MPIDO_Barrier(comm_ptr, &tmpmpierrno);
         }
      }

      if(unlikely(verbose))
      {
         unsigned long long int threadID;
         MPL_thread_id_t tid;
         MPL_thread_self(&tid);
         threadID = (unsigned long long int)tid;
         fprintf(stderr,"<%llx> Using protocol %s for allgather on %u\n", 
                 threadID,
                 my_md->name,
              (unsigned) comm_ptr->context_id);
      }
      TRACE_ERR("Calling PAMI_Collective with allgather structure\n");
      MPIDI_Post_coll_t allgather_post;
      MPIDI_Context_post(MPIDI_Context[0], &allgather_post.state, MPIDI_Pami_post_wrapper, (void *)&allgather);

      MPIDI_Update_last_algorithm(comm_ptr, my_md->name);
      MPID_PROGRESS_WAIT_WHILE(allgather_active);
      TRACE_ERR("Allgather done\n");
      return PAMI_SUCCESS;
   }

   if(use_tree_reduce)
   {
      if(unlikely(verbose))
         fprintf(stderr,"Using protocol GLUE_ALLREDUCE for allgather\n");
      TRACE_ERR("Using allgather via allreduce\n");
      MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_ALLREDUCE");
     return MPIDO_Allgather_allreduce(sendbuf, sendcount, sendtype,
                               recvbuf, recvcount, recvtype,
                               send_true_lb, recv_true_lb, send_bytes, recv_bytes*comm_size, comm_ptr, mpierrno);
   }
   if(use_alltoall)
   {
      if(unlikely(verbose))
         fprintf(stderr,"Using protocol GLUE_BCAST for allgather\n");
      TRACE_ERR("Using allgather via alltoall\n");
      MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_ALLTOALL");
     return MPIDO_Allgather_alltoall(sendbuf, sendcount, sendtype,
                               recvbuf, recvcount, recvtype,
                               send_true_lb, recv_true_lb, send_bytes, recv_bytes*comm_size, comm_ptr, mpierrno);
   }

   if(use_bcast)
   {
      if(unlikely(verbose))
         fprintf(stderr,"Using protocol GLUE_ALLTOALL for allgather\n");
      TRACE_ERR("Using allgather via bcast\n");
     MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_BCAST");
     return MPIDO_Allgather_bcast(sendbuf, sendcount, sendtype,
                               recvbuf, recvcount, recvtype,
                               send_true_lb, recv_true_lb, send_bytes, recv_bytes*comm_size, comm_ptr, mpierrno);
   }
   
   /* Nothing used yet; dump to MPICH */
   if(unlikely(verbose))
      fprintf(stderr,"Using MPICH allgather algorithm\n");
   TRACE_ERR("Using allgather via mpich\n");
   MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH");
   return MPIR_Allgather(sendbuf, sendcount, sendtype,
                         recvbuf, recvcount, recvtype,
                         comm_ptr, mpierrno);
}
예제 #19
0
int
MPIDO_Allgather_simple(const void *sendbuf,
                int sendcount,
                MPI_Datatype sendtype,
                void *recvbuf,
                int recvcount,
                MPI_Datatype recvtype,
                MPID_Comm * comm_ptr,
                int *mpierrno)
{
#ifndef HAVE_PAMI_IN_PLACE
  if (sendbuf == MPI_IN_PLACE)
  {
    MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`");
    return -1;
  }
#endif
     /* *********************************
   * Check the nature of the buffers
   * *********************************
   */
   const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
   MPID_Datatype * dt_null = NULL;
   void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL;
   MPI_Aint send_true_lb = 0;
   MPI_Aint recv_true_lb = 0;
   int snd_data_contig = 1, rcv_data_contig = 1;
   size_t send_size = 0;
   size_t recv_size = 0;
   MPID_Segment segment;
   volatile unsigned allgather_active = 1;
   const int rank = comm_ptr->rank;
   const int size = comm_ptr->local_size;
#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
    const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif

   const pami_metadata_t *my_md;

   char *rbuf = NULL, *sbuf = NULL;


   if ((sendcount < 1 && sendbuf != MPI_IN_PLACE) || recvcount < 1)
      return MPI_SUCCESS;

   /* Gather datatype information */
   MPIDI_Datatype_get_info(recvcount,
			  recvtype,
			  rcv_data_contig,
			  recv_size,
			  dt_null,
			  recv_true_lb);

   send_size = recv_size;

  if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL)
  {
    advisor_algorithm_t advisor_algorithms[1];
    int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_ALLGATHER, send_size, advisor_algorithms, 1);
    if(num_algorithms)
    {
      if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO)
      {
        return MPIR_Allgather(sendbuf, sendcount, sendtype,
                              recvbuf, recvcount, recvtype,
                              comm_ptr, mpierrno); 
      }
      else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests)))
      {
        comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
        int tmpmpierrno;
        if(unlikely(verbose))
          fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name);
        MPIDO_Barrier(comm_ptr, &tmpmpierrno);
      }
    }
  }

   rbuf = (char *)recvbuf+recv_true_lb;

  if(!rcv_data_contig)
  {
    rcv_noncontig_buff = MPL_malloc(recv_size * size);
    rbuf = rcv_noncontig_buff;
    if(rcv_noncontig_buff == NULL)
    {
      MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
                 "Fatal:  Cannot allocate pack buffer");
    }
    if(sendbuf == MPI_IN_PLACE)
    {
      sbuf = PAMI_IN_PLACE;
      size_t extent;
      MPID_Datatype_get_extent_macro(recvtype,extent);
      MPIR_Localcopy(recvbuf + (rank*recvcount*extent), recvcount, recvtype,
                       rcv_noncontig_buff + (rank*recv_size), recv_size,MPI_CHAR);
    }
  }

  if(sendbuf != MPI_IN_PLACE)
   {
     MPIDI_Datatype_get_info(sendcount,
                           sendtype,
                           snd_data_contig,
                           send_size,
                           dt_null,
                           send_true_lb);

     sbuf = (char *)sendbuf+send_true_lb;

     if(!snd_data_contig)
     {
        snd_noncontig_buff = MPL_malloc(send_size);
        sbuf = snd_noncontig_buff;
        if(snd_noncontig_buff == NULL)
        {
           MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
              "Fatal:  Cannot allocate pack buffer");
        }
        DLOOP_Offset last = send_size;
        MPID_Segment_init(sendbuf, sendcount, sendtype, &segment, 0);
        MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff);
     }
  }
  else
    sbuf = PAMI_IN_PLACE;

   TRACE_ERR("Using PAMI-level allgather protocol\n");
   pami_xfer_t allgather;
   allgather.cb_done = allgather_cb_done;
   allgather.cookie = (void *)&allgather_active;
   allgather.cmd.xfer_allgather.rcvbuf = rbuf;
   allgather.cmd.xfer_allgather.sndbuf = sbuf;
   allgather.cmd.xfer_allgather.stype = PAMI_TYPE_BYTE;/* stype is ignored when sndbuf == PAMI_IN_PLACE */
   allgather.cmd.xfer_allgather.rtype = PAMI_TYPE_BYTE;
   allgather.cmd.xfer_allgather.stypecount = send_size;
   allgather.cmd.xfer_allgather.rtypecount = recv_size;
   allgather.algorithm = mpid->coll_algorithm[PAMI_XFER_ALLGATHER][0][0];
   my_md = &mpid->coll_metadata[PAMI_XFER_ALLGATHER][0][0];

   TRACE_ERR("Calling PAMI_Collective with allgather structure\n");
   MPIDI_Post_coll_t allgather_post;
   MPIDI_Context_post(MPIDI_Context[0], &allgather_post.state, MPIDI_Pami_post_wrapper, (void *)&allgather);
   TRACE_ERR("Allgather %s\n", MPIDI_Process.context_post.active>0?"posted":"invoked");

   MPIDI_Update_last_algorithm(comm_ptr, my_md->name);
   MPID_PROGRESS_WAIT_WHILE(allgather_active);
   if(!rcv_data_contig)
   {
      MPIR_Localcopy(rcv_noncontig_buff, recv_size * size, MPI_CHAR,
                        recvbuf,         recvcount,     recvtype);
      MPL_free(rcv_noncontig_buff);
   }
   if(!snd_data_contig)  MPL_free(snd_noncontig_buff);
   TRACE_ERR("Allgather done\n");
   return MPI_SUCCESS;
}
예제 #20
0
int
MPIDO_Allgatherv(const void *sendbuf,
		 int sendcount,
		 MPI_Datatype sendtype,
		 void *recvbuf,
		 const int *recvcounts,
		 const int *displs,
		 MPI_Datatype recvtype,
		 MPID_Comm * comm_ptr,
                 int *mpierrno)
{
#ifndef HAVE_PAMI_IN_PLACE
  if (sendbuf == MPI_IN_PLACE)
  {
    MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`");
    return -1;
  }
#endif
   TRACE_ERR("Entering MPIDO_Allgatherv\n");
  /* function pointer to be used to point to approperiate algorithm */

  /* Check the nature of the buffers */
  MPID_Datatype *dt_null = NULL;
  MPI_Aint send_true_lb  = 0;
  MPI_Aint recv_true_lb  = 0;
  size_t   send_size     = 0;
  size_t   recv_size     = 0;
  int config[6];
  int scount=sendcount;

  int i, rc, buffer_sum = 0;
  const int size = comm_ptr->local_size;
  char use_tree_reduce, use_alltoall, use_bcast, use_pami, use_opt;
  char *sbuf, *rbuf;
  const int rank = comm_ptr->rank;
  const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
  int queryreq = 0;

#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
   const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif
   const int selected_type = mpid->user_selected_type[PAMI_XFER_ALLGATHERV_INT];

  pami_xfer_t allred;
  volatile unsigned allred_active = 1;
  volatile unsigned allgatherv_active = 1;
  pami_type_t stype, rtype;
  int tmp;
  const pami_metadata_t *my_md = (pami_metadata_t *)NULL;

  for(i=0;i<6;i++) config[i] = 1;

  allred.cb_done = allred_cb_done;
  allred.cookie = (void *)&allred_active;
  allred.algorithm = mpid->coll_algorithm[PAMI_XFER_ALLREDUCE][0][0];
  allred.cmd.xfer_allreduce.sndbuf = (void *)config;
  allred.cmd.xfer_allreduce.stype = PAMI_TYPE_SIGNED_INT;
  allred.cmd.xfer_allreduce.rcvbuf = (void *)config;
  allred.cmd.xfer_allreduce.rtype = PAMI_TYPE_SIGNED_INT;
  allred.cmd.xfer_allreduce.stypecount = 6;
  allred.cmd.xfer_allreduce.rtypecount = 6;
  allred.cmd.xfer_allreduce.op = PAMI_DATA_BAND;

  use_alltoall = mpid->allgathervs[2];
  use_tree_reduce = mpid->allgathervs[0];
  use_bcast = mpid->allgathervs[1];
  use_pami = selected_type != MPID_COLL_USE_MPICH;
	 
   if((sendbuf != MPI_IN_PLACE) && (MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp) != MPI_SUCCESS))
     use_pami = 0;
   if(MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp) != MPI_SUCCESS)
     use_pami = 0;

   use_opt = use_alltoall || use_tree_reduce || use_bcast || use_pami;

   if(!use_opt) /* back to MPICH */
   {
     if(unlikely(verbose))
       fprintf(stderr,"Using MPICH allgatherv type %u.\n",
             selected_type);
     TRACE_ERR("Using MPICH Allgatherv\n");
     MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_MPICH");
#if CUDA_AWARE_SUPPORT
    if(MPIDI_Process.cuda_aware_support_on)
    {
       MPI_Aint sdt_extent,rdt_extent;
       MPID_Datatype_get_extent_macro(sendtype, sdt_extent);
       MPID_Datatype_get_extent_macro(recvtype, rdt_extent);
       char *scbuf = NULL;
       char *rcbuf = NULL;
       int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
       int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf);
       if(is_send_dev_buf)
       {
         scbuf = MPIU_Malloc(sdt_extent * sendcount);
         cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost);
         if (cudaSuccess != cudaerr)
           fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
       }
       else
         scbuf = sendbuf;
       size_t rtotal_buf;
       if(is_recv_dev_buf)
       {
         //Since displs can be non-continous, we need to calculate max buffer size 
         int highest_displs = displs[size - 1];
         int highest_recvcount = recvcounts[size - 1];
         for(i = 0; i < size; i++)
         {
           if(displs[i]+recvcounts[i] > highest_displs+highest_recvcount)
           {
             highest_displs = displs[i];
             highest_recvcount = recvcounts[i];
           }
         }
         rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent;
         rcbuf = MPIU_Malloc(rtotal_buf);
         if(sendbuf == MPI_IN_PLACE)
         {
           cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rtotal_buf, cudaMemcpyDeviceToHost);
           if (cudaSuccess != cudaerr)
             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
         }
         else
           memset(rcbuf, 0, rtotal_buf);
       }
       else
         rcbuf = recvbuf;
       int cuda_res =  MPIR_Allgatherv(scbuf, sendcount, sendtype, rcbuf, recvcounts, displs, recvtype, comm_ptr, mpierrno);
       if(is_send_dev_buf)MPIU_Free(scbuf);
       if(is_recv_dev_buf)
         {
           cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, rtotal_buf, cudaMemcpyHostToDevice);
           if (cudaSuccess != cudaerr)
             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
           MPIU_Free(rcbuf);
         }
       return cuda_res;
    }
    else
#endif
     return MPIR_Allgatherv(sendbuf, sendcount, sendtype,
			   recvbuf, recvcounts, displs, recvtype,
                          comm_ptr, mpierrno);
   }

   MPIDI_Datatype_get_info(1,
			  recvtype,
			  config[MPID_RECV_CONTIG],
			  recv_size,
			  dt_null,
			  recv_true_lb);

   if(sendbuf == MPI_IN_PLACE)
   {
     sbuf = PAMI_IN_PLACE;
     if(unlikely(verbose))
       fprintf(stderr,"allgatherv MPI_IN_PLACE buffering\n");
     stype = rtype;
     scount = recvcounts[rank];
     send_size = recv_size * scount; 
   }
   else
   {
      MPIDI_Datatype_get_info(sendcount,
                              sendtype,
                              config[MPID_SEND_CONTIG],
                              send_size,
                              dt_null,
                              send_true_lb);
       sbuf = (char *)sendbuf+send_true_lb;
   }

   rbuf = (char *)recvbuf+recv_true_lb;

   if(use_alltoall || use_bcast || use_tree_reduce)
   {
      if (displs[0])
       config[MPID_RECV_CONTINUOUS] = 0;

      for (i = 1; i < size; i++)
      {
        buffer_sum += recvcounts[i - 1];
        if (buffer_sum != displs[i])
        {
          config[MPID_RECV_CONTINUOUS] = 0;
          break;
        }
      }

      buffer_sum += recvcounts[size - 1];

      buffer_sum *= recv_size;

      /* disable with "safe allgatherv" env var */
      if(mpid->preallreduces[MPID_ALLGATHERV_PREALLREDUCE])
      {
         MPIDI_Post_coll_t allred_post;
         MPIDI_Context_post(MPIDI_Context[0], &allred_post.state,
                            MPIDI_Pami_post_wrapper, (void *)&allred);

         MPID_PROGRESS_WAIT_WHILE(allred_active);
      }

      use_tree_reduce = mpid->allgathervs[0] &&
         config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG] &&
         config[MPID_RECV_CONTINUOUS] && buffer_sum % sizeof(unsigned) == 0;

      use_alltoall = mpid->allgathervs[2] &&
         config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG];

      use_bcast = mpid->allgathervs[1];
   }

   if(use_pami)
   {
      pami_xfer_t allgatherv;
      allgatherv.cb_done = allgatherv_cb_done;
      allgatherv.cookie = (void *)&allgatherv_active;
      if(selected_type == MPID_COLL_OPTIMIZED)
      {
        if((mpid->cutoff_size[PAMI_XFER_ALLGATHERV_INT][0] == 0) || 
           (mpid->cutoff_size[PAMI_XFER_ALLGATHERV_INT][0] > 0 && mpid->cutoff_size[PAMI_XFER_ALLGATHERV_INT][0] >= send_size))
        {		
          allgatherv.algorithm = mpid->opt_protocol[PAMI_XFER_ALLGATHERV_INT][0];
          my_md = &mpid->opt_protocol_md[PAMI_XFER_ALLGATHERV_INT][0];
          queryreq     = mpid->must_query[PAMI_XFER_ALLGATHERV_INT][0];
        }
        else
          return MPIR_Allgatherv(sendbuf, sendcount, sendtype,
                       recvbuf, recvcounts, displs, recvtype,
                       comm_ptr, mpierrno);
      }
      else
      {  
        allgatherv.algorithm = mpid->user_selected[PAMI_XFER_ALLGATHERV_INT];
        my_md = &mpid->user_metadata[PAMI_XFER_ALLGATHERV_INT];
        queryreq     = selected_type;
      }
      
      allgatherv.cmd.xfer_allgatherv_int.sndbuf = sbuf;
      allgatherv.cmd.xfer_allgatherv_int.rcvbuf = rbuf;

      allgatherv.cmd.xfer_allgatherv_int.stype = stype;
      allgatherv.cmd.xfer_allgatherv_int.rtype = rtype;
      allgatherv.cmd.xfer_allgatherv_int.stypecount = scount;
      allgatherv.cmd.xfer_allgatherv_int.rtypecounts = (int *) recvcounts;
      allgatherv.cmd.xfer_allgatherv_int.rdispls = (int *) displs;

      if(unlikely (queryreq == MPID_COLL_ALWAYS_QUERY ||
                   queryreq == MPID_COLL_CHECK_FN_REQUIRED))
      {
         metadata_result_t result = {0};
         TRACE_ERR("Querying allgatherv_int protocol %s, type was %d\n", my_md->name,
            selected_type);
         if(my_md->check_fn == NULL)
         {
           /* process metadata bits */
           if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE))
              result.check.unspecified = 1;
/* Can't check ranges like this.  Non-local.  Comment out for now.
          if(my_md->check_correct.values.rangeminmax)
           {
             MPI_Aint data_true_lb;
             MPID_Datatype *data_ptr;
             int data_size, data_contig;
             MPIDI_Datatype_get_info(sendcount, sendtype, data_contig, data_size, data_ptr, data_true_lb); 
             if((my_md->range_lo <= data_size) &&
                (my_md->range_hi >= data_size))
                ; 
             else
             {
                result.check.range = 1;
                if(unlikely(verbose))
                {   
                   fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n",
                           data_size,
                           my_md->range_lo,
                           my_md->range_hi,
                           my_md->name);
                }
             }
           }
 */
         }
         else /* calling the check fn is sufficient */
           result = my_md->check_fn(&allgatherv);
         TRACE_ERR("Allgatherv bitmask: %#X\n", result.bitmask);
         result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */
         if(result.bitmask)
         {
           if(unlikely(verbose))
             fprintf(stderr,"Query failed for %s. Using MPICH allgatherv.\n", my_md->name);
           MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_MPICH");
           return MPIR_Allgatherv(sendbuf, sendcount, sendtype,
                                  recvbuf, recvcounts, displs, recvtype,
                                  comm_ptr, mpierrno);
         }
         if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) 
         { 
           comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
           int tmpmpierrno;   
           if(unlikely(verbose))
             fprintf(stderr,"Query barrier required for %s\n", my_md->name);
           MPIDO_Barrier(comm_ptr, &tmpmpierrno);
         }
      }

      if(unlikely(verbose))
      {
         unsigned long long int threadID;
         MPIU_Thread_id_t tid;
         MPIU_Thread_self(&tid);
         threadID = (unsigned long long int)tid;
         fprintf(stderr,"<%llx> Using protocol %s for allgatherv on %u\n", 
                 threadID,
                 my_md->name,
              (unsigned) comm_ptr->context_id);
      }
      MPIDI_Post_coll_t allgatherv_post;
      MPIDI_Context_post(MPIDI_Context[0], &allgatherv_post.state,
                         MPIDI_Pami_post_wrapper, (void *)&allgatherv);

      MPIDI_Update_last_algorithm(comm_ptr, my_md->name);

      TRACE_ERR("Rank %d waiting on active %d\n", rank, allgatherv_active);
      MPID_PROGRESS_WAIT_WHILE(allgatherv_active);

      return PAMI_SUCCESS;
   }

   /* TODO These need ordered in speed-order */
   if(use_tree_reduce)
   {
     if(unlikely(verbose))
       fprintf(stderr,"Using tree reduce allgatherv type %u.\n",
               selected_type);
     rc = MPIDO_Allgatherv_allreduce(sendbuf, sendcount, sendtype,
             recvbuf, recvcounts, buffer_sum, displs, recvtype,
             send_true_lb, recv_true_lb, send_size, recv_size,
             comm_ptr, mpierrno);
     MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_OPT_ALLREDUCE");
     return rc;
   }

   if(use_bcast)
   {
     if(unlikely(verbose))
       fprintf(stderr,"Using bcast allgatherv type %u.\n",
               selected_type);
     rc = MPIDO_Allgatherv_bcast(sendbuf, sendcount, sendtype,
             recvbuf, recvcounts, buffer_sum, displs, recvtype,
             send_true_lb, recv_true_lb, send_size, recv_size,
             comm_ptr, mpierrno);
     MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_OPT_BCAST");
     return rc;
   }

   if(use_alltoall)
   {
     if(unlikely(verbose))
       fprintf(stderr,"Using alltoall allgatherv type %u.\n",
               selected_type);
     rc = MPIDO_Allgatherv_alltoall(sendbuf, sendcount, sendtype,
             recvbuf, (int *)recvcounts, buffer_sum, displs, recvtype,
             send_true_lb, recv_true_lb, send_size, recv_size,
             comm_ptr, mpierrno);
     MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_OPT_ALLTOALL");
     return rc;
   }

   if(unlikely(verbose))
      fprintf(stderr,"Using MPICH allgatherv type %u.\n",
            selected_type);
   TRACE_ERR("Using MPICH for Allgatherv\n");
   MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_MPICH");
   return MPIR_Allgatherv(sendbuf, sendcount, sendtype,
                       recvbuf, recvcounts, displs, recvtype,
                       comm_ptr, mpierrno);
}