예제 #1
0
int MPIDO_Gatherv(const void *sendbuf, 
                  int sendcount, 
                  MPI_Datatype sendtype,
                  void *recvbuf, 
                  const int *recvcounts, 
                  const int *displs, 
                  MPI_Datatype recvtype,
                  int root, 
                  MPID_Comm * comm_ptr, 
                  int *mpierrno)

{
#ifndef HAVE_PAMI_IN_PLACE
  if (sendbuf == MPI_IN_PLACE)
  {
    MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`");
    return -1;
  }
#endif
   TRACE_ERR("Entering MPIDO_Gatherv\n");
   int i;
   int contig ATTRIBUTE((unused)), rsize ATTRIBUTE((unused)), ssize ATTRIBUTE((unused));
   int pamidt = 1;
   MPID_Datatype *dt_ptr = NULL;
   MPI_Aint send_true_lb, recv_true_lb;
   char *sbuf, *rbuf;
   pami_type_t stype, rtype;
   int tmp;
   volatile unsigned gatherv_active = 1;
   const int rank = comm_ptr->rank;
   const int size = comm_ptr->local_size;
#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
    const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif
   const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
   const int selected_type = mpid->user_selected_type[PAMI_XFER_GATHERV_INT];

   /* Check for native PAMI types and MPI_IN_PLACE on sendbuf */
   /* MPI_IN_PLACE is a nonlocal decision. We will need a preallreduce if we ever have
    * multiple "good" gathervs that work on different counts for example */
   if((sendbuf != MPI_IN_PLACE) && (MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp) != MPI_SUCCESS))
      pamidt = 0;
   if(MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp) != MPI_SUCCESS)
      pamidt = 0;

   if(pamidt == 0 || selected_type == MPID_COLL_USE_MPICH)
   {
      if(unlikely(verbose))
         fprintf(stderr,"Using MPICH gatherv algorithm\n");
      TRACE_ERR("GATHERV using MPICH\n");
      MPIDI_Update_last_algorithm(comm_ptr, "GATHERV_MPICH");
#if CUDA_AWARE_SUPPORT
    if(MPIDI_Process.cuda_aware_support_on)
    {
       MPI_Aint sdt_extent,rdt_extent;
       MPID_Datatype_get_extent_macro(sendtype, sdt_extent);
       MPID_Datatype_get_extent_macro(recvtype, rdt_extent);
       char *scbuf = NULL;
       char *rcbuf = NULL;
       int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
       int is_recv_dev_buf = (rank == root) ? MPIDI_cuda_is_device_buf(recvbuf) : 0;
       if(is_send_dev_buf)
       {
         scbuf = MPIU_Malloc(sdt_extent * sendcount);
         cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost);
         if (cudaSuccess != cudaerr)
           fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
       }
       else
         scbuf = sendbuf;
       size_t rtotal_buf;
       if(is_recv_dev_buf)
       {
         //Since displs can be non-continous, we need to calculate max buffer size 
         int highest_displs = displs[size - 1];
         int highest_recvcount = recvcounts[size - 1];
         for(i = 0; i < size; i++)
         {
           if(displs[i]+recvcounts[i] > highest_displs+highest_recvcount)
           {
             highest_displs = displs[i];
             highest_recvcount = recvcounts[i];
           }
         }
         rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent;
         rcbuf = MPIU_Malloc(rtotal_buf);
         if(sendbuf == MPI_IN_PLACE)
         {
           cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rtotal_buf, cudaMemcpyDeviceToHost);
           if (cudaSuccess != cudaerr)
             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
         }
         else
           memset(rcbuf, 0, rtotal_buf);
       }
       else
         rcbuf = recvbuf;
       int cuda_res =  MPIR_Gatherv(scbuf, sendcount, sendtype, rcbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno);
       if(is_send_dev_buf)MPIU_Free(scbuf);
       if(is_recv_dev_buf)
         {
           cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, rtotal_buf, cudaMemcpyHostToDevice);
           if (cudaSuccess != cudaerr)
             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
           MPIU_Free(rcbuf);
         }
       return cuda_res;
    }
    else
#endif
      return MPIR_Gatherv(sendbuf, sendcount, sendtype,
               recvbuf, recvcounts, displs, recvtype,
               root, comm_ptr, mpierrno);
   }

   MPIDI_Datatype_get_info(1, recvtype, contig, rsize, dt_ptr, recv_true_lb);
   rbuf = (char *)recvbuf + recv_true_lb;
   sbuf = (void *) sendbuf;

   pami_xfer_t gatherv;

   gatherv.cb_done = cb_gatherv;
   gatherv.cookie = (void *)&gatherv_active;
   gatherv.cmd.xfer_gatherv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0);
   gatherv.cmd.xfer_gatherv_int.rcvbuf = rbuf;
   gatherv.cmd.xfer_gatherv_int.rtype = rtype;
   gatherv.cmd.xfer_gatherv_int.rtypecounts = (int *) recvcounts;
   gatherv.cmd.xfer_gatherv_int.rdispls = (int *) displs;

   gatherv.cmd.xfer_gatherv_int.sndbuf = NULL;
   gatherv.cmd.xfer_gatherv_int.stype = stype;
   gatherv.cmd.xfer_gatherv_int.stypecount = sendcount;

   if(rank == root)
   {
      if(sendbuf == MPI_IN_PLACE) 
      {
         if(unlikely(verbose))
            fprintf(stderr,"gatherv MPI_IN_PLACE buffering\n");
         sbuf = PAMI_IN_PLACE;
         gatherv.cmd.xfer_gatherv_int.stype = rtype;
         gatherv.cmd.xfer_gatherv_int.stypecount = recvcounts[rank];
      }
      else
      {
         MPIDI_Datatype_get_info(1, sendtype, contig, ssize, dt_ptr, send_true_lb);
         sbuf = (char *)sbuf + send_true_lb;
      }
   }
   gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf;

   pami_algorithm_t my_gatherv;
   const pami_metadata_t *my_md = (pami_metadata_t *)NULL;
   int queryreq = 0;

   if(selected_type == MPID_COLL_OPTIMIZED)
   {
      TRACE_ERR("Optimized gatherv %s was selected\n",
         mpid->opt_protocol_md[PAMI_XFER_GATHERV_INT][0].name);
      my_gatherv = mpid->opt_protocol[PAMI_XFER_GATHERV_INT][0];
      my_md = &mpid->opt_protocol_md[PAMI_XFER_GATHERV_INT][0];
      queryreq = mpid->must_query[PAMI_XFER_GATHERV_INT][0];
   }
   else
   {
      TRACE_ERR("Optimized gatherv %s was set by user\n",
         mpid->user_metadata[PAMI_XFER_GATHERV_INT].name);
         my_gatherv = mpid->user_selected[PAMI_XFER_GATHERV_INT];
         my_md = &mpid->user_metadata[PAMI_XFER_GATHERV_INT];
         queryreq = selected_type;
   }

   gatherv.algorithm = my_gatherv;


   if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY || 
               queryreq == MPID_COLL_CHECK_FN_REQUIRED))
   {
      metadata_result_t result = {0};
      TRACE_ERR("querying gatherv protocol %s, type was %d\n", 
         my_md->name, queryreq);
      if(my_md->check_fn == NULL)
      {
         /* process metadata bits */
         if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE))
            result.check.unspecified = 1;
/* Can't check ranges like this.  Non-local.  Comment out for now.
         if(my_md->check_correct.values.rangeminmax)
         {
            MPI_Aint data_true_lb;
            MPID_Datatype *data_ptr;
            int data_size, data_contig;
            MPIDI_Datatype_get_info(sendcount, sendtype, data_contig, data_size, data_ptr, data_true_lb); 
            if((my_md->range_lo <= data_size) &&
               (my_md->range_hi >= data_size))
               ; 
            else
            {
               result.check.range = 1;
               if(unlikely(verbose))
               {   
                  fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n",
                          data_size,
                          my_md->range_lo,
                          my_md->range_hi,
                          my_md->name);
               }
            }
         }
 */
      }
      else /* calling the check fn is sufficient */
         result = my_md->check_fn(&gatherv);
      TRACE_ERR("bitmask: %#X\n", result.bitmask);
      result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */
      if(result.bitmask)
      {
         if(unlikely(verbose))
            fprintf(stderr,"Query failed for %s. Using MPICH gatherv.\n", my_md->name);
         MPIDI_Update_last_algorithm(comm_ptr, "GATHERV_MPICH");
         return MPIR_Gatherv(sendbuf, sendcount, sendtype,
                             recvbuf, recvcounts, displs, recvtype,
                             root, comm_ptr, mpierrno);
      }
      if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) 
      { 
         comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
         int tmpmpierrno;   
         if(unlikely(verbose))
            fprintf(stderr,"Query barrier required for %s\n", my_md->name);
         MPIDO_Barrier(comm_ptr, &tmpmpierrno);
      }
   }
   
   MPIDI_Update_last_algorithm(comm_ptr, my_md->name);

   if(unlikely(verbose))
   {
      unsigned long long int threadID;
      MPIU_Thread_id_t tid;
      MPIU_Thread_self(&tid);
      threadID = (unsigned long long int)tid;
      fprintf(stderr,"<%llx> Using protocol %s for gatherv on %u\n", 
              threadID,
              my_md->name,
              (unsigned) comm_ptr->context_id);
   }

   MPIDI_Post_coll_t gatherv_post;
   MPIDI_Context_post(MPIDI_Context[0], &gatherv_post.state,
                      MPIDI_Pami_post_wrapper, (void *)&gatherv);
   
   TRACE_ERR("Waiting on active %d\n", gatherv_active);
   MPID_PROGRESS_WAIT_WHILE(gatherv_active);

   TRACE_ERR("Leaving MPIDO_Gatherv\n");
   return 0;
}
예제 #2
0
파일: mpio_err.c 프로젝트: ICLDisco/ompi
int MPIO_Err_return_file(MPI_File mpi_fh, int error_code)
{
    MPI_Errhandler e;
    void (*c_errhandler)(MPI_File *, int *, ... );
    int  kind;   /* Error handler kind (see below) */
    char error_msg[4096];
    int len;

    /* If the file pointer is not valid, we use the handler on
       MPI_FILE_NULL (MPI-2, section 9.7).  For now, this code assumes that 
       MPI_FILE_NULL has the default handler (return).  FIXME.  See
       below - the set error handler uses ADIOI_DFLT_ERR_HANDLER; 
    */

    /* First, get the handler and the corresponding function */
    if (mpi_fh == MPI_FILE_NULL) {
	e = ADIOI_DFLT_ERR_HANDLER;
    }
    else {
	ADIO_File fh;

	fh = MPIO_File_resolve(mpi_fh);
	e = fh->err_handler;
    }

    /* Actually, e is just the value provide by the MPICH routines
       file_set_errhandler.  This is actually a *pointer* to the
       errhandler structure.  We don't know that, so we ask
       the MPICH code to translate this object into an error handler.
       kind = 0: errors are fatal
       kind = 1: errors return
       kind = 2: errors call function
    */
    if (e == MPI_ERRORS_RETURN || e == MPIR_ERRORS_THROW_EXCEPTIONS || !e) {
	/* FIXME: This is a hack in case no error handler was set */
	kind = 1;
	c_errhandler = 0;
    }
    else {
	MPIR_Get_file_error_routine( e, &c_errhandler, &kind );
    }

    /* --BEGIN ERROR HANDLING-- */
    if (MPIR_Err_is_fatal(error_code) || kind == 0) 
    {
	ADIOI_Snprintf(error_msg, 4096, "I/O error: ");
	len = (int)strlen(error_msg);
	MPIR_Err_get_string(error_code, &error_msg[len], 4096-len, NULL);
	MPID_Abort(NULL, MPI_SUCCESS, error_code, error_msg);
    }
    /* --END ERROR HANDLING-- */
    else if (kind == 2) {
	(*c_errhandler)( &mpi_fh, &error_code, 0 ); 
    }
    else if (kind == 3) {
	MPIR_File_call_cxx_errhandler( &mpi_fh, &error_code, c_errhandler );
    }

    /* kind == 1 just returns */
    return error_code;
}
예제 #3
0
int MPIDO_Bcast(void *buffer,
                int count,
                MPI_Datatype datatype,
                int root,
                MPID_Comm *comm_ptr,
                int *mpierrno)
{
   TRACE_ERR("in mpido_bcast\n");
   const size_t BCAST_LIMIT =      0x40000000;
   int data_contig, rc;
   void *data_buffer    = NULL,
        *noncontig_buff = NULL;
   volatile unsigned active = 1;
   MPI_Aint data_true_lb = 0;
   MPID_Datatype *data_ptr;
   MPID_Segment segment;
   MPIDI_Post_coll_t bcast_post;
   const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
   const int rank = comm_ptr->rank;
#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
   const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif
   const int selected_type = mpid->user_selected_type[PAMI_XFER_BROADCAST];

   /* Must calculate data_size based on count=1 in case it's total size is > integer */
   int data_size_one;
   MPIDI_Datatype_get_info(1, datatype,
			   data_contig, data_size_one, data_ptr, data_true_lb);
   /* do this calculation once and use twice */
   const size_t data_size_sz = (size_t)data_size_one*(size_t)count;
   if(unlikely(verbose))
     fprintf(stderr,"bcast count %d, size %d (%#zX), root %d, buffer %p\n",
	     count,data_size_one, (size_t)data_size_one*(size_t)count, root,buffer);
   if(unlikely( data_size_sz > BCAST_LIMIT) )
   {
      void *new_buffer=buffer;
      int c, new_count = (int)BCAST_LIMIT/data_size_one;
      MPID_assert(new_count > 0);

      for(c=1; ((size_t)c*(size_t)new_count) <= (size_t)count; ++c)
      {
        if ((rc = MPIDO_Bcast(new_buffer,
                        new_count,
                        datatype,
                        root,
                        comm_ptr,
                              mpierrno)) != MPI_SUCCESS)
         return rc;
	 new_buffer = (char*)new_buffer + (size_t)data_size_one*(size_t)new_count;
      }
      new_count = count % new_count; /* 0 is ok, just returns no-op */
      return MPIDO_Bcast(new_buffer,
                         new_count,
                         datatype,
                         root,
                         comm_ptr,
                         mpierrno);
   }

   /* Must use data_size based on count for byte bcast processing.
      Previously calculated as a size_t but large data_sizes were 
      handled above so this cast to int should be fine here.  
   */
   const int data_size = (int)data_size_sz;

   if(selected_type == MPID_COLL_USE_MPICH || data_size == 0)
   {
     if(unlikely(verbose))
       fprintf(stderr,"Using MPICH bcast algorithm\n");
      MPIDI_Update_last_algorithm(comm_ptr,"BCAST_MPICH");
      return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno);
   }

   data_buffer = (char *)buffer + data_true_lb;

   if(!data_contig)
   {
      noncontig_buff = MPIU_Malloc(data_size);
      data_buffer = noncontig_buff;
      if(noncontig_buff == NULL)
      {
         MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
            "Fatal:  Cannot allocate pack buffer");
      }
      if(rank == root)
      {
         DLOOP_Offset last = data_size;
         MPID_Segment_init(buffer, count, datatype, &segment, 0);
         MPID_Segment_pack(&segment, 0, &last, noncontig_buff);
      }
   }

   pami_xfer_t bcast;
   pami_algorithm_t my_bcast;
   const pami_metadata_t *my_md = (pami_metadata_t *)NULL;
   int queryreq = 0;

   bcast.cb_done = cb_bcast;
   bcast.cookie = (void *)&active;
   bcast.cmd.xfer_broadcast.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0);
   bcast.algorithm = mpid->user_selected[PAMI_XFER_BROADCAST];
   bcast.cmd.xfer_broadcast.buf = data_buffer;
   bcast.cmd.xfer_broadcast.type = PAMI_TYPE_BYTE;
   /* Needs to be sizeof(type)*count since we are using bytes as * the generic type */
   bcast.cmd.xfer_broadcast.typecount = data_size;

   if(selected_type == MPID_COLL_OPTIMIZED)
   {
      TRACE_ERR("Optimized bcast (%s) and (%s) were pre-selected\n",
         mpid->opt_protocol_md[PAMI_XFER_BROADCAST][0].name,
         mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1].name);

      if(mpid->cutoff_size[PAMI_XFER_BROADCAST][1] != 0)/* SSS: There is FCA cutoff (FCA only sets cutoff for [PAMI_XFER_BROADCAST][1]) */
      {
        if(data_size <= mpid->cutoff_size[PAMI_XFER_BROADCAST][1])
        {
          my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][1];
          my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1];
          queryreq = mpid->must_query[PAMI_XFER_BROADCAST][1];
        }
        else
        {
          return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno);
        }
      }

      if(data_size > mpid->cutoff_size[PAMI_XFER_BROADCAST][0])
      {
         my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][1];
         my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1];
         queryreq = mpid->must_query[PAMI_XFER_BROADCAST][1];
      }
      else
      {
         my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][0];
         my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][0];
         queryreq = mpid->must_query[PAMI_XFER_BROADCAST][0];
      }
   }
   else
   {
      TRACE_ERR("Bcast (%s) was specified by user\n",
         mpid->user_metadata[PAMI_XFER_BROADCAST].name);
      my_bcast =  mpid->user_selected[PAMI_XFER_BROADCAST];
      my_md = &mpid->user_metadata[PAMI_XFER_BROADCAST];
      queryreq = selected_type;
   }

   bcast.algorithm = my_bcast;

   if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY ||
               queryreq == MPID_COLL_CHECK_FN_REQUIRED))
   {
      metadata_result_t result = {0};
      TRACE_ERR("querying bcast protocol %s, type was: %d\n",
                my_md->name, queryreq);
      if(my_md->check_fn != NULL) /* calling the check fn is sufficient */
      {
         metadata_result_t result = {0};
         result = my_md->check_fn(&bcast);
         result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */
      } 
      else /* no check_fn, manually look at the metadata fields */
      {
         TRACE_ERR("Optimzed selection line %d\n",__LINE__);
         /* Check if the message range if restricted */
         if(my_md->check_correct.values.rangeminmax)
         {
            if((my_md->range_lo <= data_size) &&
               (my_md->range_hi >= data_size))
               ; /* ok, algorithm selected */
            else
            {
               result.check.range = 1;
               if(unlikely(verbose))
               {   
                  fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n",
                          data_size,
                          my_md->range_lo,
                          my_md->range_hi,
                          my_md->name);
               }
            }
         }
         /* \todo check the rest of the metadata */
      }
      TRACE_ERR("bitmask: %#X\n", result.bitmask);
      if(result.bitmask)
      {
         if(unlikely(verbose))
            fprintf(stderr,"Using MPICH bcast algorithm - query fn failed\n");
         MPIDI_Update_last_algorithm(comm_ptr,"BCAST_MPICH");
         return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno);
      }
      if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) 
      { 
         comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
         int tmpmpierrno;   
         if(unlikely(verbose))
            fprintf(stderr,"Query barrier required for %s\n", my_md->name);
         MPIDO_Barrier(comm_ptr, &tmpmpierrno);
      }
   }

   if(unlikely(verbose))
   {
      unsigned long long int threadID;
      MPIU_Thread_id_t tid;
      MPIU_Thread_self(&tid);
      threadID = (unsigned long long int)tid;
      fprintf(stderr,"<%llx> Using protocol %s for bcast on %u\n", 
              threadID,
              my_md->name,
              (unsigned) comm_ptr->context_id);
   }

   MPIDI_Context_post(MPIDI_Context[0], &bcast_post.state, MPIDI_Pami_post_wrapper, (void *)&bcast);
   MPIDI_Update_last_algorithm(comm_ptr, my_md->name);
   MPID_PROGRESS_WAIT_WHILE(active);
   TRACE_ERR("bcast done\n");

   if(!data_contig)
   {
      if(rank != root)
         MPIR_Localcopy(noncontig_buff, data_size, MPI_CHAR,
                        buffer,         count,     datatype);
      MPIU_Free(noncontig_buff);
   }

   TRACE_ERR("leaving bcast\n");
   return 0;
}
예제 #4
0
int MPIDO_Gatherv_simple(const void *sendbuf, 
                  int sendcount, 
                  MPI_Datatype sendtype,
                  void *recvbuf, 
                  const int *recvcounts, 
                  const int *displs, 
                  MPI_Datatype recvtype,
                  int root, 
                  MPID_Comm * comm_ptr, 
                  int *mpierrno)

{
#ifndef HAVE_PAMI_IN_PLACE
  if (sendbuf == MPI_IN_PLACE)
  {
    MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`");
    return -1;
  }
#endif
   TRACE_ERR("Entering MPIDO_Gatherv_optimized\n");
   int snd_contig = 1, rcv_contig = 1;
   void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL;
   void *sbuf = NULL, *rbuf = NULL;
   int  *rcounts = NULL;
   int  *rdispls = NULL;
   int send_size = 0;
   int recv_size = 0;
   int rcvlen    = 0;
  int totalrecvcount  = 0;
   pami_type_t rtype = PAMI_TYPE_NULL;
   MPID_Segment segment;
   MPID_Datatype *data_ptr = NULL;
   int send_true_lb, recv_true_lb = 0;
   int i, tmp;
   volatile unsigned gatherv_active = 1;
   const int rank = comm_ptr->rank;
   const int size = comm_ptr->local_size;
#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
    const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif

   const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
  int recvok=PAMI_SUCCESS, recvcontinuous=0;

   if(sendbuf != MPI_IN_PLACE)
   {
     MPIDI_Datatype_get_info(sendcount, sendtype, snd_contig,
                            send_size, data_ptr, send_true_lb);
    if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL)
    {
      advisor_algorithm_t advisor_algorithms[1];
      int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_GATHERV_INT, 64, advisor_algorithms, 1);
      if(num_algorithms)
      {
        if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO)
        {
          return MPIR_Gatherv(sendbuf, sendcount, sendtype,
                              recvbuf, recvcounts, displs, recvtype,
                              root, comm_ptr, mpierrno);
        }
        else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests)))
        {
          comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
          int tmpmpierrno;
          if(unlikely(verbose))
            fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name);
          MPIDO_Barrier(comm_ptr, &tmpmpierrno);
        }
      }
    }

    sbuf = (char *)sendbuf + send_true_lb;
    if(!snd_contig)
    {
      snd_noncontig_buff = MPIU_Malloc(send_size);
      sbuf = snd_noncontig_buff;
      if(snd_noncontig_buff == NULL)
      {
        MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
                   "Fatal:  Cannot allocate pack buffer");
      }
      DLOOP_Offset last = send_size;
      MPID_Segment_init(sendbuf, sendcount, sendtype, &segment, 0);
      MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff);
    }
  }
  else
  {
    MPIDI_Datatype_get_info(1, recvtype, rcv_contig,
                            rcvlen, data_ptr, recv_true_lb);
    if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL)
    {
      advisor_algorithm_t advisor_algorithms[1];
      int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_GATHERV_INT, 64, advisor_algorithms, 1);
      if(num_algorithms)
      {
        if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO)
        {
          return MPIR_Gatherv(sendbuf, sendcount, sendtype,
                              recvbuf, recvcounts, displs, recvtype,
                              root, comm_ptr, mpierrno);
        }
        else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests)))
        {
          comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
          int tmpmpierrno;
          if(unlikely(verbose))
            fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name);
          MPIDO_Barrier(comm_ptr, &tmpmpierrno);
        }
      }
    }
  }

   pami_xfer_t gatherv;
   rbuf = (char *)recvbuf + recv_true_lb;
   rcounts = (int*)recvcounts;
   rdispls = (int*)displs;
   if(rank == root)
   {
    if((recvok = MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp)) != MPI_SUCCESS)
      {
        MPIDI_Datatype_get_info(1, recvtype, rcv_contig,
                                rcvlen, data_ptr, recv_true_lb);
      totalrecvcount = recvcounts[0];
      recvcontinuous = displs[0] == 0? 1 : 0 ;
          rcounts = (int*)MPIU_Malloc(size);
          rdispls = (int*)MPIU_Malloc(size);
      rdispls[0] = 0;
      rcounts[0] = rcvlen * recvcounts[0];
      for(i = 1; i < size; i++)
      {
        rdispls[i]= rcvlen * totalrecvcount;
        totalrecvcount += recvcounts[i];
        if(displs[i] != (displs[i-1] + recvcounts[i-1]))
          recvcontinuous = 0;
            rcounts[i] = rcvlen * recvcounts[i];
          }
      recv_size = rcvlen * totalrecvcount;

          rcv_noncontig_buff = MPIU_Malloc(recv_size);
          rbuf = rcv_noncontig_buff;
          rtype = PAMI_TYPE_BYTE;
          if(rcv_noncontig_buff == NULL)
          {
             MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
                "Fatal:  Cannot allocate pack buffer");
          }
      if(sendbuf == MPI_IN_PLACE)
      {
        size_t extent;
        MPID_Datatype_get_extent_macro(recvtype,extent);
        MPIR_Localcopy(recvbuf + displs[rank]*extent, recvcounts[rank], recvtype,
                     rcv_noncontig_buff + rdispls[rank], rcounts[rank],MPI_CHAR);
      }
    }
    if(sendbuf == MPI_IN_PLACE)
    {
      gatherv.cmd.xfer_gatherv_int.sndbuf = PAMI_IN_PLACE;
    }
    else
    {
      gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf;
    }
    gatherv.cmd.xfer_gatherv_int.stype = PAMI_TYPE_BYTE;/* stype is ignored when sndbuf == PAMI_IN_PLACE */
    gatherv.cmd.xfer_gatherv_int.stypecount = send_size;

  }
  else
  {
    gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf;
    gatherv.cmd.xfer_gatherv_int.stype = PAMI_TYPE_BYTE;
    gatherv.cmd.xfer_gatherv_int.stypecount = send_size;     
  }


  gatherv.cb_done = cb_gatherv;
  gatherv.cookie = (void *)&gatherv_active;
  gatherv.cmd.xfer_gatherv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0);
  gatherv.cmd.xfer_gatherv_int.rcvbuf = rbuf;
  gatherv.cmd.xfer_gatherv_int.rtype = rtype;
  gatherv.cmd.xfer_gatherv_int.rtypecounts = (int *) rcounts;
  gatherv.cmd.xfer_gatherv_int.rdispls = (int *) rdispls;


  const pami_metadata_t *my_gatherv_md;

  gatherv.algorithm = mpid->coll_algorithm[PAMI_XFER_GATHERV_INT][0][0];
  my_gatherv_md = &mpid->coll_metadata[PAMI_XFER_GATHERV_INT][0][0];

  MPIDI_Update_last_algorithm(comm_ptr, my_gatherv_md->name);

  MPIDI_Post_coll_t gatherv_post;
  TRACE_ERR("%s gatherv\n", MPIDI_Process.context_post.active>0?"Posting":"Invoking");
  MPIDI_Context_post(MPIDI_Context[0], &gatherv_post.state,
                     MPIDI_Pami_post_wrapper, (void *)&gatherv);
  TRACE_ERR("Gatherv %s\n", MPIDI_Process.context_post.active>0?"posted":"invoked");

  TRACE_ERR("Waiting on active %d\n", gatherv_active);
  MPID_PROGRESS_WAIT_WHILE(gatherv_active);

  if(!rcv_contig || recvok != PAMI_SUCCESS)
  {
    if(recvcontinuous)
   {
      MPIR_Localcopy(rcv_noncontig_buff, recv_size, MPI_CHAR,
                     recvbuf,   totalrecvcount,     recvtype);
    }
    else
    {
      size_t extent;
      MPID_Datatype_get_extent_macro(recvtype,extent);
      for(i=0; i<size; ++i)
      {
        char* scbuf = (char*)rcv_noncontig_buff+ rdispls[i];
        char* rcbuf = (char*)recvbuf + displs[i]*extent;
        MPIR_Localcopy(scbuf, rcounts[i], MPI_CHAR,
                       rcbuf, recvcounts[i], recvtype);
        TRACE_ERR("Pack recv src  extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n",
                  (size_t)extent, (size_t)i,(size_t)precvdispls[i],(size_t)i,(size_t)precvcounts[i],(size_t)precvdispls[i], *(int*)scbuf);
        TRACE_ERR("Pack recv dest extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n",
                  (size_t)extent, (size_t)i,(size_t)displs[i],(size_t)i,(size_t)recvcounts[i],(size_t)displs[i], *(int*)rcbuf);
      }

    }
      MPIU_Free(rcv_noncontig_buff);
      if(rank == root)
      {
         MPIU_Free(rcounts);
         MPIU_Free(rdispls);
      }
   }
   if(!snd_contig)  MPIU_Free(snd_noncontig_buff);


   TRACE_ERR("Leaving MPIDO_Gatherv_optimized\n");
   return MPI_SUCCESS;
}
예제 #5
0
int MPIDO_Reduce(const void *sendbuf, 
                 void *recvbuf, 
                 int count, 
                 MPI_Datatype datatype,
                 MPI_Op op, 
                 int root, 
                 MPID_Comm *comm_ptr, 
                 int *mpierrno)

{
#ifndef HAVE_PAMI_IN_PLACE
  if (sendbuf == MPI_IN_PLACE)
  {
    MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`");
    return -1;
  }
#endif
   MPID_Datatype *dt_null = NULL;
   MPI_Aint true_lb = 0;
   int dt_contig ATTRIBUTE((unused)), tsize;
   int mu;
   char *sbuf, *rbuf;
   pami_data_function pop;
   pami_type_t pdt;
   int rc;
   int alg_selected = 0;
   const int rank = comm_ptr->rank;
#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
    const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif
   const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
   const int selected_type = mpid->user_selected_type[PAMI_XFER_REDUCE];

   rc = MPIDI_Datatype_to_pami(datatype, &pdt, op, &pop, &mu);
   if(unlikely(verbose))
      fprintf(stderr,"reduce - rc %u, root %u, count %d, dt: %p, op: %p, mu: %u, selectedvar %u != %u (MPICH) sendbuf %p, recvbuf %p\n",
	      rc, root, count, pdt, pop, mu, 
	      (unsigned)selected_type, MPID_COLL_USE_MPICH,sendbuf, recvbuf);

   pami_xfer_t reduce;
   pami_algorithm_t my_reduce=0;
   const pami_metadata_t *my_md = (pami_metadata_t *)NULL;
   int queryreq = 0;
   volatile unsigned reduce_active = 1;

   MPIDI_Datatype_get_info(count, datatype, dt_contig, tsize, dt_null, true_lb);
   rbuf = (char *)recvbuf + true_lb;
   sbuf = (char *)sendbuf + true_lb;
   if(sendbuf == MPI_IN_PLACE) 
   {
      if(unlikely(verbose))
	fprintf(stderr,"reduce MPI_IN_PLACE send buffering (%d,%d)\n",count,tsize);
      sbuf = PAMI_IN_PLACE;
   }

   reduce.cb_done = reduce_cb_done;
   reduce.cookie = (void *)&reduce_active;
   if(mpid->optreduce) /* GLUE_ALLREDUCE */
   {
      char* tbuf = NULL;
      if(unlikely(verbose))
         fprintf(stderr,"Using protocol GLUE_ALLREDUCE for reduce (%d,%d)\n",count,tsize);
      MPIDI_Update_last_algorithm(comm_ptr, "REDUCE_OPT_ALLREDUCE");
      void *destbuf = recvbuf;
      if(rank != root) /* temp buffer for non-root destbuf */
      {
         tbuf = destbuf = MPL_malloc(tsize);
      }
      /* Switch to comm->coll_fns->fn() */
      MPIDO_Allreduce(sendbuf,
                      destbuf,
                      count,
                      datatype,
                      op,
                      comm_ptr,
                      mpierrno);
      if(tbuf)
         MPL_free(tbuf);
      return 0;
   }
   if(selected_type == MPID_COLL_USE_MPICH || rc != MPI_SUCCESS)
   {
      if(unlikely(verbose))
         fprintf(stderr,"Using MPICH reduce algorithm\n");
#if CUDA_AWARE_SUPPORT
      if(MPIDI_Process.cuda_aware_support_on)
      {
         MPI_Aint dt_extent;
         MPID_Datatype_get_extent_macro(datatype, dt_extent);
         char *scbuf = NULL;
         char *rcbuf = NULL;
         int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
         int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf);
         if(is_send_dev_buf)
         {
           scbuf = MPL_malloc(dt_extent * count);
           cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, dt_extent * count, cudaMemcpyDeviceToHost);
           if (cudaSuccess != cudaerr) 
             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
         }
         else
           scbuf = sendbuf;
         if(is_recv_dev_buf)
         {
           rcbuf = MPL_malloc(dt_extent * count);
           if(sendbuf == MPI_IN_PLACE)
           {
             cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, dt_extent * count, cudaMemcpyDeviceToHost);
             if (cudaSuccess != cudaerr)
               fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
           }
           else
             memset(rcbuf, 0, dt_extent * count);
         }
         else
           rcbuf = recvbuf;
         int cuda_res =  MPIR_Reduce(scbuf, rcbuf, count, datatype, op, root, comm_ptr, mpierrno);
         if(is_send_dev_buf)MPL_free(scbuf);
         if(is_recv_dev_buf)
         {
           cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, dt_extent * count, cudaMemcpyHostToDevice);
           if (cudaSuccess != cudaerr)
             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
           MPL_free(rcbuf);
         }
         return cuda_res;
      }
      else
#endif
      return MPIR_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, mpierrno);
   }

   if(selected_type == MPID_COLL_OPTIMIZED)
   {
      if((mpid->cutoff_size[PAMI_XFER_REDUCE][0] == 0) || 
          (mpid->cutoff_size[PAMI_XFER_REDUCE][0] >= tsize && mpid->cutoff_size[PAMI_XFER_REDUCE][0] > 0))
      {
        TRACE_ERR("Optimized Reduce (%s) was pre-selected\n",
         mpid->opt_protocol_md[PAMI_XFER_REDUCE][0].name);
        my_reduce    = mpid->opt_protocol[PAMI_XFER_REDUCE][0];
        my_md = &mpid->opt_protocol_md[PAMI_XFER_REDUCE][0];
        queryreq     = mpid->must_query[PAMI_XFER_REDUCE][0];
      }

   }
   else
   {
      TRACE_ERR("Optimized reduce (%s) was specified by user\n",
      mpid->user_metadata[PAMI_XFER_REDUCE].name);
      my_reduce    =  mpid->user_selected[PAMI_XFER_REDUCE];
      my_md = &mpid->user_metadata[PAMI_XFER_REDUCE];
      queryreq     = selected_type;
   }
   reduce.algorithm = my_reduce;
   reduce.cmd.xfer_reduce.sndbuf = sbuf;
   reduce.cmd.xfer_reduce.rcvbuf = rbuf;
   reduce.cmd.xfer_reduce.stype = pdt;
   reduce.cmd.xfer_reduce.rtype = pdt;
   reduce.cmd.xfer_reduce.stypecount = count;
   reduce.cmd.xfer_reduce.rtypecount = count;
   reduce.cmd.xfer_reduce.op = pop;
   reduce.cmd.xfer_reduce.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0);


   if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY || 
               queryreq == MPID_COLL_CHECK_FN_REQUIRED))
   {
      metadata_result_t result = {0};
      TRACE_ERR("Querying reduce protocol %s, type was %d\n",
                my_md->name,
                queryreq);
      if(my_md->check_fn == NULL)
      {
         /* process metadata bits */
         if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE))
            result.check.unspecified = 1;
         if(my_md->check_correct.values.rangeminmax)
         {
            MPI_Aint data_true_lb ATTRIBUTE((unused));
            MPID_Datatype *data_ptr;
            int data_size, data_contig ATTRIBUTE((unused));
            MPIDI_Datatype_get_info(count, datatype, data_contig, data_size, data_ptr, data_true_lb); 
            if((my_md->range_lo <= data_size) &&
               (my_md->range_hi >= data_size))
               ; /* ok, algorithm selected */
            else
            {
               result.check.range = 1;
               if(unlikely(verbose))
               {   
                  fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n",
                          data_size,
                          my_md->range_lo,
                          my_md->range_hi,
                          my_md->name);
               }
            }
         }
      }
예제 #6
0
int MPIDO_Bcast_simple(void *buffer,
                int count,
                MPI_Datatype datatype,
                int root,
                MPID_Comm *comm_ptr,
                int *mpierrno)
{
   TRACE_ERR("Entering MPIDO_Bcast_optimized\n");

   int data_contig;
   void *data_buffer    = NULL,
        *noncontig_buff = NULL;
   volatile unsigned active = 1;
   MPI_Aint data_true_lb = 0;
   MPID_Datatype *data_ptr;
   MPID_Segment segment;
   MPIDI_Post_coll_t bcast_post;
   const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
   const int rank = comm_ptr->rank;

   /* Must calculate data_size based on count=1 in case it's total size is > integer */
   int data_size_one;
   MPIDI_Datatype_get_info(1, datatype,
			   data_contig, data_size_one, data_ptr, data_true_lb);
   if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL)
   {
     advisor_algorithm_t advisor_algorithms[1];
     int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_BROADCAST, data_size_one * count, advisor_algorithms, 1);
     if(num_algorithms)
     {
       if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO)
       {
         return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno);
       }
     }
   }

   const int data_size = data_size_one*(size_t)count;

   data_buffer = (char *)buffer + data_true_lb;

   if(!data_contig)
   {
      noncontig_buff = MPIU_Malloc(data_size);
      data_buffer = noncontig_buff;
      if(noncontig_buff == NULL)
      {
         MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
            "Fatal:  Cannot allocate pack buffer");
      }
      if(rank == root)
      {
         DLOOP_Offset last = data_size;
         MPID_Segment_init(buffer, count, datatype, &segment, 0);
         MPID_Segment_pack(&segment, 0, &last, noncontig_buff);
      }
   }

   pami_xfer_t bcast;
   const pami_metadata_t *my_bcast_md;
   int queryreq = 0;

   bcast.cb_done = cb_bcast;
   bcast.cookie = (void *)&active;
   bcast.cmd.xfer_broadcast.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0);
   bcast.algorithm = mpid->coll_algorithm[PAMI_XFER_BROADCAST][0][0];
   bcast.cmd.xfer_broadcast.buf = data_buffer;
   bcast.cmd.xfer_broadcast.type = PAMI_TYPE_BYTE;
   /* Needs to be sizeof(type)*count since we are using bytes as * the generic type */
   bcast.cmd.xfer_broadcast.typecount = data_size;
   my_bcast_md = &mpid->coll_metadata[PAMI_XFER_BROADCAST][0][0];

   MPIDI_Context_post(MPIDI_Context[0], &bcast_post.state, MPIDI_Pami_post_wrapper, (void *)&bcast);
   MPIDI_Update_last_algorithm(comm_ptr, my_bcast_md->name);
   MPID_PROGRESS_WAIT_WHILE(active);
   TRACE_ERR("bcast done\n");

   if(!data_contig)
   {
      if(rank != root)
         MPIR_Localcopy(noncontig_buff, data_size, MPI_CHAR,
                        buffer,         count,     datatype);
      MPIU_Free(noncontig_buff);
   }

   TRACE_ERR("Exiting MPIDO_Bcast_optimized\n");
   return 0;
}
예제 #7
0
MPID_Request * MPID_Request_create(void)
{
    MPID_Request * req;
    MPIDI_STATE_DECL(MPID_STATE_MPID_REQUEST_CREATE);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_REQUEST_CREATE);
    
    req = MPIU_Handle_obj_alloc(&MPID_Request_mem);
    if (req != NULL)
    {
	MPIU_DBG_MSG_P(CH3_CHANNEL,VERBOSE,
		       "allocated request, handle=0x%08x", req->handle);
#ifdef MPICH_DBG_OUTPUT
	/*MPIU_Assert(HANDLE_GET_MPI_KIND(req->handle) == MPID_REQUEST);*/
	if (HANDLE_GET_MPI_KIND(req->handle) != MPID_REQUEST)
	{
	    int mpi_errno;
	    mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, 
		       FCNAME, __LINE__, MPI_ERR_OTHER, 
		       "**invalid_handle", "**invalid_handle %d", req->handle);
	    MPID_Abort(MPIR_Process.comm_world, mpi_errno, -1, NULL);
	}
#endif
	/* FIXME: This makes request creation expensive.  We need to trim
	   this to the basics, with additional setup for special-purpose 
	   requests (think base class and inheritance).  For example, do we 
	   *really* want to set the kind to UNDEFINED? And should the RMA 
	   values be set only for RMA requests? */
	MPIU_Object_set_ref(req, 1);
	req->kind		   = MPID_REQUEST_UNDEFINED;
        MPID_cc_set(&req->cc, 1);
	req->cc_ptr		   = &req->cc;
	/* FIXME: status fields meaningful only for receive, and even then
	   should not need to be set. */
	req->status.MPI_SOURCE	   = MPI_UNDEFINED;
	req->status.MPI_TAG	   = MPI_UNDEFINED;
	req->status.MPI_ERROR	   = MPI_SUCCESS;
        MPIR_STATUS_SET_COUNT(req->status, 0);
        MPIR_STATUS_SET_CANCEL_BIT(req->status, FALSE);
	req->comm		   = NULL;
        req->greq_fns              = NULL;
	req->dev.datatype_ptr	   = NULL;
	req->dev.segment_ptr	   = NULL;
	/* Masks and flags for channel device state in an MPID_Request */
	req->dev.state		   = 0;
	req->dev.cancel_pending	   = FALSE;
	/* FIXME: RMA ops shouldn't need to be set except when creating a
	   request for RMA operations */
	req->dev.target_win_handle = MPI_WIN_NULL;
	req->dev.source_win_handle = MPI_WIN_NULL;
	req->dev.lock_queue_entry  = NULL;
	req->dev.dtype_info	   = NULL;
	req->dev.dataloop	   = NULL;
	req->dev.iov_offset        = 0;
        req->dev.flags             = MPIDI_CH3_PKT_FLAG_NONE;
        req->dev.resp_request_handle = MPI_REQUEST_NULL;
        req->dev.user_buf          = NULL;
        req->dev.OnDataAvail       = NULL;
        req->dev.OnFinal           = NULL;
#ifdef MPIDI_CH3_REQUEST_INIT
	MPIDI_CH3_REQUEST_INIT(req);
#endif
    }
    else
    {
	/* FIXME: This fails to fail if debugging is turned off */
	MPIU_DBG_MSG(CH3_CHANNEL,TYPICAL,"unable to allocate a request");
    }
    
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_REQUEST_CREATE);
    return req;
}
예제 #8
0
static int mpich_type_to_vmpi_type(
    MPI_Datatype			datatype)
{
    int					rc;
    
    DEBUG_FN_ENTRY(DEBUG_MODULE_TYPES);
    DEBUG_PRINTF(DEBUG_MODULE_TYPES, DEBUG_INFO_ARGS,
		 ("datatype=%d\n", datatype));

    /*
     g* Note: MPICH maps MPI_CHARACTER to MPI_CHAR and MPI_LONG_LONG to
     * MPI_LONG_LONG_INT so we have no way to distiguish the types from the
     * ones upon which they are mapped.
     */
    switch(datatype)
    {
      case MPI_CHAR:			rc = VMPI_CHAR;			break;
      case MPI_UNSIGNED_CHAR:		rc = VMPI_UNSIGNED_CHAR;	break;
      case MPI_BYTE:			rc = VMPI_BYTE;			break;
      case MPI_SHORT:			rc = VMPI_SHORT;		break;
      case MPI_UNSIGNED_SHORT:		rc = VMPI_UNSIGNED_SHORT;	break;
      case MPI_INT:			rc = VMPI_INT;			break;
      case MPI_UNSIGNED:		rc = VMPI_UNSIGNED;		break;
      case MPI_LONG:			rc = VMPI_LONG;			break;
      case MPI_UNSIGNED_LONG:		rc = VMPI_UNSIGNED_LONG;	break;
      case MPI_FLOAT:			rc = VMPI_FLOAT;		break;
      case MPI_DOUBLE:			rc = VMPI_DOUBLE;		break;
      case MPI_LONG_DOUBLE:		rc = VMPI_LONG_DOUBLE;		break;
      case MPI_LONG_LONG_INT:		rc = VMPI_LONG_LONG_INT;	break;
      case MPI_PACKED:			rc = VMPI_PACKED;		break;
      case MPI_LB:			rc = VMPI_LB;			break;
      case MPI_UB:			rc = VMPI_UB;			break;
      case MPI_FLOAT_INT:		rc = VMPI_FLOAT_INT;		break;
      case MPI_DOUBLE_INT:		rc = VMPI_DOUBLE_INT;		break;
      case MPI_LONG_INT:		rc = VMPI_LONG_INT;		break;
      case MPI_SHORT_INT:		rc = VMPI_SHORT_INT;		break;
      case MPI_2INT:			rc = VMPI_2INT;			break;
      case MPI_LONG_DOUBLE_INT:		rc = VMPI_LONG_DOUBLE_INT;	break;
      case MPI_COMPLEX:			rc = VMPI_COMPLEX;		break;
      case MPI_DOUBLE_COMPLEX:		rc = VMPI_DOUBLE_COMPLEX;	break;
      case MPI_LOGICAL:			rc = VMPI_LOGICAL;		break;
      case MPI_REAL:			rc = VMPI_REAL;			break;
      case MPI_DOUBLE_PRECISION:	rc = VMPI_DOUBLE_PRECISION;	break;
      case MPI_INTEGER:			rc = VMPI_INTEGER;		break;
      case MPI_2INTEGER:		rc = VMPI_2INTEGER;		break;
      case MPI_2COMPLEX:		rc = VMPI_2COMPLEX;		break;
      case MPI_2DOUBLE_COMPLEX:		rc = VMPI_2DOUBLE_COMPLEX;	break;
      case MPI_2REAL:			rc = VMPI_2REAL;		break;
      case MPI_2DOUBLE_PRECISION:	rc = VMPI_2DOUBLE_PRECISION;	break;

      default:
      {
	 char err[1024];
	 globus_libc_sprintf(
	    err, 
	    "mpich_type_to_vmpi_type() -  encountered unrecognizable type %d", 
	    datatype);
	  MPID_Abort(NULL,
                   0,
                   "MPICH-G2 (internal error)",
		   err);
       }
         break;
     } /* end switch */

  /* fn_exit: */
    DEBUG_PRINTF(DEBUG_MODULE_TYPES, DEBUG_INFO_RC,
		 ("rc=%d\n", rc));
    DEBUG_FN_EXIT(DEBUG_MODULE_TYPES);
    return rc;
}
예제 #9
0
파일: contextid.c 프로젝트: tjhei/fgmpi
void MPIR_Free_contextid(MPIU_Context_id_t context_id)
{
    int idx, bitpos, raw_prefix;
    MPID_MPI_STATE_DECL(MPID_STATE_MPIR_FREE_CONTEXTID);

    MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_FREE_CONTEXTID);

    /* Convert the context id to the bit position */
    raw_prefix = MPID_CONTEXT_READ_FIELD(PREFIX, context_id);
    idx = raw_prefix / MPIR_CONTEXT_INT_BITS;
    bitpos = raw_prefix % MPIR_CONTEXT_INT_BITS;

    /* --BEGIN ERROR HANDLING-- */
    if (idx < 0 || idx >= MPIR_MAX_CONTEXT_MASK) {
        MPID_Abort(0, MPI_ERR_INTERN, 1, "In MPIR_Free_contextid, idx is out of range");
    }
    /* --END ERROR HANDLING-- */

    /* The low order bits for dynamic context IDs don't have meaning the
     * same way that low bits of non-dynamic ctx IDs do.  So we have to
     * check the dynamic case first. */
    if (MPID_CONTEXT_READ_FIELD(DYNAMIC_PROC, context_id)) {
        MPIU_DBG_MSG_D(COMM, VERBOSE, "skipping dynamic process ctx id, context_id=%d", context_id);
        goto fn_exit;
    }
    else {      /* non-dynamic context ID */
        /* In terms of the context ID bit vector, intercomms and their constituent
         * localcomms have the same value.  To avoid a double-free situation we just
         * don't free the context ID for localcomms and assume it will be cleaned up
         * when the parent intercomm is itself completely freed. */
        if (MPID_CONTEXT_READ_FIELD(IS_LOCALCOMM, context_id)) {
#ifdef USE_DBG_LOGGING
            char dump_str[1024];
            dump_context_id(context_id, dump_str, sizeof(dump_str));
            MPIU_DBG_MSG_S(COMM, VERBOSE, "skipping localcomm id: %s", dump_str);
#endif
            goto fn_exit;
        }
        else if (MPID_CONTEXT_READ_FIELD(SUBCOMM, context_id)) {
            MPIU_DBG_MSG_D(COMM, VERBOSE, "skipping non-parent communicator ctx id, context_id=%d",
                           context_id);
            goto fn_exit;
        }
    }

    /* --BEGIN ERROR HANDLING-- */
    /* Check that this context id has been allocated */
    if ((context_mask[idx] & (0x1 << bitpos)) != 0) {
#ifdef USE_DBG_LOGGING
        char dump_str[1024];
        dump_context_id(context_id, dump_str, sizeof(dump_str));
        MPIU_DBG_MSG_S(COMM, VERBOSE, "context dump: %s", dump_str);
        MPIU_DBG_MSG_S(COMM, VERBOSE, "context mask = %s", context_mask_to_str());
#endif
        MPID_Abort(0, MPI_ERR_INTERN, 1, "In MPIR_Free_contextid, the context id is not in use");
    }
    /* --END ERROR HANDLING-- */

    MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
    /* MT: Note that this update must be done atomically in the multithreaedd
     * case.  In the "one, single lock" implementation, that lock is indeed
     * held when this operation is called. */
    context_mask[idx] |= (0x1 << bitpos);
    MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);

    MPIU_DBG_MSG_FMT(COMM, VERBOSE,
                     (MPIU_DBG_FDEST,
                      "Freed context %d, mask[%d] bit %d (prefix=%#x)",
                      context_id, idx, bitpos, raw_prefix));
  fn_exit:
    MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_FREE_CONTEXTID);
}
예제 #10
0
/* The following is a handler that may be added to finalize to test whether
   handles remain allocated, including those from the direct blocks.
   
   When adding memory checking, this routine should be invoked as

   MPIR_Add_finalize(MPIU_CheckHandlesOnFinalize, objmem, 1);

   as part of the object intialization.

   The algorithm follows the following approach:
   
   The memory allocation approach manages a list of available objects.
   These objects are allocated from several places:
      "direct" - this is a block of preallocated space
      "indirect" - this is a block of blocks that are allocated as necessary.
                   E.g., objmem_ptr->indirect[0..objmem_ptr->indirect_size-1]
		   are pointers (or null) to a block of memory.  This block is
		   then divided into objects that are added to the avail list.

   To provide information on the handles that are still in use, we must 
   "repatriate" all of the free objects, at least virtually.  To give
   the best information, for each free item, we determine to which block
   it belongs.  
*/
static int MPIU_CheckHandlesOnFinalize( void *objmem_ptr )
{
    MPIU_Object_alloc_t *objmem = (MPIU_Object_alloc_t *)objmem_ptr;
    int i;
    MPIU_Handle_common *ptr;
    int leaked_handles = FALSE;
    int   directSize = objmem->direct_size;
    char *direct = (char *)objmem->direct;
    char *directEnd = (char *)direct + directSize * objmem->size - 1;
    int   nDirect = 0;
    int  *nIndirect = 0;

    /* Return immediately if this object has not allocated any space */
    if (!objmem->initialized) {
	return 0;
    }

    if (objmem->indirect_size > 0) {
	nIndirect = (int *)MPIU_Calloc( objmem->indirect_size, sizeof(int) );
    }
    /* Count the number of items in the avail list.  These include
       all objects, whether direct or indirect allocation */
    ptr = objmem->avail;
    while (ptr) {
	/* printf( "Looking at %p\n", ptr ); */
	/* Find where this object belongs */
	if ((char *)ptr >= direct && (char *)ptr < directEnd) {
	    nDirect++;
	}
	else {
	    void **indirect = (void **)objmem->indirect;
	    for (i=0; i<objmem->indirect_size; i++) {
		char *start = indirect[i];
		char *end   = start + HANDLE_BLOCK_SIZE *objmem->size;
		if ((char *)ptr >= start && (char *)ptr < end) {
		    nIndirect[i]++;
		    break;
		}
	    }
	    if (i == objmem->indirect_size) {
		/* Error - could not find the owning memory */
		/* Temp */
		printf( "Could not place object at %p in handle memory for type %s\n", ptr, MPIR_ObjectName( objmem ) );
		printf( "direct block is [%p,%p]\n", direct, directEnd );
		if (objmem->indirect_size) {
		    printf( "indirect block is [%p,%p]\n", indirect[0], 
			    (char *)indirect[0] + HANDLE_BLOCK_SIZE * 
			    objmem->size );
		}
	    }
	}
	ptr = ptr->next;
    }

    if (0) {
	/* Produce a report */
	printf( "Object handles:\n\ttype  \t%s\n\tsize  \t%d\n\tdirect size\t%d\n\
\tindirect size\t%d\n",
		MPIR_ObjectName( objmem ), objmem->size, objmem->direct_size, 
		objmem->indirect_size );
    }
    if (nDirect != directSize) {
        leaked_handles = TRUE;
	printf( "In direct memory block for handle type %s, %d handles are still allocated\n", MPIR_ObjectName( objmem ), directSize - nDirect );
    }
    for (i=0; i<objmem->indirect_size; i++) {
	if (nIndirect[i] != HANDLE_BLOCK_SIZE) {
            leaked_handles = TRUE;
	    printf( "In indirect memory block %d for handle type %s, %d handles are still allocated\n", i, MPIR_ObjectName( objmem ), HANDLE_BLOCK_SIZE - nIndirect[i] );
	}
    }

    if (nIndirect) { 
	MPIU_Free( nIndirect );
    }

    if (leaked_handles && MPIR_PARAM_ABORT_ON_LEAKED_HANDLES) {
        /* comm_world has been (or should have been) destroyed by this point,
         * pass comm=NULL */
        MPID_Abort(NULL, MPI_ERR_OTHER, 1, "ERROR: leaked handles detected, aborting");
        MPIU_Assert(0);
    }

    return 0;
}
예제 #11
0
파일: abort.c 프로젝트: jeffhammond/mpich
/*@
   MPI_Abort - Terminates MPI execution environment

Input Parameters:
+ comm - communicator of tasks to abort
- errorcode - error code to return to invoking environment

Notes:
Terminates all MPI processes associated with the communicator 'comm'; in
most systems (all to date), terminates `all` processes.

.N NotThreadSafe
Because the 'MPI_Abort' routine is intended to ensure that an MPI
process (and possibly an entire job), it cannot wait for a thread to
release a lock or other mechanism for atomic access.

.N Fortran

.N Errors
.N MPI_SUCCESS
@*/
int MPI_Abort(MPI_Comm comm, int errorcode)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Comm *comm_ptr = NULL;
    /* FIXME: 100 is arbitrary and may not be long enough */
    char abort_str[100] = "", comm_name[MPI_MAX_OBJECT_NAME];
    int len = MPI_MAX_OBJECT_NAME;
    MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPI_ABORT);

    MPIR_ERRTEST_INITIALIZED_ORDIE();

    /* FIXME: It isn't clear that Abort should wait for a critical section,
     * since that could result in the Abort hanging if another routine is
     * hung holding the critical section.  Also note the "not thread-safe"
     * comment in the description of MPI_Abort above. */
    MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPI_ABORT);

    /* Validate parameters, especially handles needing to be converted */
#ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
            MPIR_ERRTEST_COMM(comm, mpi_errno);
        }
        MPID_END_ERROR_CHECKS;
    }
#endif /* HAVE_ERROR_CHECKING */

    /* Get handles to MPI objects. */
    MPIR_Comm_get_ptr(comm, comm_ptr);

    /* Validate parameters and objects (post conversion) */
#ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
            /* Validate comm_ptr */
            MPIR_Comm_valid_ptr(comm_ptr, mpi_errno, TRUE);
            /* If comm_ptr is not valid, it will be reset to null */
            if (mpi_errno)
                goto fn_fail;
        }
        MPID_END_ERROR_CHECKS;
    }
#endif /* HAVE_ERROR_CHECKING */

    /* ... body of routine ...  */

    if (!comm_ptr) {
        /* Use comm world if the communicator is not valid */
        comm_ptr = MPIR_Process.comm_self;
    }


    MPIR_Comm_get_name_impl(comm_ptr, comm_name, &len);
    if (len == 0) {
        MPL_snprintf(comm_name, MPI_MAX_OBJECT_NAME, "comm=0x%X", comm);
    }
    if (!MPIR_CVAR_SUPPRESS_ABORT_MESSAGE)
        /* FIXME: This is not internationalized */
        MPL_snprintf(abort_str, 100, "application called MPI_Abort(%s, %d) - process %d", comm_name,
                     errorcode, comm_ptr->rank);
    mpi_errno = MPID_Abort(comm_ptr, mpi_errno, errorcode, abort_str);
    /* --BEGIN ERROR HANDLING-- */
    if (mpi_errno != MPI_SUCCESS)
        goto fn_fail;
    /* --END ERROR HANDLING-- */
    /* ... end of body of routine ... */

  fn_exit:
    MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPI_ABORT);
    MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    return mpi_errno;

  fn_fail:
    /* --BEGIN ERROR HANDLING-- */
    /* It is not clear that doing aborting abort makes sense.  We may
     * want to specify that erroneous arguments to MPI_Abort will
     * cause an immediate abort. */

#ifdef HAVE_ERROR_CHECKING
    {
        mpi_errno =
            MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER,
                                 "**mpi_abort", "**mpi_abort %C %d", comm, errorcode);
    }
#endif
    mpi_errno = MPIR_Err_return_comm(comm_ptr, __func__, mpi_errno);
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
예제 #12
0
void MPID_Request_free (MPI_Request request )
{
    MPID_Device *dev = NULL;
    MPI_Request rq = request ; /* MPID_devset->req_pending; */
    int mpi_errno = MPI_SUCCESS;
    
    switch (rq->handle_type) {
    case MPIR_SEND:
	if (MPID_SendIcomplete( rq, &mpi_errno )) {
	    MPIR_FORGET_SEND( &rq->shandle );
	    MPID_Send_free( rq );
	    /* MPID_devset->req_pending = 0;*/
	    rq = 0;
	}
	break;
    case MPIR_RECV:
	if (MPID_RecvIcomplete( rq, (MPI_Status *)0, &mpi_errno )) {
	    MPID_Recv_free( rq );
	    /* MPID_devset->req_pending = 0; */
	    rq = 0;
	}
	break;
    case MPIR_PERSISTENT_SEND:
	if (rq->persistent_shandle.active) {
	    MPID_Abort( (struct MPIR_COMMUNICATOR *)0, 1, "MPI internal", 
			"Unimplemented operation - active persistent send free" );
	} else {
  	    dev = MPID_devset->dev[((rq->persistent_shandle.perm_comm)->lrank_to_grank)
				  [rq->persistent_shandle.perm_dest]];
	    if (dev->persistent_free != NULL)
	      MPID_Device_call_persistent_free (rq, dev);
	    
	    MPID_PSend_free( rq );
	}
	break;
    case MPIR_PERSISTENT_RECV:
	if (rq->persistent_rhandle.active) {
	    MPID_Abort( (struct MPIR_COMMUNICATOR *)0, 1, "MPI internal", 
			"Unimplemented operation - active persistent recv free" );
	} else {
	    if (rq->persistent_rhandle.perm_source >= 0) {
		dev = MPID_devset->dev[rq->persistent_rhandle.perm_source];
	    } else {
		/* For a single device, we can use the related function, 
		   but what to do for multiple available devices? Multiple
		   persistent initialization? */
		if (MPID_devset->ndev == 1) {
		    dev = MPID_devset->dev_list;
		}
	    }
	    
	    if (dev != NULL && dev->persistent_free != NULL)
	      MPID_Device_call_persistent_free (rq, dev);
	}
	break;
    }

    MPID_DeviceCheck( MPID_NOTBLOCKING );
    /* 
     * If we couldn't complete it, decrement it's reference count
     * and forget about it.  This requires that the device detect
     * orphaned requests when they do complete, and process them
     * independent of any wait/test.
     */
    /*if (MPID_devset->req_pending) {*/
    if (rq) {
	rq->chandle.ref_count--;
/*	PRINTF( "Setting ref count to %d for %x\n", 
		rq->chandle.ref_count, (long)rq ); */
	/* MPID_devset->req_pending = 0; */
    }
}
예제 #13
0
void MPIDI_CH3_Request_destroy(MPID_Request * req)
{
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQUEST_DESTROY);
    
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQUEST_DESTROY);
    MPIU_DBG_MSG_P(CH3_CHANNEL,VERBOSE,
		   "freeing request, handle=0x%08x", req->handle);
    
#ifdef MPICH_DBG_OUTPUT
    /*MPIU_Assert(HANDLE_GET_MPI_KIND(req->handle) == MPID_REQUEST);*/
    if (HANDLE_GET_MPI_KIND(req->handle) != MPID_REQUEST)
    {
	int mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, 
                      FCNAME, __LINE__, MPI_ERR_OTHER, 
                      "**invalid_handle", "**invalid_handle %d", req->handle);
	MPID_Abort(MPIR_Process.comm_world, mpi_errno, -1, NULL);
    }
    /* XXX DJG FIXME should we be checking this? */
    /*MPIU_Assert(req->ref_count == 0);*/
    if (req->ref_count != 0)
    {
	int mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL,
                       FCNAME, __LINE__, MPI_ERR_OTHER, 
              "**invalid_refcount", "**invalid_refcount %d", req->ref_count);
	MPID_Abort(MPIR_Process.comm_world, mpi_errno, -1, NULL);
    }
#endif

    /* FIXME: We need a better way to handle these so that we
       do not always need to initialize these fields and check them
       when we destroy a request */
    /* FIXME: We need a way to call these routines ONLY when the 
       related ref count has become zero. */
    if (req->comm != NULL) {
	MPIR_Comm_release(req->comm, 0);
    }

    if (req->greq_fns != NULL) {
        MPIU_Free(req->greq_fns);
    }

    if (req->dev.datatype_ptr != NULL) {
	MPID_Datatype_release(req->dev.datatype_ptr);
    }

    if (req->dev.segment_ptr != NULL) {
	MPID_Segment_free(req->dev.segment_ptr);
    }

    if (MPIDI_Request_get_srbuf_flag(req)) {
	MPIDI_CH3U_SRBuf_free(req);
    }

    if (req->dev.ext_hdr_ptr != NULL) {
        MPIU_Free(req->dev.ext_hdr_ptr);
    }

    MPIU_Handle_obj_free(&MPID_Request_mem, req);
    
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQUEST_DESTROY);
}
예제 #14
0
int
MPIDO_Allgatherv_simple(const void *sendbuf,
		 int sendcount,
		 MPI_Datatype sendtype,
		 void *recvbuf,
		 const int *recvcounts,
		 const int *displs,
		 MPI_Datatype recvtype,
		 MPID_Comm * comm_ptr,
                 int *mpierrno)
{
#ifndef HAVE_PAMI_IN_PLACE
  if (sendbuf == MPI_IN_PLACE)
  {
    MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`");
    return -1;
  }
#endif
   TRACE_ERR("Entering MPIDO_Allgatherv_optimized\n");
  /* function pointer to be used to point to approperiate algorithm */
  /* Check the nature of the buffers */
  MPID_Datatype *dt_null = NULL;
  MPI_Aint send_true_lb  = 0;
  MPI_Aint recv_true_lb  = 0;
  size_t   send_size     = 0;
  size_t   recv_size     = 0;
  size_t   rcvtypelen    = 0;
  int snd_data_contig = 0, rcv_data_contig = 0;
  void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL;
  int scount=sendcount;

  char *sbuf, *rbuf;
  pami_type_t stype = NULL, rtype;
  const int rank = comm_ptr->rank;
  const int size = comm_ptr->local_size;
  const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
   const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif

  int recvcontinuous=0;
  size_t totalrecvcount=0;
  int *lrecvdispls = NULL; /* possible local displs calculated for noncontinous */
  int *lrecvcounts  = NULL;/* possible local counts calculated for noncontinous */
  const int *precvdispls = displs; /* pointer to displs to use as pami parmi */
  const int *precvcounts = recvcounts; /* pointer to counts to use as pami parmi */
  int inplace = sendbuf == MPI_IN_PLACE? 1 : 0;


  volatile unsigned allgatherv_active = 1;
  int recvok=PAMI_SUCCESS, sendok=PAMI_SUCCESS;
  int tmp;
  const pami_metadata_t *my_md;


   MPIDI_Datatype_get_info(1,
                          recvtype,
                          rcv_data_contig,
                          rcvtypelen,
                          dt_null,
                          recv_true_lb);

  if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL)
  {
    advisor_algorithm_t advisor_algorithms[1];
    int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_ALLGATHERV_INT, rcvtypelen * recvcounts[0], advisor_algorithms, 1);
     if(num_algorithms)
     {
       if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO)
       {
         return MPIR_Allgatherv(sendbuf, sendcount, sendtype,
                       recvbuf, recvcounts, displs, recvtype,
                       comm_ptr, mpierrno);
       }
       else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests)))
       {
         comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
         int tmpmpierrno;
         if(unlikely(verbose))
           fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name);
         MPIDO_Barrier(comm_ptr, &tmpmpierrno);
       }
     }
   }


  if(!inplace)
   {
    sendok = MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp);
    MPIDI_Datatype_get_info(sendcount, sendtype, snd_data_contig, send_size, dt_null, send_true_lb);
    sbuf = (char *)sendbuf + send_true_lb;
    if(!snd_data_contig || (sendok != PAMI_SUCCESS))
   {
      stype  = PAMI_TYPE_UNSIGNED_CHAR;
      scount = send_size;
      if(!snd_data_contig)
   {
        snd_noncontig_buff = MPIU_Malloc(send_size);
        sbuf = snd_noncontig_buff;
        if(snd_noncontig_buff == NULL)
   {
          MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
                   "Fatal:  Cannot allocate pack buffer");
   }
        MPIR_Localcopy(sendbuf, sendcount, sendtype,
                       snd_noncontig_buff, send_size,MPI_CHAR);
      }
    }
  }
  else
    sbuf = PAMI_IN_PLACE;

  recvok = MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp);
   rbuf = (char *)recvbuf+recv_true_lb;
  if(!rcv_data_contig || (recvok != PAMI_SUCCESS))
  {
    rtype = PAMI_TYPE_UNSIGNED_CHAR;
    totalrecvcount = recvcounts[0];
    recvcontinuous = displs[0] == 0? 1 : 0 ;
    int i;
    precvdispls = lrecvdispls = MPIU_Malloc(size*sizeof(int));
    precvcounts = lrecvcounts = MPIU_Malloc(size*sizeof(int));
    lrecvdispls[0]= 0;
    lrecvcounts[0]= rcvtypelen * recvcounts[0];
    for(i=1; i<size; ++i)
    {
      lrecvdispls[i]= rcvtypelen * totalrecvcount;
      totalrecvcount += recvcounts[i];
      if(displs[i] != (displs[i-1] + recvcounts[i-1]))
        recvcontinuous = 0;
      lrecvcounts[i]= rcvtypelen * recvcounts[i];
    }
    recv_size = rcvtypelen * totalrecvcount;
    TRACE_ERR("Pack receive rcv_contig %zu, recvok %zd, totalrecvcount %zu, recvcontinuous %zu, rcvtypelen %zu, recv_size %zu\n",
              (size_t)rcv_data_contig, (size_t)recvok, (size_t)totalrecvcount, (size_t)recvcontinuous,(size_t)rcvtypelen, (size_t)recv_size);

    rcv_noncontig_buff = MPIU_Malloc(recv_size);
    rbuf = rcv_noncontig_buff;
    if(rcv_noncontig_buff == NULL)
    {
      MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
                 "Fatal:  Cannot allocate pack buffer");
    }
    if(inplace)
    {
      size_t extent;
      MPID_Datatype_get_extent_macro(recvtype,extent);
      MPIR_Localcopy(recvbuf + displs[rank]*extent, recvcounts[rank], recvtype,
                     rcv_noncontig_buff + precvdispls[rank], precvcounts[rank],MPI_CHAR);
      scount = precvcounts[rank];
      stype   = PAMI_TYPE_UNSIGNED_CHAR;
      sbuf    = PAMI_IN_PLACE;
    }
   }


   pami_xfer_t allgatherv;
   allgatherv.cb_done = allgatherv_cb_done;
   allgatherv.cookie = (void *)&allgatherv_active;
   allgatherv.cmd.xfer_allgatherv_int.sndbuf = sbuf;
   allgatherv.cmd.xfer_allgatherv_int.rcvbuf = rbuf;
   allgatherv.cmd.xfer_allgatherv_int.stype = stype;/* stype is ignored when sndbuf == PAMI_IN_PLACE */
   allgatherv.cmd.xfer_allgatherv_int.rtype = rtype;
   allgatherv.cmd.xfer_allgatherv_int.stypecount = scount;
  allgatherv.cmd.xfer_allgatherv_int.rtypecounts = (int *) precvcounts;
  allgatherv.cmd.xfer_allgatherv_int.rdispls = (int *) precvdispls;
   allgatherv.algorithm = mpid->coll_algorithm[PAMI_XFER_ALLGATHERV_INT][0][0];
   my_md = &mpid->coll_metadata[PAMI_XFER_ALLGATHERV_INT][0][0];

   TRACE_ERR("Calling allgatherv via %s()\n", MPIDI_Process.context_post.active>0?"PAMI_Collective":"PAMI_Context_post");
   MPIDI_Post_coll_t allgatherv_post;
   MPIDI_Context_post(MPIDI_Context[0], &allgatherv_post.state,
                      MPIDI_Pami_post_wrapper, (void *)&allgatherv);

   MPIDI_Update_last_algorithm(comm_ptr, my_md->name);

   TRACE_ERR("Rank %d waiting on active %d\n", rank, allgatherv_active);
   MPID_PROGRESS_WAIT_WHILE(allgatherv_active);

  if(!rcv_data_contig || (recvok != PAMI_SUCCESS))
  {
    if(recvcontinuous)
    {
      MPIR_Localcopy(rcv_noncontig_buff, recv_size,MPI_CHAR,
                     recvbuf, totalrecvcount, recvtype);
    }
    else
    {
      size_t extent;
      int i;
      MPID_Datatype_get_extent_macro(recvtype,extent);
      for(i=0; i<size; ++i)
      {
        char* scbuf = (char*)rcv_noncontig_buff+ precvdispls[i];
        char* rcbuf = (char*)recvbuf + displs[i]*extent;
        MPIR_Localcopy(scbuf, precvcounts[i], MPI_CHAR,
                       rcbuf, recvcounts[i], recvtype);
        TRACE_ERR("Pack recv src  extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n",
                  (size_t)extent, (size_t)i,(size_t)precvdispls[i],(size_t)i,(size_t)precvcounts[i],(size_t)precvdispls[i], *(int*)scbuf);
        TRACE_ERR("Pack recv dest extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n",
                  (size_t)extent, (size_t)i,(size_t)displs[i],(size_t)i,(size_t)recvcounts[i],(size_t)displs[i], *(int*)rcbuf);
      }
    }
    MPIU_Free(rcv_noncontig_buff);
  }
  if(!snd_data_contig)  MPIU_Free(snd_noncontig_buff);
  if(lrecvdispls) MPIU_Free(lrecvdispls);
  if(lrecvcounts) MPIU_Free(lrecvcounts);

   return MPI_SUCCESS;
}
예제 #15
0
int
MPIDO_Allgatherv(const void *sendbuf,
		 int sendcount,
		 MPI_Datatype sendtype,
		 void *recvbuf,
		 const int *recvcounts,
		 const int *displs,
		 MPI_Datatype recvtype,
		 MPID_Comm * comm_ptr,
                 int *mpierrno)
{
#ifndef HAVE_PAMI_IN_PLACE
  if (sendbuf == MPI_IN_PLACE)
  {
    MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`");
    return -1;
  }
#endif
   TRACE_ERR("Entering MPIDO_Allgatherv\n");
  /* function pointer to be used to point to approperiate algorithm */

  /* Check the nature of the buffers */
  MPID_Datatype *dt_null = NULL;
  MPI_Aint send_true_lb  = 0;
  MPI_Aint recv_true_lb  = 0;
  size_t   send_size     = 0;
  size_t   recv_size     = 0;
  int config[6];
  int scount=sendcount;

  int i, rc, buffer_sum = 0;
  const int size = comm_ptr->local_size;
  char use_tree_reduce, use_alltoall, use_bcast, use_pami, use_opt;
  char *sbuf, *rbuf;
  const int rank = comm_ptr->rank;
  const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
  int queryreq = 0;

#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
   const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif
   const int selected_type = mpid->user_selected_type[PAMI_XFER_ALLGATHERV_INT];

  pami_xfer_t allred;
  volatile unsigned allred_active = 1;
  volatile unsigned allgatherv_active = 1;
  pami_type_t stype, rtype;
  int tmp;
  const pami_metadata_t *my_md = (pami_metadata_t *)NULL;

  for(i=0;i<6;i++) config[i] = 1;

  allred.cb_done = allred_cb_done;
  allred.cookie = (void *)&allred_active;
  allred.algorithm = mpid->coll_algorithm[PAMI_XFER_ALLREDUCE][0][0];
  allred.cmd.xfer_allreduce.sndbuf = (void *)config;
  allred.cmd.xfer_allreduce.stype = PAMI_TYPE_SIGNED_INT;
  allred.cmd.xfer_allreduce.rcvbuf = (void *)config;
  allred.cmd.xfer_allreduce.rtype = PAMI_TYPE_SIGNED_INT;
  allred.cmd.xfer_allreduce.stypecount = 6;
  allred.cmd.xfer_allreduce.rtypecount = 6;
  allred.cmd.xfer_allreduce.op = PAMI_DATA_BAND;

  use_alltoall = mpid->allgathervs[2];
  use_tree_reduce = mpid->allgathervs[0];
  use_bcast = mpid->allgathervs[1];
  use_pami = selected_type != MPID_COLL_USE_MPICH;
	 
   if((sendbuf != MPI_IN_PLACE) && (MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp) != MPI_SUCCESS))
     use_pami = 0;
   if(MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp) != MPI_SUCCESS)
     use_pami = 0;

   use_opt = use_alltoall || use_tree_reduce || use_bcast || use_pami;

   if(!use_opt) /* back to MPICH */
   {
     if(unlikely(verbose))
       fprintf(stderr,"Using MPICH allgatherv type %u.\n",
             selected_type);
     TRACE_ERR("Using MPICH Allgatherv\n");
     MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_MPICH");
#if CUDA_AWARE_SUPPORT
    if(MPIDI_Process.cuda_aware_support_on)
    {
       MPI_Aint sdt_extent,rdt_extent;
       MPID_Datatype_get_extent_macro(sendtype, sdt_extent);
       MPID_Datatype_get_extent_macro(recvtype, rdt_extent);
       char *scbuf = NULL;
       char *rcbuf = NULL;
       int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
       int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf);
       if(is_send_dev_buf)
       {
         scbuf = MPIU_Malloc(sdt_extent * sendcount);
         cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost);
         if (cudaSuccess != cudaerr)
           fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
       }
       else
         scbuf = sendbuf;
       size_t rtotal_buf;
       if(is_recv_dev_buf)
       {
         //Since displs can be non-continous, we need to calculate max buffer size 
         int highest_displs = displs[size - 1];
         int highest_recvcount = recvcounts[size - 1];
         for(i = 0; i < size; i++)
         {
           if(displs[i]+recvcounts[i] > highest_displs+highest_recvcount)
           {
             highest_displs = displs[i];
             highest_recvcount = recvcounts[i];
           }
         }
         rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent;
         rcbuf = MPIU_Malloc(rtotal_buf);
         if(sendbuf == MPI_IN_PLACE)
         {
           cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rtotal_buf, cudaMemcpyDeviceToHost);
           if (cudaSuccess != cudaerr)
             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
         }
         else
           memset(rcbuf, 0, rtotal_buf);
       }
       else
         rcbuf = recvbuf;
       int cuda_res =  MPIR_Allgatherv(scbuf, sendcount, sendtype, rcbuf, recvcounts, displs, recvtype, comm_ptr, mpierrno);
       if(is_send_dev_buf)MPIU_Free(scbuf);
       if(is_recv_dev_buf)
         {
           cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, rtotal_buf, cudaMemcpyHostToDevice);
           if (cudaSuccess != cudaerr)
             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
           MPIU_Free(rcbuf);
         }
       return cuda_res;
    }
    else
#endif
     return MPIR_Allgatherv(sendbuf, sendcount, sendtype,
			   recvbuf, recvcounts, displs, recvtype,
                          comm_ptr, mpierrno);
   }

   MPIDI_Datatype_get_info(1,
			  recvtype,
			  config[MPID_RECV_CONTIG],
			  recv_size,
			  dt_null,
			  recv_true_lb);

   if(sendbuf == MPI_IN_PLACE)
   {
     sbuf = PAMI_IN_PLACE;
     if(unlikely(verbose))
       fprintf(stderr,"allgatherv MPI_IN_PLACE buffering\n");
     stype = rtype;
     scount = recvcounts[rank];
     send_size = recv_size * scount; 
   }
   else
   {
      MPIDI_Datatype_get_info(sendcount,
                              sendtype,
                              config[MPID_SEND_CONTIG],
                              send_size,
                              dt_null,
                              send_true_lb);
       sbuf = (char *)sendbuf+send_true_lb;
   }

   rbuf = (char *)recvbuf+recv_true_lb;

   if(use_alltoall || use_bcast || use_tree_reduce)
   {
      if (displs[0])
       config[MPID_RECV_CONTINUOUS] = 0;

      for (i = 1; i < size; i++)
      {
        buffer_sum += recvcounts[i - 1];
        if (buffer_sum != displs[i])
        {
          config[MPID_RECV_CONTINUOUS] = 0;
          break;
        }
      }

      buffer_sum += recvcounts[size - 1];

      buffer_sum *= recv_size;

      /* disable with "safe allgatherv" env var */
      if(mpid->preallreduces[MPID_ALLGATHERV_PREALLREDUCE])
      {
         MPIDI_Post_coll_t allred_post;
         MPIDI_Context_post(MPIDI_Context[0], &allred_post.state,
                            MPIDI_Pami_post_wrapper, (void *)&allred);

         MPID_PROGRESS_WAIT_WHILE(allred_active);
      }

      use_tree_reduce = mpid->allgathervs[0] &&
         config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG] &&
         config[MPID_RECV_CONTINUOUS] && buffer_sum % sizeof(unsigned) == 0;

      use_alltoall = mpid->allgathervs[2] &&
         config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG];

      use_bcast = mpid->allgathervs[1];
   }

   if(use_pami)
   {
      pami_xfer_t allgatherv;
      allgatherv.cb_done = allgatherv_cb_done;
      allgatherv.cookie = (void *)&allgatherv_active;
      if(selected_type == MPID_COLL_OPTIMIZED)
      {
        if((mpid->cutoff_size[PAMI_XFER_ALLGATHERV_INT][0] == 0) || 
           (mpid->cutoff_size[PAMI_XFER_ALLGATHERV_INT][0] > 0 && mpid->cutoff_size[PAMI_XFER_ALLGATHERV_INT][0] >= send_size))
        {		
          allgatherv.algorithm = mpid->opt_protocol[PAMI_XFER_ALLGATHERV_INT][0];
          my_md = &mpid->opt_protocol_md[PAMI_XFER_ALLGATHERV_INT][0];
          queryreq     = mpid->must_query[PAMI_XFER_ALLGATHERV_INT][0];
        }
        else
          return MPIR_Allgatherv(sendbuf, sendcount, sendtype,
                       recvbuf, recvcounts, displs, recvtype,
                       comm_ptr, mpierrno);
      }
      else
      {  
        allgatherv.algorithm = mpid->user_selected[PAMI_XFER_ALLGATHERV_INT];
        my_md = &mpid->user_metadata[PAMI_XFER_ALLGATHERV_INT];
        queryreq     = selected_type;
      }
      
      allgatherv.cmd.xfer_allgatherv_int.sndbuf = sbuf;
      allgatherv.cmd.xfer_allgatherv_int.rcvbuf = rbuf;

      allgatherv.cmd.xfer_allgatherv_int.stype = stype;
      allgatherv.cmd.xfer_allgatherv_int.rtype = rtype;
      allgatherv.cmd.xfer_allgatherv_int.stypecount = scount;
      allgatherv.cmd.xfer_allgatherv_int.rtypecounts = (int *) recvcounts;
      allgatherv.cmd.xfer_allgatherv_int.rdispls = (int *) displs;

      if(unlikely (queryreq == MPID_COLL_ALWAYS_QUERY ||
                   queryreq == MPID_COLL_CHECK_FN_REQUIRED))
      {
         metadata_result_t result = {0};
         TRACE_ERR("Querying allgatherv_int protocol %s, type was %d\n", my_md->name,
            selected_type);
         if(my_md->check_fn == NULL)
         {
           /* process metadata bits */
           if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE))
              result.check.unspecified = 1;
/* Can't check ranges like this.  Non-local.  Comment out for now.
          if(my_md->check_correct.values.rangeminmax)
           {
             MPI_Aint data_true_lb;
             MPID_Datatype *data_ptr;
             int data_size, data_contig;
             MPIDI_Datatype_get_info(sendcount, sendtype, data_contig, data_size, data_ptr, data_true_lb); 
             if((my_md->range_lo <= data_size) &&
                (my_md->range_hi >= data_size))
                ; 
             else
             {
                result.check.range = 1;
                if(unlikely(verbose))
                {   
                   fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n",
                           data_size,
                           my_md->range_lo,
                           my_md->range_hi,
                           my_md->name);
                }
             }
           }
 */
         }
         else /* calling the check fn is sufficient */
           result = my_md->check_fn(&allgatherv);
         TRACE_ERR("Allgatherv bitmask: %#X\n", result.bitmask);
         result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */
         if(result.bitmask)
         {
           if(unlikely(verbose))
             fprintf(stderr,"Query failed for %s. Using MPICH allgatherv.\n", my_md->name);
           MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_MPICH");
           return MPIR_Allgatherv(sendbuf, sendcount, sendtype,
                                  recvbuf, recvcounts, displs, recvtype,
                                  comm_ptr, mpierrno);
         }
         if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) 
         { 
           comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
           int tmpmpierrno;   
           if(unlikely(verbose))
             fprintf(stderr,"Query barrier required for %s\n", my_md->name);
           MPIDO_Barrier(comm_ptr, &tmpmpierrno);
         }
      }

      if(unlikely(verbose))
      {
         unsigned long long int threadID;
         MPIU_Thread_id_t tid;
         MPIU_Thread_self(&tid);
         threadID = (unsigned long long int)tid;
         fprintf(stderr,"<%llx> Using protocol %s for allgatherv on %u\n", 
                 threadID,
                 my_md->name,
              (unsigned) comm_ptr->context_id);
      }
      MPIDI_Post_coll_t allgatherv_post;
      MPIDI_Context_post(MPIDI_Context[0], &allgatherv_post.state,
                         MPIDI_Pami_post_wrapper, (void *)&allgatherv);

      MPIDI_Update_last_algorithm(comm_ptr, my_md->name);

      TRACE_ERR("Rank %d waiting on active %d\n", rank, allgatherv_active);
      MPID_PROGRESS_WAIT_WHILE(allgatherv_active);

      return PAMI_SUCCESS;
   }

   /* TODO These need ordered in speed-order */
   if(use_tree_reduce)
   {
     if(unlikely(verbose))
       fprintf(stderr,"Using tree reduce allgatherv type %u.\n",
               selected_type);
     rc = MPIDO_Allgatherv_allreduce(sendbuf, sendcount, sendtype,
             recvbuf, recvcounts, buffer_sum, displs, recvtype,
             send_true_lb, recv_true_lb, send_size, recv_size,
             comm_ptr, mpierrno);
     MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_OPT_ALLREDUCE");
     return rc;
   }

   if(use_bcast)
   {
     if(unlikely(verbose))
       fprintf(stderr,"Using bcast allgatherv type %u.\n",
               selected_type);
     rc = MPIDO_Allgatherv_bcast(sendbuf, sendcount, sendtype,
             recvbuf, recvcounts, buffer_sum, displs, recvtype,
             send_true_lb, recv_true_lb, send_size, recv_size,
             comm_ptr, mpierrno);
     MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_OPT_BCAST");
     return rc;
   }

   if(use_alltoall)
   {
     if(unlikely(verbose))
       fprintf(stderr,"Using alltoall allgatherv type %u.\n",
               selected_type);
     rc = MPIDO_Allgatherv_alltoall(sendbuf, sendcount, sendtype,
             recvbuf, (int *)recvcounts, buffer_sum, displs, recvtype,
             send_true_lb, recv_true_lb, send_size, recv_size,
             comm_ptr, mpierrno);
     MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_OPT_ALLTOALL");
     return rc;
   }

   if(unlikely(verbose))
      fprintf(stderr,"Using MPICH allgatherv type %u.\n",
            selected_type);
   TRACE_ERR("Using MPICH for Allgatherv\n");
   MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_MPICH");
   return MPIR_Allgatherv(sendbuf, sendcount, sendtype,
                       recvbuf, recvcounts, displs, recvtype,
                       comm_ptr, mpierrno);
}