Exemplo n.º 1
0
/** One-sided accumulate operation with typed arguments.  Source buffer must be private.
  *
  * @param[in] mreg      Memory region
  * @param[in] src       Address of source data
  * @param[in] src_count Number of elements of the given type at the source
  * @param[in] src_type  MPI datatype of the source elements
  * @param[in] dst       Address of destination buffer
  * @param[in] dst_count Number of elements of the given type at the destination
  * @param[in] src_type  MPI datatype of the destination elements
  * @param[in] size      Number of bytes to transfer
  * @param[in] proc      Absolute process id of target process
  * @return              0 on success, non-zero on failure
  */
int gmr_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type,
    void *dst, int dst_count, MPI_Datatype dst_type, int proc) {

  int        grp_proc;
  gmr_size_t disp;
  MPI_Aint lb, extent;

  grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc);
  ARMCII_Assert(grp_proc >= 0);

  // Calculate displacement from beginning of the window
  if (dst == MPI_BOTTOM) 
    disp = 0;
  else
    disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base);

  // Perform checks
  MPI_Type_get_true_extent(dst_type, &lb, &extent);
  ARMCII_Assert(mreg->lock_state != GMR_LOCK_UNLOCKED);
  ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address");
  ARMCII_Assert_msg(disp + dst_count*extent <= mreg->slices[proc].size, "Transfer is out of range");

  MPI_Accumulate(src, src_count, src_type, grp_proc, (MPI_Aint) disp, dst_count, dst_type, MPI_SUM, mreg->window);

  return 0;
}
Exemplo n.º 2
0
/* execute a purely local sort */
int DTCMP_Sort_local_mergesort(
  const void* inbuf, 
  void* outbuf,
  int count,
  MPI_Datatype key,
  MPI_Datatype keysat,
  DTCMP_Op cmp,
  DTCMP_Flags hints)
{
  int rc = DTCMP_SUCCESS;

  MPI_Aint lb, extent;
  MPI_Type_get_true_extent(keysat, &lb, &extent);

  if (count > 0 && extent > 0) {
    /* copy data to outbuf if it's not already there */
    if (inbuf != DTCMP_IN_PLACE) {
      DTCMP_Memcpy(outbuf, count, keysat, inbuf, count, keysat);
    }

    /* allocate scratch space */
    void* scratch = dtcmp_malloc(count * extent, 0, __FILE__, __LINE__);

    /* execute our merge sort */
    size_t size = (size_t) extent;
    rc = dtcmp_sort_local_mergesort_scratch(outbuf, scratch, count, size, cmp, hints);

    /* free scratch space */
    dtcmp_free(&scratch);
  }

  return rc;
}
Exemplo n.º 3
0
/* Extract an m x n submatrix within an m x N matrix and transpose it.
   Assume storage by rows; the defined datatype accesses by columns */
MPI_Datatype transpose_type(int N, int m, int n, MPI_Datatype type)
/* computes a datatype for the transpose of an mxn matrix 
   with entries of type type */
{
  MPI_Datatype subrow, subrow1, submatrix;
  MPI_Aint lb, extent;
  
  MPI_Type_vector(m, 1, N, type, &subrow);
  MPI_Type_get_extent(type, &lb, &extent);
  MPI_Type_create_resized(subrow, 0, extent, &subrow1);
  MPI_Type_contiguous(n, subrow1, &submatrix); 
  MPI_Type_commit(&submatrix);
  MPI_Type_free( &subrow );
  MPI_Type_free( &subrow1 );

  /* Add a consistency test: the size of submatrix should be
     n * m * sizeof(type) and the extent should be ((m-1)*N+n) * sizeof(type) */
  {
      int      tsize;
      MPI_Aint textent, llb;
      MPI_Type_size( type, &tsize );
      MPI_Type_get_true_extent( submatrix, &llb, &textent );
      
      if (textent != tsize * (N * (m-1)+n)) {
	  fprintf( stderr, "Transpose Submatrix extent is %ld, expected %ld (%d,%d,%d)\n",
		   (long)textent, (long)(tsize * (N * (m-1)+n)), N, n, m );
      }
  }

  return(submatrix);
}
Exemplo n.º 4
0
    void print_pictogram(std::ostream& os, unsigned int width = 8) {
        std::pair<MPI_Aint, MPI_Aint> ext;
        std::pair<MPI_Aint, MPI_Aint> true_ext;
        MPI_Type_get_extent(type, &ext.first, &ext.second);
        MPI_Type_get_true_extent(type, &true_ext.first, &true_ext.second);

        if (ext.first != 0) {
            os << "Pictogram not available for types with lb != 0" << std::endl;
            return;
        }

        MPI_Aint ex = ext.second;

        if (ext.second < true_ext.second) {
            os << "Pictogram not available for types with ub < true_ub" << std::endl;
            return;
        }

        // use single letter type
        unsigned int pos = 0;
        os << "[";
        for (unsigned int i = 0; i < width; ++i) {
            os << "-";
        }
        os << "]" << std::endl << "[";
        for (auto it = m.begin(); it != m.end(); ++it) {
            while (pos < it->first) {
                if (pos % width == 0 && pos != 0)
                    os << "]" << std::endl << "[";
                os << " ";
                ++pos;
            }
            // get type size and type char
            char type_char = builtin_typename_map::get_typeid_name(it->second)[0];
            int size;
            MPI_Type_size(it->second, &size);
            // print the character `size` times
            for (int i = 0; i < size; ++i) {
                if (pos % width == 0 && pos != 0)
                    os << "]" << std::endl << "[";
                os << type_char;
                ++pos;
            }
        }

        while (pos < ex) {
            if (pos % width == 0 && pos != 0)
                os << "]" << std::endl << "[";
            os << " ";
            ++pos;
        }

        os << "]" << std::endl << "[";
        for (unsigned int i = 0; i < width; ++i) {
            os << "-";
        }
        os << "]" << std::endl;
    }
void mpi_type_get_true_extent_f(MPI_Fint *datatype, MPI_Aint *true_lb, MPI_Aint *true_extent, MPI_Fint *ierr)
{
    MPI_Datatype c_type = MPI_Type_f2c(*datatype);

    *ierr = OMPI_INT_2_FINT(MPI_Type_get_true_extent(c_type,
						     true_lb,
						     true_extent));

}
Exemplo n.º 6
0
JNIEXPORT void JNICALL Java_mpi_Datatype_getTrueLbExtent(
        JNIEnv *env, jobject jthis, jlong type, jintArray jLbExt)
{
    MPI_Aint lb, extent;
    int rc = MPI_Type_get_true_extent((MPI_Datatype)type, &lb, &extent);
    if(ompi_java_exceptionCheck(env, rc))
        return;

    jint *lbExt = (*env)->GetIntArrayElements(env, jLbExt, NULL);
    lbExt[0] = (jint)lb;
    lbExt[1] = (jint)extent;
    (*env)->ReleaseIntArrayElements(env, jLbExt, lbExt, 0);
}
Exemplo n.º 7
0
int zmpil_simple_create_derived(zmpil_simple_t *mpil, MPI_Datatype type, int count) /* zmpi_func zmpil_simple_create_derived */
{
/*  mpil->type = type;*/

  Z_TRACE("zmpil_simple_create_derived");

  MPI_Type_get_true_extent(type, &mpil->true_lb, &mpil->true_extent);

  mpil->true_extent *= count;

  Z_ASSERT(mpil->true_lb == 0);

  return 0;
}
int MPIU_datatype_full_size(MPI_Datatype datatype, MPI_Aint *size)
{
    int error_code = MPI_SUCCESS;
    MPI_Aint extent = 0;
    MPI_Aint true_extent = 0;
    MPI_Aint true_lb = 0;

    error_code = MPI_Type_get_true_extent(datatype, &true_lb, &true_extent);
    if (error_code != MPI_SUCCESS)
        goto fn_exit;

    *size = true_extent;
fn_exit:
    return error_code;
}
Exemplo n.º 9
0
int DTCMP_Sort_allgather(
  const void* inbuf,
  void* outbuf,
  int count,
  MPI_Datatype key,
  MPI_Datatype keysat,
  DTCMP_Op cmp,
  DTCMP_Flags hints,
  MPI_Comm comm)
{
  /* get our rank and the number of ranks in this comm */
  int rank, ranks;
  MPI_Comm_rank(comm, &rank);
  MPI_Comm_size(comm, &ranks);

  /* compute total number of items that we'll collect */
  int total_count = count * ranks;

  /* get true extent of keysat type */
  MPI_Aint true_lb, true_extent;
  MPI_Type_get_true_extent(keysat, &true_lb, &true_extent);

  /* allocate space to hold all items from all procs */
  size_t buf_size = total_count * true_extent;
  if (buf_size > 0) {
    char* buf = (char*) dtcmp_malloc(buf_size, 0, __FILE__, __LINE__); 

    /* gather all items, send from outbuf if IN_PLACE is specified */
    void* sendbuf = (void*) inbuf;
    if (inbuf == DTCMP_IN_PLACE) {
      sendbuf = outbuf;
    }
    char* recvbuf = buf - true_lb;
    MPI_Allgather(sendbuf, count, keysat, (void*)recvbuf, count, keysat, comm);

    /* sort items with local sort */
    DTCMP_Sort_local(DTCMP_IN_PLACE, recvbuf, total_count, key, keysat, cmp, hints);

    /* copy our items into outbuf */
    char* mybuf = recvbuf + count * rank * true_extent;
    DTCMP_Memcpy(outbuf, count, keysat, (void*)mybuf, count, keysat);

    /* free off our temporary buffers */
    dtcmp_free(&buf);
  }

  return DTCMP_SUCCESS;
}
Exemplo n.º 10
0
void ADIOI_Datatype_iscontig(MPI_Datatype datatype, int *flag)
{
    MPIR_Datatype_iscontig(datatype, flag);

    /* if it is MPICH2 and the datatype is reported as contigous,
       check if the true_lb is non-zero, and if so, mark the 
       datatype as noncontiguous */
#ifdef MPICH2
    if (*flag) {
        MPI_Aint true_extent, true_lb;
        
        MPI_Type_get_true_extent(datatype, &true_lb, &true_extent);

        if (true_lb > 0)
            *flag = 0;
    }
#endif
}
Exemplo n.º 11
0
/*@C
   PetscSFGetWindow - Get a window for use with a given data type

   Collective on PetscSF

   Input Arguments:
+  sf - star forest
.  unit - data type
.  array - array to be sent
.  epoch - PETSC_TRUE to acquire the window and start an epoch, PETSC_FALSE to just acquire the window
.  fenceassert - assert parameter for call to MPI_Win_fence(), if PETSCSF_WINDOW_SYNC_FENCE
.  postassert - assert parameter for call to MPI_Win_post(), if PETSCSF_WINDOW_SYNC_ACTIVE
-  startassert - assert parameter for call to MPI_Win_start(), if PETSCSF_WINDOW_SYNC_ACTIVE

   Output Arguments:
.  win - window

   Level: developer

   Developer Notes:
   This currently always creates a new window. This is more synchronous than necessary. An alternative is to try to
   reuse an existing window created with the same array. Another alternative is to maintain a cache of windows and reuse
   whichever one is available, by copying the array into it if necessary.

.seealso: PetscSFGetRanks(), PetscSFWindowGetDataTypes()
@*/
static PetscErrorCode PetscSFGetWindow(PetscSF sf,MPI_Datatype unit,void *array,PetscBool epoch,PetscMPIInt fenceassert,PetscMPIInt postassert,PetscMPIInt startassert,MPI_Win *win)
{
  PetscSF_Window *w = (PetscSF_Window*)sf->data;
  PetscErrorCode ierr;
  MPI_Aint       lb,lb_true,bytes,bytes_true;
  PetscSFWinLink link;

  PetscFunctionBegin;
  ierr = MPI_Type_get_extent(unit,&lb,&bytes);CHKERRQ(ierr);
  ierr = MPI_Type_get_true_extent(unit,&lb_true,&bytes_true);CHKERRQ(ierr);
  if (lb != 0 || lb_true != 0) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for unit type with nonzero lower bound, write [email protected] if you want this feature");
  if (bytes != bytes_true) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for unit type with modified extent, write [email protected] if you want this feature");
  ierr = PetscMalloc(sizeof(*link),&link);CHKERRQ(ierr);

  link->bytes = bytes;
  link->addr  = array;

  ierr = MPI_Win_create(array,(MPI_Aint)bytes*sf->nroots,(PetscMPIInt)bytes,MPI_INFO_NULL,PetscObjectComm((PetscObject)sf),&link->win);CHKERRQ(ierr);

  link->epoch = epoch;
  link->next  = w->wins;
  link->inuse = PETSC_TRUE;
  w->wins     = link;
  *win        = link->win;

  if (epoch) {
    switch (w->sync) {
    case PETSCSF_WINDOW_SYNC_FENCE:
      ierr = MPI_Win_fence(fenceassert,*win);CHKERRQ(ierr);
      break;
    case PETSCSF_WINDOW_SYNC_LOCK: /* Handled outside */
      break;
    case PETSCSF_WINDOW_SYNC_ACTIVE: {
      MPI_Group ingroup,outgroup;
      ierr = PetscSFGetGroups(sf,&ingroup,&outgroup);CHKERRQ(ierr);
      ierr = MPI_Win_post(ingroup,postassert,*win);CHKERRQ(ierr);
      ierr = MPI_Win_start(outgroup,startassert,*win);CHKERRQ(ierr);
    } break;
    default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_PLIB,"Unknown synchronization type");
    }
  }
  PetscFunctionReturn(0);
}
Exemplo n.º 12
0
/* determine whether type is contiguous, has a true lower bound of 0,
 * and extent == true_extent */
static int dtcmp_type_is_valid(MPI_Datatype type)
{
    /* get (user-defined) lower bound and extent */
    MPI_Aint lb, extent;
    MPI_Type_get_extent(type, &lb, &extent);

    /* get true lower bound and extent */
    MPI_Aint true_lb, true_extent;
    MPI_Type_get_true_extent(type, &true_lb, &true_extent);

    /* get size of type */
    int size;
    MPI_Type_size(type, &size);

    /* check that type is contiguous (size == true_extent ==> no holes) */
    if (size != true_extent) {
        return 0;
    }

    /* check that lower bounds are 0 */
    if (lb != 0 || true_lb != 0) {
        return 0;
    }

    /* check that extent == true_extent ==> no funny business if we
     * concatenate a series of these types */
    if (extent != true_extent) {
        return 0;
    }

    /* check that extent is positive */
    if (extent <= 0) {
        return 0;
    }

    return 1;
}
Exemplo n.º 13
0
/* check whether all items in buf are already in sorted order */
int DTCMP_Is_sorted(
  const void* buf,
  int count,
  MPI_Datatype key,
  MPI_Datatype keysat,
  DTCMP_Op cmp,
  DTCMP_Flags hints,
  MPI_Comm comm,
  int* flag)
{
  int rc = DTCMP_SUCCESS;

  /* assume that items are globally sorted,
   * we'll set this to 0 if we find otherwise */
  int sorted = 1;

  /* get our rank and the number of ranks in the communicator */
  int rank, ranks;
  MPI_Comm_rank(comm, &rank);
  MPI_Comm_size(comm, &ranks);

  /* first, step through and check that all of our local items are in order */
  DTCMP_Is_sorted_local(buf, count, key, keysat, cmp, hints, &sorted);

  /* bail out at this point if ranks == 1 */
  if (ranks <= 1) {
    *flag = sorted;
    return DTCMP_SUCCESS;
  }

  /* get extent of keysat */
  MPI_Aint lb, extent;
  MPI_Type_get_extent(keysat, &lb, &extent);

  /* get true extent of key */
  MPI_Aint key_true_lb, key_true_extent;
  MPI_Type_get_true_extent(key, &key_true_lb, &key_true_extent);

  /* TODO: if we know that each proc has an item,
   * we could just do a single pt2pt send to the rank one higher,
   * compare, then allreduce, and thereby avoid the type/op creation
   * and scan that follows */

  /* allocate type for scan, one int to say whether key is valid,
   * and our largest key */
  size_t item_size = sizeof(int) + key_true_extent;
  char* sendbuf = dtcmp_malloc(item_size, 0, __FILE__, __LINE__);
  char* recvbuf = dtcmp_malloc(item_size, 0, __FILE__, __LINE__);

  /* copy our largest item into our send buffer,
   * set valid flag to 1 if we have a value */
  int*  valid = (int*) sendbuf;
  void* value = (void*) (sendbuf + sizeof(int));
  if (count > 1) {
    *valid = 1;

    /* get pointer to largest element in our buffer,
     * and copy it to our send buffer */
    const void* lastitem = (const void*) ((const char*)buf + (count - 1) * extent);
    DTCMP_Memcpy(value, 1, key, lastitem, 1, key);
  } else {
    /* we dont have any items, so set valid flag to 0 */
    *valid = 0;
  }

  /* create and commit type that consists of leading int followed by key */
  MPI_Datatype validtype;
  dtcmp_type_concat2(MPI_INT, key, &validtype);

  /* create user-defined reduction operation to copy key if its valid */
  MPI_Op validop;
  MPI_Op_create(copy_key_if_valid, 0, &validop);

  /* execute scan to get key from next process to our left (that has an item) */
  MPI_Exscan(sendbuf, recvbuf, 1, validtype, validop, comm);

  /* free off our user-defined reduction op and datatype */
  MPI_Op_free(&validop);
  MPI_Type_free(&validtype);

  /* compare our smallest item to the received item */
  if (count > 0 && rank > 0) {
    int recvvalid = *(int*) recvbuf;
    if (recvvalid) {
      const void* recvkey = (const void*) (recvbuf + sizeof(int));
      if (dtcmp_op_eval(recvkey, buf, cmp) > 0) {
        sorted = 0;
      }
    }
  }

  /* allreduce to determine whether all items are in order */
  int all_sorted;
  MPI_Allreduce(&sorted, &all_sorted, 1, MPI_INT, MPI_LAND, comm);

  /* free the scratch space */
  dtcmp_free(&recvbuf);
  dtcmp_free(&sendbuf);

  /* set caller's output flag and return */
  *flag = all_sorted;
  return rc;
}
Exemplo n.º 14
0
int MPICH_AlltoAll_short( 
						 void *sendbuf, 
						 int sendcount, 
						 MPI_Datatype sendtype, 
						 void *recvbuf, 
						 int recvcount, 
						 MPI_Datatype recvtype, 
						 MPI_Comm comm )
{

  int          comm_size, i, pof2;
  MPI_Aint     sendtype_extent, recvtype_extent;
 
  int mpi_errno=MPI_SUCCESS, src, dst, rank, nbytes;
  MPI_Status status;
  void *tmp_buf;
  int sendtype_size, pack_size, block, position, *displs, count;

  MPI_Datatype newtype;
  MPI_Aint recvtype_true_extent, recvbuf_extent, recvtype_true_lb;


  if (sendcount == 0) return MPI_SUCCESS;
  
  MPI_Comm_rank (MPI_COMM_WORLD, &rank);
  MPI_Comm_size (MPI_COMM_WORLD, &comm_size);
    
  /* Get extent of send and recv types */
  MPID_Datatype_get_extent_macro(recvtype, recvtype_extent);
  MPID_Datatype_get_extent_macro(sendtype, sendtype_extent);

  MPID_Datatype_get_size_macro(sendtype, sendtype_size);
  nbytes = sendtype_size * sendcount;
    
  /* use the indexing algorithm by Jehoshua Bruck et al,
   * IEEE TPDS, Nov. 97 */ 

  /* allocate temporary buffer */
  MPI_Pack_size(recvcount*comm_size, recvtype, comm, &pack_size);
  tmp_buf = malloc(pack_size);
  CkAssert(tmp_buf);

  /* Do Phase 1 of the algorithim. Shift the data blocks on process i
   * upwards by a distance of i blocks. Store the result in recvbuf. */
  MPICH_Localcopy((char *) sendbuf + rank*sendcount*sendtype_extent, 
				  (comm_size - rank)*sendcount, sendtype, recvbuf, 
				  (comm_size - rank)*recvcount, recvtype);
	    
  MPICH_Localcopy(sendbuf, rank*sendcount, sendtype, 
				  (char *) recvbuf + (comm_size-rank)*recvcount*recvtype_extent, 
				  rank*recvcount, recvtype);
	    			
  /* Input data is now stored in recvbuf with datatype recvtype */

  /* Now do Phase 2, the communication phase. It takes
	 ceiling(lg p) steps. In each step i, each process sends to rank+2^i
	 and receives from rank-2^i, and exchanges all data blocks
	 whose ith bit is 1. */

  /* allocate displacements array for indexed datatype used in
	 communication */

  displs = (int*)malloc(comm_size * sizeof(int));
  CkAssert(displs);


  pof2 = 1;
  while (pof2 < comm_size) {
	dst = (rank + pof2) % comm_size;
	src = (rank - pof2 + comm_size) % comm_size;

	/* Exchange all data blocks whose ith bit is 1 */
	/* Create an indexed datatype for the purpose */

	count = 0;
	for (block=1; block<comm_size; block++) {
	  if (block & pof2) {
		displs[count] = block * recvcount;
		count++;
	  }
	}

	mpi_errno = MPI_Type_create_indexed_block(count, recvcount, displs, recvtype, &newtype);

	if (mpi_errno)
	  return mpi_errno;

	mpi_errno = MPI_Type_commit(&newtype);

	if (mpi_errno)
	  return mpi_errno;
	    
	position = 0;
	mpi_errno = MPI_Pack(recvbuf, 1, newtype, tmp_buf, pack_size, 
						  &position, comm);

	mpi_errno = AMPI_Sendrecv(tmp_buf, position, MPI_PACKED, dst,
							  MPI_ATA_TAG, recvbuf, 1, newtype,
							  src, MPI_ATA_TAG, comm,
							  MPI_STATUS_IGNORE);
	    
	if (mpi_errno)
	  return mpi_errno;
	    

	mpi_errno = MPI_Type_free(&newtype);
	   
	if (mpi_errno)
	  return mpi_errno;

	pof2 *= 2;
  }

  free(displs);
  free(tmp_buf);

  /* Rotate blocks in recvbuf upwards by (rank + 1) blocks. Need
   * a temporary buffer of the same size as recvbuf. */
        
  /* get true extent of recvtype */
  mpi_errno = MPI_Type_get_true_extent(recvtype, &recvtype_true_lb,
										&recvtype_true_extent);  

  if (mpi_errno)
	return mpi_errno;

  recvbuf_extent = recvcount * comm_size *
	(MAX(recvtype_true_extent, recvtype_extent));
  tmp_buf = malloc(recvbuf_extent);
  CkAssert(tmp_buf);

  /* adjust for potential negative lower bound in datatype */
  tmp_buf = (void *)((char*)tmp_buf - recvtype_true_lb);

  MPICH_Localcopy((char *) recvbuf + (rank+1)*recvcount*recvtype_extent, 
				  (comm_size - rank - 1)*recvcount, recvtype, tmp_buf, 
				  (comm_size - rank - 1)*recvcount, recvtype);
			
  MPICH_Localcopy(recvbuf, (rank+1)*recvcount, recvtype, 
				  (char *) tmp_buf + (comm_size-rank-1)*recvcount*recvtype_extent, 
				  (rank+1)*recvcount, recvtype);
	
        
  /* Blocks are in the reverse order now (comm_size-1 to 0). 
   * Reorder them to (0 to comm_size-1) and store them in recvbuf. */

  for (i=0; i<comm_size; i++) 
	MPICH_Localcopy((char *) tmp_buf + i*recvcount*recvtype_extent,
					recvcount, recvtype, 
					(char *) recvbuf + (comm_size-i-1)*recvcount*recvtype_extent, 
					recvcount, recvtype); 

  free((char*)tmp_buf + recvtype_true_lb);

}
Exemplo n.º 15
0
int DTCMP_Sortz(
    const void* inbuf,
    int count,
    void** outbuf,
    int* outcount,
    MPI_Datatype key,
    MPI_Datatype keysat,
    DTCMP_Op cmp,
    DTCMP_Flags hints,
    MPI_Comm comm,
    DTCMP_Handle* handle)
{
    /* check that we're initialized */
    if (dtcmp_init_count <= 0) {
        return DTCMP_FAILURE;
    }

    /* check parameters */
    if (count < 0) {
        return DTCMP_FAILURE;
    }
    if (outbuf == NULL || outcount == NULL) {
        return DTCMP_FAILURE;
    }
    if (! dtcmp_type_is_valid(key)) {
        return DTCMP_FAILURE;
    }
    if (! dtcmp_type_is_valid(keysat)) {
        return DTCMP_FAILURE;
    }

    /* execute allreduce to compute min/max counts per process,
     * and sum of all elements */
    uint64_t min, max, sum;
    dtcmp_get_uint64t_min_max_sum(count, &min, &max, &sum, comm);

    /* nothing to do if the total element count is 0 */
    if (sum == 0) {
        dtcmp_handle_alloc_single(0, outbuf, handle);
        *outcount = 0;
        return DTCMP_SUCCESS;
    }

    /* can't call sample sort with a single process right now */
    int ranks;
    MPI_Comm_size(comm, &ranks);

    /* for now, we can only use sample sort if min==max and ranks > 1*/
    if (min == max && ranks > 1) {
        /* TODO: if number of elements per process is small,
         * call bitonic sort */
        return DTCMP_Sortz_samplesort(
                   inbuf, count, outbuf, outcount,
                   key, keysat, cmp, hints, comm, handle
               );
    }

    /* otherwise, force things into a Sortv but where we allocate and
     * return memory with a handle */
    MPI_Aint keysat_true_lb, keysat_true_extent;
    MPI_Type_get_true_extent(keysat, &keysat_true_lb, &keysat_true_extent);
    size_t outbuf_size = count * keysat_true_extent;
    dtcmp_handle_alloc_single(outbuf_size, outbuf, handle);
    DTCMP_Sortv(inbuf, *outbuf, count, key, keysat, cmp, hints, comm);
    *outcount = count;

    return DTCMP_SUCCESS;
}