Ejemplo n.º 1
0
int main( int argc, char *argv[] )
{
    int errs = 0;
    int rank, size, root;
    int minsize = 2, count; 
    MPI_Comm      comm;
    int *buf, *bufout;
    MPI_Op op;
    MPI_Datatype mattype;

    MTest_Init( &argc, &argv );

    MPI_Op_create( uop, 0, &op );
    
    while (MTestGetIntracommGeneral( &comm, minsize, 1 )) {
	if (comm == MPI_COMM_NULL) continue;

	MPI_Comm_size( comm, &size );
	MPI_Comm_rank( comm, &rank );

	matSize = size;  /* used by the user-defined operation */
	/* Only one matrix for now */
	count = 1;

	/* A single matrix, the size of the communicator */
	MPI_Type_contiguous( size*size, MPI_INT, &mattype );
	MPI_Type_commit( &mattype );
	
	buf = (int *)malloc( count * size * size * sizeof(int) );
	if (!buf) {
          MPI_Abort( MPI_COMM_WORLD, 1 );
          exit(1);
        }
	bufout = (int *)malloc( count * size * size * sizeof(int) );
	if (!bufout) {
          MPI_Abort( MPI_COMM_WORLD, 1 );
          exit(1);
        }

	for (root = 0; root < size; root ++) {
	    initMat( comm, buf );
	    MPI_Reduce( buf, bufout, count, mattype, op, root, comm );
	    if (rank == root) {
		errs += isShiftLeft( comm, bufout );
	    }

	    /* Try the same test, but using MPI_IN_PLACE */
	    initMat( comm, bufout );
	    if (rank == root) {
		MPI_Reduce( MPI_IN_PLACE, bufout, count, mattype, op, root, comm );
	    }
	    else {
		MPI_Reduce( bufout, NULL, count, mattype, op, root, comm );
	    }
	    if (rank == root) {
		errs += isShiftLeft( comm, bufout );
	    }

#if MTEST_HAVE_MIN_MPI_VERSION(2,2)
            /* Try one more time without IN_PLACE to make sure we check
             * aliasing correctly */
            if (rank == root) {
                MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
                if (MPI_SUCCESS == MPI_Reduce( bufout, bufout, count, mattype, op, root, comm ))
                    errs++;
            }
#endif
	}

	free( buf );
	free( bufout );
	
	MPI_Type_free( &mattype );

	MTestFreeComm( &comm );
    }

    MPI_Op_free( &op );

    MTest_Finalize( errs );
    MPI_Finalize();
    return 0;
}
Ejemplo n.º 2
0
Archivo: pinit.c Proyecto: hansec/petsc
/*@C
   PetscInitialize - Initializes the PETSc database and MPI.
   PetscInitialize() calls MPI_Init() if that has yet to be called,
   so this routine should always be called near the beginning of
   your program -- usually the very first line!

   Collective on MPI_COMM_WORLD or PETSC_COMM_WORLD if it has been set

   Input Parameters:
+  argc - count of number of command line arguments
.  args - the command line arguments
.  file - [optional] PETSc database file, also checks ~username/.petscrc and .petscrc use NULL to not check for
          code specific file. Use -skip_petscrc in the code specific file to skip the .petscrc files
-  help - [optional] Help message to print, use NULL for no message

   If you wish PETSc code to run ONLY on a subcommunicator of MPI_COMM_WORLD, create that
   communicator first and assign it to PETSC_COMM_WORLD BEFORE calling PetscInitialize(). Thus if you are running a
   four process job and two processes will run PETSc and have PetscInitialize() and PetscFinalize() and two process will not,
   then do this. If ALL processes in the job are using PetscInitialize() and PetscFinalize() then you don't need to do this, even
   if different subcommunicators of the job are doing different things with PETSc.

   Options Database Keys:
+  -start_in_debugger [noxterm,dbx,xdb,gdb,...] - Starts program in debugger
.  -on_error_attach_debugger [noxterm,dbx,xdb,gdb,...] - Starts debugger when error detected
.  -on_error_emacs <machinename> causes emacsclient to jump to error file
.  -on_error_abort calls abort() when error detected (no traceback)
.  -on_error_mpiabort calls MPI_abort() when error detected
.  -error_output_stderr prints error messages to stderr instead of the default stdout
.  -error_output_none does not print the error messages (but handles errors in the same way as if this was not called)
.  -debugger_nodes [node1,node2,...] - Indicates nodes to start in debugger
.  -debugger_pause [sleeptime] (in seconds) - Pauses debugger
.  -stop_for_debugger - Print message on how to attach debugger manually to
                        process and wait (-debugger_pause) seconds for attachment
.  -malloc - Indicates use of PETSc error-checking malloc (on by default for debug version of libraries)
.  -malloc no - Indicates not to use error-checking malloc
.  -malloc_debug - check for memory corruption at EVERY malloc or free
.  -malloc_test - like -malloc_dump -malloc_debug, but only active for debugging builds
.  -fp_trap - Stops on floating point exceptions (Note that on the
              IBM RS6000 this slows code by at least a factor of 10.)
.  -no_signal_handler - Indicates not to trap error signals
.  -shared_tmp - indicates /tmp directory is shared by all processors
.  -not_shared_tmp - each processor has own /tmp
.  -tmp - alternative name of /tmp directory
.  -get_total_flops - returns total flops done by all processors
.  -memory_info - Print memory usage at end of run
-  -server <port> - start PETSc webserver (default port is 8080)

   Options Database Keys for Profiling:
   See the <a href="../../docs/manual.pdf#nameddest=ch_profiling">profiling chapter of the users manual</a> for details.
+  -info <optional filename> - Prints verbose information to the screen
.  -info_exclude <null,vec,mat,pc,ksp,snes,ts> - Excludes some of the verbose messages
.  -log_sync - Log the synchronization in scatters, inner products and norms
.  -log_trace [filename] - Print traces of all PETSc calls to the screen (useful to determine where a program
        hangs without running in the debugger).  See PetscLogTraceBegin().
.  -log_summary [filename] - Prints summary of flop and timing information to screen. If the filename is specified the
        summary is written to the file.  See PetscLogView().
.  -log_summary_python [filename] - Prints data on of flop and timing usage to a file or screen. See PetscLogPrintSViewPython().
.  -log_all [filename] - Logs extensive profiling information  See PetscLogDump().
.  -log [filename] - Logs basic profiline information  See PetscLogDump().
-  -log_mpe [filename] - Creates a logfile viewable by the utility Jumpshot (in MPICH distribution)

    Only one of -log_trace, -log_summary, -log_all, -log, or -log_mpe may be used at a time

   Environmental Variables:
+   PETSC_TMP - alternative tmp directory
.   PETSC_SHARED_TMP - tmp is shared by all processes
.   PETSC_NOT_SHARED_TMP - each process has its own private tmp
.   PETSC_VIEWER_SOCKET_PORT - socket number to use for socket viewer
-   PETSC_VIEWER_SOCKET_MACHINE - machine to use for socket viewer to connect to


   Level: beginner

   Notes:
   If for some reason you must call MPI_Init() separately, call
   it before PetscInitialize().

   Fortran Version:
   In Fortran this routine has the format
$       call PetscInitialize(file,ierr)

+   ierr - error return code
-  file - [optional] PETSc database file, also checks ~username/.petscrc and .petscrc use NULL_CHARACTER to not check for
          code specific file. Use -skip_petscrc in the code specific file to skip the .petscrc files

   Important Fortran Note:
   In Fortran, you MUST use NULL_CHARACTER to indicate a
   null character string; you CANNOT just use NULL as
   in the C version. See the <a href="../../docs/manual.pdf">users manual</a> for details.

   If your main program is C but you call Fortran code that also uses PETSc you need to call PetscInitializeFortran() soon after
   calling PetscInitialize().

   Concepts: initializing PETSc

.seealso: PetscFinalize(), PetscInitializeFortran(), PetscGetArgs(), PetscInitializeNoArguments()

@*/
PetscErrorCode  PetscInitialize(int *argc,char ***args,const char file[],const char help[])
{
  PetscErrorCode ierr;
  PetscMPIInt    flag, size;
  PetscInt       nodesize;
  PetscBool      flg;
  char           hostname[256];

  PetscFunctionBegin;
  if (PetscInitializeCalled) PetscFunctionReturn(0);

  /* these must be initialized in a routine, not as a constant declaration*/
  PETSC_STDOUT = stdout;
  PETSC_STDERR = stderr;

  ierr = PetscOptionsCreate();CHKERRQ(ierr);

  /*
     We initialize the program name here (before MPI_Init()) because MPICH has a bug in
     it that it sets args[0] on all processors to be args[0] on the first processor.
  */
  if (argc && *argc) {
    ierr = PetscSetProgramName(**args);CHKERRQ(ierr);
  } else {
    ierr = PetscSetProgramName("Unknown Name");CHKERRQ(ierr);
  }

  ierr = MPI_Initialized(&flag);CHKERRQ(ierr);
  if (!flag) {
    if (PETSC_COMM_WORLD != MPI_COMM_NULL) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"You cannot set PETSC_COMM_WORLD if you have not initialized MPI first");
#if defined(PETSC_HAVE_MPI_INIT_THREAD)
    {
      PetscMPIInt provided;
      ierr = MPI_Init_thread(argc,args,MPI_THREAD_FUNNELED,&provided);CHKERRQ(ierr);
    }
#else
    ierr = MPI_Init(argc,args);CHKERRQ(ierr);
#endif
    PetscBeganMPI = PETSC_TRUE;
  }
  if (argc && args) {
    PetscGlobalArgc = *argc;
    PetscGlobalArgs = *args;
  }
  PetscFinalizeCalled = PETSC_FALSE;

  if (PETSC_COMM_WORLD == MPI_COMM_NULL) PETSC_COMM_WORLD = MPI_COMM_WORLD;
  ierr = MPI_Comm_set_errhandler(PETSC_COMM_WORLD,MPI_ERRORS_RETURN);CHKERRQ(ierr);

  /* Done after init due to a bug in MPICH-GM? */
  ierr = PetscErrorPrintfInitialize();CHKERRQ(ierr);

  ierr = MPI_Comm_rank(MPI_COMM_WORLD,&PetscGlobalRank);CHKERRQ(ierr);
  ierr = MPI_Comm_size(MPI_COMM_WORLD,&PetscGlobalSize);CHKERRQ(ierr);

  MPIU_BOOL = MPI_INT;
  MPIU_ENUM = MPI_INT;

  /*
     Initialized the global complex variable; this is because with
     shared libraries the constructors for global variables
     are not called; at least on IRIX.
  */
#if defined(PETSC_HAVE_COMPLEX)
  {
#if defined(PETSC_CLANGUAGE_CXX)
    PetscComplex ic(0.0,1.0);
    PETSC_i = ic;
#elif defined(PETSC_CLANGUAGE_C)
    PETSC_i = _Complex_I;
#endif
  }

#if !defined(PETSC_HAVE_MPI_C_DOUBLE_COMPLEX)
  ierr = MPI_Type_contiguous(2,MPI_DOUBLE,&MPIU_C_DOUBLE_COMPLEX);CHKERRQ(ierr);
  ierr = MPI_Type_commit(&MPIU_C_DOUBLE_COMPLEX);CHKERRQ(ierr);
  ierr = MPI_Type_contiguous(2,MPI_FLOAT,&MPIU_C_COMPLEX);CHKERRQ(ierr);
  ierr = MPI_Type_commit(&MPIU_C_COMPLEX);CHKERRQ(ierr);
#endif
#endif /* PETSC_HAVE_COMPLEX */

  /*
     Create the PETSc MPI reduction operator that sums of the first
     half of the entries and maxes the second half.
  */
  ierr = MPI_Op_create(PetscMaxSum_Local,1,&PetscMaxSum_Op);CHKERRQ(ierr);

#if defined(PETSC_USE_REAL___FLOAT128)
  ierr = MPI_Type_contiguous(2,MPI_DOUBLE,&MPIU___FLOAT128);CHKERRQ(ierr);
  ierr = MPI_Type_commit(&MPIU___FLOAT128);CHKERRQ(ierr);
#if defined(PETSC_HAVE_COMPLEX)
  ierr = MPI_Type_contiguous(4,MPI_DOUBLE,&MPIU___COMPLEX128);CHKERRQ(ierr);
  ierr = MPI_Type_commit(&MPIU___COMPLEX128);CHKERRQ(ierr);
#endif
  ierr = MPI_Op_create(PetscMax_Local,1,&MPIU_MAX);CHKERRQ(ierr);
  ierr = MPI_Op_create(PetscMin_Local,1,&MPIU_MIN);CHKERRQ(ierr);
#endif

#if (defined(PETSC_HAVE_COMPLEX) && !defined(PETSC_HAVE_MPI_C_DOUBLE_COMPLEX)) || defined(PETSC_USE_REAL___FLOAT128)
  ierr = MPI_Op_create(PetscSum_Local,1,&MPIU_SUM);CHKERRQ(ierr);
#endif

  ierr = MPI_Type_contiguous(2,MPIU_SCALAR,&MPIU_2SCALAR);CHKERRQ(ierr);
  ierr = MPI_Type_commit(&MPIU_2SCALAR);CHKERRQ(ierr);
  ierr = MPI_Op_create(PetscADMax_Local,1,&PetscADMax_Op);CHKERRQ(ierr);
  ierr = MPI_Op_create(PetscADMin_Local,1,&PetscADMin_Op);CHKERRQ(ierr);

#if defined(PETSC_USE_64BIT_INDICES) || !defined(MPI_2INT)
  ierr = MPI_Type_contiguous(2,MPIU_INT,&MPIU_2INT);CHKERRQ(ierr);
  ierr = MPI_Type_commit(&MPIU_2INT);CHKERRQ(ierr);
#endif

  /*
     Attributes to be set on PETSc communicators
  */
  ierr = MPI_Keyval_create(MPI_NULL_COPY_FN,Petsc_DelCounter,&Petsc_Counter_keyval,(void*)0);CHKERRQ(ierr);
  ierr = MPI_Keyval_create(MPI_NULL_COPY_FN,Petsc_DelComm,&Petsc_InnerComm_keyval,(void*)0);CHKERRQ(ierr);
  ierr = MPI_Keyval_create(MPI_NULL_COPY_FN,Petsc_DelComm,&Petsc_OuterComm_keyval,(void*)0);CHKERRQ(ierr);

  /*
     Build the options database
  */
  ierr = PetscOptionsInsert(argc,args,file);CHKERRQ(ierr);


  /*
     Print main application help message
  */
  ierr = PetscOptionsHasName(NULL,"-help",&flg);CHKERRQ(ierr);
  if (help && flg) {
    ierr = PetscPrintf(PETSC_COMM_WORLD,help);CHKERRQ(ierr);
  }
  ierr = PetscOptionsCheckInitial_Private();CHKERRQ(ierr);

  /* SHOULD PUT IN GUARDS: Make sure logging is initialized, even if we do not print it out */
#if defined(PETSC_USE_LOG)
  ierr = PetscLogBegin_Private();CHKERRQ(ierr);
#endif

  /*
     Load the dynamic libraries (on machines that support them), this registers all
     the solvers etc. (On non-dynamic machines this initializes the PetscDraw and PetscViewer classes)
  */
  ierr = PetscInitialize_DynamicLibraries();CHKERRQ(ierr);

  ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
  ierr = PetscInfo1(0,"PETSc successfully started: number of processors = %d\n",size);CHKERRQ(ierr);
  ierr = PetscGetHostName(hostname,256);CHKERRQ(ierr);
  ierr = PetscInfo1(0,"Running on machine: %s\n",hostname);CHKERRQ(ierr);

  ierr = PetscOptionsCheckInitial_Components();CHKERRQ(ierr);
  /* Check the options database for options related to the options database itself */
  ierr = PetscOptionsSetFromOptions();CHKERRQ(ierr);

#if defined(PETSC_USE_PETSC_MPI_EXTERNAL32)
  /*
      Tell MPI about our own data representation converter, this would/should be used if extern32 is not supported by the MPI

      Currently not used because it is not supported by MPICH.
  */
#if !defined(PETSC_WORDS_BIGENDIAN)
  ierr = MPI_Register_datarep((char*)"petsc",PetscDataRep_read_conv_fn,PetscDataRep_write_conv_fn,PetscDataRep_extent_fn,NULL);CHKERRQ(ierr);
#endif
#endif

  ierr = PetscOptionsGetInt(NULL,"-hmpi_spawn_size",&nodesize,&flg);CHKERRQ(ierr);
  if (flg) {
#if defined(PETSC_HAVE_MPI_COMM_SPAWN)
    ierr = PetscHMPISpawn((PetscMPIInt) nodesize);CHKERRQ(ierr); /* worker nodes never return from here; they go directly to PetscEnd() */
#else
    SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PETSc built without MPI 2 (MPI_Comm_spawn) support, use -hmpi_merge_size instead");
#endif
  } else {
    ierr = PetscOptionsGetInt(NULL,"-hmpi_merge_size",&nodesize,&flg);CHKERRQ(ierr);
    if (flg) {
      ierr = PetscHMPIMerge((PetscMPIInt) nodesize,NULL,NULL);CHKERRQ(ierr);
      if (PetscHMPIWorker) { /* if worker then never enter user code */
        PetscInitializeCalled = PETSC_TRUE;
        PetscEnd();
      }
    }
  }

#if defined(PETSC_HAVE_CUDA)
  {
    PetscMPIInt p;
    for (p = 0; p < PetscGlobalSize; ++p) {
      if (p == PetscGlobalRank) cublasInit();
      ierr = MPI_Barrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
    }
  }
#endif

  ierr = PetscOptionsHasName(NULL,"-python",&flg);CHKERRQ(ierr);
  if (flg) {
    PetscInitializeCalled = PETSC_TRUE;
    ierr = PetscPythonInitialize(NULL,NULL);CHKERRQ(ierr);
  }

  ierr = PetscThreadCommInitializePackage();CHKERRQ(ierr);

  /*
      Setup building of stack frames for all function calls
  */
#if defined(PETSC_USE_DEBUG)
  PetscThreadLocalRegister((PetscThreadKey*)&petscstack); /* Creates petscstack_key if needed */
  ierr = PetscStackCreate();CHKERRQ(ierr);
#endif

#if defined(PETSC_SERIALIZE_FUNCTIONS)
  ierr = PetscFPTCreate(10000);CHKERRQ(ierr);
#endif

  /*
      Once we are completedly initialized then we can set this variables
  */
  PetscInitializeCalled = PETSC_TRUE;
  PetscFunctionReturn(0);
}
Ejemplo n.º 3
0
void mpi_op_create(int* function, int* commute, int* opc, int* ierr){
  MPI_Op op;
  *ierr = MPI_Op_create((MPI_User_function *)function, *commute, (MPI_Op *)&op);
  GET_MPI_OP(CtvAccess(mpi_opc)++) = op;
  *opc = CtvAccess(mpi_opc)-1;
}
Ejemplo n.º 4
0
void ghost_init()
{
    MPI_Op_create(reduce_forces_sum, 1, &MPI_FORCES_SUM);
}
Ejemplo n.º 5
0
int Zoltan_RB_find_median(
  int Tflops_Special,   /* Flag indicating whether Tflops_Special handling 
                           of communicators should be done (to avoid memory
                           leaks in tflops' Comm_Dup and Comm_Split).        */
  double *dots,         /* array of coordinates                              */
  double *wgts,         /* array of weights associated with dots, or NULL    */
  double uniformWeight, /* weight of every dot, or 0.0 if wgts != NULL */ 
  int *dotmark,         /* returned list of which side of the median
                           each dot is on:
                                0 - dot is < valuehalf
                                1 - dot is > valuehalf                       */
  int dotnum,           /* number of dots (length of three previous arrays)  */
  int proc,             /* this proc number (rank)                           */
  double *fractionlo,   /* fraction of weight that should be in bottom half  */
  MPI_Comm local_comm,  /* MPI communicator on which to find median          */
  double *valuehalf,    /* on entry - first guess at median (if first_guess set)
                           on exit - the median value                        */
  int first_guess,      /* if set, use value in valuehalf as first guess     */
  int nprocs,           /* Total number of processors (Tflops_Special)       */
  int num_procs,        /* Number of procs in set (Tflops_Special)     */
  int proclower,        /* Lowest numbered proc in set (Tflops_Special)*/
  int num_parts,        /* Number of partitions in set (Tflops_Special) */
  int wgtflag,          /* Number of user supplied weights in wgts array */
  double valuemin,      /* minimum value in partition (input) */
  double valuemax,      /* maximum value in partition (input) */
  double weight,        /* weight of entire partition (input) */
  double *wgtlo,        /* weight of lower partition (output) */
  double *wgthi,        /* weight of upper partition (output) */
  int    *dotlist,      /* list of active dots */
  int rectilinear_blocks,/*if set all dots with same value on same side of cut*/
  int average_cuts      /* force cut to be halfway between two closest dots. */
)
{
/* Local declarations. */
  struct median med, medme;          /* median data */


  double  wtmax, wtsum, wtok, wtupto;/* temporary wts */
  double  tolerance;                 /* largest single weight of a dot */
  double  targetlo, targethi;        /* desired wt in lower half */
  double  weightlo, weighthi;        /* wt in lower/upper half of non-active */
  double  tmp_half = 0.0;
  double  tmp_wgt;
  double *w;

  int     i, j, k, numlist;
  int     first_iteration;
  int     indexlo=0, indexhi=0;      /* indices of dot closest to median */
  int     breakflag;                 /* for breaking out of median iteration */
  int     markactive;                /* which side of cut is active = 0/1 */
  int     rank=0;                    /* rank in partition (Tflops_Special) */
  int     loopCount=0;

  /* MPI data types and user functions */

  MPI_Op            med_op;
  MPI_Datatype      med_type;
  MPI_User_function Zoltan_RB_median_merge;

#ifdef WATCH_MEDIAN_FIND
  char debugText[64];
  double initmin, initmax;
  if (first_guess)
    sprintf(debugText,"(%d - %d) first guess %lf ",proclower,proclower+num_procs-1,*valuehalf);
  else
    sprintf(debugText,"(%d - %d) ",proclower,proclower+num_procs-1);
  initmin = valuemin;
  initmax = valuemax;
#endif

  rank = proc - proclower;

/***************************** BEGIN EXECUTION ******************************/

  /* create MPI data and function types for box and median */

  MPI_Type_contiguous(sizeof(struct median),MPI_CHAR,&med_type);
  MPI_Type_commit(&med_type);

  if (!Tflops_Special)
     MPI_Op_create(&Zoltan_RB_median_merge,1,&med_op);

  /*
   * intialize the dotlist array
   * while looping through, find:
   *	wtmax		- max weight on this proc
   *
   * weight = summed weight of entire partition
   * search tolerance = largest single weight (plus epsilon)
   * targetlo = desired weight in lower half of partition
   * targethi = desired weight in upper half of partition
   */
  wtmax = 0.0;
  numlist = dotnum;
  w = wgts;                     /* we use 1st weight only */
  for (i = 0; i < dotnum;i++) {
    dotlist[i] = i;
    if (wgtflag){
      if (*w > wtmax) wtmax = *w;
      w += wgtflag;
    }
  }

  if (Tflops_Special) {
    if (wgtflag) {

      /* find tolerance (max of wtmax) */
      tolerance = wtmax;
      Zoltan_RB_max_double(&tolerance, 1, proclower, rank, num_procs, local_comm);
    }
    else 
      tolerance = 1.0;   /* if user did not supply weights, all are 1.0 */
  }
  else {
    if (wgtflag)
      MPI_Allreduce(&wtmax,&tolerance,1,MPI_DOUBLE,MPI_MAX,local_comm);
    else
      tolerance = 1.0;   /* if user did not supply weights, all are 1.0 */
  }

  tolerance *= 0.5 + TINY;  /* ctv - changed from        1.0 + TINY
               The larger tolerance allowed one side of the cut to be larger
               than the target weight by a node of largest weight (the other
               side would be smaller by the same amount).  In that case a
               node of largest weight could be moved from the side whose weight
               is larger than its target to the other side and they would both
               be in balance with the target weight.  A tolerance less than
               half of the largest weight would allow infinite looping as a
               node of largest weight was passed back and forth. */
  targetlo = fractionlo[0] * weight;
  targethi = weight - targetlo;

  /* weightlo/hi = total weight in non-active parts of partition */
  weighthi = weightlo = 0.0;

  first_iteration = 1;

  /* median iteration */
  /* zoom in on bisector until correct # of dots in each half of partition */
  /* as each iteration of median-loop begins, require:
          all non-active dots are marked with 0/1 in dotmark
          valuemin <= every active dot <= valuemax
          weightlo, weighthi = total wt of non-active dots */
  /* when leave median-loop, require only:
          valuehalf = correct cut position
          all dots <= valuehalf are marked with 0 in dotmark
          all dots >= valuehalf are marked with 1 in dotmark */

  if (!Tflops_Special || num_procs > 1) { /* don't need to go thru if only
                                             one proc with Tflops_Special. 
                                             Input argument Tflops_Special 
                                             should be 0 for
                                             serial partitioning. */
    while (1) {

    /* choose bisector value */
    /* use old value on 1st iteration if old cut dimension is the same */
    /* on 2nd option: could push valuehalf towards geometric center 
       with "1.0-factor" to force overshoot */


      if (first_iteration && first_guess) {
        tmp_half = *valuehalf;
        if (tmp_half < valuemin || tmp_half > valuemax)
          tmp_half = 0.5 * (valuemin + valuemax);
      }
      else if (weight)
        tmp_half = valuemin + (targetlo - weightlo) /
                    (weight - weightlo - weighthi) * (valuemax - valuemin);
      else
        tmp_half = 0.5 * (valuemin + valuemax);

      first_iteration = 0;
      /* initialize local median data structure */

      medme.totallo = medme.totalhi = 0.0;
      medme.valuelo = -DBL_MAX;
      medme.valuehi = DBL_MAX;
      medme.wtlo = medme.wthi = 0.0;
      medme.countlo = medme.counthi = 0;
      medme.proclo = medme.prochi = proc;

      /* mark all active dots on one side or other of bisector */
      /* also set all fields in median data struct */
      /* save indices of closest dots on either side */

      for (j = 0; j < numlist; j++) {
        i = dotlist[j];
        tmp_wgt = (wgts ? wgts[i*wgtflag] : uniformWeight);
        if (dots[i] <= tmp_half) {            /* in lower part */
          medme.totallo += tmp_wgt;
          dotmark[i] = 0;
          if (dots[i] > medme.valuelo) {       /* my closest dot */
            medme.valuelo = dots[i];
            medme.wtlo = tmp_wgt;
            medme.countlo = 1;
            indexlo = i;
          }                                            /* tied for closest */
          else if (dots[i] == medme.valuelo) {
            medme.wtlo += tmp_wgt;
            medme.countlo++;
          }
        }
        else {                                         /* in upper part */
          medme.totalhi += tmp_wgt;
          dotmark[i] = 1;
          if (dots[i] < medme.valuehi) {       /* my closest dot */
            medme.valuehi = dots[i];
            medme.wthi = tmp_wgt;
            medme.counthi = 1;
            indexhi = i;
          }                                            /* tied for closest */
          else if (dots[i] == medme.valuehi) {
            medme.wthi += tmp_wgt;
            medme.counthi++;
          }
        }
      }

      med.totallo = med.totalhi = 0.0;
      med.valuelo = -DBL_MAX;
      med.valuehi = DBL_MAX;
      med.wtlo = med.wthi = 0.0;
      med.countlo = med.counthi = 0;
      med.proclo = med.prochi = proc;

      loopCount++;

      if (Tflops_Special) {
         i = 1;
         Zoltan_RB_reduce(num_procs, rank, proc, (void *) &medme, (void *) &med,
                          sizeof(medme), &i, med_type, local_comm, 
                          Zoltan_RB_median_merge);
      }
      else {
         MPI_Allreduce(&medme,&med,1,med_type,med_op,local_comm);
      }

      /* test median guess for convergence */
      /* move additional dots that are next to cut across it */

      if (weightlo + med.totallo < targetlo) {    /* lower half TOO SMALL */

        weightlo += med.totallo;
        tmp_half = med.valuehi;

        if (med.counthi == 1) {                  /* only one dot to move */
          if (weightlo + med.wthi < targetlo) {  /* move it, keep iterating */
            if (proc == med.prochi) dotmark[indexhi] = 0;
          }
          else {                                 /* only move if beneficial */
            if (weightlo + med.wthi - targetlo < targetlo - weightlo) {
              if (proc == med.prochi) dotmark[indexhi] = 0;
              weightlo += med.wthi;
            }
            weighthi = weight - weightlo;
            break;                               /* all done */
          }
        }
        else {                                   /* multiple dots to move */
          breakflag = 0;
          wtok = 0.0;
          if (medme.valuehi == med.valuehi) wtok = medme.wthi;   
          if (weightlo + med.wthi >= targetlo) {                /* all done */
            if (rectilinear_blocks) {
              if (weightlo + med.wthi - targetlo > targetlo - weightlo)
                wtok = 0.0;                      /* don't move if moving group
                                                    of dots has worse balance*/
            } else {
              if (Tflops_Special)
                Zoltan_RB_scan_double(&wtok, &wtupto, 1, local_comm, 
                                      proc, rank, num_procs);
              else
                MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,local_comm);
              wtmax = targetlo - weightlo;
              if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax);
            }
            breakflag = 1;
          }                                      /* wtok = most I can move */
          for (j = 0, wtsum = 0.0; j < numlist && wtsum < wtok; j++) {
            i = dotlist[j];
            if (dots[i] == med.valuehi) { /* only move if better */
              tmp_wgt = (wgts ? wgts[i*wgtflag] : uniformWeight);
              if (wtsum + tmp_wgt - wtok < wtok - wtsum) {
                dotmark[i] = 0;
                wtsum += tmp_wgt;  /* KDD Moved sum inside if test 1/2002 */
              }
            }
          }
          if (breakflag) {                        /* done if moved enough */
            if (Tflops_Special) {
              wtok = wtsum;
              Zoltan_RB_sum_double(&wtok, 1, proclower, rank, num_procs, 
                                   local_comm);
            }
            else
              MPI_Allreduce(&wtsum, &wtok, 1, MPI_DOUBLE, MPI_SUM, local_comm);
            weightlo += wtok;
            weighthi = weight - weightlo;
            break;
          }
        }

        weightlo += med.wthi;
        if (targetlo-weightlo <= tolerance) {     /* close enough */
           weighthi = weight - weightlo;
           break;
        }

        valuemin = med.valuehi;                   /* iterate again */
        markactive = 1;
      }

      else if (weighthi + med.totalhi < targethi) {  /* upper half TOO SMALL */

        weighthi += med.totalhi;
        tmp_half = med.valuelo;

        if (med.countlo == 1) {                  /* only one dot to move */
          if (weighthi + med.wtlo < targethi) {  /* move it, keep iterating */
            if (proc == med.proclo) dotmark[indexlo] = 1;
          }
          else {                                 /* only move if beneficial */
            if (weighthi + med.wtlo - targethi < targethi - weighthi) {
              if (proc == med.proclo) dotmark[indexlo] = 1;
              weighthi += med.wtlo;
            }
            weightlo = weight - weighthi;
            break;                               /* all done */
          }
        }
        else {                                   /* multiple dots to move */
          breakflag = 0;
          wtok = 0.0;
          if (medme.valuelo == med.valuelo) wtok = medme.wtlo;   
          if (weighthi + med.wtlo >= targethi) {                /* all done */
            if (rectilinear_blocks) {
              if (weighthi + med.wtlo - targethi > targethi - weighthi)
                wtok = 0.0;                      /* don't move if moving group
                                                    of dots has worse balance*/
            } else {
              if (Tflops_Special)
                Zoltan_RB_scan_double(&wtok, &wtupto, 1, local_comm, 
                                      proc, rank, num_procs);
              else
                MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,local_comm);
              wtmax = targethi - weighthi;
              if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax);
            }
            breakflag = 1;
          }                                      /* wtok = most I can move */
          for (j = 0, wtsum = 0.0; j < numlist && wtsum < wtok; j++) {
            i = dotlist[j];
            if (dots[i] == med.valuelo) { /* only move if better */
              tmp_wgt = (wgts ? wgts[i*wgtflag] : uniformWeight);
              if (wtsum + tmp_wgt - wtok < wtok - wtsum) {
                dotmark[i] = 1;
                wtsum += tmp_wgt; /* KDD Moved sum inside if test 1/2002 */
              }
            }
          }
          if (breakflag) {                        /* done if moved enough */
            if (Tflops_Special) {
              wtok = wtsum;
              Zoltan_RB_sum_double(&wtok, 1, proclower, rank, num_procs, local_comm);
            }
            else
              MPI_Allreduce(&wtsum, &wtok, 1, MPI_DOUBLE, MPI_SUM, local_comm);
            weighthi += wtok;
            weightlo = weight - weighthi;
            break;
          }
        }

        weighthi += med.wtlo;
        if (targethi-weighthi <= tolerance) {     /* close enough */
          weightlo = weight - weighthi;
          break;
        }

        valuemax = med.valuelo;                   /* iterate again */
        markactive = 0;
      }

      else {                /* Goldilocks result: both partitions JUST RIGHT */
        weightlo += med.totallo;
        weighthi += med.totalhi;
        break;
      }

      /* shrink the active list */
      
      k = 0;
      for (j = 0; j < numlist; j++) {
        i = dotlist[j];
        if (dotmark[i] == markactive) dotlist[k++] = i;
      }
      numlist = k;
    }
  }
  else { /* if one processor set all dots to 0 (Tflops_Special) */
    for (i = 0; i < numlist; i++)
      dotmark[i] = 0;
    weightlo = weight;
    weighthi = 0.;
    tmp_half = valuemax;
  }

  /* found median */
  *valuehalf = tmp_half;

  if (average_cuts) 
    *valuehalf = Zoltan_RB_Average_Cut(Tflops_Special, dots, dotmark, dotnum,
                                       num_procs, rank, proc, local_comm,
                                       *valuehalf);

  *wgtlo = weightlo;
  *wgthi = weighthi;

  MPI_Type_free(&med_type);
  if (!Tflops_Special)
     MPI_Op_free(&med_op);

#ifdef WATCH_MEDIAN_FIND
  if ((num_procs>1) && (rank==0)){
    fprintf(stderr,"%s loop count %d interval size %d median (%lf - %lf) %lf\n",
      debugText, loopCount,dotnum, initmin, initmax, *valuehalf);
  }
#endif

  par_median_accumulate_counts(nprocs, num_procs, rank, loopCount);

  return 1;
}