Ejemplo n.º 1
0
int
n2a_event_host_check (int event_type __attribute__ ((__unused__)), void *data)
{
  //logger(LG_DEBUG, "Event: event_service_check");
  nebstruct_host_check_data *c = (nebstruct_host_check_data *) data;

  if (c->type == NEBTYPE_HOSTCHECK_PROCESSED)
    {
      //logger(LG_DEBUG, "HOSTCHECK_PROCESSED: %s", c->host_name);
      char *buffer = NULL, *key = NULL;

      size_t l = xstrlen(g_options.connector) + xstrlen(g_options.eventsource_name) + xstrlen(c->host_name) + 20; 

      nebstruct_host_check_data_to_json(&buffer, c); 

      // DO NOT FREE !!!
      xalloca(key, xmin(g_options.max_size, (int)l) * sizeof(char));

      snprintf(key, xmin(g_options.max_size, (int)l),
                 "%s.%s.check.component.%s", g_options.connector,
                 g_options.eventsource_name, c->host_name);

      if (c_size == -10000 || c_size / 2 == 0)
          amqp_publish(key, buffer);
      else
          n2a_record_cache (key, buffer);

      xfree(buffer);
    }

  return 0;
}
Ejemplo n.º 2
0
slint_t sort_radix_ma(elements_t *s, elements_t *sx, slint_t rhigh, slint_t rlow, slint_t rwidth) /* sl_proto, sl_func sort_radix_ma */
{
  elements_t _sx;


  if (s == NULL) return -1;

  if (s->size < 2) return 0;

  rti_tstart(rti_tid_sort_radix);

  if (sx == NULL || sx->size < 1)
  {
    sx = &_sx;
    elements_alloc(sx, 1, SLCM_ALL);

  } else if (sx->size < 1) return -1;

  if (rhigh < 0) rhigh = key_radix_high;
  if (rlow < 0) rlow = key_radix_low;
  if (rwidth <= 0) rwidth = sort_radix_width_default;

  rs_rec_ma(s, sx, rhigh, rlow, xmin(rwidth, sort_radix_width_max));

  if (sx == &_sx) elements_free(sx);

  rti_tstop(rti_tid_sort_radix);

  return 0;
}
Ejemplo n.º 3
0
slint sort_permute_forward(elements_t *s, elements_t *sx, slint *perm, slint offset, slint mask_bit) /* sl_proto, sl_func sort_permute_forward */
{
  elements_t _sx;

  if (s == NULL) return -1;

  if (s->size < 2) return 0;

  rti_tstart(rti_tid_sort_permute_forward);

  if (sx == NULL || sx->size < 1)
  {
    sx = &_sx;
    elements_alloc(sx, 1, SLCM_ALL);
  }

  if (mask_bit < 0) sort_permute_forward_(s, sx, perm, offset);
  else sort_permute_forward_masked(s, sx, perm, offset, 1L << xmin(mask_bit, (sizeof(slint) * 8) - 1));

  if (sx == &_sx) elements_free(sx);

  rti_tstop(rti_tid_sort_permute_forward);

  return 0;
}
Ejemplo n.º 4
0
bool SmoothConstrainedInterpolator::ProjectVelocity(const Config& x,Config& v)
{
  constraint->PreEval(x);
  Matrix J;
  constraint->Jacobian(x,J);
  if(!xmin.empty()) {
    //look through active contraints, set that column to 0
    for(int i=0;i<x.n;i++) {
      if(x(i)==xmin(i) || x(i) == xmax(i)) { 
	v(i) = 0;
	for(int j=0;j<J.m;j++)
	  J(j,i) = 0;
      }
    }
  }
  RobustSVD<Real> svd;
  bool res=svd.set(J);
  if(!res) {
    fprintf(stderr,"SmoothConstrainedInterpolator: Numerical error projecting velocity?\n");
    return false;
  }
  Vector temp;
  svd.nullspaceComponent(v,temp);
  v -= temp;
  return true;
}
Ejemplo n.º 5
0
void Instance::transformBoundingBox()
{
    auto b = i->getBoundingBox();
    
    BoundingBox bb;
    
    for(int i = 0; i <= RayTracer::getInstance()->maxTime; ++i)
    {
        std::set<double> x,y,z;
        
        auto m = makeMatrices(i);
        
        auto p = Vector(b.xmin(i),0,0);
        p = transformLoc(m.first, p);
        x.insert(p.x);
        y.insert(p.y);
        z.insert(p.z);
        
        p = Vector(b.xmax(i),0,0);
        p = transformLoc(m.first, p);
        x.insert(p.x);
        y.insert(p.y);
        z.insert(p.z);
        
        p = Vector(0,b.ymin(i),0);
        p = transformLoc(m.first, p);
        x.insert(p.x);
        y.insert(p.y);
        z.insert(p.z);
        
        p = Vector(0,b.ymax(i),0);
        p = transformLoc(m.first, p);
        x.insert(p.x);
        y.insert(p.y);
        z.insert(p.z);
        
        p = Vector(0,0,b.zmin(i));
        p = transformLoc(m.first, p);
        x.insert(p.x);
        y.insert(p.y);
        z.insert(p.z);
        
        p = Vector(0,0,b.zmax(i));
        p = transformLoc(m.first, p);
        x.insert(p.x);
        y.insert(p.y);
        z.insert(p.z);
               
        bb.xmin.addFrame(i, *x.begin());
        bb.xmax.addFrame(i, *x.rbegin());
        bb.ymin.addFrame(i, *y.begin());
        bb.ymax.addFrame(i, *y.rbegin());
        bb.zmin.addFrame(i, *z.begin());
        bb.zmax.addFrame(i, *z.rbegin());
    }
    bbox = bb;
}
Ejemplo n.º 6
0
slint_t binning_radix_pre(binning_t *bm) /* sl_proto, sl_func binning_radix_pre */
{
  bm->bd.radix.rcurrent = xmin(bm->bd.radix.rwidth, bm->bd.radix.rhigh - bm->bd.radix.rlow + 1);
  bm->bd.radix.rhigh -= (bm->bd.radix.rcurrent > 0)?bm->bd.radix.rcurrent - 1:bm->bd.radix.rhigh;

  bm->nbins = (bm->bd.radix.rcurrent > 0)?powof2(bm->bd.radix.rcurrent):1;
  bm->bd.radix.bit_mask = bm->nbins - 1;
  
  return 0;
}
Ejemplo n.º 7
0
int QDeclarativeDrag::qt_metacall(QMetaObject::Call _c, int _id, void **_a)
{
    _id = QObject::qt_metacall(_c, _id, _a);
    if (_id < 0)
        return _id;
    if (_c == QMetaObject::InvokeMetaMethod) {
        if (_id < 8)
            qt_static_metacall(this, _c, _id, _a);
        _id -= 8;
    }
#ifndef QT_NO_PROPERTIES
      else if (_c == QMetaObject::ReadProperty) {
        void *_v = _a[0];
        switch (_id) {
        case 0: *reinterpret_cast< QGraphicsObject**>(_v) = target(); break;
        case 1: *reinterpret_cast< Axis*>(_v) = axis(); break;
        case 2: *reinterpret_cast< qreal*>(_v) = xmin(); break;
        case 3: *reinterpret_cast< qreal*>(_v) = xmax(); break;
        case 4: *reinterpret_cast< qreal*>(_v) = ymin(); break;
        case 5: *reinterpret_cast< qreal*>(_v) = ymax(); break;
        case 6: *reinterpret_cast< bool*>(_v) = active(); break;
        case 7: *reinterpret_cast< bool*>(_v) = filterChildren(); break;
        }
        _id -= 8;
    } else if (_c == QMetaObject::WriteProperty) {
        void *_v = _a[0];
        switch (_id) {
        case 0: setTarget(*reinterpret_cast< QGraphicsObject**>(_v)); break;
        case 1: setAxis(*reinterpret_cast< Axis*>(_v)); break;
        case 2: setXmin(*reinterpret_cast< qreal*>(_v)); break;
        case 3: setXmax(*reinterpret_cast< qreal*>(_v)); break;
        case 4: setYmin(*reinterpret_cast< qreal*>(_v)); break;
        case 5: setYmax(*reinterpret_cast< qreal*>(_v)); break;
        case 7: setFilterChildren(*reinterpret_cast< bool*>(_v)); break;
        }
        _id -= 8;
    } else if (_c == QMetaObject::ResetProperty) {
        switch (_id) {
        case 0: resetTarget(); break;
        }
        _id -= 8;
    } else if (_c == QMetaObject::QueryPropertyDesignable) {
        _id -= 8;
    } else if (_c == QMetaObject::QueryPropertyScriptable) {
        _id -= 8;
    } else if (_c == QMetaObject::QueryPropertyStored) {
        _id -= 8;
    } else if (_c == QMetaObject::QueryPropertyEditable) {
        _id -= 8;
    } else if (_c == QMetaObject::QueryPropertyUser) {
        _id -= 8;
    }
#endif // QT_NO_PROPERTIES
    return _id;
}
Ejemplo n.º 8
0
static char *xstrsub(const char *src, int begin, int len)
{
	int l;
	int ind;
	char *ret;
	size_t s_full;

	s_full=strlen(src);
	if(len==-1) l=(int)s_full;
	else l=len;

	if(!(ret=(char *)malloc_w((xmin(s_full, l)+1)*sizeof(char), __func__)))
		return NULL;
	ind=begin<0?xmax((int) s_full+begin, 0):xmin(s_full, begin);

	strncpy(ret, src+ind, xmin(s_full, l));
	ret[xmin(s_full, l)] = '\0';

	return ret;
}
Ejemplo n.º 9
0
 float size() const
 {
     if (xmax() < xmin() || ymax() < ymin())
     {
         // If box is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
         return 0.0f;
     }
     else
     {
         return width() * height();
     }
 }
Ejemplo n.º 10
0
void CartesianWidget::setXRange(double a, double b)
{
    setCenter((a+b)/2.0, centerY());
    if (xmax()-xmin() > (b-a))
    {
        while ( xmax()-xmin() > (b-a) )
        {
            setZoomLevel(zoomLevel()-1);
        }
        if ( xmax()-xmin() < (b-a) )
        {
            setZoomLevel(zoomLevel()+1);
        }
    }
    else
    {
        while ( xmax()-xmin() < (b-a) )
        {
            setZoomLevel(zoomLevel()+1);
        }
    }
    update();
}
Ejemplo n.º 11
0
void XArrayLinear<T>::Resize( int newSize ) {
	XBREAK( m_pArray == NULL );
	XBREAK( m_nNum > m_nMax );
	XBREAK( newSize <= 0 );
	XBREAK( newSize <= m_nMax && newSize <= m_nNum );
	T *pNewArray = new T[ newSize ];
	// 새 버퍼로 옮김.
	int Min = xmin( newSize, m_nMax );
	for( int i = 0; i < Min; ++i )
		pNewArray[ i ] = m_pArray[ i ];
	SAFE_DELETE_ARRAY( m_pArray );
	m_pArray = pNewArray;
	m_nMax = newSize;
}
Ejemplo n.º 12
0
/* single even-stage */
slint sn_even(slint size, slint rank, slint stage, void *snp, slint *up) /* sl_proto, sl_func sn_even */
{
  slint stages = 1;

  /* if the rank is out of range, return 'finshed' */
  if (rank >= size) return -1;
  /* if 'stage < 0' return the number of stages */
  if (stage < 0) return stages;
  /* if the stage is to large, return 'finshed' */
  if (stage >= stages) return -1;

  if (up != NULL) *up = 0;

  return xmax(0, xmin(size - 1, ((1 == rank % 2)?rank + 1:rank - 1)));
}
AccelerationGrid::AccelerationGrid() :
m_cells(0,0,0),
m_elementidxs(0),
m_elementxmins(0),
m_elementxmaxs(0),
m_elementquery(0),
m_lastquery(0),
m_gridxmin(0,0,0),
m_gridxmax(0,0,0),
m_cellsize(0,0,0),
m_invcellsize(0,0,0)
{
    Vec3st dims(1,1,1);
    Vec3d xmin(0,0,0), xmax(1,1,1);
    set(dims, xmin, xmax);
}
Ejemplo n.º 14
0
int max_flow(int** matrix, int n, int start, int finish) {
    int maxFlow = 0;
    int from[n];
    for(EVER) {
        bool visited[n];
        int curPos;
        std::fill(&visited[0], &visited[n], false);
        std::deque< int > pointQueue;
        pointQueue.push_front(start);
        visited[start] = true;
        while(!pointQueue.empty()) {
            curPos = pointQueue.front();
            pointQueue.pop_front();
            for(int i = 0; i < n; i++) {
                if(!visited[i] && matrix[curPos][i] > 0) {
                    from[i] = curPos;
                    visited[i] = true;
                    pointQueue.push_front(i);
                }
            }
        }
        if(!visited[finish]) {
            break;
        }
        int maxPath = -1;
        curPos = finish;
        while(curPos != start) {
            maxPath = xmin(maxPath, matrix[from[curPos]][curPos]);
            curPos = from[curPos];
        }
        curPos = finish;
        while(curPos != start) {
            matrix[from[curPos]][curPos] -= maxPath;
            matrix[curPos][from[curPos]] += maxPath;
            curPos = from[curPos];
        }
        maxFlow += maxPath;
    }
    return maxFlow;
}
// ---------------------------------------------------------------------------
//
// ---------------------------------------------------------------------------
TInt CSvgStyleElementImpl::GetAttributeFloat( const TInt aNameId,
                                              TFloatFixPt& aValue )
    {
    switch ( aNameId )
        {
        case KAtrRefX:
            {
            TFloatFixPt          xmin( KMAXFLOATFIX ), x; // 0x7fff is the maximum integer in TFixPt
            CSvgElementImpl*lNewElement = ( CSvgElementImpl* ) FirstChild();
            while ( lNewElement != NULL )
                {
                lNewElement->GetAttributeFloat( KAtrRefX, x );
                if ( x < xmin )
                    xmin = x;
                lNewElement = ( CSvgElementImpl * )
                              lNewElement->NextSibling();
                }
            aValue = xmin;
            }
        break;
        case KAtrRefY:
            {
            TFloatFixPt          ymin( KMAXFLOATFIX ), y; // 0x7fff is the maximum integer in TFixPt
            CSvgElementImpl*lNewElement = ( CSvgElementImpl* ) FirstChild();
            while ( lNewElement != NULL )
                {
                lNewElement->GetAttributeFloat( KAtrRefY, y );
                if ( y < ymin )
                    ymin = y;
                lNewElement = ( CSvgElementImpl * )
                              lNewElement->NextSibling();
                }
            aValue = ymin;
            }
        break;
        default:
        return CSvgElementImpl::GetAttributeFloat( aNameId, aValue );
        }
    return KErrNone;
    }
Ejemplo n.º 16
0
slint sort_radix_af(elements_t *s, elements_t *sx, slint rhigh, slint rlow, slint rwidth) /* sl_proto, sl_func sort_radix_af */
{
  elements_t _sx;

  slint finalize = 1;

#ifdef insertsort_finalize_adaptive
  finalize = 0;
#endif /* insertsort_finalize_adaptive */

  if (s == NULL) return -1;

  if (s->size < 2) return 0;

  rti_tstart(rti_tid_sort_radix);

  if (sx == NULL || sx->size < 1)
  {
    sx = &_sx;
    elements_alloc(sx, 1, SLCM_ALL);
  }

  if (rhigh < 0) rhigh = key_radix_high;
  if (rlow < 0) rlow = key_radix_low;
  if (rwidth <= 0) rwidth = sort_radix_width_default;

  rs_rec_af(s, sx, rhigh, rlow, xmin(rwidth, sort_radix_width_max), &finalize);

#ifdef insertsort_finalize
  if (sort_radix_threshold_rec > 1 && finalize) rs_rec_insertsort_af(s, sx, rhigh, rlow);
#endif /* insertsort_finalize */

  if (sx == &_sx) elements_free(sx);

  rti_tstop(rti_tid_sort_radix);

  return 0;
}
Ejemplo n.º 17
0
//------------------------------------------------------------------------------------------------------------------------------------
// called when we want to draw the 3D data in our app.
//------------------------------------------------------------------------------------------------------------------------------------
void draw3D()
{
	const float DEG_TO_RAD = PI / 180.0f;
	const Vec3 xAxis(1.0f, 0, 0);
	const Vec3 yAxis(0, 1.0f, 0);

	translate(0, 0, -g_zoom);
	translate(g_tx, g_ty, 0);
	rotate(g_rotx * DEG_TO_RAD, xAxis);
	rotate(g_roty * DEG_TO_RAD, yAxis);

	// draw the grid on the floor
	setColour(0.25f, 0.25f, 0.25f);
	for(float i = -10.0f; i <= 10.1f; i += 1.0f)
	{
		Vec3 zmin(i, 0, -10);
		Vec3 zmax(i, 0,  10);
		Vec3 xmin(-10, 0, i);
		Vec3 xmax(10, 0, i);
		drawLine(xmin, xmax);
		drawLine(zmin, zmax);
	}
}
Ejemplo n.º 18
0
 nervana::boundingbox::box unnormalize(float width, float height)
 {
     return nervana::boundingbox::box(
         xmin() * width, ymin() * height, xmax() * width - 1, ymax() * height - 1);
 }
Ejemplo n.º 19
0
int main(int argc, char **argv)
{
        int c = 0;
        long i_start_arg = 1;
        long i_end_arg = N;
        int i_start = 1;
        int i_end = N;
        mpfr_fn sin_fn = 0;
        mpfr_fn cos_fn = 0;

        for (int k = 0; k < argc; ++k) {
                printf("%s ", argv[k]);
        }

        printf("\n");

        while ((c = getopt(argc, argv, "i:j:f:")) != -1) {
                switch (c) {
                case 'i':
                        errno = 0;
                        i_start_arg = strtoll(optarg, 0, 0);

                        if (errno) {
                                fprintf(stderr, "bad start index %s\n", optarg);

                                return 1;
                        }

                        break;
                case 'j':
                        errno = 0;
                        i_end_arg = strtoll(optarg, 0, 0);

                        if (errno) {
                                fprintf(stderr, "bad end index %s\n", optarg);

                                return 1;
                        }

                        break;
                case 'f':

                        if (!strcmp(optarg, "sin")) {
                                sin_fn = mpfr_sin;
                                cos_fn = mpfr_cos;
                        } else if (!strcmp(optarg, "tan")) {
                                sin_fn = mpfr_tan;
                                cos_fn = mpfr_cot;
                        } else {
                                fprintf(stderr, "unknown function %s\n",
                                        optarg);

                                return 1;
                        }

                        break;
                default:
                        usage();
                        break;
                }
        }

        if (i_start_arg <= 0 ||
            i_end_arg > N) {
                printf("truncating start to (0, %d]\n", N);
                i_start_arg = xmin(xmax(i_start_arg, 1), N);
        }

        if (i_end_arg <= 0 ||
            i_end_arg > N) {
                printf("truncating end to (0, %d]\n", N);
                i_end_arg = xmin(xmax(i_end_arg, 1), N);
        }

        i_start = i_start_arg;
        i_end = i_end_arg;

        if (!sin_fn ||
            !cos_fn) {
                fprintf(stderr, "-f required\n");

                return 1;
        }

        for (int i = i_start; i <= i_end; ++i) {
                if (find_triple_64(i, 11, 20, sin_fn, cos_fn) < 0) {
                        /*
                           This indicates you should drop the range
                           limitations on r, re-run, and come back
                           in a week.
                         */
                        printf("CANNOT FIND SUITABLE CANDIDATE FOR i = %03d\n",
                               i);
                }
        }

        return 0;
}
Ejemplo n.º 20
0
slint_t binning_radix_finalize(binning_t *bm, bin_t *bin, slweight_t dcw, slint_t lc_min, slint_t lc_max, slweight_t *lcw, splitter_t *sp, slint_t s) /* sl_proto, sl_func binning_radix_finalize */
{
  slint_t lc, r;
#ifdef elem_weight
  elements_t xi, end;
  slweight_t lw;
#endif


  SL_TRACE_IF(BR_TRACE_IF, "bin size: %" slint_fmt ", dcw = %" slweight_fmt ", lc: %" slint_fmt " - %" slint_fmt ", lcw[0] = %" slweight_fmt, bin->s.size, dcw, lc_min, lc_max, lcw[0]);
#ifdef elem_weight
  if (bm->doweights)
    SL_TRACE_IF(BR_TRACE_IF, "bin weight: %" slweight_fmt ", dcw = %" slweight_fmt ", lc: %" slint_fmt " - %" slint_fmt ", lcw[1] = %" slweight_fmt, bin->weight, dcw, lc_min, lc_max, lcw[1]);
#endif

  r = 0;

#ifdef elem_weight
  if (bm->doweights)
  {
    lc = 0;
    lw = 0.0;

    if (bin->s.size <= lc_min || (dcw >= bin->weight && bin->s.size <= lc_max))
    {
      lc = bin->s.size;
      lw = bin->weight;

    } else
    {
      if (0 < lc_max)
      {
        elem_assign_at(&bin->s, bin->s.size, &end);

        lw = dcw;

        for (elem_assign(&bin->s, &xi); xi.keys < end.keys; elem_inc(&xi))
        {
          ++lc;
          lw -= elem_weight(&xi, 0);
        
          if (lc <= lc_min) continue;

          if (lw < 0.0 || lc > lc_max)
          {
            lw += elem_weight(&xi, 0);
            --lc;
            break;
          }
        }
      
        lw = dcw - lw;
      }

      r = 1;
    }

  } else
#endif
  {
    lc = xmin(dcw, bin->s.size);
    
    r = (lc >= (slint_t) dcw);
  }

  lcw[0] += lc;
  SL_TRACE_IF(BR_TRACE_IF, "lcw[0] = %" slweight_fmt " + %" slint_fmt " = %" slweight_fmt, lcw[0] - lc, lc, lcw[0]);
#ifdef elem_weight
  if (bm->doweights)
  {
    lcw[1] += lw;
    SL_TRACE_IF(BR_TRACE_IF, "lcw[1] = %" slweight_fmt " + %" slweight_fmt " = %" slweight_fmt, lcw[1] - lw, lw, lcw[1]);
  }
#endif

  sp->displs[s] += lc;

  SL_TRACE_IF(BR_TRACE_IF, "displs[%" slint_fmt "] += %" slint_fmt " = %d", s, lc, sp->displs[s]);

  return r;
}
Ejemplo n.º 21
0
QRectF QTessellatorPrivate::collectAndSortVertices(const QPointF *points, int *maxActiveEdges)
{
    *maxActiveEdges = 0;
    Vertex *v = vertices.storage;
    Vertex **vv = vertices.sorted;

    qreal xmin(points[0].x());
    qreal xmax(points[0].x());
    qreal ymin(points[0].y());
    qreal ymax(points[0].y());

    // collect vertex data
    Q27Dot5 y_prev = FloatToQ27Dot5(points[vertices.nPoints-1].y());
    Q27Dot5 x_next = FloatToQ27Dot5(points[0].x());
    Q27Dot5 y_next = FloatToQ27Dot5(points[0].y());
    int j = 0;
    int i = 0;
    while (i < vertices.nPoints) {
        Q27Dot5 y_curr = y_next;

        *vv = v;

        v->x = x_next;
        v->y = y_next;
        v->flags = 0;

    next_point:

        xmin = qMin(xmin, points[i+1].x());
        xmax = qMax(xmax, points[i+1].x());
        ymin = qMin(ymin, points[i+1].y());
        ymax = qMax(ymax, points[i+1].y());

        y_next = FloatToQ27Dot5(points[i+1].y());
        x_next = FloatToQ27Dot5(points[i+1].x());

        // skip vertices on top of each other
        if (v->x == x_next && v->y == y_next) {
            ++i;
            if (i < vertices.nPoints)
                goto next_point;
            Vertex *v0 = vertices.storage;
            v0->flags &= ~(LineBeforeStarts|LineBeforeEnds|LineBeforeHorizontal);
            if (y_prev < y_curr)
                v0->flags |= LineBeforeEnds;
            else if (y_prev > y_curr)
                v0->flags |= LineBeforeStarts;
            else
                v0->flags |= LineBeforeHorizontal;
            if ((v0->flags & (LineBeforeStarts|LineAfterStarts))
                && !(v0->flags & (LineAfterEnds|LineBeforeEnds)))
                *maxActiveEdges += 2;
            break;
        }

        if (y_prev < y_curr)
            v->flags |= LineBeforeEnds;
        else if (y_prev > y_curr)
            v->flags |= LineBeforeStarts;
        else
            v->flags |= LineBeforeHorizontal;


        if (y_curr < y_next)
            v->flags |= LineAfterStarts;
        else if (y_curr > y_next)
            v->flags |= LineAfterEnds;
        else
            v->flags |= LineAfterHorizontal;
        // ### could probably get better limit by looping over sorted list and counting down on ending edges
        if ((v->flags & (LineBeforeStarts|LineAfterStarts))
            && !(v->flags & (LineAfterEnds|LineBeforeEnds)))
            *maxActiveEdges += 2;
        y_prev = y_curr;
        ++v;
        ++vv;
        ++j;
        ++i;
    }
    vertices.nPoints = j;

    QDEBUG() << "maxActiveEdges=" << *maxActiveEdges;
    vv = vertices.sorted;
    qSort(vv, vv + vertices.nPoints, compareVertex);

    return QRectF(xmin, ymin, xmax-xmin, ymax-ymin);
}
Ejemplo n.º 22
0
/* Returns >= zero iff successful */
static int find_triple_64(int i, int min_leeway, int perfect_leeway, mpfr_fn
                          sin_fn, mpfr_fn cos_fn)
{
        /*
           Using mpfr is not entirely overkill for this; [Lut95]
           includes PASCAL fragments that use almost entirely integer
           arithmetic... but the error term in that only handles
           up to 13 extra bits of zeroes or so. We proudly boast
           at least 16 bits of extra zeroes in all cases.
         */
        mpfr_t xi;
        mpfr_t xip1;
        mpfr_t cos;
        mpfr_t sin;
        double xip1_d;
        double t;
        uint64_t sin_u;
        uint64_t cos_u;
        int e1;
        int e2;
        uint64_t xip1_u;
        double xi_initial;
        uint64_t xi_initial_u;
        double xi_current;
        uint64_t xi_current_u;
        long int r = 0;
        long int best_r = 0;
        int sgn = 1;
        int ml = min_leeway;
        int best_l = 0;
        uint64_t best_xi_u;
        uint64_t best_sin_u;
        uint64_t best_cos_u;
        time_t start;
        time_t end;

        start = time(0);
        mpfr_init2(xi, 100);
        mpfr_init2(xip1, 100);
        mpfr_init2(cos, 100);
        mpfr_init2(sin, 100);

        /* start out at xi = πi/(4N) */
        mpfr_const_pi(xi, MPFR_RNDN);
        mpfr_mul_si(xip1, xi, (long int) (i + 1), MPFR_RNDN);
        mpfr_mul_si(xi, xi, (long int) i, MPFR_RNDN);
        mpfr_div_si(xi, xi, (long int) 4 * N, MPFR_RNDN);
        mpfr_div_si(xip1, xip1, (long int) 4 * N, MPFR_RNDN);
        xip1_d = mpfr_get_d(xip1, MPFR_RNDN);
        xip1_u = FLT64_TO_UINT64(xip1_d);
        xi_initial = mpfr_get_d(xi, MPFR_RNDN);
        xi_initial_u = FLT64_TO_UINT64(xi_initial);

        while (1) {
                xi_current_u = xi_initial_u + (sgn * r);
                xi_current = UINT64_TO_FLT64(xi_current_u);
                mpfr_set_d(xi, xi_current, MPFR_RNDN);

                /* Test if cos(xi) has enough zeroes */
                cos_fn(cos, xi, MPFR_RNDN);
                t = mpfr_get_d(cos, MPFR_RNDN);
                cos_u = FLT64_TO_UINT64(t);
                e1 = EXP_OF_FLT64(t);
                mpfr_sub_d(cos, cos, t, MPFR_RNDN);
                t = mpfr_get_d(cos, MPFR_RNDN);
                e2 = EXP_OF_FLT64(t);

                if (e2 == -1024) {

                        /* Damn; this is too close to a subnormal. i = 0 or N? */
                        return -1;
                }

                if (e1 - e2 < (52 + min_leeway)) {
                        goto inc;
                }

                ml = xmax(min_leeway, e1 - e2 - 52);

                /* Test if sin(xi) has enough zeroes */
                sin_fn(sin, xi, MPFR_RNDN);
                t = mpfr_get_d(sin, MPFR_RNDN);
                sin_u = FLT64_TO_UINT64(t);
                e1 = EXP_OF_FLT64(t);
                mpfr_sub_d(sin, sin, t, MPFR_RNDN);
                t = mpfr_get_d(sin, MPFR_RNDN);
                e2 = EXP_OF_FLT64(t);

                if (e2 == -1024) {

                        /* Damn; this is too close to a subnormal. i = 0 or N? */
                        return -1;
                }

                if (e1 - e2 < (52 + min_leeway)) {
                        goto inc;
                }

                ml = xmin(ml, e1 - e2 - 52);

                /* Hurrah, this is valid */
                if (ml > best_l) {
                        best_l = ml;
                        best_xi_u = xi_current_u;
                        best_cos_u = cos_u;
                        best_sin_u = sin_u;
                        best_r = sgn * r;

                        /* If this is super-good, don't bother finding more */
                        if (best_l >= perfect_leeway) {
                                break;
                        }
                }

inc:

                /* Increment */
                sgn *= -1;

                if (sgn < 0) {
                        r++;
                } else if (r > (1 << 29) ||
                           xi_current_u > xip1_u) {
                        /*
                           This is taking too long, give up looking
                           for perfection and take the best we've
                           got. A sweep of 1 << 28 finishes in ~60
                           hrs on my personal machine as I write
                           this.
                         */
                        break;
                }
        }

        end = time(0);

        if (best_l > min_leeway) {
                printf(
                        "(%#018lx, %#018lx, %#018lx), /* i = %03d, l = %02d, r = %010ld, t = %ld */ \n",
                        best_xi_u, best_cos_u, best_sin_u, i, best_l, best_r,
                        end -
                        start);

                return 0;
        } else {
                return -1;
        }
}
Ejemplo n.º 23
0
//---------------------------------------------------------
DVec& CS_PCG::solve(const DVec& rhs, double tol, int maxit)
//---------------------------------------------------------
{
  // Use a preconditioned Conjugate Gradient method 
  // to return an iterative solution to: x = A\rhs.
  //
  // 1. permute rhs
  // 2. solve using pcg
  // 3. unpermute result

#if (APPLY_PERM)
  m_permute = true;
#else
  m_permute = false;
#endif


  // check system
  if (!m_factor || !L.ok())  { umERROR("CS_PCG::solve", "cholinc factor not ready."); }

  // store user args
  m_tol=tol;  m_maxit=maxit;

  // store permuted rhs in pb
  int n=rhs.size(); pb.resize(n);
  if (m_permute) {
    CS_ipvec(this->pinv, rhs, pb, n);   // pb = P*rhs
  } else {
    pb = rhs;                           // pb = rhs
  }

  if (!pb.ok())       { umERROR("CS_PCG::solve", "failed to permute rhs"); }
  if (this->L.n != n) { umERROR("CS_PCG::solve", "rhs not compatible"); }

  //---------------------------------------------
  // When used during time-dependent simulations,
  // set the initial solution vector to zero, but
  // reuse previous solution on subsequent calls.
  //---------------------------------------------
  
  // work with permuted px = P(x), 
  // return unpermuted   x = P(px),
  
  px.resize(n, false);  // false -> don't bother initialising
  if (!m_oldsol) {
    px.fill(0.0);       // initial guess is zero vector
     x.resize(n);       // allocate return vector
  } 
  else 
  {
    if (m_permute) {
      // reapply permutation and use old solution as inital guess
      CS_ipvec(this->pinv, x, px, n);   // px = P*x
    } else {
      px = x;                           // px = x
    }
  }

  if (!px.ok() || !x.ok()) { umERROR("CS_PCG::solve", "out of memory"); }

  // check parameters
  if (m_tol<=0.0) { m_tol = 1e-6;           umWARNING("pcg", "resetting tol to %g  (was %g).", m_tol,   tol); }
  if (m_maxit>n)  { m_maxit=std::min(n,20); umWARNING("pcg", "setting maxit to %d  (was %d).", m_maxit, maxit); }

  // Check for all zero right hand side vector => all zero solution
  double n2b = pb.norm2();          // Norm of rhs vector, b
  if (0.0 == n2b) {                 // if rhs vector is all zeros
    x.resize(n,true,0.0);           // then  solution is all zeros
    m_flag   = 0;                   // a valid solution has been obtained
    m_relres = 0;                   // the relative residual is actually 0/0
    m_iter   = 0;                   // no iterations need be performed
    m_resvec = 0;                   // resvec(1) = norm(b-A*x) = norm(0)
  //if (m_verbose) {itermsg("pcg", m_tol,m_maxit,0,m_flag,m_iter,NaN);}
    return x;
  }

  // local variables
  DVec xmin("xmin"), r("r"), z("z"), p("p"), q("q"), b_Ax("b-Ax");
  double tolb=0.0,normr=0.0,normrmin=0.0,rho=0.0,rho1=0.0,pq=0.0;
  double alpha=0.0,beta=0.0;  int i=0, imin=0;
  // IVec stagtest(n, "stagtest"), ind("ind");

  //-------------------------------------------------------
  // Set up for pcg method
  //-------------------------------------------------------
  m_flag = 1;
  imin = 0;                   // iteration at which xmin was computed
  xmin = px;                  // iterate which has minimal residual so far
  tolb = m_tol * n2b;         // relative tolerance
  r = pb - A*px;
  normr = r.norm2();          // norm of residual

  if (normr <= tolb) {
    m_flag   = 0;             // initial guess "x0" was good enough.
    m_relres = normr / n2b;   // since we have made no changes to x,
    m_iter   = 0;             // just return old x without permuting
    m_resvec = normr;
  //if (m_verbose) {itermsg("pcg", m_tol,m_maxit,0,m_flag,m_iter,relres);}
  //CS_pvec(this->pinv, px, x, n); // unpermute solution
    m_oldsol = true;
    return x;
  }

  m_resvec.resize(m_maxit+1);   // Preallocate vector for norm of residuals
  m_resvec(1) = normr;          // resvec(1) = norm(b-A*x0)
  normrmin    = normr;          // Norm of minimum residual
  rho         = 1.0;
  bool stag   = false;          // stagnation: flag failure to converge
  bool bOk    = true;          // stagnation: flag failure to converge

  //-------------------------------------------------------
  // loop for maxit iters, unless convergence or failure:
  //-------------------------------------------------------
  for (i=1; i<=m_maxit; ++i) 
  {
    // apply cholinc preconditioner
    z = solve_LLT(r);       // z = LLT\r
  //bOk = solve_LLT(r,z);   // z = LLT\r
    if (isInf(z)) 
  //if (!bOk) 
    {
      m_flag = 2; break;
    }

    rho1=rho;  rho=inner(r,z);

    if ((0.0==rho) || isinf(rho)) {
      m_flag = 4; break;
    }

    if (1 == i) {
      p = z;
    } else {
      beta = rho / rho1;
      if ((0.0 == beta) || isinf(beta)) {
        m_flag = 4; break;
      }
    //p = z + beta * p;
      p*=beta;  p+=z;
    }

    q = A*p;
    pq = inner(p,q);

    if ((pq <= 0) || isinf(pq)) {
      m_flag = 4; break;
    } else {
      alpha = rho / pq;
    }

    if (isinf(alpha)) {
      m_flag = 4; break;
    }

    // Check for stagnation of the method
    if (0.0 == alpha) { stag = true; }

#if (0)
    //#####################################################
    // TODO: Check for stagnation of the method
    //#####################################################
    if (!stag) {
      stagtest.fill(0);
      ind = find(x, '!', 0.0);
      stagtest(ind) = dd(p(ind), x(ind));
      stagtest(~ind & p ~= 0) = Inf;
      if (abs(alpha)*norm(stagtest,inf) < eps) {stag = true;}
    }
    //#####################################################
#endif

    // form new iterate
    px += alpha * p;
    b_Ax = pb - A*px;
    normr = b_Ax.norm2();
    m_resvec(i+1) = normr;

    // check for convergence
    if (normr <= tolb) { 
      m_flag = 0; m_iter = i;

#if 1
      umLOG(1, " ==> CS_PCG sol: %3d %15.12lf\n", i, normr);
#endif

      break; 
    }

    // check for stagnation
    if (stag) { 
      m_flag = 3;
      break; 
    }

    // update minimal norm quantities
    if (normr < normrmin) { 
      normrmin = normr; xmin = px; imin = i; 
    }

    r -= alpha * q;

#if (SHOW_ITER_CONVERG)
    umLOG(1, " ==> CS_PCG sol: %3d %15.12lf\n", i, normr);
#endif

  } // for i=1:m_maxit
slint_t mpi_select_exact_radix_fixed(elements_t *s, slint_t nelements, slint_t nparts, partcond_t *pconds, slint_t rhigh, slint_t rlow, slint_t rwidth, int *sdispls, int size, int rank, MPI_Comm comm) /* sl_proto, sl_func mpi_select_exact_radix_fixed */
{
  slkey_pure_t max_nclasses, nclasses, bit_mask;
  slkey_pure_t k, l;

  typedef struct {
    slint_t count_min, count_max;
    slint_t count_low, count_hig;
#ifdef elem_weight
    double weight_min, weight_max;
    double weight_low, weight_hig;
#endif
  } mmlh_t;

  mmlh_t mmlh[nparts];

  const slint_t max_nborders = nparts - 1;
  slint_t border_lo, border_hi, nborders_removed;
  slint_t borders[max_nborders], border_areas[max_nborders];

#define MIN_LE  0
#define MIN_RI  1
#define MAX_LE  2
#define MAX_RI  3

  struct {
    slint_t update;
    slint_t crange[2], cmmlr[4];
#ifdef elem_weight
    double wrange[2], wmmlr[4];
#endif
  } border_infos_[1 + max_nborders + 1], *border_infos = border_infos_ + 1, border_info_old;

  const slint_t max_nareas = max_nborders;
  slint_t nareas, nareas_new;
  elements_t areas0[max_nareas * nelements], areas1[max_nareas * nelements], *areas, *areas_new;

  slint_t *area_counts, *current_counts;
  double *local_counts, *global_counts;
#ifdef elem_weight
  double *local_weights, *global_weights, *current_weights;
#endif

  slint_t current_cmm[2];
#ifdef elem_weight
  double current_wmm[2];
#endif

  slint_t final_areas[max_nborders * nelements];
  double final_locals[NCONDS * max_nborders], *final_globals;

  slint_t current_width;
  slint_t round, direction, refine, finalize;
  slint_t last_new_area, last_new_class;

  slint_t lc, lcs, gc, gcs, lcv[nelements], lcsv[nelements];
#ifdef elem_weight
  double lw, gw, lws, gws;
  double mw, dw;
  double mcw[4];
#else
  slint_t mc, dc;
#endif

  slint_t i, j;

  elements_t xi, end;

#ifdef VERIFY
  slint_t v;
#endif


  SL_TRACE_IF(DEBUG_OR_NOT, "starting mpi_select_exact_radix");

  /* sl_tid rti_tid_mpi_select_exact_radix rti_tid_mpi_select_exact_radix_sync */

  rti_treset(rti_tid_mpi_select_exact_radix_while);                   /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_count);             /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_allreduce);         /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_round1);            /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_round1_allgather);  /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_exscan);            /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_check);             /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_check_pre);         /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_check_classes);     /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_check_final);       /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_check_post);        /* sl_tid */

  rti_tstart(rti_tid_mpi_select_exact_radix_sync);
#ifdef SYNC_ON_INIT
  MPI_Barrier(comm);
#endif
  rti_tstop(rti_tid_mpi_select_exact_radix_sync);

#ifdef VERIFY
  v = elements_validate_order(s, 1);
  
  SL_TRACE_IF(DEBUG_OR_NOT, "elements order: %s (%" slint_fmt ")", (v > 0)?"FAILED":"SUCCESS", v);
#endif

  rti_tstart(rti_tid_mpi_select_exact_radix);

  if (rhigh < 0) rhigh = key_radix_high;
  if (rlow < 0) rlow = key_radix_low;
  if (rwidth < 0) rwidth = sort_radix_width_default;
  
  max_nclasses = powof2_typed(rwidth, slkey_pure_t);

/*  SL_TRACE_IF(DEBUG_OR_NOT, "alloc area_counts: %" slint_fmt " * %d", max_nareas * nelements * max_nclasses, sizeof(slint_t));
  SL_TRACE_IF(DEBUG_OR_NOT, "alloc local_counts: %" slint_fmt " * %d", NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(slint_t));
  SL_TRACE_IF(DEBUG_OR_NOT, "alloc global_counts: %" slint_fmt " * %d", NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(slint_t));*/

  area_counts = sl_alloc(max_nareas * nelements * max_nclasses, sizeof(slint_t));
  local_counts = sl_alloc(NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(double));
  global_counts = sl_alloc(NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(double));

  /* init areas (first area = all elements) */
  areas = areas0;
  areas_new = areas1;

  nareas = 1;
  for (j = 0; j < nelements; ++j) elem_assign(&s[j], &areas[0 * nelements + j]);

  /* init parts */
  border_lo = 0;
  border_hi = max_nborders - 1;
  for (i = border_lo; i <= border_hi; ++i)
  {
    borders[i] = i;
    border_areas[i] = 0;
  }

  /* init sdispls */
  for (i = 0; i < nparts; ++i)
  for (j = 0; j < nelements; ++j) sdispls[i * nelements + j] = 0;

  rti_tstart(rti_tid_mpi_select_exact_radix_while);

  round = 0;
  while (border_lo <= border_hi)
  {
    ++round;

    /* setup bitmask */
    current_width = xmin(rwidth, rhigh - rlow + 1);
    rhigh -= (current_width > 0)?current_width - 1:rhigh;

    nclasses = (current_width > 0)?powof2_typed(current_width, slkey_pure_t):1;
    bit_mask = nclasses - 1;

    SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" slint_fmt ", rhigh: %" slint_fmt ", current_width: %" slint_fmt ", nclasses: %" sl_key_pure_type_fmt, round, rhigh, current_width, nclasses);

    finalize = (current_width <= 0);

    if (!finalize || round == 1)
    {
#ifdef elem_weight
      /* init weight counters */
      local_weights = local_counts + (nareas * nclasses) + nareas;
      global_weights = global_counts + (nareas * nclasses) + nareas;
#endif

      /* zero all counter */
      for (i = 0; i < nareas; ++i)
      for (k = 0; k < nclasses; ++k) local_counts[i * nclasses + k] = 
#ifdef elem_weight
        local_weights[i * nclasses + k] = 
#endif
        0.0;

      rti_tstart(rti_tid_mpi_select_exact_radix_while_count);

      /* for every area */
      for (i = 0; i < nareas; ++i)
      {
        local_counts[nareas * nclasses + i] = 0;
#ifdef elem_weight
        local_weights[nareas * nclasses + i] = 0.0;
#endif

        /* for every list of elements */
        for (j = 0; j < nelements; ++j)
        {
          SL_TRACE_IF(DEBUG_OR_NOT, "area %" slint_fmt ",%" slint_fmt ": size = %" slint_fmt, i, j, areas[i * nelements + j].size);

          elem_assign_at(&areas[i * nelements + j], areas[i * nelements + j].size, &end);
          
          current_counts = area_counts + ((i * nelements + j) * nclasses);
#ifdef elem_weight
          current_weights = local_weights + (i * nclasses);
#endif

          for (k = 0; k < nclasses; ++k) current_counts[k] = 0;

          if (nclasses > 1)
          {
            /* counts and weights in every class */
            for (elem_assign(&areas[i * nelements + j], &xi); xi.keys < end.keys; elem_inc(&xi))
            {
              k = key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask);
              current_counts[k] += 1;
/*              SL_TRACE_IF(DEBUG_OR_NOT, "key %" sl_key_pure_type_fmt " goes to bin %"  sl_key_pure_type_fmt, key_purify(*xi.keys), k);*/
#ifdef elem_weight
              current_weights[k] += elem_weight(&xi, 0);
#endif
            }

          } else
          {
            /* total counts and weights */
            current_counts[0] = areas[i * nelements + j].size;

#ifdef elem_weight
            for (elem_assign(&areas[i * nelements + j], &xi); xi.keys < end.keys; elem_inc(&xi)) current_weights[0] += elem_weight(&xi, 0);
#endif
          }
          
          for (k = 0; k < nclasses; ++k) local_counts[i * nclasses + k] += current_counts[k];

          /* total counts and weights in this area */
          local_counts[nareas * nclasses + i] += areas[i * nelements + j].size;
#ifdef elem_weight
          for (k = 0; k < nclasses; ++k) local_weights[nareas * nclasses + i] += current_weights[k];
#endif
        }

        SL_TRACE_ARRAY_IF(DEBUG_OR_NOT, "%" slint_fmt ": counts =", " %f", k, nclasses, (&local_counts[i * nclasses]), i);
      }

      rti_tstop(rti_tid_mpi_select_exact_radix_while_count);

      --rhigh;

      SL_TRACE_IF(DEBUG_OR_NOT, "all-reducing %" slint_fmt " doubles", (slint_t) (NCONDS * (nareas * nclasses + nareas)));

      rti_tstart(rti_tid_mpi_select_exact_radix_while_allreduce);

      /* create global counts and weights */
#ifdef MPI_SELECT_EXACT_RADIX_REDUCEBCAST_THRESHOLD
      if (size >= MPI_SELECT_EXACT_RADIX_REDUCEBCAST_THRESHOLD)
      {
        MPI_Reduce(local_counts, global_counts, NCONDS * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, REDUCEBCAST_ROOT, comm);
        MPI_Bcast(global_counts, NCONDS * (nareas * nclasses + nareas), MPI_DOUBLE, REDUCEBCAST_ROOT, comm);

      } else
#endif
        MPI_Allreduce(local_counts, global_counts, NCONDS * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, comm);

      rti_tstop(rti_tid_mpi_select_exact_radix_while_allreduce);
    }

    /* do initializations */
    if (round == 1)
    {
      rti_tstart(rti_tid_mpi_select_exact_radix_while_round1);

      for (i = 0; i < nparts; ++i)
      {
        /* truncate counts, set default values and determine local (count/weight) limits */
        init_partconds(1, &pconds[i], nparts, global_counts[nareas * nclasses + 0],
#ifdef elem_weight
          global_weights[nareas * nclasses + 0]
#else
          0
#endif
          );

        mmlh[i].count_min = pconds[i].count_min;
        mmlh[i].count_max = pconds[i].count_max;
        mmlh[i].count_low = pconds[i].count_low;
        mmlh[i].count_hig = pconds[i].count_high;

#ifdef elem_weight
        mmlh[i].weight_min = pconds[i].weight_min;
        mmlh[i].weight_max = pconds[i].weight_max;
        mmlh[i].weight_low = pconds[i].weight_low;
        mmlh[i].weight_hig = pconds[i].weight_high;
#endif
      }

      /* init lowest and highest part (sentinels) */
      border_infos[border_lo - 1].update = 0;
      border_infos[border_lo - 1].crange[0] = 0;
      border_infos[border_lo - 1].crange[1] = 0;
      border_infos[border_lo - 1].cmmlr[MIN_LE] = border_infos[border_lo - 1].cmmlr[MAX_LE] = 0;
      border_infos[border_lo - 1].cmmlr[MIN_RI] = border_infos[border_lo - 1].cmmlr[MAX_RI] = 0;

      SL_TRACE_IF(DEBUG_OR_NOT, "lowest: %" slint_fmt ": init count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", border_lo - 1,
        border_infos[border_lo - 1].cmmlr[MIN_LE], border_infos[border_lo - 1].cmmlr[MAX_LE], border_infos[border_lo - 1].cmmlr[MIN_RI], border_infos[border_lo - 1].cmmlr[MAX_RI]);

#ifdef elem_weight
      border_infos[border_lo - 1].wrange[0] = 0.0;
      border_infos[border_lo - 1].wrange[1] = 0.0;
      border_infos[border_lo - 1].wmmlr[MIN_LE] = border_infos[border_lo - 1].wmmlr[MAX_LE] = 0.0;
      border_infos[border_lo - 1].wmmlr[MIN_RI] = border_infos[border_lo - 1].wmmlr[MAX_RI] = 0.0;

      SL_TRACE_IF(DEBUG_OR_NOT, "lowest: %" slint_fmt ": init weight[min/max-left/right]: %f / %f - %f / %f", border_lo - 1,
        border_infos[border_lo - 1].wmmlr[MIN_LE], border_infos[border_lo - 1].wmmlr[MAX_LE], border_infos[border_lo - 1].wmmlr[MIN_RI], border_infos[border_lo - 1].wmmlr[MAX_RI]);
#endif

      /* init highest part (sentinel) */
      border_infos[border_hi + 1].update = 0;
      border_infos[border_hi + 1].crange[0] = global_counts[nareas * nclasses + 0];
      border_infos[border_hi + 1].crange[1] = global_counts[nareas * nclasses + 0];
      border_infos[border_hi + 1].cmmlr[MIN_LE] = border_infos[border_hi + 1].cmmlr[MAX_LE] = 0;
      border_infos[border_hi + 1].cmmlr[MIN_RI] = border_infos[border_hi + 1].cmmlr[MAX_RI] = global_counts[nareas * nclasses + 0];

      SL_TRACE_IF(DEBUG_OR_NOT, "highest: %" slint_fmt ": init count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", border_hi + 1,
        border_infos[border_hi + 1].cmmlr[MIN_LE], border_infos[border_hi + 1].cmmlr[MAX_LE], border_infos[border_hi + 1].cmmlr[MIN_RI], border_infos[border_hi + 1].cmmlr[MAX_RI]);

#ifdef elem_weight
      border_infos[border_hi + 1].wrange[0] = global_weights[nareas * nclasses + 0];
      border_infos[border_hi + 1].wrange[1] = global_weights[nareas * nclasses + 0];
      border_infos[border_hi + 1].wmmlr[MIN_LE] = border_infos[border_hi + 1].wmmlr[MAX_LE] = 0.0;
      border_infos[border_hi + 1].wmmlr[MIN_RI] = border_infos[border_hi + 1].wmmlr[MAX_RI] = global_weights[nareas * nclasses + 0];

      SL_TRACE_IF(DEBUG_OR_NOT, "highest: %" slint_fmt ": init weight[min/max-left/right]: %f / %f - %f / %f", border_hi + 1,
        border_infos[border_hi + 1].wmmlr[MIN_LE], border_infos[border_hi + 1].wmmlr[MAX_LE], border_infos[border_hi + 1].wmmlr[MIN_RI], border_infos[border_hi + 1].wmmlr[MAX_RI]);
#endif

      /* init regular parts (backwards) */
      for (i = border_hi; i >= border_lo; --i)
      {
        border_infos[borders[i]].update = 1;
        border_infos[borders[i]].crange[0] = 0;
        border_infos[borders[i]].crange[1] = global_counts[nareas * nclasses + 0];
        border_infos[borders[i]].cmmlr[MIN_LE] = -1;
        border_infos[borders[i]].cmmlr[MIN_RI] = border_infos[borders[i] + 1].cmmlr[MIN_RI] - mmlh[borders[i] + 1].count_min;
        border_infos[borders[i]].cmmlr[MAX_LE] = -1;
        border_infos[borders[i]].cmmlr[MAX_RI] = border_infos[borders[i] + 1].cmmlr[MAX_RI] - mmlh[borders[i] + 1].count_max;

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": init count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i],
          border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]);

#ifdef elem_weight
        border_infos[borders[i]].wrange[0] = 0.0;
        border_infos[borders[i]].wrange[1] = global_weights[nareas * nclasses + 0];
        border_infos[borders[i]].wmmlr[MIN_LE] = -1.0;
        border_infos[borders[i]].wmmlr[MIN_RI] = border_infos[borders[i] + 1].wmmlr[MIN_RI] - mmlh[borders[i] + 1].weight_min;
        border_infos[borders[i]].wmmlr[MAX_LE] = -1.0;
        border_infos[borders[i]].wmmlr[MAX_RI] = border_infos[borders[i] + 1].wmmlr[MAX_RI] - mmlh[borders[i] + 1].weight_max;

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": init weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i],
          border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]);
#endif

        /* prepare for finalization in the 1st round */
        if (finalize)
        {
          for (j = 0; j < nelements; ++j) final_areas[i * nelements + j] = area_counts[(0 * nelements + j) * nclasses + 0];

          final_locals[NCONDS * i + 0] = local_counts[nareas * nclasses + 0];
#ifdef elem_weight
          final_locals[NCONDS * i + 1] = local_weights[nareas * nclasses + 0];
#endif
        }
      }
      
      /* first direction: forward */
      direction = 1;

      rti_tstop(rti_tid_mpi_select_exact_radix_while_round1);
    }

    /* compute prefixes for finalization */
    if (finalize)
    {
      /* determine number of parts to finalize */
      j = border_hi - border_lo + 1;
    
      SL_TRACE_IF(DEBUG_OR_NOT, "Exscan: finalizing %" slint_fmt " parts", j);

      rti_tstart(rti_tid_mpi_select_exact_radix_while_exscan);

      /* use local_counts to store the global prefix sums */      
      final_globals = local_counts;

      /* create global prefix sums (set rank 0 to zero) */
      MPI_Exscan(&final_locals[NCONDS * border_lo], &final_globals[NCONDS * border_lo], NCONDS * j, MPI_DOUBLE, MPI_SUM, comm);
      if (rank == 0) for (i = border_lo; i <= border_hi; ++i) final_globals[NCONDS * i + 0] = 
#ifdef elem_weight
        final_globals[NCONDS * i + 1] = 
#endif
        0.0;

      rti_tstop(rti_tid_mpi_select_exact_radix_while_exscan);
    }

    /* check all remaining parts */
    SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" slint_fmt ", %s", round, (direction > 0)?"forward":"backward");

    nareas_new = 0;
    last_new_area = last_new_class = -1;
    nborders_removed = 0;

    rti_tstart(rti_tid_mpi_select_exact_radix_while_check);

    i = (direction > 0)?border_lo:border_hi;
    while ((direction > 0)?(i <= border_hi):(i >= border_lo))
    {
      /* check partition borders[i] */
      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ": PART: %" slint_fmt ",%" slint_fmt, round, i, borders[i]);

      rti_tstart(rti_tid_mpi_select_exact_radix_while_check_pre);

      /* save to old limits */
      border_info_old = border_infos[borders[i]];

      /* is an update required? */
      if (border_infos[borders[i]].update)
      {
        /* forward */
        if (direction > 0)
        {
          /* init from min/max (always) */
          border_infos[borders[i]].cmmlr[MIN_LE] = border_infos[borders[i] - 1].cmmlr[MIN_LE] + mmlh[borders[i]].count_min;
          border_infos[borders[i]].cmmlr[MAX_LE] = border_infos[borders[i] - 1].cmmlr[MAX_LE] + mmlh[borders[i]].count_max;

          SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left]: %" slint_fmt " + %" slint_fmt ", %" slint_fmt " + %" slint_fmt "", i, borders[i],
            border_infos[borders[i] - 1].cmmlr[MIN_LE], mmlh[borders[i]].count_min,
            border_infos[borders[i] - 1].cmmlr[MAX_LE], mmlh[borders[i]].count_max);

          /* check against low/high (on demand) */
          if (pconds->pcm & SLPC_COUNTS_LH)
          {
            if (border_infos[borders[i]].cmmlr[MIN_LE] < mmlh[borders[i] + 1].count_low) border_infos[borders[i]].cmmlr[MIN_LE] = mmlh[borders[i] + 1].count_low;
            if (border_infos[borders[i]].cmmlr[MAX_LE] > mmlh[borders[i]    ].count_hig) border_infos[borders[i]].cmmlr[MAX_LE] = mmlh[borders[i]    ].count_hig;
          }

#ifdef elem_weight
          /* init from min/max (always) */
          border_infos[borders[i]].wmmlr[MIN_LE] = border_infos[borders[i] - 1].wmmlr[MIN_LE] + mmlh[borders[i]].weight_min;
          border_infos[borders[i]].wmmlr[MAX_LE] = border_infos[borders[i] - 1].wmmlr[MAX_LE] + mmlh[borders[i]].weight_max;

          SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left]: %f + %f, %f + %f", i, borders[i],
            border_infos[borders[i] - 1].wmmlr[MIN_LE], mmlh[borders[i]].weight_min,
            border_infos[borders[i] - 1].wmmlr[MAX_LE], mmlh[borders[i]].weight_max);

          /* check against low/high (on demand) */
          if (pconds->pcm & SLPC_WEIGHTS_LH)
          {
            if (border_infos[borders[i]].wmmlr[MIN_LE] < mmlh[borders[i] + 1].weight_low) border_infos[borders[i]].wmmlr[MIN_LE] = mmlh[borders[i] + 1].weight_low;
            if (border_infos[borders[i]].wmmlr[MAX_LE] > mmlh[borders[i]    ].weight_hig) border_infos[borders[i]].wmmlr[MAX_LE] = mmlh[borders[i]    ].weight_hig;
          }
#endif
        } else /* backward */
        {
          /* init from min/max (always) */
          border_infos[borders[i]].cmmlr[MIN_RI] = border_infos[borders[i] + 1].cmmlr[MIN_RI] - mmlh[borders[i] + 1].count_min;
          border_infos[borders[i]].cmmlr[MAX_RI] = border_infos[borders[i] + 1].cmmlr[MAX_RI] - mmlh[borders[i] + 1].count_max;

          SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-right]: %" slint_fmt " - %" slint_fmt ", %" slint_fmt " - %" slint_fmt "", i, borders[i],
            border_infos[borders[i] + 1].cmmlr[MIN_RI], mmlh[borders[i] + 1].count_min,
            border_infos[borders[i] + 1].cmmlr[MAX_RI], mmlh[borders[i] + 1].count_max);

          /* check against low/high (on demand) */
          if (pconds->pcm & SLPC_COUNTS_LH)
          {
            if (border_infos[borders[i]].cmmlr[MAX_RI] < mmlh[borders[i] + 1].count_low) border_infos[borders[i]].cmmlr[MAX_RI] = mmlh[borders[i] + 1].count_low;
            if (border_infos[borders[i]].cmmlr[MIN_RI] > mmlh[borders[i]    ].count_hig) border_infos[borders[i]].cmmlr[MIN_RI] = mmlh[borders[i]    ].count_hig;
          }

#ifdef elem_weight
          /* init from min/max (always) */
          border_infos[borders[i]].wmmlr[MIN_RI] = border_infos[borders[i] + 1].wmmlr[MIN_RI] - mmlh[borders[i] + 1].weight_min;
          border_infos[borders[i]].wmmlr[MAX_RI] = border_infos[borders[i] + 1].wmmlr[MAX_RI] - mmlh[borders[i] + 1].weight_max;

          SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-right]: %f - %f, %f - %f", i, borders[i],
            border_infos[borders[i] + 1].wmmlr[MIN_RI], mmlh[borders[i] + 1].weight_min,
            border_infos[borders[i] + 1].wmmlr[MAX_RI], mmlh[borders[i] + 1].weight_max);

          /* check against low/high (on demand) */
          if (pconds->pcm & SLPC_WEIGHTS_LH)
          {
            if (border_infos[borders[i]].wmmlr[MAX_RI] < mmlh[borders[i] + 1].weight_low) border_infos[borders[i]].wmmlr[MAX_RI] = mmlh[borders[i] + 1].weight_low;
            if (border_infos[borders[i]].wmmlr[MIN_RI] > mmlh[borders[i]    ].weight_hig) border_infos[borders[i]].wmmlr[MIN_RI] = mmlh[borders[i]    ].weight_hig;
          }
#endif
        }

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i],
          border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]);

        /* check against inconsistence */
        if (border_infos[borders[i]].cmmlr[MIN_LE] > border_infos[borders[i]].cmmlr[MIN_RI]) border_infos[borders[i]].cmmlr[MIN_LE] = border_infos[borders[i]].cmmlr[MIN_RI] = (border_infos[borders[i]].cmmlr[MIN_LE] + border_infos[borders[i]].cmmlr[MIN_RI]) / 2;
        if (border_infos[borders[i]].cmmlr[MAX_LE] < border_infos[borders[i]].cmmlr[MAX_RI]) border_infos[borders[i]].cmmlr[MAX_LE] = border_infos[borders[i]].cmmlr[MAX_RI] = (border_infos[borders[i]].cmmlr[MAX_LE] + border_infos[borders[i]].cmmlr[MAX_RI]) / 2;

#ifdef elem_weight
        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i],
          border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]);

        /* check against inconsistence */
        if (border_infos[borders[i]].wmmlr[MIN_LE] > border_infos[borders[i]].wmmlr[MIN_RI]) border_infos[borders[i]].wmmlr[MIN_LE] = border_infos[borders[i]].wmmlr[MIN_RI] = (border_infos[borders[i]].wmmlr[MIN_LE] + border_infos[borders[i]].wmmlr[MIN_RI]) / 2;
        if (border_infos[borders[i]].wmmlr[MAX_LE] < border_infos[borders[i]].wmmlr[MAX_RI]) border_infos[borders[i]].wmmlr[MAX_LE] = border_infos[borders[i]].wmmlr[MAX_RI] = (border_infos[borders[i]].wmmlr[MAX_LE] + border_infos[borders[i]].wmmlr[MAX_RI]) / 2;
#endif
      }

      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i],
        border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]);

      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": crange: %" slint_fmt " - %" slint_fmt "", i, borders[i], border_infos[borders[i]].crange[0], border_infos[borders[i]].crange[1]);

      /* select highest min and lowest max */
      current_cmm[0] = xmax(border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_RI]) - border_infos[borders[i]].crange[0];
      current_cmm[1] = xmin(border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI]) - border_infos[borders[i]].crange[0];

      if (rank == 0) SL_ASSERT(current_cmm[0] <= current_cmm[1]);
      
      if (rank == 0) SL_ASSERT(0 <= current_cmm[0]);

      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": current_count: %" slint_fmt " - %" slint_fmt "", i, borders[i], current_cmm[0], current_cmm[1]);

#ifdef elem_weight
      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i],
        border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]);

      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": wrange: %f - %f", i, borders[i], border_infos[borders[i]].wrange[0], border_infos[borders[i]].wrange[1]);

      /* select highest min and lowest max */
      current_wmm[0] = xmax(border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_RI]) - border_infos[borders[i]].wrange[0];
      current_wmm[1] = xmin(border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI]) - border_infos[borders[i]].wrange[0];

      if (rank == 0) SL_ASSERT(current_wmm[0] <= current_wmm[1]);

      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": current_weight: %f - %f", i, borders[i], current_wmm[0], current_wmm[1]);
#endif

      rti_tstop(rti_tid_mpi_select_exact_radix_while_check_pre);

      /* HIT is the default */
      refine = 0;

      if (!finalize)
      {
        rti_tstart(rti_tid_mpi_select_exact_radix_while_check_classes);

        lcs = gcs = 0;
#ifdef elem_weight
        lws = gws = 0.0;
#endif

        for (k = 0; k < nclasses; ++k)
        {
          lc = local_counts[border_areas[i] * nclasses + k];
          gc = global_counts[border_areas[i] * nclasses + k];

          current_cmm[0] -= gc;
          current_cmm[1] -= gc;

          SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": k = %" sl_key_pure_type_fmt ", current_count: %" slint_fmt " - %" slint_fmt ", lc = %" slint_fmt ", lcs = %" slint_fmt ", gc = %" slint_fmt ", gcs = %" slint_fmt,
            i, borders[i], k, current_cmm[0], current_cmm[1], lc, lcs, gc, gcs);

#ifdef elem_weight
          lw = local_weights[border_areas[i] * nclasses + k];
          gw = global_weights[border_areas[i] * nclasses + k];

          current_wmm[0] -= gw;
          current_wmm[1] -= gw;

          SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": k = %" sl_key_pure_type_fmt ", current_weight: %e - %e", i, borders[i], k, current_wmm[0], current_wmm[1]);
#endif

          /* stop and refine if max count is skipped OR min count AND max weight is skipped */
          if ((current_cmm[1] < 0)
#ifdef elem_weight
            || (current_cmm[0] < 0 && current_wmm[1] < 0.0)
#endif
            )
          {
            refine = 1;
            break;
          }

          lcs += lc;
          gcs += gc;
          gc = 0;

#ifdef elem_weight
          lws += lw;
          gws += gw;
          gw = 0.0;
#endif

          /* if between min/max counts */
          if (current_cmm[0] <= 0 && current_cmm[1] >= 0)
          {
#ifdef elem_weight
            SL_TRACE_IF(DEBUG_OR_NOT, "got to next: %d && %d", (current_cmm[1] > 0), (current_wmm[0] > 0));

            /* go to next if max count not reached AND min weight not reached */
            if (current_cmm[1] > 0 && current_wmm[0] > 0) continue;
#endif

            /* look ahead for a better stop */
            if (k + 1 < nclasses && current_cmm[1] - global_counts[border_areas[i] * nclasses + k + 1] >= 0)
            {
#ifdef elem_weight
              /* continue if weights will improve */
              if (myabs(current_wmm[0] + current_wmm[1]) > myabs(current_wmm[0] + current_wmm[1] - 2 * global_weights[border_areas[i] * nclasses + k + 1])) continue;
#else
              /* continue if counts will improve */
              if (myabs(current_cmm[0] + current_cmm[1]) > myabs(current_cmm[0] + current_cmm[1] - 2 * global_counts[border_areas[i] * nclasses + k + 1])) continue;
#endif
            }

            /* stop */
            break;
          }
        }

        SL_ASSERT_IF((rank == 0), k < nclasses);

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": %s k = %" sl_key_pure_type_fmt ", lcs = %" slint_fmt, i, borders[i], (refine)?"REFINE":"HIT", k, lcs);

        /* make sure k is safe (it is used as index later) */
        if (k >= nclasses) k = nclasses - 1;

        /* break the local contribution into contributions for the lists of elements */
        for (j = 0; j < nelements; ++j)
        {
          lcsv[j] = 0;
          for (l = 0; l < k; ++l) lcsv[j] += area_counts[((border_areas[i] * nelements + j) * nclasses) + l];

          if (refine) lcv[j] = area_counts[((border_areas[i] * nelements + j) * nclasses) + k];
          else
          {
            lcv[j] = 0;
            lcsv[j] += area_counts[((border_areas[i] * nelements + j) * nclasses) + k];
          }

          lcs -= lcsv[j];
        }

        rti_tstop(rti_tid_mpi_select_exact_radix_while_check_classes);

      } else
      {
        rti_tstart(rti_tid_mpi_select_exact_radix_while_check_final);
        
        k = 0;

#ifdef elem_weight
        /* middle of min/max weight */
        mw = (current_wmm[0] + current_wmm[1]) / 2.0;

        /* min. part of weight to contribute */
        dw = xmax(0, mw - final_globals[NCONDS * i + 1]);

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": mw = %e, dw = %e", i, borders[i], mw, dw);
#else
        /* middle of min/max count */
        mc = (current_cmm[0] + current_cmm[1]) / 2;

        /* min. part of count to contribute */
        dc = xmax(0, mc - final_globals[NCONDS * i + 0]);

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": mc = %" slint_fmt ", dc = %" slint_fmt, i, borders[i], mc, dc);
#endif

        /* contribute all? */
        if (
#ifdef elem_weight
          dw >= final_locals[NCONDS * i + 1]
#else
          dc >= final_locals[NCONDS * i + 0]
#endif
        )
        {
          lc = final_locals[NCONDS * i + 0];
#ifdef elem_weight
          lw = final_locals[NCONDS * i + 1];
#endif

        } else
        {
          /* contribute only a part */
#ifdef elem_weight
          lc = 0;

          for (j = 0; j < nelements; ++j)
          {
            elem_assign_at(&areas[border_areas[i] * nelements + j], areas[border_areas[i] * nelements + j].size, &end);

            for (elem_assign(&areas[border_areas[i] * nelements + j], &xi); xi.keys < end.keys; elem_inc(&xi))
            {
              dw -= elem_weight(&xi, 0);
              ++lc;

              if (dw < 0.0 || lc >= final_locals[NCONDS * i + 0])
              {
                dw += elem_weight(&xi, 0);
                --lc;
                break;
              }
            }
          }

          lw = dw;
#else
          lc = dc;
#endif
        }

        /* check mc against min/max count borders */
        lc = xminmax(current_cmm[0] - final_globals[NCONDS * i + 0], lc, current_cmm[1] - final_globals[NCONDS * i + 0]);

        /* check agains 0 (don't step back!) and the local contribution */
        lc = xminmax(0, lc, final_locals[NCONDS * i + 0]);

        lcs = lc;
#ifdef elem_weight
        lws = lw;
#endif

#ifdef elem_weight
        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": next border: %" slint_fmt " <= %" slint_fmt " + %" slint_fmt " <= %" slint_fmt,
          i, borders[i], border_lo, i, direction, border_hi);
        if (border_lo <= i + direction && i + direction <= border_hi)
          SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": next border: %" slint_fmt " == %" slint_fmt " + %" slint_fmt,
            i, borders[i], borders[i + direction], borders[i], direction);

        /* FIXME: finalize geht auch rückwärts!!! */

        /* if the next open border is really the _next_ border */
        if (border_lo <= i + direction && i + direction <= border_hi && borders[i + direction] == borders[i] + direction)
        {
          /* determine the exact global counts/weights (damn, this is expensive) */
          mcw[0] = lcs;
          mcw[1] = lws;
          MPI_Allreduce(&mcw[0], &mcw[2], 2, MPI_DOUBLE, MPI_SUM, comm);

        } else
        {
          /* the exact global counts/weights are not required */
          mcw[2] = 0.0;
          mcw[3] = 0.0;
        }

        gc = 0;
        gcs = mcw[2];
        gw = 0.0;
        gws = mcw[3];
        
        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": gcs = %" slint_fmt ", gws = %f", i, borders[i], gcs, gws);
#else
        /* the global count is simply mc */
        gc = 0;
        gcs = mc;

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": gcs = %" slint_fmt, i, borders[i], gcs);
#endif

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": lcs = %" slint_fmt, i, borders[i], lcs);

        /* break the local contribution into contributions for the lists of elements */
        for (j = 0; j < nelements; ++j)
        {
          lcv[j] = 0;
          lcsv[j] = xmin(lcs, final_areas[i * nelements + j]);
          
          lcs -= lcsv[j];
        }

        SL_TRACE_ARRAY_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": lcsv = ", "%" slint_fmt, j, nelements, lcsv, i, borders[i]);

        rti_tstop(rti_tid_mpi_select_exact_radix_while_check_final);
      }

      SL_ASSERT(lcs == 0);
      
      /* accept local contributions */
      for (j = 0; j < nelements; ++j) sdispls[(borders[i] + 1) * nelements + j] += lcsv[j];

      rti_tstart(rti_tid_mpi_select_exact_radix_while_check_post);

      /* this is wrong, e.g., even if gc == 0 and gcs == 0 then crange[1] is set to crange[0]! */
/*      if (gc > 0 || gcs > 0
#ifdef elem_weight
       || gw != 0.0 || gws != 0.0
#endif
       )*/
      {
        border_infos[borders[i]].crange[0] += gcs;
        border_infos[borders[i]].crange[1] = border_infos[borders[i]].crange[0] + gc;

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": counts_range: %" slint_fmt "  %" slint_fmt "", i, borders[i], border_infos[borders[i]].crange[0], border_infos[borders[i]].crange[1]);

        border_infos[borders[i]].cmmlr[MIN_LE] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].crange[1]);
        border_infos[borders[i]].cmmlr[MAX_LE] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].crange[1]);
        border_infos[borders[i]].cmmlr[MIN_RI] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].crange[1]);
        border_infos[borders[i]].cmmlr[MAX_RI] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MAX_RI], border_infos[borders[i]].crange[1]);

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i],
          border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]);

#ifdef elem_weight
        border_infos[borders[i]].wrange[0] += gws;
        border_infos[borders[i]].wrange[1] = border_infos[borders[i]].wrange[0] + gw;

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weights_range: %f  %f", i, borders[i], border_infos[borders[i]].wrange[0], border_infos[borders[i]].wrange[1]);

        border_infos[borders[i]].wmmlr[MIN_LE] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wrange[1]);
        border_infos[borders[i]].wmmlr[MAX_LE] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wrange[1]);
        border_infos[borders[i]].wmmlr[MIN_RI] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wrange[1]);
        border_infos[borders[i]].wmmlr[MAX_RI] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MAX_RI], border_infos[borders[i]].wrange[1]);

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i],
          border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]);
#endif
      }
      
      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": range diff 0: %" slint_fmt "-%" slint_fmt " | %" slint_fmt "-%" slint_fmt, i, borders[i],
        border_infos[borders[i]].crange[0] - border_infos[borders[i] - 1].crange[1], border_infos[borders[i]].crange[0] - border_infos[borders[i] - 1].crange[0],
        border_infos[borders[i] + 1].crange[0] - border_infos[borders[i]].crange[0], border_infos[borders[i] + 1].crange[1] - border_infos[borders[i]].crange[0]);
      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": range diff 1: %" slint_fmt "-%" slint_fmt " | %" slint_fmt "-%" slint_fmt, i, borders[i],
        border_infos[borders[i]].crange[1] - border_infos[borders[i] - 1].crange[1], border_infos[borders[i]].crange[1] - border_infos[borders[i] - 1].crange[0],
        border_infos[borders[i] + 1].crange[0] - border_infos[borders[i]].crange[1], border_infos[borders[i] + 1].crange[1] - border_infos[borders[i]].crange[1]);

      if (border_infos[borders[i]].cmmlr[MIN_LE] != border_info_old.cmmlr[MIN_LE]
       || border_infos[borders[i]].cmmlr[MAX_LE] != border_info_old.cmmlr[MAX_LE]
#ifdef elem_weight
       || border_infos[borders[i]].wmmlr[MIN_LE] != border_info_old.wmmlr[MIN_LE]
       || border_infos[borders[i]].wmmlr[MAX_LE] != border_info_old.wmmlr[MAX_LE]
#endif
       ) border_infos[borders[i] + 1].update = 1;

      if (border_infos[borders[i]].cmmlr[MIN_RI] != border_info_old.cmmlr[MIN_RI]
       || border_infos[borders[i]].cmmlr[MAX_RI] != border_info_old.cmmlr[MAX_RI]
#ifdef elem_weight
       || border_infos[borders[i]].wmmlr[MIN_RI] != border_info_old.wmmlr[MIN_RI]
       || border_infos[borders[i]].wmmlr[MAX_RI] != border_info_old.wmmlr[MAX_RI]
#endif
       ) border_infos[borders[i] - 1].update = 1;

      border_infos[borders[i]].update = 0;

      /* refine or remove */
      if (refine)
      {
        /* bits left for partitioning? */
        if (rhigh >= rlow)
        {
          if (last_new_area == border_areas[i] && last_new_class == k) border_areas[i] = nareas_new - 1;
          else
          {
            /* update last_new_... */
            last_new_area = border_areas[i];
            last_new_class = k;

            /* create new area */
            for (j = 0; j < nelements; ++j)
            {
              elem_assign_at(&areas[border_areas[i] * nelements + j], lcsv[j], &areas_new[nareas_new * nelements + j]);
              areas_new[nareas_new * nelements + j].size = lcv[j];
            }
            border_areas[i] = nareas_new;
            ++nareas_new;
          }

        } else
        {
          for (j = 0; j < nelements; ++j) final_areas[(i - nborders_removed * direction) * nelements + j] = lcv[j];

          /* save local count/weight for the later prefix calculations */
          final_locals[NCONDS * (i - nborders_removed * direction) + 0] = lc;
#ifdef elem_weight
          final_locals[NCONDS * (i - nborders_removed * direction) + 1] = lw;
#endif
        }

        borders[i - nborders_removed * direction] = borders[i];
        border_areas[i - nborders_removed * direction] = border_areas[i];

      } else ++nborders_removed;

      rti_tstop(rti_tid_mpi_select_exact_radix_while_check_post);

      i += direction;
    }

    /* restrict the parts */
    if (direction > 0) border_hi -= nborders_removed;
    else border_lo += nborders_removed;

    /* change direction */
    direction *= -1;

    rti_tstop(rti_tid_mpi_select_exact_radix_while_check);
    
    /* switch areas */
    nareas = nareas_new;
    if (areas == areas0)
    {
      areas = areas1;
      areas_new = areas0;
    } else
    {
      areas = areas0;
      areas_new = areas1;
    }
  }

  rti_tstop(rti_tid_mpi_select_exact_radix_while);

  sl_free(area_counts);
  sl_free(local_counts);
  sl_free(global_counts);

  rti_tstop(rti_tid_mpi_select_exact_radix);

#ifdef VERIFY
  v = mpi_post_check_partconds(s, nelements, nparts, pconds, sdispls, size, rank, comm);
  
  SL_ASSERT_IF(rank == 0, v < 0);
  
  SL_NOTICE_IF(rank == 0, "post_check_partconds: %s (%" slint_fmt ")", (v >= 0)?"FAILED":"SUCCESS", v);
#endif

#ifdef PRINT_SDISPLS
  printf("%d: sdispls:", rank);
  for (i = 0; i < nparts; ++i) printf(" %d ", sdispls[i]);
  printf("\n");
#endif

#ifdef PRINT_STATS
  mpi_select_stats(s, nparts, sdispls, size, rank, comm);
#endif

#if defined(PRINT_TIMINGS) && defined(SL_USE_RTI_TIM)
  if (rank == PRINT_TIMINGS)
  {
    printf("%d: mpi_select_exact_radix: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix));
    printf("%d: mpi_select_exact_radix: sync: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_sync));
    printf("%d: mpi_select_exact_radix: while: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while));
    printf("%d: mpi_select_exact_radix:  count: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_count));
    printf("%d: mpi_select_exact_radix:  allreduce: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_allreduce));
    printf("%d: mpi_select_exact_radix:  round1: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_round1));
    printf("%d: mpi_select_exact_radix:   allgather: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_round1_allgather));
    printf("%d: mpi_select_exact_radix:  exscan: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_exscan));
    printf("%d: mpi_select_exact_radix:  check: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_check));
    printf("%d: mpi_select_exact_radix:   pre: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_pre));
    printf("%d: mpi_select_exact_radix:   classes: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_classes));
    printf("%d: mpi_select_exact_radix:   final: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_final));
    printf("%d: mpi_select_exact_radix:   post: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_post));
    printf("%d: mpi_select_exact_radix: rounds: %" slint_fmt "\n", rank, round);
  }
#endif

  return 0;
}
Ejemplo n.º 25
0
slint rs_rec_af(elements_t *s, elements_t *sx, slint rhigh, slint rlow, slint rwidth, slint *finalize) /* sl_func rs_rec_af */
{
#define max_nclasses (powof2_typed(sort_radix_width_max, slkey_pure_t))

  slkey_pure_t bit_mask, nclasses;

  slint i, current_width, c[max_nclasses];
  elements_t xi, end, parts[max_nclasses];

  elem_assign_at(s, s->size, &end);

  current_width = xmin(rwidth, rhigh - rlow + 1);
  rhigh -= current_width - 1;

  nclasses = powof2_typed(current_width, slkey_pure_t);
  bit_mask = nclasses - 1;


  /* zero all counter */
  for (i = 0; i < nclasses; i++) c[i] = 0;

  /* count the number of elements in every class */
  for (elem_assign(s, &xi); xi.keys < end.keys; elem_inc(&xi)) ++c[key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask)];

  /* compute the target of every class */
  elem_assign_at(s, c[0], &parts[0]);
  parts[0].size = c[0];
  for (i = 1; i < nclasses; i++)
  {
    elem_assign_at(&parts[i - 1], c[i], &parts[i]);
    parts[i].size = c[i];
  }

  /* permute the keys home */
  for (elem_assign(s, &xi); xi.keys < end.keys; elem_add(&xi, c[i]))
  {
    while (1)
    {
      i = key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask);

      elem_dec(&parts[i]);

      if (xi.keys >= parts[i].keys) break;

      elem_xchange(&parts[i], &xi, sx);
    }
  }

  --rhigh;

  if (rhigh >= rlow)
  {
    elem_assign(s, &xi);
    for (i = 0; i < nclasses; i++)
    {
      xi.size = c[i];

#ifdef insertsort
      if (xi.size > sort_radix_threshold_rec) rs_rec_af(&xi, sx, rhigh, rlow, rwidth, finalize);

 #ifdef insertsort_finalize

  #ifdef insertsort_finalize_adaptive
      else if (xi.size > 1) *finalize = 1;
  #endif /* insertsort_finalize_adaptive */

 #else /* insertsort_finalize */
      else if (xi.size > 1) rs_rec_insertsort_af(&xi, sx, rhigh, rlow);
 #endif /* insertsort_finalize */

#else /* insertsort */
      if (xi.size > 1) rs_rec(&xi, sx, rhigh, rlow, rwidth, finalize);
#endif /* insertsort */

      elem_add(&xi, c[i]);
    }
  }

  return 0;
}
Ejemplo n.º 26
0
slint_t mpi_merge2(elements_t *s, slint_t other_rank, slint_t high_rank, slint_t *dst_size, merge2x_f m2, elements_t *xs, int size, int rank, MPI_Comm comm) /* sl_proto, sl_func mpi_merge2 */
{
    const int tag = 1;

    slint_t ex_start, ex_sizes[2], nx_move, ex_size;
    elements_t s0, s1;

    MPI_Status status;

#ifdef CHECK_ORDER
    slint_t check_order;
#endif


    SL_TRACE_IF(MM2_TRACE_IF, "starting mpi_merge2");

    /* sl_tid rti_tid_mpi_merge2 */

    rti_treset(rti_tid_mpi_merge2_find);       /* sl_tid */
    rti_treset(rti_tid_mpi_merge2_moveright);  /* sl_tid */
    rti_treset(rti_tid_mpi_merge2_exchange);   /* sl_tid */
    rti_treset(rti_tid_mpi_merge2_moveleft);   /* sl_tid */
    rti_treset(rti_tid_mpi_merge2_local);     /* sl_tid */

    rti_tclear(rti_tid_mpi_merge2);

    if (other_rank < 0 || other_rank >= size) return -1;

    if (rank == other_rank) return 0;

    rti_tstart(rti_tid_mpi_merge2);

#ifdef CHECK_ORDER
    check_order = elements_validate_order(s, 1);
    if (check_order) SL_ERROR("input order failed at %" slint_fmt "", check_order);
#endif

    SL_TRACE_IF(MM2_TRACE_IF, "find_exact: s->size = %" slint_fmt ", other_rank / high_rank = %" slint_fmt " / %" slint_fmt, s->size, other_rank, high_rank);

    rti_tstart(rti_tid_mpi_merge2_find);
    mpi_find_exact(s, other_rank, high_rank, dst_size, &ex_start, ex_sizes, &nx_move, size, rank, comm);
    rti_tstop(rti_tid_mpi_merge2_find);

    SL_TRACE_IF(MM2_TRACE_IF, "find_exact: ex_start = %" slint_fmt ", ex_sizes = { %" slint_fmt ", %" slint_fmt " }, nx_move = %" slint_fmt, ex_start, ex_sizes[0], ex_sizes[1], nx_move);

    /* move the nx-block to the right (before exchange) */
    rti_tstart(rti_tid_mpi_merge2_moveright);

    if (nx_move > 0 && s->size - ex_sizes[0] > 0)
    {
        SL_TRACE_IF(MM2_TRACE_IF, "moving right %" slint_fmt "", nx_move);

        if (rank != high_rank) elem_nmove_at(s, 0, s, nx_move, s->size - ex_sizes[0]);
        else elem_nmove_at(s, ex_sizes[0], s, ex_sizes[0] + nx_move, s->size - ex_sizes[0]);
    }

    rti_tstop(rti_tid_mpi_merge2_moveright);

    /* exchange elements */
    rti_tstart(rti_tid_mpi_merge2_exchange);

    elem_assign_at(s, ex_start, &s0);
    ex_size = xmin(ex_sizes[0], ex_sizes[1]);

    if (ex_size > 0)
    {
        SL_TRACE_IF(MM2_TRACE_IF, "exchanging %" slint_fmt " elements at %" slint_fmt "", ex_size, ex_start);

#ifdef MM2_ELEMENTS_SENDRECV_REPLACE
        mpi_elements_sendrecv_replace(&s0, ex_size, other_rank, tag, other_rank, tag, size, rank, comm);
#else
#define xelem_call \
    MPI_Sendrecv_replace(xelem_buf(&s0), ex_size, xelem_mpi_datatype, other_rank, tag, other_rank, tag, comm, &status);
#include "sl_xelem_call.h"
#endif
    }

    elem_add(&s0, ex_size);

    if (ex_size < ex_sizes[0])
    {
        ex_size = ex_sizes[0] - ex_size;

        SL_TRACE_IF(MM2_TRACE_IF, "sending %" slint_fmt " at %" slint_fmt "", ex_size, (slint_t) (s0.keys - s->keys));

#define xelem_call \
    MPI_Send(xelem_buf(&s0), ex_size, xelem_mpi_datatype, other_rank, tag, comm);
#include "sl_xelem_call.h"

    } else if (ex_size < ex_sizes[1])
    {
        ex_size = ex_sizes[1] - ex_size;

        SL_TRACE_IF(MM2_TRACE_IF, "receiving %" slint_fmt " at %" slint_fmt "", ex_size, (slint_t) (s0.keys - s->keys));

#define xelem_call \
    MPI_Recv(xelem_buf(&s0), ex_size, xelem_mpi_datatype, other_rank, tag, comm, &status);
#include "sl_xelem_call.h"
    }

    rti_tstop(rti_tid_mpi_merge2_exchange);

    /* move the nx-block to the left (after exchange) */
    rti_tstart(rti_tid_mpi_merge2_moveleft);

    if (nx_move < 0 && s->size - ex_sizes[0] > 0)
    {
        SL_TRACE_IF(MM2_TRACE_IF, "moving left %" slint_fmt "", nx_move);

        if (rank != high_rank) elem_nmove_at(s, 0, s, nx_move, s->size - ex_sizes[0]);
        else elem_nmove_at(s, ex_sizes[0], s, ex_sizes[0] + nx_move, s->size - ex_sizes[0]);
    }

    rti_tstop(rti_tid_mpi_merge2_moveleft);

    /* prepare the local merge2 */
    if (rank != high_rank)
    {
        elem_assign_at(s, 0, &s0);
        s0.size = s->size - ex_sizes[0];

        elem_assign_at(s, s0.size, &s1);
        s1.size = ex_sizes[1];

    } else
    {
        elem_assign_at(s, 0, &s0);
        s0.size = ex_sizes[1];

        elem_assign_at(s, s0.size, &s1);
        s1.size = s->size - ex_sizes[0];
    }

#ifdef CHECK_ORDER
    check_order = elements_validate_order(&s0, 1);
    if (check_order) SL_ERROR("intermediate lower order failed at %" slint_fmt "", check_order);
    check_order = elements_validate_order(&s1, 1);
    if (check_order) SL_ERROR("intermediate higher order failed at %" slint_fmt "", check_order);
#endif

    s->size = s0.size + s1.size;

    /* local merge */
    rti_tstart(rti_tid_mpi_merge2_local);

    if (s0.size > 0 && s1.size > 0 && m2 != NULL)
    {
        SL_TRACE_IF(MM2_TRACE_IF, "local merge2 %" slint_fmt " with %" slint_fmt "", s0.size, s1.size);

        m2(&s0, &s1, xs);
    }

    rti_tstop(rti_tid_mpi_merge2_local);

#ifdef CHECK_ORDER
    check_order = elements_validate_order(s, 1);
    if (check_order) SL_ERROR("output order failed at %" slint_fmt "", check_order);
#endif

    rti_tstop(rti_tid_mpi_merge2);

#if defined(MM2_PRINT_TIMINGS) && defined(SL_USE_RTI_TIM)
    if (MM2_PRINT_TIMINGS)
    {
        printf("%d: mpi_merge2: %f\n", rank, rti_tlast(rti_tid_mpi_merge2));
        printf("%d: mpi_merge2: find: %f\n", rank, rti_tlast(rti_tid_mpi_merge2_find));
        printf("%d: mpi_merge2: move-right: %f\n", rank, rti_tlast(rti_tid_mpi_merge2_moveright));
        printf("%d: mpi_merge2: exchange: %f\n", rank, rti_tlast(rti_tid_mpi_merge2_exchange));
        printf("%d: mpi_merge2: move-left: %f\n", rank, rti_tlast(rti_tid_mpi_merge2_moveleft));
        printf("%d: mpi_merge2: local: %f\n", rank, rti_tlast(rti_tid_mpi_merge2_local));
    }
#endif

    return 0;
}
slint_t mpi_partition_radix2(elements_t *s, partcond2_t *pc, slint_t rhigh, slint_t rlow, slint_t rwidth, int *scounts, int *sdispls, int size, int rank, MPI_Comm comm) /* sl_proto, sl_func mpi_partition_radix2 */
{
  slkey_pure_t max_nclasses;
  slkey_pure_t nclasses, bit_mask;
  slkey_pure_t k;

  const slint_t max_nareas = size - 1;
  slint_t nareas, nareas_new;
  elements_t areas0[max_nareas], areas1[max_nareas], *areas, *areas_new;

  double *locals, *globals;
  double *local_counts, *local_weights, *global_counts, *global_weights;

  const slint_t max_nparts = size - 1;
  slint_t parts_low, parts_high, nparts_removed;
  slint_t parts[max_nparts], part_areas[max_nparts];

  double parts_range_[2 * 2 * (1 + max_nparts + 1)];
  double *parts_range = parts_range_ + (2 * 2);
  double parts_minmax_[2 * 4 * (1 + max_nparts + 1)];
  double *parts_minmax = parts_minmax_ + (2 * 4);
  slint_t parts_update_[1 + max_nparts + 1];
  slint_t *parts_update = parts_update_ + 1;

  double parts_minmax_new[2 * 4];
  double current_minmax[2 * 2];
  
  double final_locals[2 * max_nparts];

  slint_t i, j, jp1, jm1, l, lp1, lm1;
  slint_t current_width;

  double minmax[2 * 4 * size];
  
  slint_t last_new_area, last_new_class;

#ifdef HAVENT_MPI_IN_PLACE
  double local_minmax[2 * 4];
#endif

  slint_t lc, lcs, gc, gcs;
  double lw, gw, lws, gws;
  double d, m;

  elements_t xi, end;

  slint_t round = 0;
  slint_t direction = 1;

  slint_t refine, finalize;

#ifdef RCOUNTS_RDISPLS
  int *rcounts, *rdispls;
#endif

#ifdef WEIGHT_STATS
  slint_t total_count = 0, partial_counts[size + 1];
  double total_weight = 0.0, partial_weights[size + 1];
  double vmin, vmax;
# ifdef HAVENT_MPI_IN_PLACE
  slint_t partial_counts2[size + 1];
  double partial_weights2[size + 1];
# endif
#endif

  rti_treset(rti_tid_mpi_partition_radix2_while);                   /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_count);             /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_allreduce);         /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_round1);            /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_round1_allgather);  /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_exscan);            /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_check);             /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_check_pre);         /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_check_classes);     /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_check_final);       /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_check_post);        /* sl_tid */

  rti_tstart(rti_tid_mpi_partition_radix2_sync);
#ifdef SYNC_ON_INIT
  MPI_Barrier(comm);
#endif
  rti_tstop(rti_tid_mpi_partition_radix2_sync);

  rti_tstart(rti_tid_mpi_partition_radix2);

  if (rhigh < 0) rhigh = radix_high;
  if (rlow < 0) rlow = radix_low;
  if (rwidth < 0) rwidth = sort_radix_width_default;
  
  max_nclasses = powof2_typed(rwidth, slkey_pure_t);

  locals = sl_alloc(2 * (max_nareas * max_nclasses + max_nareas), sizeof(double));
  globals = sl_alloc(2 * (max_nareas * max_nclasses + max_nareas), sizeof(double));

  areas = areas0;
  areas_new = areas1;

  /* init the first area (all elements) */
  nareas = 1;
  elem_assign(s, &areas[0]);

  /* init all parts */
  parts_low = 0;
  parts_high = max_nparts - 1;
  for (i = parts_low; i <= parts_high; ++i)
  {
    parts[i] = i;
    part_areas[i] = 0;
  }

  /* init sdispls */
  for (i = 0; i < size; ++i) sdispls[i] = 0;

  rti_tstart(rti_tid_mpi_partition_radix2_while);

  while (parts_low <= parts_high)
  {
    ++round;

    /* setup bitmask */
    current_width = xmin(rwidth, rhigh - rlow + 1);
    rhigh -= (current_width > 0)?current_width - 1:rhigh;

    nclasses = (current_width > 0)?powof2_typed(current_width, slkey_pure_t):1;
    bit_mask = nclasses - 1;
    
    SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" sl_int_type_fmt ", rhigh: %" sl_int_type_fmt ", current_width: %" sl_int_type_fmt ", nclasses: %" sl_key_pure_type_fmt, round, rhigh, current_width, nclasses);

    finalize = (current_width <= 0);

    if (!finalize || round == 1)
    {
      /* init counters */
      local_counts = locals;
      global_counts = globals;
      local_weights = locals + (nareas * nclasses) + nareas;
      global_weights = globals + (nareas * nclasses) + nareas;

      /* zero all counter */
      for (i = 0; i < nareas; ++i)
      for (k = 0; k < nclasses; ++k) local_counts[i * nclasses + k] = local_weights[i * nclasses + k] = 0.0;

      rti_tstart(rti_tid_mpi_partition_radix2_while_count);

      /* for every area */
      for (i = 0; i < nareas; ++i)
      {
        elem_assign_at(&areas[i], areas[i].size, &end);

        if (nclasses > 1)
        {
          /* counts and weights in every class */
          for (elem_assign(&areas[i], &xi); xi.keys < end.keys; elem_inc(&xi))
          {
            k = radix_key2class(key_purify(*xi.keys), rhigh, bit_mask);
            local_counts[i * nclasses + k] += 1;
            local_weights[i * nclasses + k] += elem_weight_one(&xi, 0);
          }

        } else
        {
          /* total counts and weights */
          local_counts[i * nclasses + 0] = areas[i].size;

          for (elem_assign(&areas[i], &xi); xi.keys < end.keys; elem_inc(&xi)) local_weights[i * nclasses + 0] += elem_weight_one(&xi, 0);
        }

        /* total counts and weights in this area */
        local_counts[nareas * nclasses + i] = areas[i].size;

        local_weights[nareas * nclasses + i] = 0.0;
        for (k = 0; k < nclasses; ++k) local_weights[nareas * nclasses + i] += local_weights[i * nclasses + k];
      }

      rti_tstop(rti_tid_mpi_partition_radix2_while_count);

      --rhigh;

      rti_tstart(rti_tid_mpi_partition_radix2_while_allreduce);

      /* create global counts and weights */
#ifdef MPI_PARTITION_RADIX_REDUCEBCAST_THRESHOLD
      if (size >= MPI_PARTITION_RADIX_REDUCEBCAST_THRESHOLD)
      {
        MPI_Reduce(locals, globals, (1 + 1) * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, REDUCEBCAST_ROOT, comm);
        MPI_Bcast(globals, (1 + 1) * (nareas * nclasses + nareas), MPI_DOUBLE, REDUCEBCAST_ROOT, comm);

      } else
#endif
        MPI_Allreduce(locals, globals, (1 + 1) * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, comm);

      rti_tstop(rti_tid_mpi_partition_radix2_while_allreduce);
    }

#ifdef TIMING
    SL_TRACE_IF(DEBUG_OR_NOT, "allreduce: %f, nareas: %" sl_int_type_fmt ", nclasses: %" sl_key_type_fmt ", doubles: %" sl_int_type_fmt, rti_tlast(rti_tid_mpi_partition_radix2_while_allreduce), nareas, nclasses, (1 + 1) * (nareas * nclasses + nareas));
#endif

/*    if (DEBUG_OR_NOT)
    {
      printf("%d: locals\n", rank);
      for (i = 0; i < nareas; ++i)
      {
        printf("%d: %" sl_int_type_fmt ":", rank, i);
        for (k = 0; k < nclasses; ++k) printf("  %f", local_counts[i * nclasses + k]);
        printf(" = %f\n", local_counts[nareas * nclasses + i]);
        printf("%d: %" sl_int_type_fmt ":", rank, i);
        for (k = 0; k < nclasses; ++k) printf("  %f", local_weights[i * nclasses + k]);
        printf(" = %f\n", local_weights[nareas * nclasses + i]);
      }
      printf("%d: globals\n", rank);
      for (i = 0; i < nareas; ++i)
      {
        printf("%d: %" sl_int_type_fmt ":", rank, i);
        for (k = 0; k < nclasses; ++k) printf("  %f", global_counts[i * nclasses + k]);
        printf(" = %f\n", global_counts[nareas * nclasses + i]);
        printf("%d: %" sl_int_type_fmt ":", rank, i);
        for (k = 0; k < nclasses; ++k) printf("  %f", global_weights[i * nclasses + k]);
        printf(" = %f\n", global_weights[nareas * nclasses + i]);
      }
    }*/

    /* do some initializations */
    if (round == 1)
    {
      rti_tstart(rti_tid_mpi_partition_radix2_while_round1);
    
      /* distribute min/max counts and weights */
      minmax[rank * 2 * 4 + 0 + 0] = (pc->min_count >= 0)?pc->min_count:(-pc->min_count * global_counts[nareas * nclasses + 0] / size);
      minmax[rank * 2 * 4 + 0 + 1] = (pc->max_count >= 0)?pc->max_count:(-pc->max_count * global_counts[nareas * nclasses + 0] / size);
      minmax[rank * 2 * 4 + 0 + 2] = (pc->min_cpart >= 0)?pc->min_cpart:(-pc->min_cpart * global_counts[nareas * nclasses + 0]);
      minmax[rank * 2 * 4 + 0 + 3] = (pc->max_cpart >= 0)?pc->max_cpart:(-pc->max_cpart * global_counts[nareas * nclasses + 0]);

      minmax[rank * 2 * 4 + 4 + 0] = (pc->min_weight >= 0)?pc->min_weight:(-pc->min_weight * global_weights[nareas * nclasses + 0] / size);
      minmax[rank * 2 * 4 + 4 + 1] = (pc->max_weight >= 0)?pc->max_weight:(-pc->max_weight * global_weights[nareas * nclasses + 0] / size);
      minmax[rank * 2 * 4 + 4 + 2] = (pc->min_wpart >= 0)?pc->min_wpart:(-pc->min_wpart * global_weights[nareas * nclasses + 0]);
      minmax[rank * 2 * 4 + 4 + 3] = (pc->max_wpart >= 0)?pc->max_wpart:(-pc->max_wpart * global_weights[nareas * nclasses + 0]);

      rti_tstart(rti_tid_mpi_partition_radix2_while_round1_allgather);
#ifdef HAVENT_MPI_IN_PLACE
      local_minmax[0 + 0] = minmax[rank * 2 * 4 + 0 + 0];
      local_minmax[0 + 1] = minmax[rank * 2 * 4 + 0 + 1];
      local_minmax[0 + 2] = minmax[rank * 2 * 4 + 0 + 2];
      local_minmax[0 + 3] = minmax[rank * 2 * 4 + 0 + 3];
      local_minmax[4 + 0] = minmax[rank * 2 * 4 + 4 + 0];
      local_minmax[4 + 1] = minmax[rank * 2 * 4 + 4 + 1];
      local_minmax[4 + 2] = minmax[rank * 2 * 4 + 4 + 2];
      local_minmax[4 + 3] = minmax[rank * 2 * 4 + 4 + 3];
      MPI_Allgather(local_minmax, 2 * 4, MPI_DOUBLE, minmax, 2 * 4, MPI_DOUBLE, comm);
/*      MPI_Gather(local_minmax_weights, 2 * 4, MPI_DOUBLE, minmax_weights, 2 * 4, MPI_DOUBLE, 0, comm);
      MPI_Bcast(minmax_weights, 2 * 4 * size, MPI_DOUBLE, 0, comm);*/
#else
      MPI_Allgather(MPI_IN_PLACE, 2 * 4, MPI_DOUBLE, minmax_weights, 2 * 4, MPI_DOUBLE, comm);
#endif
      rti_tstop(rti_tid_mpi_partition_radix2_while_round1_allgather);

#ifdef WEIGHT_STATS
      total_count = global_counts[nareas * nclasses + 0];
      total_weight = global_weights[nareas * nclasses + 0];
#endif

      parts_minmax[2 * 4 * (parts_low - 1) + 0 + 0] = parts_minmax[2 * 4 * (parts_low - 1) + 0 + 2] = 0;
      parts_minmax[2 * 4 * (parts_low - 1) + 0 + 1] = parts_minmax[2 * 4 * (parts_low - 1) + 0 + 3] = 0;
      parts_minmax[2 * 4 * (parts_low - 1) + 4 + 0] = parts_minmax[2 * 4 * (parts_low - 1) + 4 + 2] = 0;
      parts_minmax[2 * 4 * (parts_low - 1) + 4 + 1] = parts_minmax[2 * 4 * (parts_low - 1) + 4 + 3] = 0;

      parts_minmax[2 * 4 * (parts_high + 1) + 0 + 0] = parts_minmax[2 * 4 * (parts_high + 1) + 0 + 2] = 0;
      parts_minmax[2 * 4 * (parts_high + 1) + 0 + 1] = parts_minmax[2 * 4 * (parts_high + 1) + 0 + 3] = global_counts[nareas * nclasses + 0];
      parts_minmax[2 * 4 * (parts_high + 1) + 4 + 0] = parts_minmax[2 * 4 * (parts_high + 1) + 4 + 2] = 0;
      parts_minmax[2 * 4 * (parts_high + 1) + 4 + 1] = parts_minmax[2 * 4 * (parts_high + 1) + 4 + 3] = global_weights[nareas * nclasses + 0];

      parts_range[2 * 2 * (parts_low - 1) + 0 + 0] = parts_range[2 * 2 * (parts_high + 1) + 0 + 0] = 0.0;
      parts_range[2 * 2 * (parts_low - 1) + 0 + 1] = parts_range[2 * 2 * (parts_high + 1) + 0 + 1] = global_counts[nareas * nclasses + 0];
      parts_range[2 * 2 * (parts_low - 1) + 2 + 0] = parts_range[2 * 2 * (parts_high + 1) + 2 + 0] = 0.0;
      parts_range[2 * 2 * (parts_low - 1) + 2 + 1] = parts_range[2 * 2 * (parts_high + 1) + 2 + 1] = global_weights[nareas * nclasses + 0];

      for (i = parts_high; i >= parts_low; --i)
      {
        parts_minmax[2 * 4 * parts[i] + 0 + 1] = parts_minmax[2 * 4 * (parts[i] + 1) + 0 + 1] - minmax[2 * 4 * (parts[i] + 1) + 0 + 0];
        parts_minmax[2 * 4 * parts[i] + 0 + 3] = parts_minmax[2 * 4 * (parts[i] + 1) + 0 + 3] - minmax[2 * 4 * (parts[i] + 1) + 0 + 1];
        parts_minmax[2 * 4 * parts[i] + 4 + 1] = parts_minmax[2 * 4 * (parts[i] + 1) + 4 + 1] - minmax[2 * 4 * (parts[i] + 1) + 4 + 0];
        parts_minmax[2 * 4 * parts[i] + 4 + 3] = parts_minmax[2 * 4 * (parts[i] + 1) + 4 + 3] - minmax[2 * 4 * (parts[i] + 1) + 4 + 1];
        
        parts_minmax[2 * 4 * parts[i] + 0 + 0] = parts_minmax[2 * 4 * parts[i] + 0 + 2] = parts_minmax[2 * 4 * parts[i] + 4 + 0] = parts_minmax[2 * 4 * parts[i] + 4 + 2] = -1;

        parts_range[2 * 2 * parts[i] + 0 + 0] = 0.0;
        parts_range[2 * 2 * parts[i] + 0 + 1] = global_counts[nareas * nclasses + 0];
        parts_range[2 * 2 * parts[i] + 2 + 0] = 0.0;
        parts_range[2 * 2 * parts[i] + 2 + 1] = global_weights[nareas * nclasses + 0];
/*        SL_ASSERT(minmax[2 * 4 * (parts[i] + 1) + 0 + 2] <= minmax[2 * 4 * (parts[i] + 0) + 0 + 3]);*/
/*        SL_ASSERT(minmax[2 * 4 * (parts[i] + 1) + 4 + 2] <= minmax[2 * 4 * (parts[i] + 0) + 4 + 3]);*/

        parts_update[parts[i]] = 1;

        if (finalize)
        {
          final_locals[2 * i + 0] = local_counts[nareas * nclasses + 0];
          final_locals[2 * i + 1] = local_weights[nareas * nclasses + 0];
        }
      }

      rti_tstop(rti_tid_mpi_partition_radix2_while_round1);
    }

    if (finalize)
    {
      j = parts_high - parts_low + 1;
    
      SL_TRACE_IF(DEBUG_OR_NOT, "Exscan: finalizing %" sl_int_type_fmt " parts", j);

      rti_tstart(rti_tid_mpi_partition_radix2_while_exscan);

      MPI_Exscan(&final_locals[2 * parts_low], &locals[2 * parts_low], 2 * j, MPI_DOUBLE, MPI_SUM, comm);
      if (rank == 0) for (i = parts_low; i <= parts_high; ++i) locals[2 * i + 0] = locals[2 * i + 1] = 0;

      rti_tstop(rti_tid_mpi_partition_radix2_while_exscan);
    }

    nareas_new = 0;
    last_new_area = last_new_class = -1;

    /* check all remaining parts */

    SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" sl_int_type_fmt ", %s", round, (direction > 0)?"forward":"backward");

    nparts_removed = 0;

    rti_tstart(rti_tid_mpi_partition_radix2_while_check);

    i = (direction > 0)?parts_low:parts_high;
    while ((direction > 0)?(i <= parts_high):(i >= parts_low))
    {
      rti_tstart(rti_tid_mpi_partition_radix2_while_check_pre);
    
      SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": PART: %" sl_int_type_fmt ",%" sl_int_type_fmt, round, i, parts[i]);

      j = 2 * 4 * parts[i];
      jp1 = 2 * 4 * (parts[i] + 1);
      jm1 = 2 * 4 * (parts[i] - 1);
      l = 2 * 2 * parts[i];
      lp1 = 2 * 2 * (parts[i] + 1);
      lm1 = 2 * 2 * (parts[i] - 1);

      if (parts_update[parts[i]])
      {
        if (direction > 0)
        {
          parts_minmax_new[0 + 0] = parts_minmax[jm1 + 0 + 0] + minmax[j + 0 + 0];
          parts_minmax_new[0 + 2] = parts_minmax[jm1 + 0 + 2] + minmax[j + 0 + 1];
          parts_minmax_new[4 + 0] = parts_minmax[jm1 + 4 + 0] + minmax[j + 4 + 0];
          parts_minmax_new[4 + 2] = parts_minmax[jm1 + 4 + 2] + minmax[j + 4 + 1];

          SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": %f + %f, %f + %f  /  %f + %f, %f + %f", i, parts[i],
            parts_minmax[jm1 + 0 + 0], minmax[j + 0 + 0],
            parts_minmax[jm1 + 0 + 2], minmax[j + 0 + 1],
            parts_minmax[jm1 + 4 + 0], minmax[j + 4 + 0],
            parts_minmax[jm1 + 4 + 2], minmax[j + 4 + 1]);

          SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": 0. parts_minmax_new: %f  %f  %f  %f  /  %f  %f  %f  %f", parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]);

          if (parts_minmax_new[0 + 0] < minmax[jp1 + 0 + 2]) parts_minmax_new[0 + 0] = minmax[jp1 + 0 + 2];
          if (parts_minmax_new[0 + 2] > minmax[j   + 0 + 3]) parts_minmax_new[0 + 2] = minmax[j   + 0 + 3];
          if (parts_minmax_new[4 + 0] < minmax[jp1 + 4 + 2]) parts_minmax_new[4 + 0] = minmax[jp1 + 4 + 2];
          if (parts_minmax_new[4 + 2] > minmax[j   + 4 + 3]) parts_minmax_new[4 + 2] = minmax[j   + 4 + 3];

          parts_minmax_new[0 + 1] = parts_minmax[j + 0 + 1];
          parts_minmax_new[0 + 3] = parts_minmax[j + 0 + 3];
          parts_minmax_new[4 + 1] = parts_minmax[j + 4 + 1];
          parts_minmax_new[4 + 3] = parts_minmax[j + 4 + 3];

        } else
        {
          parts_minmax_new[0 + 1] = parts_minmax[jp1 + 0 + 1] - minmax[jp1 + 0 + 0];
          parts_minmax_new[0 + 3] = parts_minmax[jp1 + 0 + 3] - minmax[jp1 + 0 + 1];
          parts_minmax_new[4 + 1] = parts_minmax[jp1 + 4 + 1] - minmax[jp1 + 4 + 0];
          parts_minmax_new[4 + 3] = parts_minmax[jp1 + 4 + 3] - minmax[jp1 + 4 + 1];

          SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": %f - %f, %f - %f  /  %f - %f, %f - %f", i, parts[i],
            parts_minmax[jp1 + 0 + 1], minmax[jp1 + 0 + 0],
            parts_minmax[jp1 + 0 + 3], minmax[jp1 + 0 + 1],
            parts_minmax[jp1 + 4 + 1], minmax[jp1 + 4 + 0],
            parts_minmax[jp1 + 4 + 3], minmax[jp1 + 4 + 1]);

          SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": 0. parts_minmax_new: %f  %f  %f  %f  /  %f  %f  %f  %f", parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]);

          if (parts_minmax_new[0 + 3] < minmax[jp1 + 0 + 2]) parts_minmax_new[0 + 3] = minmax[jp1 + 0 + 2];
          if (parts_minmax_new[0 + 1] > minmax[j   + 0 + 3]) parts_minmax_new[0 + 1] = minmax[j   + 0 + 3];
          if (parts_minmax_new[4 + 3] < minmax[jp1 + 4 + 2]) parts_minmax_new[4 + 3] = minmax[jp1 + 4 + 2];
          if (parts_minmax_new[4 + 1] > minmax[j   + 4 + 3]) parts_minmax_new[4 + 1] = minmax[j   + 4 + 3];

          parts_minmax_new[0 + 0] = parts_minmax[j + 0 + 0];
          parts_minmax_new[0 + 2] = parts_minmax[j + 0 + 2];
          parts_minmax_new[4 + 0] = parts_minmax[j + 4 + 0];
          parts_minmax_new[4 + 2] = parts_minmax[j + 4 + 2];
        }

        SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": 1. parts_minmax_new: %f  %f  %f  %f  /  %f  %f  %f  %f", parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]);
        SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": minmax: %f  %f  /  %f  %f", parts[i], minmax[2 * 4 * (parts[i] + 1) + 0 + 2], minmax[2 * 4 * (parts[i] + 0) + 0 + 3], minmax[2 * 4 * (parts[i] + 1) + 4 + 2], minmax[2 * 4 * (parts[i] + 0) + 4 + 3]);

        if (parts_minmax_new[0 + 0] > parts_minmax_new[0 + 1]) parts_minmax_new[0 + 0] = parts_minmax_new[0 + 1] = (parts_minmax_new[0 + 0] + parts_minmax_new[0 + 1]) / 2;
        if (parts_minmax_new[0 + 2] < parts_minmax_new[0 + 3]) parts_minmax_new[0 + 2] = parts_minmax_new[0 + 3] = (parts_minmax_new[0 + 2] + parts_minmax_new[0 + 3]) / 2;

        if (parts_minmax_new[4 + 0] > parts_minmax_new[4 + 1]) parts_minmax_new[4 + 0] = parts_minmax_new[4 + 1] = (parts_minmax_new[4 + 0] + parts_minmax_new[4 + 1]) / 2;
        if (parts_minmax_new[4 + 2] < parts_minmax_new[4 + 3]) parts_minmax_new[4 + 2] = parts_minmax_new[4 + 3] = (parts_minmax_new[4 + 2] + parts_minmax_new[4 + 3]) / 2;

      } else
      {
        parts_minmax_new[0 + 0] = parts_minmax[j + 0 + 0];
        parts_minmax_new[0 + 1] = parts_minmax[j + 0 + 1];
        parts_minmax_new[0 + 2] = parts_minmax[j + 0 + 2];
        parts_minmax_new[0 + 3] = parts_minmax[j + 0 + 3];

        parts_minmax_new[4 + 0] = parts_minmax[j + 4 + 0];
        parts_minmax_new[4 + 1] = parts_minmax[j + 4 + 1];
        parts_minmax_new[4 + 2] = parts_minmax[j + 4 + 2];
        parts_minmax_new[4 + 3] = parts_minmax[j + 4 + 3];
      }

      SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": 2. parts_minmax_new: %f  %f  %f  %f  /  %f  %f  %f  %f", i, parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]);

      current_minmax[0 + 0] = xmax(parts_minmax_new[0 + 0], parts_minmax_new[0 + 3]) - parts_range[l + 0 + 0];
      current_minmax[0 + 1] = xmin(parts_minmax_new[0 + 2], parts_minmax_new[0 + 1]) - parts_range[l + 0 + 0];

      current_minmax[2 + 0] = xmax(parts_minmax_new[4 + 0], parts_minmax_new[4 + 3]) - parts_range[l + 2 + 0];
      current_minmax[2 + 1] = xmin(parts_minmax_new[4 + 2], parts_minmax_new[4 + 1]) - parts_range[l + 2 + 0];

      SL_ASSERT(current_minmax[0 + 0] <= current_minmax[0 + 1]);
      SL_ASSERT(current_minmax[2 + 0] <= current_minmax[2 + 1]);

      rti_tstop(rti_tid_mpi_partition_radix2_while_check_pre);

      SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": current_minmax: %f  %f / %f  %f", parts[i], current_minmax[0 + 0], current_minmax[0 + 1], current_minmax[2 + 0], current_minmax[2 + 1]);

      lcs = gcs = 0;
      lws = gws = 0;

      /* HIT is the default */
      refine = 0;

      if (!finalize)
      {
        rti_tstart(rti_tid_mpi_partition_radix2_while_check_classes);
      
        for (k = 0; k < nclasses; ++k)
        {
          lc = local_counts[part_areas[i] * nclasses + k];
          gc = global_counts[part_areas[i] * nclasses + k];
          lw = local_weights[part_areas[i] * nclasses + k];
          gw = global_weights[part_areas[i] * nclasses + k];

          current_minmax[0 + 0] -= gc;
          current_minmax[0 + 1] -= gc;

          current_minmax[2 + 0] -= gw;
          current_minmax[2 + 1] -= gw;

          SL_TRACE_IF(DEBUG_OR_NOT, "k = %" sl_key_pure_type_fmt ", current_minmax: %f  %f  / %f  %f", k, current_minmax[0], current_minmax[1], current_minmax[2], current_minmax[3]);

          /* stop and refine if max count is skipped OR min count AND max weight is skipped */
          if ((current_minmax[0 + 1] < 0) || (current_minmax[0 + 0] < 0 && current_minmax[2 + 1] < 0))
          {
            refine = 1;
            break;
          }

          lcs += lc;
          gcs += gc;
          lws += lw;
          gws += gw;

          gc = gw = 0.0;

          /* if between min/max counts */
          if (current_minmax[0 + 0] <= 0 && current_minmax[0 + 1] >= 0)
          {
            /* go to next if max count not reached AND min weight not reached */
            if (current_minmax[0 + 1] > 0 && current_minmax[2 + 0] > 0) continue;

            /* look ahead for a better stop */
            if (k + 1 < nclasses && current_minmax[0 + 1] - global_counts[part_areas[i] * nclasses + k + 1] >= 0)
            {
              /* continue if weights will improve */
              if (myabs(current_minmax[2 + 0] + current_minmax[2 + 1]) > myabs(current_minmax[2 + 0] + current_minmax[2 + 1] - 2 * global_weights[part_areas[i] * nclasses + k + 1])) continue;
            }

            /* stop */
            break;
          }
        }

        SL_ASSERT(k < nclasses);

        SL_TRACE_IF(DEBUG_OR_NOT, "%s k = %" sl_key_pure_type_fmt, (refine)?"REFINE":"HIT", k);
      
        rti_tstop(rti_tid_mpi_partition_radix2_while_check_classes);

      } else
      {
        rti_tstart(rti_tid_mpi_partition_radix2_while_check_final);

        /* middle of min/max weight */
        m = (current_minmax[2 + 0] + current_minmax[2 + 1]) / 2;

        /* min. part of weight to contribute */
        d = xmax(0, m - locals[i * 2 + 1]);

        /* contribute all? */
        if (d >= final_locals[i * 2 + 1])
        {
          lc = final_locals[i * 2 + 0];
          lw = final_locals[i * 2 + 1];

        } else
        {
          /* contribute only a part */
          lc = 0;
          lw = 0; /* not required */

          do
          {
            d -= elem_weight_one(s, sdispls[1 + parts[i]] + lc);
            ++lc;

          } while (d >= 0 && lc < final_locals[i * 2 + 0]);

          --lc;
        
          /* if unweighted, then m = middle of min/max count, d = ..., lc = d */
        }

        /* check mc against min/max count borders */
        lc = xminmax(current_minmax[0 + 0] - locals[i * 2 + 0], lc, current_minmax[0 + 1] - locals[i * 2 + 0]);

        /* check agains 0 (don't step back!) and the local contribution */
        lc = xminmax(0, lc, final_locals[i * 2 + 0]);

        /* the exact global counts/weights are unknown (set gc/gw so that parts_range is not changed) */
        gc = 0;
        gw = 0;

        lcs += lc;
        gcs += gc;
        lws += lw;
        gws += gw;
        
        gc = (parts_range[2 * 2 * parts[i] + 0 + 1] - parts_range[2 * 2 * parts[i] + 0 + 0]);
        gw = (parts_range[2 * 2 * parts[i] + 2 + 1] - parts_range[2 * 2 * parts[i] + 2 + 0]);

        rti_tstop(rti_tid_mpi_partition_radix2_while_check_final);
      }      

      rti_tstart(rti_tid_mpi_partition_radix2_while_check_post);
      
      SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": sdispls[%" sl_int_type_fmt " + 1] = %d, lcs = %" sl_int_type_fmt, i, parts[i], parts[i], sdispls[parts[i] + 1], lcs);

      sdispls[parts[i] + 1] += lcs;

      if (gcs > 0 || gws > 0)
      {
        parts_range[l + 0 + 0] += gcs;
        parts_range[l + 0 + 1] = parts_range[l + 0 + 0] + gc;
        parts_range[l + 2 + 0] += gws;
        parts_range[l + 2 + 1] = parts_range[l + 2 + 0] + gw;

        SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": 3. part_minmax_new: %f  %f  %f  %f  /  %f  %f  %f  %f", i, parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]);
        SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": parts_range: %f  %f  /  %f  %f", i, parts[i], parts_range[2 * 2 * parts[i] + 0 + 0], parts_range[2 * 2 * parts[i] + 0 + 1], parts_range[2 * 2 * parts[i] + 2 + 0], parts_range[2 * 2 * parts[i] + 2 + 1]);

        parts_minmax_new[0 + 0] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 0], parts_range[l + 0 + 1]);
        parts_minmax_new[0 + 2] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 2], parts_range[l + 0 + 1]);
        parts_minmax_new[0 + 1] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 1], parts_range[l + 0 + 1]);
        parts_minmax_new[0 + 3] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 3], parts_range[l + 0 + 1]);
      
        parts_minmax_new[4 + 0] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 0], parts_range[l + 2 + 1]);
        parts_minmax_new[4 + 2] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 2], parts_range[l + 2 + 1]);
        parts_minmax_new[4 + 1] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 1], parts_range[l + 2 + 1]);
        parts_minmax_new[4 + 3] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 3], parts_range[l + 2 + 1]);
      }

      SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": 4. part_minmax_new: %f  %f  %f  %f  /  %f  %f  %f  %f", i, parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]);

      if (parts_minmax_new[0 + 0] != parts_minmax[j + 0 + 0] || parts_minmax_new[0 + 2] != parts_minmax[j + 0 + 2] || parts_minmax_new[4 + 0] != parts_minmax[j + 4 + 0] || parts_minmax_new[4 + 2] != parts_minmax[j + 4 + 2])
      {
        parts_minmax[j + 0 + 0] = parts_minmax_new[0 + 0];
        parts_minmax[j + 0 + 2] = parts_minmax_new[0 + 2];
        parts_minmax[j + 4 + 0] = parts_minmax_new[4 + 0];
        parts_minmax[j + 4 + 2] = parts_minmax_new[4 + 2];

        parts_update[parts[i] + 1] = 1;
      }

      if (parts_minmax_new[0 + 1] != parts_minmax[j + 0 + 1] || parts_minmax_new[0 + 3] != parts_minmax[j + 0 + 3] || parts_minmax_new[4 + 1] != parts_minmax[j + 4 + 1] || parts_minmax_new[4 + 3] != parts_minmax[j + 4 + 3])
      {
        parts_minmax[j + 0 + 1] = parts_minmax_new[0 + 1];
        parts_minmax[j + 0 + 3] = parts_minmax_new[0 + 3];
        parts_minmax[j + 4 + 1] = parts_minmax_new[4 + 1];
        parts_minmax[j + 4 + 3] = parts_minmax_new[4 + 3];

        parts_update[parts[i] - 1] = 1;
      }

      parts_update[parts[i]] = 0;

      /* refine or remove */
      if (refine)
      {
        /* bits left for partitioning? */
        if (rhigh >= rlow)
        {
          if (last_new_area == part_areas[i] && last_new_class == k) part_areas[i] = nareas_new - 1;
          else
          {
            /* update last_new_... */
            last_new_area = part_areas[i];
            last_new_class = k;

            /* create new area */
            elem_assign_at(&areas[part_areas[i]], lcs, &areas_new[nareas_new]);
            areas_new[nareas_new].size = local_counts[part_areas[i] * nclasses + k];
            part_areas[i] = nareas_new;
            ++nareas_new;
          }

        } else
        {
          /* save local count/weight for the later prefix calculations */
          final_locals[2 * (i - nparts_removed * direction) + 0] = lc;
          final_locals[2 * (i - nparts_removed * direction) + 1] = lw;
        }

        parts[i - nparts_removed * direction] = parts[i];
        part_areas[i - nparts_removed * direction] = part_areas[i];

      } else ++nparts_removed;

      rti_tstop(rti_tid_mpi_partition_radix2_while_check_post);
      
      i += direction;
    }

    if (direction > 0) parts_high -= nparts_removed;
    else parts_low += nparts_removed;

    direction *= -1;

/*    SL_NOTICE_IF(DEBUG_OR_NOT, "nparts = %" sl_int_type_fmt " vs. nareas_new = %" sl_int_type_fmt, nparts, nareas_new);*/

    rti_tstop(rti_tid_mpi_partition_radix2_while_check);
    
    /* switch areas */
    nareas = nareas_new;
    if (areas == areas0)
    {
      areas = areas1;
      areas_new = areas0;
    } else
    {
      areas = areas0;
      areas_new = areas1;
    }
  }

  rti_tstop(rti_tid_mpi_partition_radix2_while);

  /* create scounts */
  for (i = 0; i < size - 1; ++i) scounts[i] = sdispls[i + 1] - sdispls[i];
  scounts[size - 1] = s->size - sdispls[size - 1];

#ifdef SCOUNTS_SDISPLS
  printf("%d: scounts", rank);
  for (i = 0, j = 0; i < size; ++i) { printf("  %d", scounts[i]); j += scounts[i]; }
  printf(" = %" sl_int_type_fmt "\n", j);
  printf("%d: sdispls", rank);
  for (i = 0; i < size; ++i) printf("  %d", sdispls[i]);
  printf("\n");
#endif

#ifdef RCOUNTS_RDISPLS
  rcounts = sl_alloc(size, sizeof(int));
  rdispls = sl_alloc(size, sizeof(int));

  MPI_Alltoall(scounts, 1, MPI_INT, rcounts, 1, MPI_INT, comm);

  rdispls[0] = 0;
  for (i = 1; i < size; ++i) rdispls[i] = rdispls[i - 1] + rcounts[i - 1];

  printf("%d: rcounts", rank);
  for (i = 0; i < size; ++i) printf("  %d", rcounts[i]);
  printf("\n");
  printf("%d: rdispls", rank);
  for (i = 0; i < size; ++i) printf("  %d", rdispls[i]);
  printf("\n");

  sl_free(rcounts);
  sl_free(rdispls);
#endif

  sl_free(locals);
  sl_free(globals);

#ifdef WEIGHT_STATS
  partial_counts[size] = 0;
  partial_weights[size] = 0.0;
  for (i = 0; i < size; ++i)
  {
    partial_counts[i] = scounts[i];
    partial_weights[i] = 0.0;
    for (j = sdispls[i]; j < sdispls[i] + scounts[i]; ++j) partial_weights[i] += elem_weight_one(s, j);
    
    partial_counts[size] += partial_counts[i];
    partial_weights[size] += partial_weights[i];
  }

#ifdef HAVENT_MPI_IN_PLACE
  MPI_Reduce(partial_counts, partial_counts2, size + 1, int_mpi_datatype, MPI_SUM, 0, comm);
  MPI_Reduce(partial_weights, partial_weights2, size + 1, MPI_DOUBLE, MPI_SUM, 0, comm);
# define partial_counts   partial_counts2
# define partial_weights  partial_weights2
#else
  /* recvbuf requires workaround for an in-place/aliased-buffer-check-bug in mpich2 (fixed with rev 5518) */
  MPI_Reduce((rank == 0)?MPI_IN_PLACE:partial_counts, (rank == 0)?partial_counts:NULL, size + 1, int_mpi_datatype, MPI_SUM, 0, comm);
  MPI_Reduce((rank == 0)?MPI_IN_PLACE:partial_weights, (rank == 0)?partial_weights:NULL, size + 1, MPI_DOUBLE, MPI_SUM, 0, comm);
#endif

  if (rank == 0)
  {
    printf("%d: total_count: %" sl_int_type_fmt " vs. %" sl_int_type_fmt "\n", rank, total_count, partial_counts[size]);
    d = 0.0;
    vmin = 1.0;
    vmax = 0.0;
    for (i = 0; i < size; ++i)
    {
/*      printf("%d: %" sl_int_type_fmt " %" sl_int_type_fmt " / %f - %" sl_int_type_fmt " / %f\n", rank, i, partial_counts[i], (double) partial_counts[i] / partial_counts[size], (partial_counts[size] / size) - partial_counts[i], fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])));*/
      d += fabs((partial_counts[size] / size) - partial_counts[i]);
      if (fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])) < vmin) vmin = fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size]));
      if (fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])) > vmax) vmax = fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size]));
    }
    printf("%d: min/max: %f / %f\n", rank, vmin, vmax);
    printf("%d: average_count: %" sl_int_type_fmt " - %f / %f\n", rank, partial_counts[size] / size, d / size, d / partial_counts[size]);

    printf("%d: total_weight: %f vs. %f\n", rank, total_weight, partial_weights[size]);
    d = 0.0;
    vmin = 1.0;
    vmax = 0.0;
    for (i = 0; i < size; ++i)
    {
/*      printf("%d: %" sl_int_type_fmt " %f / %f - %f / %f\n", rank, i, partial_weights[i], partial_weights[i] / partial_weights[size], (partial_weights[size] / size) - partial_weights[i], fabs(1.0 - (partial_weights[i] * size / partial_weights[size])));*/
      d += fabs((partial_weights[size] / size) - partial_weights[i]);
      if (fabs(1.0 - (partial_weights[i] * size / partial_weights[size])) < vmin) vmin = fabs(1.0 - (partial_weights[i] * size / partial_weights[size]));
      if (fabs(1.0 - (partial_weights[i] * size / partial_weights[size])) > vmax) vmax = fabs(1.0 - (partial_weights[i] * size / partial_weights[size]));
    }
    printf("%d: min/max: %f / %f\n", rank, vmin, vmax);
    printf("%d: average_weight: %f - %f / %f\n", rank, partial_weights[size] / size, d / size, d / partial_weights[size]);
  }
#endif

  rti_tstop(rti_tid_mpi_partition_radix2);

#if defined(TIMING_STATS) && defined(SL_USE_RTI_TIM)
  if (rank == 0)
  {
    printf("%d: mpi_partition_radix: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2));
    printf("%d: mpi_partition_radix: sync: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_sync));
    printf("%d: mpi_partition_radix: while: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while));
    printf("%d: mpi_partition_radix:   count: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_count));
    printf("%d: mpi_partition_radix:   allreduce: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_allreduce));
    printf("%d: mpi_partition_radix:   round1: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_round1));
    printf("%d: mpi_partition_radix:     allgather: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_round1_allgather));
    printf("%d: mpi_partition_radix:   exscan: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_exscan));
    printf("%d: mpi_partition_radix:   check: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_check));
    printf("%d: mpi_partition_radix:     pre: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_pre));
    printf("%d: mpi_partition_radix:     classes: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_classes));
    printf("%d: mpi_partition_radix:     final: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_final));
    printf("%d: mpi_partition_radix:     post: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_post));
  }
#endif

  return 0;
}
Ejemplo n.º 28
0
slint_t rs_rec_ma(elements_t *s, elements_t *sx, slint_t rhigh, slint_t rlow, slint_t rwidth) /* sl_func rs_rec_ma */
{
#define max_nclasses (powof2_typed(sort_radix_width_max, slkey_pure_t))

  slkey_pure_t bit_mask, nclasses;

  slint_t i, j, k, current_width, c[max_nclasses];
  elements_t xi, end, parts[max_nclasses];

  elem_assign_at(s, s->size, &end);

  current_width = xmin(rwidth, rhigh - rlow + 1);
  rhigh -= current_width - 1;

  nclasses = powof2_typed(current_width, slkey_pure_t);
  bit_mask = nclasses - 1;


  /* zero all counter */
  for (i = 0; i < nclasses; i++) c[i] = 0;

  /* count the number of elements in every class */
  for (elem_assign(s, &xi); xi.keys < end.keys; elem_inc(&xi)) ++c[key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask)];

  /* compute the target of every class */
  elem_assign(s, &parts[0]);
  for (i = 1; i < nclasses; i++) elem_assign_at(&parts[i - 1], c[i - 1], &parts[i]);;

  /* split the elements */
  elem_assign(s, &end);
  for (i = 0; i < nclasses; i++)
  {
    elem_add(&end, c[i]);

    elem_assign(&parts[i], &xi);

    while (xi.keys < end.keys)
    {
      j = key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask);

      while (j != i)
      {
        k = key_radix_key2class(key_purify(*parts[j].keys), rhigh, bit_mask);

        if (k != j) elem_xchange(&xi, &parts[j], sx);

        elem_inc(&parts[j]);

        j = k;
      }

      elem_inc(&xi);
    }
  }

  --rhigh;

  if (rhigh >= rlow)
  {
#ifdef SR_MA_INSERTSORT
    bit_mask = 0;
    if (rhigh - rlow + 1 <= key_radix_high) bit_mask = powof2_typed(rhigh - rlow + 1, slkey_pure_t);
    bit_mask = (bit_mask - 1) << rlow;
#endif

    elem_assign(s, &xi);
    for (i = 0; i < nclasses; i++)
    {
      xi.size = c[i];

#ifdef SR_MA_INSERTSORT
      if (xi.size > sort_radix_threshold_rec)
#else
      if (xi.size > 1)
#endif
      {
        if (xi.size > sx->size) rs_rec_ma(&xi, sx, rhigh, rlow, rwidth);
        else rs_rec_ma_db(&xi, sx, rhigh, rlow, rwidth, 1);
      }
#ifdef SR_MA_INSERTSORT
        else
      {
        if (xi.size > 1) sort_insert_bmask_kernel(&xi, sx, bit_mask);
      }
#endif

      elem_add(&xi, c[i]);
    }
  }

  return 0;
}
Ejemplo n.º 29
0
void PCGSolver::solvefull( OoqpVector& rhs_ )
{
  SimpleVector& b = dynamic_cast<SimpleVector&>(rhs_);
  assert(n+m == b.length());

  int flag, imin; int stagsteps, maxstagsteps;
  double normr, normr_act, normrmin; 
  double alpha, beta, rg, pHp;

  double n2b  = b.twonorm();
  double tolb = n2b*tol;

  if(tmpVec1==NULL) tmpVec1=new double[n+m];
  if(tmpVec2==NULL) tmpVec2=new double[n+m];
  if(tmpVec3==NULL) tmpVec3=new double[n];
  if(tmpVec4==NULL) tmpVec4=new double[n];
  if(tmpVec5==NULL) tmpVec5=new double[n];
  if(tmpVec6==NULL) tmpVec6=new double[n+m];

  SimpleVector xy(tmpVec1, n+m);      //iterate
  SimpleVector auxnm(tmpVec2,n+m);      //auxiliary
  SimpleVector xmin(tmpVec3,n);   //minimal residual iterate
  SimpleVector g(tmpVec4,n);      //work vectors
  SimpleVector p(tmpVec5,n);
  SimpleVector res(tmpVec6, n+m);

  SimpleVector    x(& xy[0],n); //y-part of the iterate
  SimpleVector    y(& xy[n],m); //x-part of the iterate
  SimpleVector   rx(&res[0],n); //residual
  SimpleVector   ry(&res[n],m); //residual corresponding to last m eqn
  SimpleVector auxn(&auxnm[0],n);
  SimpleVector auxm(&auxnm[n],m);
  //////////////////////////////////////////////////////////////////
  // Starting procedure
  /////////////////////////////////////////////////////////////////

  //find starting point x satisfying Ax=b_2
  applyM1(0.0, xy, 1.0, b);

  //compute the x-residual for the starting point rx=Hx-b_1
  rx.copyFromArray(&b[0]);
  applyA(-1.0, rx, 1.0, x);

  //find y such that it minimizes ||r-A'y||_Ginv
  //this is done by a preconditioner solve with rhs=[rx;0]
  ry.setToZero();
  applyM1(0.0, auxnm, 1.0, res);
  y.copyFromArray(&auxnm[n]);

  //remove A'y from residual
  At->doIt(1.0, rx, -1.0, y);

  //initialize projected residual g=Pr and update p=-g
  ry.setToZero();
  applyM1(0.0, auxnm, 1.0, res);
  g.copyFromArray(&auxnm[0]);
  p.copyFrom(g); p.negate();

  normr=rx.twonorm();
  rg = rx.dotProductWith(g);

  xmin.copyFrom(x);
  flag=1; imin=0;

  maxit=n/2+10;
  if(normr<tolb) {
    //initial guess is good enough
    for(int i=0; i<n; i++)   b[i]=x[i];
    for(int i=n; i<n+m; i++) b[i]=y[i-n];
    return;
  }
  stagsteps=0; maxstagsteps = 5; normrmin=normr;

  //////////////////////////////////////////////////////////////////
  // loop over maxit iterations
  //////////////////////////////////////////////////////////////////
  int ii=0; while(ii<maxit) {
    ii++;
    // compute Hp and p'Hp
    SimpleVector Hp(&auxn[0], n);
    applyA(0.0, Hp, 1.0, p);
    pHp = p.dotProductWith(Hp);

    //check for negative curvature
    if(pHp<0.0) { flag=2; break; }

    alpha = rg/pHp;
    
    //update x=x+alpha*p and r=r+alpha*H*p
    x.axpy(alpha,  p); rx.axpy(alpha,Hp);

    normr=rx.twonorm();
    ///////////////////////////////////////
    //convergence tests
    ///////////////////////////////////////
    if(normr<=tolb) {
      //compute actual residual
      SimpleVector rx_act(&auxnm[0], n);
      rx_act.copyFromArray(&b[0]);
      applyA(-1.0, rx_act, 1.0, x);
      normr_act=rx_act.twonorm(); 

      //if(normr_act/n2b<tolb) { flag=0; break; }
      { flag=0; break; }
    }
    
    if(normr<normrmin) { imin=ii; xmin.copyFrom(x); normrmin=normr; stagsteps=0;}
    else stagsteps++;
    //check for stagnation!!!
    if(stagsteps>maxstagsteps) { flag=4; break; }
    //------- end convergence tests -------

    //compute y that minimizes ||r-A'y||_Ginv
    ry.setToZero();
    applyM1(0.0, auxnm, 1.0, res);
    y.copyFromArray(&auxnm[n]);
    //substract A'y from r
    At->doIt(1.0, rx, -1.0, y);

    //projected residual g=Pr
    ry.setToZero();
    applyM1(0.0, auxnm, 1.0, res);
    g.copyFromArray(&auxnm[0]);

    double rpgp = rx.dotProductWith(g);
    beta = rpgp/rg;
    
    //p = -g+beta*p
    p.scale(beta); p.axpy(-1.0, g);

    rg = rpgp;
 
    //rounding error
    if(rg<0.0) { flag=3; break; }
  }


  //////////////////////////////////////////////////////////
  // status/error output
  /////////////////////////////////////////////////////////
  if(flag==0) {
    double relres = normr_act/n2b;
    printf("CG converged: actual normResid=%g relResid=%g iter=%d\n", 
	   normr_act, relres, ii);

    b.setToZero();
    for(int i=0; i<n; i++)  b[i] = x[i];
    for(int i=n; i<m+n;i++) b[i] = y[i-n];
  } else {

    if(flag==4) x.copyFrom(xmin);
    //compute actual residual
    SimpleVector rx(&auxnm[0], n);
    rx.copyFromArray(&b[0]);
    applyA(1.0, rx, -1.0, x);
    normr_act = rx.twonorm(); 

    for(int i=0; i<n; i++)  b[i] = x[i];
    for(int i=n; i<m+n;i++) b[i] = y[i-n];

    if(gOoqpPrintLevel>=1) {
      printf("Projected CG did not NOT converged after %d iters. The solution from iter %d was returned.\n", ii, imin);
      printf("\t - Error code %d\n\t - Act res=%g\n\t - Rel res=%g %g\n\n", 
	     flag, normr_act, normrmin);
    }

  }
  //b.copyFrom(x);
}
Ejemplo n.º 30
0
slint_t rs_rec_ma_db(elements_t *s, elements_t *sx, slint_t rhigh, slint_t rlow, slint_t rwidth, slint_t switchdb) /* sl_func rs_rec_ma_db */
{
#define max_nclasses (powof2_typed(sort_radix_width_max, slkey_pure_t))

  slkey_pure_t bit_mask, nclasses;

  slint_t i, j, current_width, c[max_nclasses];
  elements_t xi, xj, end, parts[max_nclasses];

  elem_assign_at(s, s->size, &end);

  current_width = xmin(rwidth, rhigh - rlow + 1);
  rhigh -= current_width - 1;

  nclasses = powof2_typed(current_width, slkey_pure_t);
  bit_mask = nclasses - 1;


  /* zero all counter */
  for (i = 0; i < nclasses; i++) c[i] = 0;

  /* count the number of elements in every class */
  for (elem_assign(s, &xi); xi.keys < end.keys; elem_inc(&xi)) ++c[key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask)];

  /* compute the target of every class */
  elem_assign(sx, &parts[0]);
  for (i = 1; i < nclasses; i++) elem_assign_at(&parts[i - 1], c[i - 1], &parts[i]);

  /* split the elements */
  elem_assign(s, &xi);
  elem_assign_at(s, s->size, &end);
  while (xi.keys < end.keys)
  {
    j = key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask);

    elem_copy(&xi, &parts[j]);

    elem_inc(&xi);
    elem_inc(&parts[j]);
  }

  --rhigh;

  if (rhigh >= rlow)
  {
#ifdef SR_MA_INSERTSORT
    bit_mask = 0;
    if (rhigh - rlow + 1 <= key_radix_high) bit_mask = powof2_typed(rhigh - rlow + 1, slkey_pure_t);
    bit_mask = (bit_mask - 1) << rlow;
#endif

    elem_assign(s, &xi);
    elem_assign(sx, &xj);
    for (i = 0; i < nclasses; i++)
    {
      xi.size = xj.size = c[i];

#ifdef SR_MA_INSERTSORT
      if (c[i] > sort_radix_threshold_rec) rs_rec_ma_db(&xj, &xi, rhigh, rlow, rwidth, (!switchdb));
      else
      {
        if (c[i] > 1) sort_insert_bmask_kernel(&xj, &xi, bit_mask);
        if (switchdb) elem_ncopy(&xj, &xi, c[i]);
      }

      elem_add(&xi, c[i]);
      elem_add(&xj, c[i]);
    }
#else
      if (c[i] > 1) rs_rec_ma_db(&xj, &xi, rhigh, rlow, rwidth, (!switchdb));
#endif

  } else elem_ncopy(sx, s, s->size);

  return 0;
}