int n2a_event_host_check (int event_type __attribute__ ((__unused__)), void *data) { //logger(LG_DEBUG, "Event: event_service_check"); nebstruct_host_check_data *c = (nebstruct_host_check_data *) data; if (c->type == NEBTYPE_HOSTCHECK_PROCESSED) { //logger(LG_DEBUG, "HOSTCHECK_PROCESSED: %s", c->host_name); char *buffer = NULL, *key = NULL; size_t l = xstrlen(g_options.connector) + xstrlen(g_options.eventsource_name) + xstrlen(c->host_name) + 20; nebstruct_host_check_data_to_json(&buffer, c); // DO NOT FREE !!! xalloca(key, xmin(g_options.max_size, (int)l) * sizeof(char)); snprintf(key, xmin(g_options.max_size, (int)l), "%s.%s.check.component.%s", g_options.connector, g_options.eventsource_name, c->host_name); if (c_size == -10000 || c_size / 2 == 0) amqp_publish(key, buffer); else n2a_record_cache (key, buffer); xfree(buffer); } return 0; }
slint_t sort_radix_ma(elements_t *s, elements_t *sx, slint_t rhigh, slint_t rlow, slint_t rwidth) /* sl_proto, sl_func sort_radix_ma */ { elements_t _sx; if (s == NULL) return -1; if (s->size < 2) return 0; rti_tstart(rti_tid_sort_radix); if (sx == NULL || sx->size < 1) { sx = &_sx; elements_alloc(sx, 1, SLCM_ALL); } else if (sx->size < 1) return -1; if (rhigh < 0) rhigh = key_radix_high; if (rlow < 0) rlow = key_radix_low; if (rwidth <= 0) rwidth = sort_radix_width_default; rs_rec_ma(s, sx, rhigh, rlow, xmin(rwidth, sort_radix_width_max)); if (sx == &_sx) elements_free(sx); rti_tstop(rti_tid_sort_radix); return 0; }
slint sort_permute_forward(elements_t *s, elements_t *sx, slint *perm, slint offset, slint mask_bit) /* sl_proto, sl_func sort_permute_forward */ { elements_t _sx; if (s == NULL) return -1; if (s->size < 2) return 0; rti_tstart(rti_tid_sort_permute_forward); if (sx == NULL || sx->size < 1) { sx = &_sx; elements_alloc(sx, 1, SLCM_ALL); } if (mask_bit < 0) sort_permute_forward_(s, sx, perm, offset); else sort_permute_forward_masked(s, sx, perm, offset, 1L << xmin(mask_bit, (sizeof(slint) * 8) - 1)); if (sx == &_sx) elements_free(sx); rti_tstop(rti_tid_sort_permute_forward); return 0; }
bool SmoothConstrainedInterpolator::ProjectVelocity(const Config& x,Config& v) { constraint->PreEval(x); Matrix J; constraint->Jacobian(x,J); if(!xmin.empty()) { //look through active contraints, set that column to 0 for(int i=0;i<x.n;i++) { if(x(i)==xmin(i) || x(i) == xmax(i)) { v(i) = 0; for(int j=0;j<J.m;j++) J(j,i) = 0; } } } RobustSVD<Real> svd; bool res=svd.set(J); if(!res) { fprintf(stderr,"SmoothConstrainedInterpolator: Numerical error projecting velocity?\n"); return false; } Vector temp; svd.nullspaceComponent(v,temp); v -= temp; return true; }
void Instance::transformBoundingBox() { auto b = i->getBoundingBox(); BoundingBox bb; for(int i = 0; i <= RayTracer::getInstance()->maxTime; ++i) { std::set<double> x,y,z; auto m = makeMatrices(i); auto p = Vector(b.xmin(i),0,0); p = transformLoc(m.first, p); x.insert(p.x); y.insert(p.y); z.insert(p.z); p = Vector(b.xmax(i),0,0); p = transformLoc(m.first, p); x.insert(p.x); y.insert(p.y); z.insert(p.z); p = Vector(0,b.ymin(i),0); p = transformLoc(m.first, p); x.insert(p.x); y.insert(p.y); z.insert(p.z); p = Vector(0,b.ymax(i),0); p = transformLoc(m.first, p); x.insert(p.x); y.insert(p.y); z.insert(p.z); p = Vector(0,0,b.zmin(i)); p = transformLoc(m.first, p); x.insert(p.x); y.insert(p.y); z.insert(p.z); p = Vector(0,0,b.zmax(i)); p = transformLoc(m.first, p); x.insert(p.x); y.insert(p.y); z.insert(p.z); bb.xmin.addFrame(i, *x.begin()); bb.xmax.addFrame(i, *x.rbegin()); bb.ymin.addFrame(i, *y.begin()); bb.ymax.addFrame(i, *y.rbegin()); bb.zmin.addFrame(i, *z.begin()); bb.zmax.addFrame(i, *z.rbegin()); } bbox = bb; }
slint_t binning_radix_pre(binning_t *bm) /* sl_proto, sl_func binning_radix_pre */ { bm->bd.radix.rcurrent = xmin(bm->bd.radix.rwidth, bm->bd.radix.rhigh - bm->bd.radix.rlow + 1); bm->bd.radix.rhigh -= (bm->bd.radix.rcurrent > 0)?bm->bd.radix.rcurrent - 1:bm->bd.radix.rhigh; bm->nbins = (bm->bd.radix.rcurrent > 0)?powof2(bm->bd.radix.rcurrent):1; bm->bd.radix.bit_mask = bm->nbins - 1; return 0; }
int QDeclarativeDrag::qt_metacall(QMetaObject::Call _c, int _id, void **_a) { _id = QObject::qt_metacall(_c, _id, _a); if (_id < 0) return _id; if (_c == QMetaObject::InvokeMetaMethod) { if (_id < 8) qt_static_metacall(this, _c, _id, _a); _id -= 8; } #ifndef QT_NO_PROPERTIES else if (_c == QMetaObject::ReadProperty) { void *_v = _a[0]; switch (_id) { case 0: *reinterpret_cast< QGraphicsObject**>(_v) = target(); break; case 1: *reinterpret_cast< Axis*>(_v) = axis(); break; case 2: *reinterpret_cast< qreal*>(_v) = xmin(); break; case 3: *reinterpret_cast< qreal*>(_v) = xmax(); break; case 4: *reinterpret_cast< qreal*>(_v) = ymin(); break; case 5: *reinterpret_cast< qreal*>(_v) = ymax(); break; case 6: *reinterpret_cast< bool*>(_v) = active(); break; case 7: *reinterpret_cast< bool*>(_v) = filterChildren(); break; } _id -= 8; } else if (_c == QMetaObject::WriteProperty) { void *_v = _a[0]; switch (_id) { case 0: setTarget(*reinterpret_cast< QGraphicsObject**>(_v)); break; case 1: setAxis(*reinterpret_cast< Axis*>(_v)); break; case 2: setXmin(*reinterpret_cast< qreal*>(_v)); break; case 3: setXmax(*reinterpret_cast< qreal*>(_v)); break; case 4: setYmin(*reinterpret_cast< qreal*>(_v)); break; case 5: setYmax(*reinterpret_cast< qreal*>(_v)); break; case 7: setFilterChildren(*reinterpret_cast< bool*>(_v)); break; } _id -= 8; } else if (_c == QMetaObject::ResetProperty) { switch (_id) { case 0: resetTarget(); break; } _id -= 8; } else if (_c == QMetaObject::QueryPropertyDesignable) { _id -= 8; } else if (_c == QMetaObject::QueryPropertyScriptable) { _id -= 8; } else if (_c == QMetaObject::QueryPropertyStored) { _id -= 8; } else if (_c == QMetaObject::QueryPropertyEditable) { _id -= 8; } else if (_c == QMetaObject::QueryPropertyUser) { _id -= 8; } #endif // QT_NO_PROPERTIES return _id; }
static char *xstrsub(const char *src, int begin, int len) { int l; int ind; char *ret; size_t s_full; s_full=strlen(src); if(len==-1) l=(int)s_full; else l=len; if(!(ret=(char *)malloc_w((xmin(s_full, l)+1)*sizeof(char), __func__))) return NULL; ind=begin<0?xmax((int) s_full+begin, 0):xmin(s_full, begin); strncpy(ret, src+ind, xmin(s_full, l)); ret[xmin(s_full, l)] = '\0'; return ret; }
float size() const { if (xmax() < xmin() || ymax() < ymin()) { // If box is invalid (e.g. xmax < xmin or ymax < ymin), return 0. return 0.0f; } else { return width() * height(); } }
void CartesianWidget::setXRange(double a, double b) { setCenter((a+b)/2.0, centerY()); if (xmax()-xmin() > (b-a)) { while ( xmax()-xmin() > (b-a) ) { setZoomLevel(zoomLevel()-1); } if ( xmax()-xmin() < (b-a) ) { setZoomLevel(zoomLevel()+1); } } else { while ( xmax()-xmin() < (b-a) ) { setZoomLevel(zoomLevel()+1); } } update(); }
void XArrayLinear<T>::Resize( int newSize ) { XBREAK( m_pArray == NULL ); XBREAK( m_nNum > m_nMax ); XBREAK( newSize <= 0 ); XBREAK( newSize <= m_nMax && newSize <= m_nNum ); T *pNewArray = new T[ newSize ]; // 새 버퍼로 옮김. int Min = xmin( newSize, m_nMax ); for( int i = 0; i < Min; ++i ) pNewArray[ i ] = m_pArray[ i ]; SAFE_DELETE_ARRAY( m_pArray ); m_pArray = pNewArray; m_nMax = newSize; }
/* single even-stage */ slint sn_even(slint size, slint rank, slint stage, void *snp, slint *up) /* sl_proto, sl_func sn_even */ { slint stages = 1; /* if the rank is out of range, return 'finshed' */ if (rank >= size) return -1; /* if 'stage < 0' return the number of stages */ if (stage < 0) return stages; /* if the stage is to large, return 'finshed' */ if (stage >= stages) return -1; if (up != NULL) *up = 0; return xmax(0, xmin(size - 1, ((1 == rank % 2)?rank + 1:rank - 1))); }
AccelerationGrid::AccelerationGrid() : m_cells(0,0,0), m_elementidxs(0), m_elementxmins(0), m_elementxmaxs(0), m_elementquery(0), m_lastquery(0), m_gridxmin(0,0,0), m_gridxmax(0,0,0), m_cellsize(0,0,0), m_invcellsize(0,0,0) { Vec3st dims(1,1,1); Vec3d xmin(0,0,0), xmax(1,1,1); set(dims, xmin, xmax); }
int max_flow(int** matrix, int n, int start, int finish) { int maxFlow = 0; int from[n]; for(EVER) { bool visited[n]; int curPos; std::fill(&visited[0], &visited[n], false); std::deque< int > pointQueue; pointQueue.push_front(start); visited[start] = true; while(!pointQueue.empty()) { curPos = pointQueue.front(); pointQueue.pop_front(); for(int i = 0; i < n; i++) { if(!visited[i] && matrix[curPos][i] > 0) { from[i] = curPos; visited[i] = true; pointQueue.push_front(i); } } } if(!visited[finish]) { break; } int maxPath = -1; curPos = finish; while(curPos != start) { maxPath = xmin(maxPath, matrix[from[curPos]][curPos]); curPos = from[curPos]; } curPos = finish; while(curPos != start) { matrix[from[curPos]][curPos] -= maxPath; matrix[curPos][from[curPos]] += maxPath; curPos = from[curPos]; } maxFlow += maxPath; } return maxFlow; }
// --------------------------------------------------------------------------- // // --------------------------------------------------------------------------- TInt CSvgStyleElementImpl::GetAttributeFloat( const TInt aNameId, TFloatFixPt& aValue ) { switch ( aNameId ) { case KAtrRefX: { TFloatFixPt xmin( KMAXFLOATFIX ), x; // 0x7fff is the maximum integer in TFixPt CSvgElementImpl*lNewElement = ( CSvgElementImpl* ) FirstChild(); while ( lNewElement != NULL ) { lNewElement->GetAttributeFloat( KAtrRefX, x ); if ( x < xmin ) xmin = x; lNewElement = ( CSvgElementImpl * ) lNewElement->NextSibling(); } aValue = xmin; } break; case KAtrRefY: { TFloatFixPt ymin( KMAXFLOATFIX ), y; // 0x7fff is the maximum integer in TFixPt CSvgElementImpl*lNewElement = ( CSvgElementImpl* ) FirstChild(); while ( lNewElement != NULL ) { lNewElement->GetAttributeFloat( KAtrRefY, y ); if ( y < ymin ) ymin = y; lNewElement = ( CSvgElementImpl * ) lNewElement->NextSibling(); } aValue = ymin; } break; default: return CSvgElementImpl::GetAttributeFloat( aNameId, aValue ); } return KErrNone; }
slint sort_radix_af(elements_t *s, elements_t *sx, slint rhigh, slint rlow, slint rwidth) /* sl_proto, sl_func sort_radix_af */ { elements_t _sx; slint finalize = 1; #ifdef insertsort_finalize_adaptive finalize = 0; #endif /* insertsort_finalize_adaptive */ if (s == NULL) return -1; if (s->size < 2) return 0; rti_tstart(rti_tid_sort_radix); if (sx == NULL || sx->size < 1) { sx = &_sx; elements_alloc(sx, 1, SLCM_ALL); } if (rhigh < 0) rhigh = key_radix_high; if (rlow < 0) rlow = key_radix_low; if (rwidth <= 0) rwidth = sort_radix_width_default; rs_rec_af(s, sx, rhigh, rlow, xmin(rwidth, sort_radix_width_max), &finalize); #ifdef insertsort_finalize if (sort_radix_threshold_rec > 1 && finalize) rs_rec_insertsort_af(s, sx, rhigh, rlow); #endif /* insertsort_finalize */ if (sx == &_sx) elements_free(sx); rti_tstop(rti_tid_sort_radix); return 0; }
//------------------------------------------------------------------------------------------------------------------------------------ // called when we want to draw the 3D data in our app. //------------------------------------------------------------------------------------------------------------------------------------ void draw3D() { const float DEG_TO_RAD = PI / 180.0f; const Vec3 xAxis(1.0f, 0, 0); const Vec3 yAxis(0, 1.0f, 0); translate(0, 0, -g_zoom); translate(g_tx, g_ty, 0); rotate(g_rotx * DEG_TO_RAD, xAxis); rotate(g_roty * DEG_TO_RAD, yAxis); // draw the grid on the floor setColour(0.25f, 0.25f, 0.25f); for(float i = -10.0f; i <= 10.1f; i += 1.0f) { Vec3 zmin(i, 0, -10); Vec3 zmax(i, 0, 10); Vec3 xmin(-10, 0, i); Vec3 xmax(10, 0, i); drawLine(xmin, xmax); drawLine(zmin, zmax); } }
nervana::boundingbox::box unnormalize(float width, float height) { return nervana::boundingbox::box( xmin() * width, ymin() * height, xmax() * width - 1, ymax() * height - 1); }
int main(int argc, char **argv) { int c = 0; long i_start_arg = 1; long i_end_arg = N; int i_start = 1; int i_end = N; mpfr_fn sin_fn = 0; mpfr_fn cos_fn = 0; for (int k = 0; k < argc; ++k) { printf("%s ", argv[k]); } printf("\n"); while ((c = getopt(argc, argv, "i:j:f:")) != -1) { switch (c) { case 'i': errno = 0; i_start_arg = strtoll(optarg, 0, 0); if (errno) { fprintf(stderr, "bad start index %s\n", optarg); return 1; } break; case 'j': errno = 0; i_end_arg = strtoll(optarg, 0, 0); if (errno) { fprintf(stderr, "bad end index %s\n", optarg); return 1; } break; case 'f': if (!strcmp(optarg, "sin")) { sin_fn = mpfr_sin; cos_fn = mpfr_cos; } else if (!strcmp(optarg, "tan")) { sin_fn = mpfr_tan; cos_fn = mpfr_cot; } else { fprintf(stderr, "unknown function %s\n", optarg); return 1; } break; default: usage(); break; } } if (i_start_arg <= 0 || i_end_arg > N) { printf("truncating start to (0, %d]\n", N); i_start_arg = xmin(xmax(i_start_arg, 1), N); } if (i_end_arg <= 0 || i_end_arg > N) { printf("truncating end to (0, %d]\n", N); i_end_arg = xmin(xmax(i_end_arg, 1), N); } i_start = i_start_arg; i_end = i_end_arg; if (!sin_fn || !cos_fn) { fprintf(stderr, "-f required\n"); return 1; } for (int i = i_start; i <= i_end; ++i) { if (find_triple_64(i, 11, 20, sin_fn, cos_fn) < 0) { /* This indicates you should drop the range limitations on r, re-run, and come back in a week. */ printf("CANNOT FIND SUITABLE CANDIDATE FOR i = %03d\n", i); } } return 0; }
slint_t binning_radix_finalize(binning_t *bm, bin_t *bin, slweight_t dcw, slint_t lc_min, slint_t lc_max, slweight_t *lcw, splitter_t *sp, slint_t s) /* sl_proto, sl_func binning_radix_finalize */ { slint_t lc, r; #ifdef elem_weight elements_t xi, end; slweight_t lw; #endif SL_TRACE_IF(BR_TRACE_IF, "bin size: %" slint_fmt ", dcw = %" slweight_fmt ", lc: %" slint_fmt " - %" slint_fmt ", lcw[0] = %" slweight_fmt, bin->s.size, dcw, lc_min, lc_max, lcw[0]); #ifdef elem_weight if (bm->doweights) SL_TRACE_IF(BR_TRACE_IF, "bin weight: %" slweight_fmt ", dcw = %" slweight_fmt ", lc: %" slint_fmt " - %" slint_fmt ", lcw[1] = %" slweight_fmt, bin->weight, dcw, lc_min, lc_max, lcw[1]); #endif r = 0; #ifdef elem_weight if (bm->doweights) { lc = 0; lw = 0.0; if (bin->s.size <= lc_min || (dcw >= bin->weight && bin->s.size <= lc_max)) { lc = bin->s.size; lw = bin->weight; } else { if (0 < lc_max) { elem_assign_at(&bin->s, bin->s.size, &end); lw = dcw; for (elem_assign(&bin->s, &xi); xi.keys < end.keys; elem_inc(&xi)) { ++lc; lw -= elem_weight(&xi, 0); if (lc <= lc_min) continue; if (lw < 0.0 || lc > lc_max) { lw += elem_weight(&xi, 0); --lc; break; } } lw = dcw - lw; } r = 1; } } else #endif { lc = xmin(dcw, bin->s.size); r = (lc >= (slint_t) dcw); } lcw[0] += lc; SL_TRACE_IF(BR_TRACE_IF, "lcw[0] = %" slweight_fmt " + %" slint_fmt " = %" slweight_fmt, lcw[0] - lc, lc, lcw[0]); #ifdef elem_weight if (bm->doweights) { lcw[1] += lw; SL_TRACE_IF(BR_TRACE_IF, "lcw[1] = %" slweight_fmt " + %" slweight_fmt " = %" slweight_fmt, lcw[1] - lw, lw, lcw[1]); } #endif sp->displs[s] += lc; SL_TRACE_IF(BR_TRACE_IF, "displs[%" slint_fmt "] += %" slint_fmt " = %d", s, lc, sp->displs[s]); return r; }
QRectF QTessellatorPrivate::collectAndSortVertices(const QPointF *points, int *maxActiveEdges) { *maxActiveEdges = 0; Vertex *v = vertices.storage; Vertex **vv = vertices.sorted; qreal xmin(points[0].x()); qreal xmax(points[0].x()); qreal ymin(points[0].y()); qreal ymax(points[0].y()); // collect vertex data Q27Dot5 y_prev = FloatToQ27Dot5(points[vertices.nPoints-1].y()); Q27Dot5 x_next = FloatToQ27Dot5(points[0].x()); Q27Dot5 y_next = FloatToQ27Dot5(points[0].y()); int j = 0; int i = 0; while (i < vertices.nPoints) { Q27Dot5 y_curr = y_next; *vv = v; v->x = x_next; v->y = y_next; v->flags = 0; next_point: xmin = qMin(xmin, points[i+1].x()); xmax = qMax(xmax, points[i+1].x()); ymin = qMin(ymin, points[i+1].y()); ymax = qMax(ymax, points[i+1].y()); y_next = FloatToQ27Dot5(points[i+1].y()); x_next = FloatToQ27Dot5(points[i+1].x()); // skip vertices on top of each other if (v->x == x_next && v->y == y_next) { ++i; if (i < vertices.nPoints) goto next_point; Vertex *v0 = vertices.storage; v0->flags &= ~(LineBeforeStarts|LineBeforeEnds|LineBeforeHorizontal); if (y_prev < y_curr) v0->flags |= LineBeforeEnds; else if (y_prev > y_curr) v0->flags |= LineBeforeStarts; else v0->flags |= LineBeforeHorizontal; if ((v0->flags & (LineBeforeStarts|LineAfterStarts)) && !(v0->flags & (LineAfterEnds|LineBeforeEnds))) *maxActiveEdges += 2; break; } if (y_prev < y_curr) v->flags |= LineBeforeEnds; else if (y_prev > y_curr) v->flags |= LineBeforeStarts; else v->flags |= LineBeforeHorizontal; if (y_curr < y_next) v->flags |= LineAfterStarts; else if (y_curr > y_next) v->flags |= LineAfterEnds; else v->flags |= LineAfterHorizontal; // ### could probably get better limit by looping over sorted list and counting down on ending edges if ((v->flags & (LineBeforeStarts|LineAfterStarts)) && !(v->flags & (LineAfterEnds|LineBeforeEnds))) *maxActiveEdges += 2; y_prev = y_curr; ++v; ++vv; ++j; ++i; } vertices.nPoints = j; QDEBUG() << "maxActiveEdges=" << *maxActiveEdges; vv = vertices.sorted; qSort(vv, vv + vertices.nPoints, compareVertex); return QRectF(xmin, ymin, xmax-xmin, ymax-ymin); }
/* Returns >= zero iff successful */ static int find_triple_64(int i, int min_leeway, int perfect_leeway, mpfr_fn sin_fn, mpfr_fn cos_fn) { /* Using mpfr is not entirely overkill for this; [Lut95] includes PASCAL fragments that use almost entirely integer arithmetic... but the error term in that only handles up to 13 extra bits of zeroes or so. We proudly boast at least 16 bits of extra zeroes in all cases. */ mpfr_t xi; mpfr_t xip1; mpfr_t cos; mpfr_t sin; double xip1_d; double t; uint64_t sin_u; uint64_t cos_u; int e1; int e2; uint64_t xip1_u; double xi_initial; uint64_t xi_initial_u; double xi_current; uint64_t xi_current_u; long int r = 0; long int best_r = 0; int sgn = 1; int ml = min_leeway; int best_l = 0; uint64_t best_xi_u; uint64_t best_sin_u; uint64_t best_cos_u; time_t start; time_t end; start = time(0); mpfr_init2(xi, 100); mpfr_init2(xip1, 100); mpfr_init2(cos, 100); mpfr_init2(sin, 100); /* start out at xi = πi/(4N) */ mpfr_const_pi(xi, MPFR_RNDN); mpfr_mul_si(xip1, xi, (long int) (i + 1), MPFR_RNDN); mpfr_mul_si(xi, xi, (long int) i, MPFR_RNDN); mpfr_div_si(xi, xi, (long int) 4 * N, MPFR_RNDN); mpfr_div_si(xip1, xip1, (long int) 4 * N, MPFR_RNDN); xip1_d = mpfr_get_d(xip1, MPFR_RNDN); xip1_u = FLT64_TO_UINT64(xip1_d); xi_initial = mpfr_get_d(xi, MPFR_RNDN); xi_initial_u = FLT64_TO_UINT64(xi_initial); while (1) { xi_current_u = xi_initial_u + (sgn * r); xi_current = UINT64_TO_FLT64(xi_current_u); mpfr_set_d(xi, xi_current, MPFR_RNDN); /* Test if cos(xi) has enough zeroes */ cos_fn(cos, xi, MPFR_RNDN); t = mpfr_get_d(cos, MPFR_RNDN); cos_u = FLT64_TO_UINT64(t); e1 = EXP_OF_FLT64(t); mpfr_sub_d(cos, cos, t, MPFR_RNDN); t = mpfr_get_d(cos, MPFR_RNDN); e2 = EXP_OF_FLT64(t); if (e2 == -1024) { /* Damn; this is too close to a subnormal. i = 0 or N? */ return -1; } if (e1 - e2 < (52 + min_leeway)) { goto inc; } ml = xmax(min_leeway, e1 - e2 - 52); /* Test if sin(xi) has enough zeroes */ sin_fn(sin, xi, MPFR_RNDN); t = mpfr_get_d(sin, MPFR_RNDN); sin_u = FLT64_TO_UINT64(t); e1 = EXP_OF_FLT64(t); mpfr_sub_d(sin, sin, t, MPFR_RNDN); t = mpfr_get_d(sin, MPFR_RNDN); e2 = EXP_OF_FLT64(t); if (e2 == -1024) { /* Damn; this is too close to a subnormal. i = 0 or N? */ return -1; } if (e1 - e2 < (52 + min_leeway)) { goto inc; } ml = xmin(ml, e1 - e2 - 52); /* Hurrah, this is valid */ if (ml > best_l) { best_l = ml; best_xi_u = xi_current_u; best_cos_u = cos_u; best_sin_u = sin_u; best_r = sgn * r; /* If this is super-good, don't bother finding more */ if (best_l >= perfect_leeway) { break; } } inc: /* Increment */ sgn *= -1; if (sgn < 0) { r++; } else if (r > (1 << 29) || xi_current_u > xip1_u) { /* This is taking too long, give up looking for perfection and take the best we've got. A sweep of 1 << 28 finishes in ~60 hrs on my personal machine as I write this. */ break; } } end = time(0); if (best_l > min_leeway) { printf( "(%#018lx, %#018lx, %#018lx), /* i = %03d, l = %02d, r = %010ld, t = %ld */ \n", best_xi_u, best_cos_u, best_sin_u, i, best_l, best_r, end - start); return 0; } else { return -1; } }
//--------------------------------------------------------- DVec& CS_PCG::solve(const DVec& rhs, double tol, int maxit) //--------------------------------------------------------- { // Use a preconditioned Conjugate Gradient method // to return an iterative solution to: x = A\rhs. // // 1. permute rhs // 2. solve using pcg // 3. unpermute result #if (APPLY_PERM) m_permute = true; #else m_permute = false; #endif // check system if (!m_factor || !L.ok()) { umERROR("CS_PCG::solve", "cholinc factor not ready."); } // store user args m_tol=tol; m_maxit=maxit; // store permuted rhs in pb int n=rhs.size(); pb.resize(n); if (m_permute) { CS_ipvec(this->pinv, rhs, pb, n); // pb = P*rhs } else { pb = rhs; // pb = rhs } if (!pb.ok()) { umERROR("CS_PCG::solve", "failed to permute rhs"); } if (this->L.n != n) { umERROR("CS_PCG::solve", "rhs not compatible"); } //--------------------------------------------- // When used during time-dependent simulations, // set the initial solution vector to zero, but // reuse previous solution on subsequent calls. //--------------------------------------------- // work with permuted px = P(x), // return unpermuted x = P(px), px.resize(n, false); // false -> don't bother initialising if (!m_oldsol) { px.fill(0.0); // initial guess is zero vector x.resize(n); // allocate return vector } else { if (m_permute) { // reapply permutation and use old solution as inital guess CS_ipvec(this->pinv, x, px, n); // px = P*x } else { px = x; // px = x } } if (!px.ok() || !x.ok()) { umERROR("CS_PCG::solve", "out of memory"); } // check parameters if (m_tol<=0.0) { m_tol = 1e-6; umWARNING("pcg", "resetting tol to %g (was %g).", m_tol, tol); } if (m_maxit>n) { m_maxit=std::min(n,20); umWARNING("pcg", "setting maxit to %d (was %d).", m_maxit, maxit); } // Check for all zero right hand side vector => all zero solution double n2b = pb.norm2(); // Norm of rhs vector, b if (0.0 == n2b) { // if rhs vector is all zeros x.resize(n,true,0.0); // then solution is all zeros m_flag = 0; // a valid solution has been obtained m_relres = 0; // the relative residual is actually 0/0 m_iter = 0; // no iterations need be performed m_resvec = 0; // resvec(1) = norm(b-A*x) = norm(0) //if (m_verbose) {itermsg("pcg", m_tol,m_maxit,0,m_flag,m_iter,NaN);} return x; } // local variables DVec xmin("xmin"), r("r"), z("z"), p("p"), q("q"), b_Ax("b-Ax"); double tolb=0.0,normr=0.0,normrmin=0.0,rho=0.0,rho1=0.0,pq=0.0; double alpha=0.0,beta=0.0; int i=0, imin=0; // IVec stagtest(n, "stagtest"), ind("ind"); //------------------------------------------------------- // Set up for pcg method //------------------------------------------------------- m_flag = 1; imin = 0; // iteration at which xmin was computed xmin = px; // iterate which has minimal residual so far tolb = m_tol * n2b; // relative tolerance r = pb - A*px; normr = r.norm2(); // norm of residual if (normr <= tolb) { m_flag = 0; // initial guess "x0" was good enough. m_relres = normr / n2b; // since we have made no changes to x, m_iter = 0; // just return old x without permuting m_resvec = normr; //if (m_verbose) {itermsg("pcg", m_tol,m_maxit,0,m_flag,m_iter,relres);} //CS_pvec(this->pinv, px, x, n); // unpermute solution m_oldsol = true; return x; } m_resvec.resize(m_maxit+1); // Preallocate vector for norm of residuals m_resvec(1) = normr; // resvec(1) = norm(b-A*x0) normrmin = normr; // Norm of minimum residual rho = 1.0; bool stag = false; // stagnation: flag failure to converge bool bOk = true; // stagnation: flag failure to converge //------------------------------------------------------- // loop for maxit iters, unless convergence or failure: //------------------------------------------------------- for (i=1; i<=m_maxit; ++i) { // apply cholinc preconditioner z = solve_LLT(r); // z = LLT\r //bOk = solve_LLT(r,z); // z = LLT\r if (isInf(z)) //if (!bOk) { m_flag = 2; break; } rho1=rho; rho=inner(r,z); if ((0.0==rho) || isinf(rho)) { m_flag = 4; break; } if (1 == i) { p = z; } else { beta = rho / rho1; if ((0.0 == beta) || isinf(beta)) { m_flag = 4; break; } //p = z + beta * p; p*=beta; p+=z; } q = A*p; pq = inner(p,q); if ((pq <= 0) || isinf(pq)) { m_flag = 4; break; } else { alpha = rho / pq; } if (isinf(alpha)) { m_flag = 4; break; } // Check for stagnation of the method if (0.0 == alpha) { stag = true; } #if (0) //##################################################### // TODO: Check for stagnation of the method //##################################################### if (!stag) { stagtest.fill(0); ind = find(x, '!', 0.0); stagtest(ind) = dd(p(ind), x(ind)); stagtest(~ind & p ~= 0) = Inf; if (abs(alpha)*norm(stagtest,inf) < eps) {stag = true;} } //##################################################### #endif // form new iterate px += alpha * p; b_Ax = pb - A*px; normr = b_Ax.norm2(); m_resvec(i+1) = normr; // check for convergence if (normr <= tolb) { m_flag = 0; m_iter = i; #if 1 umLOG(1, " ==> CS_PCG sol: %3d %15.12lf\n", i, normr); #endif break; } // check for stagnation if (stag) { m_flag = 3; break; } // update minimal norm quantities if (normr < normrmin) { normrmin = normr; xmin = px; imin = i; } r -= alpha * q; #if (SHOW_ITER_CONVERG) umLOG(1, " ==> CS_PCG sol: %3d %15.12lf\n", i, normr); #endif } // for i=1:m_maxit
slint_t mpi_select_exact_radix_fixed(elements_t *s, slint_t nelements, slint_t nparts, partcond_t *pconds, slint_t rhigh, slint_t rlow, slint_t rwidth, int *sdispls, int size, int rank, MPI_Comm comm) /* sl_proto, sl_func mpi_select_exact_radix_fixed */ { slkey_pure_t max_nclasses, nclasses, bit_mask; slkey_pure_t k, l; typedef struct { slint_t count_min, count_max; slint_t count_low, count_hig; #ifdef elem_weight double weight_min, weight_max; double weight_low, weight_hig; #endif } mmlh_t; mmlh_t mmlh[nparts]; const slint_t max_nborders = nparts - 1; slint_t border_lo, border_hi, nborders_removed; slint_t borders[max_nborders], border_areas[max_nborders]; #define MIN_LE 0 #define MIN_RI 1 #define MAX_LE 2 #define MAX_RI 3 struct { slint_t update; slint_t crange[2], cmmlr[4]; #ifdef elem_weight double wrange[2], wmmlr[4]; #endif } border_infos_[1 + max_nborders + 1], *border_infos = border_infos_ + 1, border_info_old; const slint_t max_nareas = max_nborders; slint_t nareas, nareas_new; elements_t areas0[max_nareas * nelements], areas1[max_nareas * nelements], *areas, *areas_new; slint_t *area_counts, *current_counts; double *local_counts, *global_counts; #ifdef elem_weight double *local_weights, *global_weights, *current_weights; #endif slint_t current_cmm[2]; #ifdef elem_weight double current_wmm[2]; #endif slint_t final_areas[max_nborders * nelements]; double final_locals[NCONDS * max_nborders], *final_globals; slint_t current_width; slint_t round, direction, refine, finalize; slint_t last_new_area, last_new_class; slint_t lc, lcs, gc, gcs, lcv[nelements], lcsv[nelements]; #ifdef elem_weight double lw, gw, lws, gws; double mw, dw; double mcw[4]; #else slint_t mc, dc; #endif slint_t i, j; elements_t xi, end; #ifdef VERIFY slint_t v; #endif SL_TRACE_IF(DEBUG_OR_NOT, "starting mpi_select_exact_radix"); /* sl_tid rti_tid_mpi_select_exact_radix rti_tid_mpi_select_exact_radix_sync */ rti_treset(rti_tid_mpi_select_exact_radix_while); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_count); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_allreduce); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_round1); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_round1_allgather); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_exscan); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_check); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_check_pre); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_check_classes); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_check_final); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_check_post); /* sl_tid */ rti_tstart(rti_tid_mpi_select_exact_radix_sync); #ifdef SYNC_ON_INIT MPI_Barrier(comm); #endif rti_tstop(rti_tid_mpi_select_exact_radix_sync); #ifdef VERIFY v = elements_validate_order(s, 1); SL_TRACE_IF(DEBUG_OR_NOT, "elements order: %s (%" slint_fmt ")", (v > 0)?"FAILED":"SUCCESS", v); #endif rti_tstart(rti_tid_mpi_select_exact_radix); if (rhigh < 0) rhigh = key_radix_high; if (rlow < 0) rlow = key_radix_low; if (rwidth < 0) rwidth = sort_radix_width_default; max_nclasses = powof2_typed(rwidth, slkey_pure_t); /* SL_TRACE_IF(DEBUG_OR_NOT, "alloc area_counts: %" slint_fmt " * %d", max_nareas * nelements * max_nclasses, sizeof(slint_t)); SL_TRACE_IF(DEBUG_OR_NOT, "alloc local_counts: %" slint_fmt " * %d", NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(slint_t)); SL_TRACE_IF(DEBUG_OR_NOT, "alloc global_counts: %" slint_fmt " * %d", NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(slint_t));*/ area_counts = sl_alloc(max_nareas * nelements * max_nclasses, sizeof(slint_t)); local_counts = sl_alloc(NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(double)); global_counts = sl_alloc(NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(double)); /* init areas (first area = all elements) */ areas = areas0; areas_new = areas1; nareas = 1; for (j = 0; j < nelements; ++j) elem_assign(&s[j], &areas[0 * nelements + j]); /* init parts */ border_lo = 0; border_hi = max_nborders - 1; for (i = border_lo; i <= border_hi; ++i) { borders[i] = i; border_areas[i] = 0; } /* init sdispls */ for (i = 0; i < nparts; ++i) for (j = 0; j < nelements; ++j) sdispls[i * nelements + j] = 0; rti_tstart(rti_tid_mpi_select_exact_radix_while); round = 0; while (border_lo <= border_hi) { ++round; /* setup bitmask */ current_width = xmin(rwidth, rhigh - rlow + 1); rhigh -= (current_width > 0)?current_width - 1:rhigh; nclasses = (current_width > 0)?powof2_typed(current_width, slkey_pure_t):1; bit_mask = nclasses - 1; SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" slint_fmt ", rhigh: %" slint_fmt ", current_width: %" slint_fmt ", nclasses: %" sl_key_pure_type_fmt, round, rhigh, current_width, nclasses); finalize = (current_width <= 0); if (!finalize || round == 1) { #ifdef elem_weight /* init weight counters */ local_weights = local_counts + (nareas * nclasses) + nareas; global_weights = global_counts + (nareas * nclasses) + nareas; #endif /* zero all counter */ for (i = 0; i < nareas; ++i) for (k = 0; k < nclasses; ++k) local_counts[i * nclasses + k] = #ifdef elem_weight local_weights[i * nclasses + k] = #endif 0.0; rti_tstart(rti_tid_mpi_select_exact_radix_while_count); /* for every area */ for (i = 0; i < nareas; ++i) { local_counts[nareas * nclasses + i] = 0; #ifdef elem_weight local_weights[nareas * nclasses + i] = 0.0; #endif /* for every list of elements */ for (j = 0; j < nelements; ++j) { SL_TRACE_IF(DEBUG_OR_NOT, "area %" slint_fmt ",%" slint_fmt ": size = %" slint_fmt, i, j, areas[i * nelements + j].size); elem_assign_at(&areas[i * nelements + j], areas[i * nelements + j].size, &end); current_counts = area_counts + ((i * nelements + j) * nclasses); #ifdef elem_weight current_weights = local_weights + (i * nclasses); #endif for (k = 0; k < nclasses; ++k) current_counts[k] = 0; if (nclasses > 1) { /* counts and weights in every class */ for (elem_assign(&areas[i * nelements + j], &xi); xi.keys < end.keys; elem_inc(&xi)) { k = key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask); current_counts[k] += 1; /* SL_TRACE_IF(DEBUG_OR_NOT, "key %" sl_key_pure_type_fmt " goes to bin %" sl_key_pure_type_fmt, key_purify(*xi.keys), k);*/ #ifdef elem_weight current_weights[k] += elem_weight(&xi, 0); #endif } } else { /* total counts and weights */ current_counts[0] = areas[i * nelements + j].size; #ifdef elem_weight for (elem_assign(&areas[i * nelements + j], &xi); xi.keys < end.keys; elem_inc(&xi)) current_weights[0] += elem_weight(&xi, 0); #endif } for (k = 0; k < nclasses; ++k) local_counts[i * nclasses + k] += current_counts[k]; /* total counts and weights in this area */ local_counts[nareas * nclasses + i] += areas[i * nelements + j].size; #ifdef elem_weight for (k = 0; k < nclasses; ++k) local_weights[nareas * nclasses + i] += current_weights[k]; #endif } SL_TRACE_ARRAY_IF(DEBUG_OR_NOT, "%" slint_fmt ": counts =", " %f", k, nclasses, (&local_counts[i * nclasses]), i); } rti_tstop(rti_tid_mpi_select_exact_radix_while_count); --rhigh; SL_TRACE_IF(DEBUG_OR_NOT, "all-reducing %" slint_fmt " doubles", (slint_t) (NCONDS * (nareas * nclasses + nareas))); rti_tstart(rti_tid_mpi_select_exact_radix_while_allreduce); /* create global counts and weights */ #ifdef MPI_SELECT_EXACT_RADIX_REDUCEBCAST_THRESHOLD if (size >= MPI_SELECT_EXACT_RADIX_REDUCEBCAST_THRESHOLD) { MPI_Reduce(local_counts, global_counts, NCONDS * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, REDUCEBCAST_ROOT, comm); MPI_Bcast(global_counts, NCONDS * (nareas * nclasses + nareas), MPI_DOUBLE, REDUCEBCAST_ROOT, comm); } else #endif MPI_Allreduce(local_counts, global_counts, NCONDS * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, comm); rti_tstop(rti_tid_mpi_select_exact_radix_while_allreduce); } /* do initializations */ if (round == 1) { rti_tstart(rti_tid_mpi_select_exact_radix_while_round1); for (i = 0; i < nparts; ++i) { /* truncate counts, set default values and determine local (count/weight) limits */ init_partconds(1, &pconds[i], nparts, global_counts[nareas * nclasses + 0], #ifdef elem_weight global_weights[nareas * nclasses + 0] #else 0 #endif ); mmlh[i].count_min = pconds[i].count_min; mmlh[i].count_max = pconds[i].count_max; mmlh[i].count_low = pconds[i].count_low; mmlh[i].count_hig = pconds[i].count_high; #ifdef elem_weight mmlh[i].weight_min = pconds[i].weight_min; mmlh[i].weight_max = pconds[i].weight_max; mmlh[i].weight_low = pconds[i].weight_low; mmlh[i].weight_hig = pconds[i].weight_high; #endif } /* init lowest and highest part (sentinels) */ border_infos[border_lo - 1].update = 0; border_infos[border_lo - 1].crange[0] = 0; border_infos[border_lo - 1].crange[1] = 0; border_infos[border_lo - 1].cmmlr[MIN_LE] = border_infos[border_lo - 1].cmmlr[MAX_LE] = 0; border_infos[border_lo - 1].cmmlr[MIN_RI] = border_infos[border_lo - 1].cmmlr[MAX_RI] = 0; SL_TRACE_IF(DEBUG_OR_NOT, "lowest: %" slint_fmt ": init count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", border_lo - 1, border_infos[border_lo - 1].cmmlr[MIN_LE], border_infos[border_lo - 1].cmmlr[MAX_LE], border_infos[border_lo - 1].cmmlr[MIN_RI], border_infos[border_lo - 1].cmmlr[MAX_RI]); #ifdef elem_weight border_infos[border_lo - 1].wrange[0] = 0.0; border_infos[border_lo - 1].wrange[1] = 0.0; border_infos[border_lo - 1].wmmlr[MIN_LE] = border_infos[border_lo - 1].wmmlr[MAX_LE] = 0.0; border_infos[border_lo - 1].wmmlr[MIN_RI] = border_infos[border_lo - 1].wmmlr[MAX_RI] = 0.0; SL_TRACE_IF(DEBUG_OR_NOT, "lowest: %" slint_fmt ": init weight[min/max-left/right]: %f / %f - %f / %f", border_lo - 1, border_infos[border_lo - 1].wmmlr[MIN_LE], border_infos[border_lo - 1].wmmlr[MAX_LE], border_infos[border_lo - 1].wmmlr[MIN_RI], border_infos[border_lo - 1].wmmlr[MAX_RI]); #endif /* init highest part (sentinel) */ border_infos[border_hi + 1].update = 0; border_infos[border_hi + 1].crange[0] = global_counts[nareas * nclasses + 0]; border_infos[border_hi + 1].crange[1] = global_counts[nareas * nclasses + 0]; border_infos[border_hi + 1].cmmlr[MIN_LE] = border_infos[border_hi + 1].cmmlr[MAX_LE] = 0; border_infos[border_hi + 1].cmmlr[MIN_RI] = border_infos[border_hi + 1].cmmlr[MAX_RI] = global_counts[nareas * nclasses + 0]; SL_TRACE_IF(DEBUG_OR_NOT, "highest: %" slint_fmt ": init count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", border_hi + 1, border_infos[border_hi + 1].cmmlr[MIN_LE], border_infos[border_hi + 1].cmmlr[MAX_LE], border_infos[border_hi + 1].cmmlr[MIN_RI], border_infos[border_hi + 1].cmmlr[MAX_RI]); #ifdef elem_weight border_infos[border_hi + 1].wrange[0] = global_weights[nareas * nclasses + 0]; border_infos[border_hi + 1].wrange[1] = global_weights[nareas * nclasses + 0]; border_infos[border_hi + 1].wmmlr[MIN_LE] = border_infos[border_hi + 1].wmmlr[MAX_LE] = 0.0; border_infos[border_hi + 1].wmmlr[MIN_RI] = border_infos[border_hi + 1].wmmlr[MAX_RI] = global_weights[nareas * nclasses + 0]; SL_TRACE_IF(DEBUG_OR_NOT, "highest: %" slint_fmt ": init weight[min/max-left/right]: %f / %f - %f / %f", border_hi + 1, border_infos[border_hi + 1].wmmlr[MIN_LE], border_infos[border_hi + 1].wmmlr[MAX_LE], border_infos[border_hi + 1].wmmlr[MIN_RI], border_infos[border_hi + 1].wmmlr[MAX_RI]); #endif /* init regular parts (backwards) */ for (i = border_hi; i >= border_lo; --i) { border_infos[borders[i]].update = 1; border_infos[borders[i]].crange[0] = 0; border_infos[borders[i]].crange[1] = global_counts[nareas * nclasses + 0]; border_infos[borders[i]].cmmlr[MIN_LE] = -1; border_infos[borders[i]].cmmlr[MIN_RI] = border_infos[borders[i] + 1].cmmlr[MIN_RI] - mmlh[borders[i] + 1].count_min; border_infos[borders[i]].cmmlr[MAX_LE] = -1; border_infos[borders[i]].cmmlr[MAX_RI] = border_infos[borders[i] + 1].cmmlr[MAX_RI] - mmlh[borders[i] + 1].count_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": init count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]); #ifdef elem_weight border_infos[borders[i]].wrange[0] = 0.0; border_infos[borders[i]].wrange[1] = global_weights[nareas * nclasses + 0]; border_infos[borders[i]].wmmlr[MIN_LE] = -1.0; border_infos[borders[i]].wmmlr[MIN_RI] = border_infos[borders[i] + 1].wmmlr[MIN_RI] - mmlh[borders[i] + 1].weight_min; border_infos[borders[i]].wmmlr[MAX_LE] = -1.0; border_infos[borders[i]].wmmlr[MAX_RI] = border_infos[borders[i] + 1].wmmlr[MAX_RI] - mmlh[borders[i] + 1].weight_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": init weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]); #endif /* prepare for finalization in the 1st round */ if (finalize) { for (j = 0; j < nelements; ++j) final_areas[i * nelements + j] = area_counts[(0 * nelements + j) * nclasses + 0]; final_locals[NCONDS * i + 0] = local_counts[nareas * nclasses + 0]; #ifdef elem_weight final_locals[NCONDS * i + 1] = local_weights[nareas * nclasses + 0]; #endif } } /* first direction: forward */ direction = 1; rti_tstop(rti_tid_mpi_select_exact_radix_while_round1); } /* compute prefixes for finalization */ if (finalize) { /* determine number of parts to finalize */ j = border_hi - border_lo + 1; SL_TRACE_IF(DEBUG_OR_NOT, "Exscan: finalizing %" slint_fmt " parts", j); rti_tstart(rti_tid_mpi_select_exact_radix_while_exscan); /* use local_counts to store the global prefix sums */ final_globals = local_counts; /* create global prefix sums (set rank 0 to zero) */ MPI_Exscan(&final_locals[NCONDS * border_lo], &final_globals[NCONDS * border_lo], NCONDS * j, MPI_DOUBLE, MPI_SUM, comm); if (rank == 0) for (i = border_lo; i <= border_hi; ++i) final_globals[NCONDS * i + 0] = #ifdef elem_weight final_globals[NCONDS * i + 1] = #endif 0.0; rti_tstop(rti_tid_mpi_select_exact_radix_while_exscan); } /* check all remaining parts */ SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" slint_fmt ", %s", round, (direction > 0)?"forward":"backward"); nareas_new = 0; last_new_area = last_new_class = -1; nborders_removed = 0; rti_tstart(rti_tid_mpi_select_exact_radix_while_check); i = (direction > 0)?border_lo:border_hi; while ((direction > 0)?(i <= border_hi):(i >= border_lo)) { /* check partition borders[i] */ SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ": PART: %" slint_fmt ",%" slint_fmt, round, i, borders[i]); rti_tstart(rti_tid_mpi_select_exact_radix_while_check_pre); /* save to old limits */ border_info_old = border_infos[borders[i]]; /* is an update required? */ if (border_infos[borders[i]].update) { /* forward */ if (direction > 0) { /* init from min/max (always) */ border_infos[borders[i]].cmmlr[MIN_LE] = border_infos[borders[i] - 1].cmmlr[MIN_LE] + mmlh[borders[i]].count_min; border_infos[borders[i]].cmmlr[MAX_LE] = border_infos[borders[i] - 1].cmmlr[MAX_LE] + mmlh[borders[i]].count_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left]: %" slint_fmt " + %" slint_fmt ", %" slint_fmt " + %" slint_fmt "", i, borders[i], border_infos[borders[i] - 1].cmmlr[MIN_LE], mmlh[borders[i]].count_min, border_infos[borders[i] - 1].cmmlr[MAX_LE], mmlh[borders[i]].count_max); /* check against low/high (on demand) */ if (pconds->pcm & SLPC_COUNTS_LH) { if (border_infos[borders[i]].cmmlr[MIN_LE] < mmlh[borders[i] + 1].count_low) border_infos[borders[i]].cmmlr[MIN_LE] = mmlh[borders[i] + 1].count_low; if (border_infos[borders[i]].cmmlr[MAX_LE] > mmlh[borders[i] ].count_hig) border_infos[borders[i]].cmmlr[MAX_LE] = mmlh[borders[i] ].count_hig; } #ifdef elem_weight /* init from min/max (always) */ border_infos[borders[i]].wmmlr[MIN_LE] = border_infos[borders[i] - 1].wmmlr[MIN_LE] + mmlh[borders[i]].weight_min; border_infos[borders[i]].wmmlr[MAX_LE] = border_infos[borders[i] - 1].wmmlr[MAX_LE] + mmlh[borders[i]].weight_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left]: %f + %f, %f + %f", i, borders[i], border_infos[borders[i] - 1].wmmlr[MIN_LE], mmlh[borders[i]].weight_min, border_infos[borders[i] - 1].wmmlr[MAX_LE], mmlh[borders[i]].weight_max); /* check against low/high (on demand) */ if (pconds->pcm & SLPC_WEIGHTS_LH) { if (border_infos[borders[i]].wmmlr[MIN_LE] < mmlh[borders[i] + 1].weight_low) border_infos[borders[i]].wmmlr[MIN_LE] = mmlh[borders[i] + 1].weight_low; if (border_infos[borders[i]].wmmlr[MAX_LE] > mmlh[borders[i] ].weight_hig) border_infos[borders[i]].wmmlr[MAX_LE] = mmlh[borders[i] ].weight_hig; } #endif } else /* backward */ { /* init from min/max (always) */ border_infos[borders[i]].cmmlr[MIN_RI] = border_infos[borders[i] + 1].cmmlr[MIN_RI] - mmlh[borders[i] + 1].count_min; border_infos[borders[i]].cmmlr[MAX_RI] = border_infos[borders[i] + 1].cmmlr[MAX_RI] - mmlh[borders[i] + 1].count_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-right]: %" slint_fmt " - %" slint_fmt ", %" slint_fmt " - %" slint_fmt "", i, borders[i], border_infos[borders[i] + 1].cmmlr[MIN_RI], mmlh[borders[i] + 1].count_min, border_infos[borders[i] + 1].cmmlr[MAX_RI], mmlh[borders[i] + 1].count_max); /* check against low/high (on demand) */ if (pconds->pcm & SLPC_COUNTS_LH) { if (border_infos[borders[i]].cmmlr[MAX_RI] < mmlh[borders[i] + 1].count_low) border_infos[borders[i]].cmmlr[MAX_RI] = mmlh[borders[i] + 1].count_low; if (border_infos[borders[i]].cmmlr[MIN_RI] > mmlh[borders[i] ].count_hig) border_infos[borders[i]].cmmlr[MIN_RI] = mmlh[borders[i] ].count_hig; } #ifdef elem_weight /* init from min/max (always) */ border_infos[borders[i]].wmmlr[MIN_RI] = border_infos[borders[i] + 1].wmmlr[MIN_RI] - mmlh[borders[i] + 1].weight_min; border_infos[borders[i]].wmmlr[MAX_RI] = border_infos[borders[i] + 1].wmmlr[MAX_RI] - mmlh[borders[i] + 1].weight_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-right]: %f - %f, %f - %f", i, borders[i], border_infos[borders[i] + 1].wmmlr[MIN_RI], mmlh[borders[i] + 1].weight_min, border_infos[borders[i] + 1].wmmlr[MAX_RI], mmlh[borders[i] + 1].weight_max); /* check against low/high (on demand) */ if (pconds->pcm & SLPC_WEIGHTS_LH) { if (border_infos[borders[i]].wmmlr[MAX_RI] < mmlh[borders[i] + 1].weight_low) border_infos[borders[i]].wmmlr[MAX_RI] = mmlh[borders[i] + 1].weight_low; if (border_infos[borders[i]].wmmlr[MIN_RI] > mmlh[borders[i] ].weight_hig) border_infos[borders[i]].wmmlr[MIN_RI] = mmlh[borders[i] ].weight_hig; } #endif } SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]); /* check against inconsistence */ if (border_infos[borders[i]].cmmlr[MIN_LE] > border_infos[borders[i]].cmmlr[MIN_RI]) border_infos[borders[i]].cmmlr[MIN_LE] = border_infos[borders[i]].cmmlr[MIN_RI] = (border_infos[borders[i]].cmmlr[MIN_LE] + border_infos[borders[i]].cmmlr[MIN_RI]) / 2; if (border_infos[borders[i]].cmmlr[MAX_LE] < border_infos[borders[i]].cmmlr[MAX_RI]) border_infos[borders[i]].cmmlr[MAX_LE] = border_infos[borders[i]].cmmlr[MAX_RI] = (border_infos[borders[i]].cmmlr[MAX_LE] + border_infos[borders[i]].cmmlr[MAX_RI]) / 2; #ifdef elem_weight SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]); /* check against inconsistence */ if (border_infos[borders[i]].wmmlr[MIN_LE] > border_infos[borders[i]].wmmlr[MIN_RI]) border_infos[borders[i]].wmmlr[MIN_LE] = border_infos[borders[i]].wmmlr[MIN_RI] = (border_infos[borders[i]].wmmlr[MIN_LE] + border_infos[borders[i]].wmmlr[MIN_RI]) / 2; if (border_infos[borders[i]].wmmlr[MAX_LE] < border_infos[borders[i]].wmmlr[MAX_RI]) border_infos[borders[i]].wmmlr[MAX_LE] = border_infos[borders[i]].wmmlr[MAX_RI] = (border_infos[borders[i]].wmmlr[MAX_LE] + border_infos[borders[i]].wmmlr[MAX_RI]) / 2; #endif } SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": crange: %" slint_fmt " - %" slint_fmt "", i, borders[i], border_infos[borders[i]].crange[0], border_infos[borders[i]].crange[1]); /* select highest min and lowest max */ current_cmm[0] = xmax(border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_RI]) - border_infos[borders[i]].crange[0]; current_cmm[1] = xmin(border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI]) - border_infos[borders[i]].crange[0]; if (rank == 0) SL_ASSERT(current_cmm[0] <= current_cmm[1]); if (rank == 0) SL_ASSERT(0 <= current_cmm[0]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": current_count: %" slint_fmt " - %" slint_fmt "", i, borders[i], current_cmm[0], current_cmm[1]); #ifdef elem_weight SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": wrange: %f - %f", i, borders[i], border_infos[borders[i]].wrange[0], border_infos[borders[i]].wrange[1]); /* select highest min and lowest max */ current_wmm[0] = xmax(border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_RI]) - border_infos[borders[i]].wrange[0]; current_wmm[1] = xmin(border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI]) - border_infos[borders[i]].wrange[0]; if (rank == 0) SL_ASSERT(current_wmm[0] <= current_wmm[1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": current_weight: %f - %f", i, borders[i], current_wmm[0], current_wmm[1]); #endif rti_tstop(rti_tid_mpi_select_exact_radix_while_check_pre); /* HIT is the default */ refine = 0; if (!finalize) { rti_tstart(rti_tid_mpi_select_exact_radix_while_check_classes); lcs = gcs = 0; #ifdef elem_weight lws = gws = 0.0; #endif for (k = 0; k < nclasses; ++k) { lc = local_counts[border_areas[i] * nclasses + k]; gc = global_counts[border_areas[i] * nclasses + k]; current_cmm[0] -= gc; current_cmm[1] -= gc; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": k = %" sl_key_pure_type_fmt ", current_count: %" slint_fmt " - %" slint_fmt ", lc = %" slint_fmt ", lcs = %" slint_fmt ", gc = %" slint_fmt ", gcs = %" slint_fmt, i, borders[i], k, current_cmm[0], current_cmm[1], lc, lcs, gc, gcs); #ifdef elem_weight lw = local_weights[border_areas[i] * nclasses + k]; gw = global_weights[border_areas[i] * nclasses + k]; current_wmm[0] -= gw; current_wmm[1] -= gw; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": k = %" sl_key_pure_type_fmt ", current_weight: %e - %e", i, borders[i], k, current_wmm[0], current_wmm[1]); #endif /* stop and refine if max count is skipped OR min count AND max weight is skipped */ if ((current_cmm[1] < 0) #ifdef elem_weight || (current_cmm[0] < 0 && current_wmm[1] < 0.0) #endif ) { refine = 1; break; } lcs += lc; gcs += gc; gc = 0; #ifdef elem_weight lws += lw; gws += gw; gw = 0.0; #endif /* if between min/max counts */ if (current_cmm[0] <= 0 && current_cmm[1] >= 0) { #ifdef elem_weight SL_TRACE_IF(DEBUG_OR_NOT, "got to next: %d && %d", (current_cmm[1] > 0), (current_wmm[0] > 0)); /* go to next if max count not reached AND min weight not reached */ if (current_cmm[1] > 0 && current_wmm[0] > 0) continue; #endif /* look ahead for a better stop */ if (k + 1 < nclasses && current_cmm[1] - global_counts[border_areas[i] * nclasses + k + 1] >= 0) { #ifdef elem_weight /* continue if weights will improve */ if (myabs(current_wmm[0] + current_wmm[1]) > myabs(current_wmm[0] + current_wmm[1] - 2 * global_weights[border_areas[i] * nclasses + k + 1])) continue; #else /* continue if counts will improve */ if (myabs(current_cmm[0] + current_cmm[1]) > myabs(current_cmm[0] + current_cmm[1] - 2 * global_counts[border_areas[i] * nclasses + k + 1])) continue; #endif } /* stop */ break; } } SL_ASSERT_IF((rank == 0), k < nclasses); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": %s k = %" sl_key_pure_type_fmt ", lcs = %" slint_fmt, i, borders[i], (refine)?"REFINE":"HIT", k, lcs); /* make sure k is safe (it is used as index later) */ if (k >= nclasses) k = nclasses - 1; /* break the local contribution into contributions for the lists of elements */ for (j = 0; j < nelements; ++j) { lcsv[j] = 0; for (l = 0; l < k; ++l) lcsv[j] += area_counts[((border_areas[i] * nelements + j) * nclasses) + l]; if (refine) lcv[j] = area_counts[((border_areas[i] * nelements + j) * nclasses) + k]; else { lcv[j] = 0; lcsv[j] += area_counts[((border_areas[i] * nelements + j) * nclasses) + k]; } lcs -= lcsv[j]; } rti_tstop(rti_tid_mpi_select_exact_radix_while_check_classes); } else { rti_tstart(rti_tid_mpi_select_exact_radix_while_check_final); k = 0; #ifdef elem_weight /* middle of min/max weight */ mw = (current_wmm[0] + current_wmm[1]) / 2.0; /* min. part of weight to contribute */ dw = xmax(0, mw - final_globals[NCONDS * i + 1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": mw = %e, dw = %e", i, borders[i], mw, dw); #else /* middle of min/max count */ mc = (current_cmm[0] + current_cmm[1]) / 2; /* min. part of count to contribute */ dc = xmax(0, mc - final_globals[NCONDS * i + 0]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": mc = %" slint_fmt ", dc = %" slint_fmt, i, borders[i], mc, dc); #endif /* contribute all? */ if ( #ifdef elem_weight dw >= final_locals[NCONDS * i + 1] #else dc >= final_locals[NCONDS * i + 0] #endif ) { lc = final_locals[NCONDS * i + 0]; #ifdef elem_weight lw = final_locals[NCONDS * i + 1]; #endif } else { /* contribute only a part */ #ifdef elem_weight lc = 0; for (j = 0; j < nelements; ++j) { elem_assign_at(&areas[border_areas[i] * nelements + j], areas[border_areas[i] * nelements + j].size, &end); for (elem_assign(&areas[border_areas[i] * nelements + j], &xi); xi.keys < end.keys; elem_inc(&xi)) { dw -= elem_weight(&xi, 0); ++lc; if (dw < 0.0 || lc >= final_locals[NCONDS * i + 0]) { dw += elem_weight(&xi, 0); --lc; break; } } } lw = dw; #else lc = dc; #endif } /* check mc against min/max count borders */ lc = xminmax(current_cmm[0] - final_globals[NCONDS * i + 0], lc, current_cmm[1] - final_globals[NCONDS * i + 0]); /* check agains 0 (don't step back!) and the local contribution */ lc = xminmax(0, lc, final_locals[NCONDS * i + 0]); lcs = lc; #ifdef elem_weight lws = lw; #endif #ifdef elem_weight SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": next border: %" slint_fmt " <= %" slint_fmt " + %" slint_fmt " <= %" slint_fmt, i, borders[i], border_lo, i, direction, border_hi); if (border_lo <= i + direction && i + direction <= border_hi) SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": next border: %" slint_fmt " == %" slint_fmt " + %" slint_fmt, i, borders[i], borders[i + direction], borders[i], direction); /* FIXME: finalize geht auch rückwärts!!! */ /* if the next open border is really the _next_ border */ if (border_lo <= i + direction && i + direction <= border_hi && borders[i + direction] == borders[i] + direction) { /* determine the exact global counts/weights (damn, this is expensive) */ mcw[0] = lcs; mcw[1] = lws; MPI_Allreduce(&mcw[0], &mcw[2], 2, MPI_DOUBLE, MPI_SUM, comm); } else { /* the exact global counts/weights are not required */ mcw[2] = 0.0; mcw[3] = 0.0; } gc = 0; gcs = mcw[2]; gw = 0.0; gws = mcw[3]; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": gcs = %" slint_fmt ", gws = %f", i, borders[i], gcs, gws); #else /* the global count is simply mc */ gc = 0; gcs = mc; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": gcs = %" slint_fmt, i, borders[i], gcs); #endif SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": lcs = %" slint_fmt, i, borders[i], lcs); /* break the local contribution into contributions for the lists of elements */ for (j = 0; j < nelements; ++j) { lcv[j] = 0; lcsv[j] = xmin(lcs, final_areas[i * nelements + j]); lcs -= lcsv[j]; } SL_TRACE_ARRAY_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": lcsv = ", "%" slint_fmt, j, nelements, lcsv, i, borders[i]); rti_tstop(rti_tid_mpi_select_exact_radix_while_check_final); } SL_ASSERT(lcs == 0); /* accept local contributions */ for (j = 0; j < nelements; ++j) sdispls[(borders[i] + 1) * nelements + j] += lcsv[j]; rti_tstart(rti_tid_mpi_select_exact_radix_while_check_post); /* this is wrong, e.g., even if gc == 0 and gcs == 0 then crange[1] is set to crange[0]! */ /* if (gc > 0 || gcs > 0 #ifdef elem_weight || gw != 0.0 || gws != 0.0 #endif )*/ { border_infos[borders[i]].crange[0] += gcs; border_infos[borders[i]].crange[1] = border_infos[borders[i]].crange[0] + gc; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": counts_range: %" slint_fmt " %" slint_fmt "", i, borders[i], border_infos[borders[i]].crange[0], border_infos[borders[i]].crange[1]); border_infos[borders[i]].cmmlr[MIN_LE] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].crange[1]); border_infos[borders[i]].cmmlr[MAX_LE] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].crange[1]); border_infos[borders[i]].cmmlr[MIN_RI] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].crange[1]); border_infos[borders[i]].cmmlr[MAX_RI] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MAX_RI], border_infos[borders[i]].crange[1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]); #ifdef elem_weight border_infos[borders[i]].wrange[0] += gws; border_infos[borders[i]].wrange[1] = border_infos[borders[i]].wrange[0] + gw; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weights_range: %f %f", i, borders[i], border_infos[borders[i]].wrange[0], border_infos[borders[i]].wrange[1]); border_infos[borders[i]].wmmlr[MIN_LE] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wrange[1]); border_infos[borders[i]].wmmlr[MAX_LE] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wrange[1]); border_infos[borders[i]].wmmlr[MIN_RI] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wrange[1]); border_infos[borders[i]].wmmlr[MAX_RI] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MAX_RI], border_infos[borders[i]].wrange[1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]); #endif } SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": range diff 0: %" slint_fmt "-%" slint_fmt " | %" slint_fmt "-%" slint_fmt, i, borders[i], border_infos[borders[i]].crange[0] - border_infos[borders[i] - 1].crange[1], border_infos[borders[i]].crange[0] - border_infos[borders[i] - 1].crange[0], border_infos[borders[i] + 1].crange[0] - border_infos[borders[i]].crange[0], border_infos[borders[i] + 1].crange[1] - border_infos[borders[i]].crange[0]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": range diff 1: %" slint_fmt "-%" slint_fmt " | %" slint_fmt "-%" slint_fmt, i, borders[i], border_infos[borders[i]].crange[1] - border_infos[borders[i] - 1].crange[1], border_infos[borders[i]].crange[1] - border_infos[borders[i] - 1].crange[0], border_infos[borders[i] + 1].crange[0] - border_infos[borders[i]].crange[1], border_infos[borders[i] + 1].crange[1] - border_infos[borders[i]].crange[1]); if (border_infos[borders[i]].cmmlr[MIN_LE] != border_info_old.cmmlr[MIN_LE] || border_infos[borders[i]].cmmlr[MAX_LE] != border_info_old.cmmlr[MAX_LE] #ifdef elem_weight || border_infos[borders[i]].wmmlr[MIN_LE] != border_info_old.wmmlr[MIN_LE] || border_infos[borders[i]].wmmlr[MAX_LE] != border_info_old.wmmlr[MAX_LE] #endif ) border_infos[borders[i] + 1].update = 1; if (border_infos[borders[i]].cmmlr[MIN_RI] != border_info_old.cmmlr[MIN_RI] || border_infos[borders[i]].cmmlr[MAX_RI] != border_info_old.cmmlr[MAX_RI] #ifdef elem_weight || border_infos[borders[i]].wmmlr[MIN_RI] != border_info_old.wmmlr[MIN_RI] || border_infos[borders[i]].wmmlr[MAX_RI] != border_info_old.wmmlr[MAX_RI] #endif ) border_infos[borders[i] - 1].update = 1; border_infos[borders[i]].update = 0; /* refine or remove */ if (refine) { /* bits left for partitioning? */ if (rhigh >= rlow) { if (last_new_area == border_areas[i] && last_new_class == k) border_areas[i] = nareas_new - 1; else { /* update last_new_... */ last_new_area = border_areas[i]; last_new_class = k; /* create new area */ for (j = 0; j < nelements; ++j) { elem_assign_at(&areas[border_areas[i] * nelements + j], lcsv[j], &areas_new[nareas_new * nelements + j]); areas_new[nareas_new * nelements + j].size = lcv[j]; } border_areas[i] = nareas_new; ++nareas_new; } } else { for (j = 0; j < nelements; ++j) final_areas[(i - nborders_removed * direction) * nelements + j] = lcv[j]; /* save local count/weight for the later prefix calculations */ final_locals[NCONDS * (i - nborders_removed * direction) + 0] = lc; #ifdef elem_weight final_locals[NCONDS * (i - nborders_removed * direction) + 1] = lw; #endif } borders[i - nborders_removed * direction] = borders[i]; border_areas[i - nborders_removed * direction] = border_areas[i]; } else ++nborders_removed; rti_tstop(rti_tid_mpi_select_exact_radix_while_check_post); i += direction; } /* restrict the parts */ if (direction > 0) border_hi -= nborders_removed; else border_lo += nborders_removed; /* change direction */ direction *= -1; rti_tstop(rti_tid_mpi_select_exact_radix_while_check); /* switch areas */ nareas = nareas_new; if (areas == areas0) { areas = areas1; areas_new = areas0; } else { areas = areas0; areas_new = areas1; } } rti_tstop(rti_tid_mpi_select_exact_radix_while); sl_free(area_counts); sl_free(local_counts); sl_free(global_counts); rti_tstop(rti_tid_mpi_select_exact_radix); #ifdef VERIFY v = mpi_post_check_partconds(s, nelements, nparts, pconds, sdispls, size, rank, comm); SL_ASSERT_IF(rank == 0, v < 0); SL_NOTICE_IF(rank == 0, "post_check_partconds: %s (%" slint_fmt ")", (v >= 0)?"FAILED":"SUCCESS", v); #endif #ifdef PRINT_SDISPLS printf("%d: sdispls:", rank); for (i = 0; i < nparts; ++i) printf(" %d ", sdispls[i]); printf("\n"); #endif #ifdef PRINT_STATS mpi_select_stats(s, nparts, sdispls, size, rank, comm); #endif #if defined(PRINT_TIMINGS) && defined(SL_USE_RTI_TIM) if (rank == PRINT_TIMINGS) { printf("%d: mpi_select_exact_radix: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix)); printf("%d: mpi_select_exact_radix: sync: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_sync)); printf("%d: mpi_select_exact_radix: while: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while)); printf("%d: mpi_select_exact_radix: count: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_count)); printf("%d: mpi_select_exact_radix: allreduce: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_allreduce)); printf("%d: mpi_select_exact_radix: round1: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_round1)); printf("%d: mpi_select_exact_radix: allgather: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_round1_allgather)); printf("%d: mpi_select_exact_radix: exscan: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_exscan)); printf("%d: mpi_select_exact_radix: check: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_check)); printf("%d: mpi_select_exact_radix: pre: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_pre)); printf("%d: mpi_select_exact_radix: classes: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_classes)); printf("%d: mpi_select_exact_radix: final: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_final)); printf("%d: mpi_select_exact_radix: post: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_post)); printf("%d: mpi_select_exact_radix: rounds: %" slint_fmt "\n", rank, round); } #endif return 0; }
slint rs_rec_af(elements_t *s, elements_t *sx, slint rhigh, slint rlow, slint rwidth, slint *finalize) /* sl_func rs_rec_af */ { #define max_nclasses (powof2_typed(sort_radix_width_max, slkey_pure_t)) slkey_pure_t bit_mask, nclasses; slint i, current_width, c[max_nclasses]; elements_t xi, end, parts[max_nclasses]; elem_assign_at(s, s->size, &end); current_width = xmin(rwidth, rhigh - rlow + 1); rhigh -= current_width - 1; nclasses = powof2_typed(current_width, slkey_pure_t); bit_mask = nclasses - 1; /* zero all counter */ for (i = 0; i < nclasses; i++) c[i] = 0; /* count the number of elements in every class */ for (elem_assign(s, &xi); xi.keys < end.keys; elem_inc(&xi)) ++c[key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask)]; /* compute the target of every class */ elem_assign_at(s, c[0], &parts[0]); parts[0].size = c[0]; for (i = 1; i < nclasses; i++) { elem_assign_at(&parts[i - 1], c[i], &parts[i]); parts[i].size = c[i]; } /* permute the keys home */ for (elem_assign(s, &xi); xi.keys < end.keys; elem_add(&xi, c[i])) { while (1) { i = key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask); elem_dec(&parts[i]); if (xi.keys >= parts[i].keys) break; elem_xchange(&parts[i], &xi, sx); } } --rhigh; if (rhigh >= rlow) { elem_assign(s, &xi); for (i = 0; i < nclasses; i++) { xi.size = c[i]; #ifdef insertsort if (xi.size > sort_radix_threshold_rec) rs_rec_af(&xi, sx, rhigh, rlow, rwidth, finalize); #ifdef insertsort_finalize #ifdef insertsort_finalize_adaptive else if (xi.size > 1) *finalize = 1; #endif /* insertsort_finalize_adaptive */ #else /* insertsort_finalize */ else if (xi.size > 1) rs_rec_insertsort_af(&xi, sx, rhigh, rlow); #endif /* insertsort_finalize */ #else /* insertsort */ if (xi.size > 1) rs_rec(&xi, sx, rhigh, rlow, rwidth, finalize); #endif /* insertsort */ elem_add(&xi, c[i]); } } return 0; }
slint_t mpi_merge2(elements_t *s, slint_t other_rank, slint_t high_rank, slint_t *dst_size, merge2x_f m2, elements_t *xs, int size, int rank, MPI_Comm comm) /* sl_proto, sl_func mpi_merge2 */ { const int tag = 1; slint_t ex_start, ex_sizes[2], nx_move, ex_size; elements_t s0, s1; MPI_Status status; #ifdef CHECK_ORDER slint_t check_order; #endif SL_TRACE_IF(MM2_TRACE_IF, "starting mpi_merge2"); /* sl_tid rti_tid_mpi_merge2 */ rti_treset(rti_tid_mpi_merge2_find); /* sl_tid */ rti_treset(rti_tid_mpi_merge2_moveright); /* sl_tid */ rti_treset(rti_tid_mpi_merge2_exchange); /* sl_tid */ rti_treset(rti_tid_mpi_merge2_moveleft); /* sl_tid */ rti_treset(rti_tid_mpi_merge2_local); /* sl_tid */ rti_tclear(rti_tid_mpi_merge2); if (other_rank < 0 || other_rank >= size) return -1; if (rank == other_rank) return 0; rti_tstart(rti_tid_mpi_merge2); #ifdef CHECK_ORDER check_order = elements_validate_order(s, 1); if (check_order) SL_ERROR("input order failed at %" slint_fmt "", check_order); #endif SL_TRACE_IF(MM2_TRACE_IF, "find_exact: s->size = %" slint_fmt ", other_rank / high_rank = %" slint_fmt " / %" slint_fmt, s->size, other_rank, high_rank); rti_tstart(rti_tid_mpi_merge2_find); mpi_find_exact(s, other_rank, high_rank, dst_size, &ex_start, ex_sizes, &nx_move, size, rank, comm); rti_tstop(rti_tid_mpi_merge2_find); SL_TRACE_IF(MM2_TRACE_IF, "find_exact: ex_start = %" slint_fmt ", ex_sizes = { %" slint_fmt ", %" slint_fmt " }, nx_move = %" slint_fmt, ex_start, ex_sizes[0], ex_sizes[1], nx_move); /* move the nx-block to the right (before exchange) */ rti_tstart(rti_tid_mpi_merge2_moveright); if (nx_move > 0 && s->size - ex_sizes[0] > 0) { SL_TRACE_IF(MM2_TRACE_IF, "moving right %" slint_fmt "", nx_move); if (rank != high_rank) elem_nmove_at(s, 0, s, nx_move, s->size - ex_sizes[0]); else elem_nmove_at(s, ex_sizes[0], s, ex_sizes[0] + nx_move, s->size - ex_sizes[0]); } rti_tstop(rti_tid_mpi_merge2_moveright); /* exchange elements */ rti_tstart(rti_tid_mpi_merge2_exchange); elem_assign_at(s, ex_start, &s0); ex_size = xmin(ex_sizes[0], ex_sizes[1]); if (ex_size > 0) { SL_TRACE_IF(MM2_TRACE_IF, "exchanging %" slint_fmt " elements at %" slint_fmt "", ex_size, ex_start); #ifdef MM2_ELEMENTS_SENDRECV_REPLACE mpi_elements_sendrecv_replace(&s0, ex_size, other_rank, tag, other_rank, tag, size, rank, comm); #else #define xelem_call \ MPI_Sendrecv_replace(xelem_buf(&s0), ex_size, xelem_mpi_datatype, other_rank, tag, other_rank, tag, comm, &status); #include "sl_xelem_call.h" #endif } elem_add(&s0, ex_size); if (ex_size < ex_sizes[0]) { ex_size = ex_sizes[0] - ex_size; SL_TRACE_IF(MM2_TRACE_IF, "sending %" slint_fmt " at %" slint_fmt "", ex_size, (slint_t) (s0.keys - s->keys)); #define xelem_call \ MPI_Send(xelem_buf(&s0), ex_size, xelem_mpi_datatype, other_rank, tag, comm); #include "sl_xelem_call.h" } else if (ex_size < ex_sizes[1]) { ex_size = ex_sizes[1] - ex_size; SL_TRACE_IF(MM2_TRACE_IF, "receiving %" slint_fmt " at %" slint_fmt "", ex_size, (slint_t) (s0.keys - s->keys)); #define xelem_call \ MPI_Recv(xelem_buf(&s0), ex_size, xelem_mpi_datatype, other_rank, tag, comm, &status); #include "sl_xelem_call.h" } rti_tstop(rti_tid_mpi_merge2_exchange); /* move the nx-block to the left (after exchange) */ rti_tstart(rti_tid_mpi_merge2_moveleft); if (nx_move < 0 && s->size - ex_sizes[0] > 0) { SL_TRACE_IF(MM2_TRACE_IF, "moving left %" slint_fmt "", nx_move); if (rank != high_rank) elem_nmove_at(s, 0, s, nx_move, s->size - ex_sizes[0]); else elem_nmove_at(s, ex_sizes[0], s, ex_sizes[0] + nx_move, s->size - ex_sizes[0]); } rti_tstop(rti_tid_mpi_merge2_moveleft); /* prepare the local merge2 */ if (rank != high_rank) { elem_assign_at(s, 0, &s0); s0.size = s->size - ex_sizes[0]; elem_assign_at(s, s0.size, &s1); s1.size = ex_sizes[1]; } else { elem_assign_at(s, 0, &s0); s0.size = ex_sizes[1]; elem_assign_at(s, s0.size, &s1); s1.size = s->size - ex_sizes[0]; } #ifdef CHECK_ORDER check_order = elements_validate_order(&s0, 1); if (check_order) SL_ERROR("intermediate lower order failed at %" slint_fmt "", check_order); check_order = elements_validate_order(&s1, 1); if (check_order) SL_ERROR("intermediate higher order failed at %" slint_fmt "", check_order); #endif s->size = s0.size + s1.size; /* local merge */ rti_tstart(rti_tid_mpi_merge2_local); if (s0.size > 0 && s1.size > 0 && m2 != NULL) { SL_TRACE_IF(MM2_TRACE_IF, "local merge2 %" slint_fmt " with %" slint_fmt "", s0.size, s1.size); m2(&s0, &s1, xs); } rti_tstop(rti_tid_mpi_merge2_local); #ifdef CHECK_ORDER check_order = elements_validate_order(s, 1); if (check_order) SL_ERROR("output order failed at %" slint_fmt "", check_order); #endif rti_tstop(rti_tid_mpi_merge2); #if defined(MM2_PRINT_TIMINGS) && defined(SL_USE_RTI_TIM) if (MM2_PRINT_TIMINGS) { printf("%d: mpi_merge2: %f\n", rank, rti_tlast(rti_tid_mpi_merge2)); printf("%d: mpi_merge2: find: %f\n", rank, rti_tlast(rti_tid_mpi_merge2_find)); printf("%d: mpi_merge2: move-right: %f\n", rank, rti_tlast(rti_tid_mpi_merge2_moveright)); printf("%d: mpi_merge2: exchange: %f\n", rank, rti_tlast(rti_tid_mpi_merge2_exchange)); printf("%d: mpi_merge2: move-left: %f\n", rank, rti_tlast(rti_tid_mpi_merge2_moveleft)); printf("%d: mpi_merge2: local: %f\n", rank, rti_tlast(rti_tid_mpi_merge2_local)); } #endif return 0; }
slint_t mpi_partition_radix2(elements_t *s, partcond2_t *pc, slint_t rhigh, slint_t rlow, slint_t rwidth, int *scounts, int *sdispls, int size, int rank, MPI_Comm comm) /* sl_proto, sl_func mpi_partition_radix2 */ { slkey_pure_t max_nclasses; slkey_pure_t nclasses, bit_mask; slkey_pure_t k; const slint_t max_nareas = size - 1; slint_t nareas, nareas_new; elements_t areas0[max_nareas], areas1[max_nareas], *areas, *areas_new; double *locals, *globals; double *local_counts, *local_weights, *global_counts, *global_weights; const slint_t max_nparts = size - 1; slint_t parts_low, parts_high, nparts_removed; slint_t parts[max_nparts], part_areas[max_nparts]; double parts_range_[2 * 2 * (1 + max_nparts + 1)]; double *parts_range = parts_range_ + (2 * 2); double parts_minmax_[2 * 4 * (1 + max_nparts + 1)]; double *parts_minmax = parts_minmax_ + (2 * 4); slint_t parts_update_[1 + max_nparts + 1]; slint_t *parts_update = parts_update_ + 1; double parts_minmax_new[2 * 4]; double current_minmax[2 * 2]; double final_locals[2 * max_nparts]; slint_t i, j, jp1, jm1, l, lp1, lm1; slint_t current_width; double minmax[2 * 4 * size]; slint_t last_new_area, last_new_class; #ifdef HAVENT_MPI_IN_PLACE double local_minmax[2 * 4]; #endif slint_t lc, lcs, gc, gcs; double lw, gw, lws, gws; double d, m; elements_t xi, end; slint_t round = 0; slint_t direction = 1; slint_t refine, finalize; #ifdef RCOUNTS_RDISPLS int *rcounts, *rdispls; #endif #ifdef WEIGHT_STATS slint_t total_count = 0, partial_counts[size + 1]; double total_weight = 0.0, partial_weights[size + 1]; double vmin, vmax; # ifdef HAVENT_MPI_IN_PLACE slint_t partial_counts2[size + 1]; double partial_weights2[size + 1]; # endif #endif rti_treset(rti_tid_mpi_partition_radix2_while); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_count); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_allreduce); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_round1); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_round1_allgather); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_exscan); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_check); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_check_pre); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_check_classes); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_check_final); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_check_post); /* sl_tid */ rti_tstart(rti_tid_mpi_partition_radix2_sync); #ifdef SYNC_ON_INIT MPI_Barrier(comm); #endif rti_tstop(rti_tid_mpi_partition_radix2_sync); rti_tstart(rti_tid_mpi_partition_radix2); if (rhigh < 0) rhigh = radix_high; if (rlow < 0) rlow = radix_low; if (rwidth < 0) rwidth = sort_radix_width_default; max_nclasses = powof2_typed(rwidth, slkey_pure_t); locals = sl_alloc(2 * (max_nareas * max_nclasses + max_nareas), sizeof(double)); globals = sl_alloc(2 * (max_nareas * max_nclasses + max_nareas), sizeof(double)); areas = areas0; areas_new = areas1; /* init the first area (all elements) */ nareas = 1; elem_assign(s, &areas[0]); /* init all parts */ parts_low = 0; parts_high = max_nparts - 1; for (i = parts_low; i <= parts_high; ++i) { parts[i] = i; part_areas[i] = 0; } /* init sdispls */ for (i = 0; i < size; ++i) sdispls[i] = 0; rti_tstart(rti_tid_mpi_partition_radix2_while); while (parts_low <= parts_high) { ++round; /* setup bitmask */ current_width = xmin(rwidth, rhigh - rlow + 1); rhigh -= (current_width > 0)?current_width - 1:rhigh; nclasses = (current_width > 0)?powof2_typed(current_width, slkey_pure_t):1; bit_mask = nclasses - 1; SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" sl_int_type_fmt ", rhigh: %" sl_int_type_fmt ", current_width: %" sl_int_type_fmt ", nclasses: %" sl_key_pure_type_fmt, round, rhigh, current_width, nclasses); finalize = (current_width <= 0); if (!finalize || round == 1) { /* init counters */ local_counts = locals; global_counts = globals; local_weights = locals + (nareas * nclasses) + nareas; global_weights = globals + (nareas * nclasses) + nareas; /* zero all counter */ for (i = 0; i < nareas; ++i) for (k = 0; k < nclasses; ++k) local_counts[i * nclasses + k] = local_weights[i * nclasses + k] = 0.0; rti_tstart(rti_tid_mpi_partition_radix2_while_count); /* for every area */ for (i = 0; i < nareas; ++i) { elem_assign_at(&areas[i], areas[i].size, &end); if (nclasses > 1) { /* counts and weights in every class */ for (elem_assign(&areas[i], &xi); xi.keys < end.keys; elem_inc(&xi)) { k = radix_key2class(key_purify(*xi.keys), rhigh, bit_mask); local_counts[i * nclasses + k] += 1; local_weights[i * nclasses + k] += elem_weight_one(&xi, 0); } } else { /* total counts and weights */ local_counts[i * nclasses + 0] = areas[i].size; for (elem_assign(&areas[i], &xi); xi.keys < end.keys; elem_inc(&xi)) local_weights[i * nclasses + 0] += elem_weight_one(&xi, 0); } /* total counts and weights in this area */ local_counts[nareas * nclasses + i] = areas[i].size; local_weights[nareas * nclasses + i] = 0.0; for (k = 0; k < nclasses; ++k) local_weights[nareas * nclasses + i] += local_weights[i * nclasses + k]; } rti_tstop(rti_tid_mpi_partition_radix2_while_count); --rhigh; rti_tstart(rti_tid_mpi_partition_radix2_while_allreduce); /* create global counts and weights */ #ifdef MPI_PARTITION_RADIX_REDUCEBCAST_THRESHOLD if (size >= MPI_PARTITION_RADIX_REDUCEBCAST_THRESHOLD) { MPI_Reduce(locals, globals, (1 + 1) * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, REDUCEBCAST_ROOT, comm); MPI_Bcast(globals, (1 + 1) * (nareas * nclasses + nareas), MPI_DOUBLE, REDUCEBCAST_ROOT, comm); } else #endif MPI_Allreduce(locals, globals, (1 + 1) * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, comm); rti_tstop(rti_tid_mpi_partition_radix2_while_allreduce); } #ifdef TIMING SL_TRACE_IF(DEBUG_OR_NOT, "allreduce: %f, nareas: %" sl_int_type_fmt ", nclasses: %" sl_key_type_fmt ", doubles: %" sl_int_type_fmt, rti_tlast(rti_tid_mpi_partition_radix2_while_allreduce), nareas, nclasses, (1 + 1) * (nareas * nclasses + nareas)); #endif /* if (DEBUG_OR_NOT) { printf("%d: locals\n", rank); for (i = 0; i < nareas; ++i) { printf("%d: %" sl_int_type_fmt ":", rank, i); for (k = 0; k < nclasses; ++k) printf(" %f", local_counts[i * nclasses + k]); printf(" = %f\n", local_counts[nareas * nclasses + i]); printf("%d: %" sl_int_type_fmt ":", rank, i); for (k = 0; k < nclasses; ++k) printf(" %f", local_weights[i * nclasses + k]); printf(" = %f\n", local_weights[nareas * nclasses + i]); } printf("%d: globals\n", rank); for (i = 0; i < nareas; ++i) { printf("%d: %" sl_int_type_fmt ":", rank, i); for (k = 0; k < nclasses; ++k) printf(" %f", global_counts[i * nclasses + k]); printf(" = %f\n", global_counts[nareas * nclasses + i]); printf("%d: %" sl_int_type_fmt ":", rank, i); for (k = 0; k < nclasses; ++k) printf(" %f", global_weights[i * nclasses + k]); printf(" = %f\n", global_weights[nareas * nclasses + i]); } }*/ /* do some initializations */ if (round == 1) { rti_tstart(rti_tid_mpi_partition_radix2_while_round1); /* distribute min/max counts and weights */ minmax[rank * 2 * 4 + 0 + 0] = (pc->min_count >= 0)?pc->min_count:(-pc->min_count * global_counts[nareas * nclasses + 0] / size); minmax[rank * 2 * 4 + 0 + 1] = (pc->max_count >= 0)?pc->max_count:(-pc->max_count * global_counts[nareas * nclasses + 0] / size); minmax[rank * 2 * 4 + 0 + 2] = (pc->min_cpart >= 0)?pc->min_cpart:(-pc->min_cpart * global_counts[nareas * nclasses + 0]); minmax[rank * 2 * 4 + 0 + 3] = (pc->max_cpart >= 0)?pc->max_cpart:(-pc->max_cpart * global_counts[nareas * nclasses + 0]); minmax[rank * 2 * 4 + 4 + 0] = (pc->min_weight >= 0)?pc->min_weight:(-pc->min_weight * global_weights[nareas * nclasses + 0] / size); minmax[rank * 2 * 4 + 4 + 1] = (pc->max_weight >= 0)?pc->max_weight:(-pc->max_weight * global_weights[nareas * nclasses + 0] / size); minmax[rank * 2 * 4 + 4 + 2] = (pc->min_wpart >= 0)?pc->min_wpart:(-pc->min_wpart * global_weights[nareas * nclasses + 0]); minmax[rank * 2 * 4 + 4 + 3] = (pc->max_wpart >= 0)?pc->max_wpart:(-pc->max_wpart * global_weights[nareas * nclasses + 0]); rti_tstart(rti_tid_mpi_partition_radix2_while_round1_allgather); #ifdef HAVENT_MPI_IN_PLACE local_minmax[0 + 0] = minmax[rank * 2 * 4 + 0 + 0]; local_minmax[0 + 1] = minmax[rank * 2 * 4 + 0 + 1]; local_minmax[0 + 2] = minmax[rank * 2 * 4 + 0 + 2]; local_minmax[0 + 3] = minmax[rank * 2 * 4 + 0 + 3]; local_minmax[4 + 0] = minmax[rank * 2 * 4 + 4 + 0]; local_minmax[4 + 1] = minmax[rank * 2 * 4 + 4 + 1]; local_minmax[4 + 2] = minmax[rank * 2 * 4 + 4 + 2]; local_minmax[4 + 3] = minmax[rank * 2 * 4 + 4 + 3]; MPI_Allgather(local_minmax, 2 * 4, MPI_DOUBLE, minmax, 2 * 4, MPI_DOUBLE, comm); /* MPI_Gather(local_minmax_weights, 2 * 4, MPI_DOUBLE, minmax_weights, 2 * 4, MPI_DOUBLE, 0, comm); MPI_Bcast(minmax_weights, 2 * 4 * size, MPI_DOUBLE, 0, comm);*/ #else MPI_Allgather(MPI_IN_PLACE, 2 * 4, MPI_DOUBLE, minmax_weights, 2 * 4, MPI_DOUBLE, comm); #endif rti_tstop(rti_tid_mpi_partition_radix2_while_round1_allgather); #ifdef WEIGHT_STATS total_count = global_counts[nareas * nclasses + 0]; total_weight = global_weights[nareas * nclasses + 0]; #endif parts_minmax[2 * 4 * (parts_low - 1) + 0 + 0] = parts_minmax[2 * 4 * (parts_low - 1) + 0 + 2] = 0; parts_minmax[2 * 4 * (parts_low - 1) + 0 + 1] = parts_minmax[2 * 4 * (parts_low - 1) + 0 + 3] = 0; parts_minmax[2 * 4 * (parts_low - 1) + 4 + 0] = parts_minmax[2 * 4 * (parts_low - 1) + 4 + 2] = 0; parts_minmax[2 * 4 * (parts_low - 1) + 4 + 1] = parts_minmax[2 * 4 * (parts_low - 1) + 4 + 3] = 0; parts_minmax[2 * 4 * (parts_high + 1) + 0 + 0] = parts_minmax[2 * 4 * (parts_high + 1) + 0 + 2] = 0; parts_minmax[2 * 4 * (parts_high + 1) + 0 + 1] = parts_minmax[2 * 4 * (parts_high + 1) + 0 + 3] = global_counts[nareas * nclasses + 0]; parts_minmax[2 * 4 * (parts_high + 1) + 4 + 0] = parts_minmax[2 * 4 * (parts_high + 1) + 4 + 2] = 0; parts_minmax[2 * 4 * (parts_high + 1) + 4 + 1] = parts_minmax[2 * 4 * (parts_high + 1) + 4 + 3] = global_weights[nareas * nclasses + 0]; parts_range[2 * 2 * (parts_low - 1) + 0 + 0] = parts_range[2 * 2 * (parts_high + 1) + 0 + 0] = 0.0; parts_range[2 * 2 * (parts_low - 1) + 0 + 1] = parts_range[2 * 2 * (parts_high + 1) + 0 + 1] = global_counts[nareas * nclasses + 0]; parts_range[2 * 2 * (parts_low - 1) + 2 + 0] = parts_range[2 * 2 * (parts_high + 1) + 2 + 0] = 0.0; parts_range[2 * 2 * (parts_low - 1) + 2 + 1] = parts_range[2 * 2 * (parts_high + 1) + 2 + 1] = global_weights[nareas * nclasses + 0]; for (i = parts_high; i >= parts_low; --i) { parts_minmax[2 * 4 * parts[i] + 0 + 1] = parts_minmax[2 * 4 * (parts[i] + 1) + 0 + 1] - minmax[2 * 4 * (parts[i] + 1) + 0 + 0]; parts_minmax[2 * 4 * parts[i] + 0 + 3] = parts_minmax[2 * 4 * (parts[i] + 1) + 0 + 3] - minmax[2 * 4 * (parts[i] + 1) + 0 + 1]; parts_minmax[2 * 4 * parts[i] + 4 + 1] = parts_minmax[2 * 4 * (parts[i] + 1) + 4 + 1] - minmax[2 * 4 * (parts[i] + 1) + 4 + 0]; parts_minmax[2 * 4 * parts[i] + 4 + 3] = parts_minmax[2 * 4 * (parts[i] + 1) + 4 + 3] - minmax[2 * 4 * (parts[i] + 1) + 4 + 1]; parts_minmax[2 * 4 * parts[i] + 0 + 0] = parts_minmax[2 * 4 * parts[i] + 0 + 2] = parts_minmax[2 * 4 * parts[i] + 4 + 0] = parts_minmax[2 * 4 * parts[i] + 4 + 2] = -1; parts_range[2 * 2 * parts[i] + 0 + 0] = 0.0; parts_range[2 * 2 * parts[i] + 0 + 1] = global_counts[nareas * nclasses + 0]; parts_range[2 * 2 * parts[i] + 2 + 0] = 0.0; parts_range[2 * 2 * parts[i] + 2 + 1] = global_weights[nareas * nclasses + 0]; /* SL_ASSERT(minmax[2 * 4 * (parts[i] + 1) + 0 + 2] <= minmax[2 * 4 * (parts[i] + 0) + 0 + 3]);*/ /* SL_ASSERT(minmax[2 * 4 * (parts[i] + 1) + 4 + 2] <= minmax[2 * 4 * (parts[i] + 0) + 4 + 3]);*/ parts_update[parts[i]] = 1; if (finalize) { final_locals[2 * i + 0] = local_counts[nareas * nclasses + 0]; final_locals[2 * i + 1] = local_weights[nareas * nclasses + 0]; } } rti_tstop(rti_tid_mpi_partition_radix2_while_round1); } if (finalize) { j = parts_high - parts_low + 1; SL_TRACE_IF(DEBUG_OR_NOT, "Exscan: finalizing %" sl_int_type_fmt " parts", j); rti_tstart(rti_tid_mpi_partition_radix2_while_exscan); MPI_Exscan(&final_locals[2 * parts_low], &locals[2 * parts_low], 2 * j, MPI_DOUBLE, MPI_SUM, comm); if (rank == 0) for (i = parts_low; i <= parts_high; ++i) locals[2 * i + 0] = locals[2 * i + 1] = 0; rti_tstop(rti_tid_mpi_partition_radix2_while_exscan); } nareas_new = 0; last_new_area = last_new_class = -1; /* check all remaining parts */ SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" sl_int_type_fmt ", %s", round, (direction > 0)?"forward":"backward"); nparts_removed = 0; rti_tstart(rti_tid_mpi_partition_radix2_while_check); i = (direction > 0)?parts_low:parts_high; while ((direction > 0)?(i <= parts_high):(i >= parts_low)) { rti_tstart(rti_tid_mpi_partition_radix2_while_check_pre); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": PART: %" sl_int_type_fmt ",%" sl_int_type_fmt, round, i, parts[i]); j = 2 * 4 * parts[i]; jp1 = 2 * 4 * (parts[i] + 1); jm1 = 2 * 4 * (parts[i] - 1); l = 2 * 2 * parts[i]; lp1 = 2 * 2 * (parts[i] + 1); lm1 = 2 * 2 * (parts[i] - 1); if (parts_update[parts[i]]) { if (direction > 0) { parts_minmax_new[0 + 0] = parts_minmax[jm1 + 0 + 0] + minmax[j + 0 + 0]; parts_minmax_new[0 + 2] = parts_minmax[jm1 + 0 + 2] + minmax[j + 0 + 1]; parts_minmax_new[4 + 0] = parts_minmax[jm1 + 4 + 0] + minmax[j + 4 + 0]; parts_minmax_new[4 + 2] = parts_minmax[jm1 + 4 + 2] + minmax[j + 4 + 1]; SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": %f + %f, %f + %f / %f + %f, %f + %f", i, parts[i], parts_minmax[jm1 + 0 + 0], minmax[j + 0 + 0], parts_minmax[jm1 + 0 + 2], minmax[j + 0 + 1], parts_minmax[jm1 + 4 + 0], minmax[j + 4 + 0], parts_minmax[jm1 + 4 + 2], minmax[j + 4 + 1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": 0. parts_minmax_new: %f %f %f %f / %f %f %f %f", parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); if (parts_minmax_new[0 + 0] < minmax[jp1 + 0 + 2]) parts_minmax_new[0 + 0] = minmax[jp1 + 0 + 2]; if (parts_minmax_new[0 + 2] > minmax[j + 0 + 3]) parts_minmax_new[0 + 2] = minmax[j + 0 + 3]; if (parts_minmax_new[4 + 0] < minmax[jp1 + 4 + 2]) parts_minmax_new[4 + 0] = minmax[jp1 + 4 + 2]; if (parts_minmax_new[4 + 2] > minmax[j + 4 + 3]) parts_minmax_new[4 + 2] = minmax[j + 4 + 3]; parts_minmax_new[0 + 1] = parts_minmax[j + 0 + 1]; parts_minmax_new[0 + 3] = parts_minmax[j + 0 + 3]; parts_minmax_new[4 + 1] = parts_minmax[j + 4 + 1]; parts_minmax_new[4 + 3] = parts_minmax[j + 4 + 3]; } else { parts_minmax_new[0 + 1] = parts_minmax[jp1 + 0 + 1] - minmax[jp1 + 0 + 0]; parts_minmax_new[0 + 3] = parts_minmax[jp1 + 0 + 3] - minmax[jp1 + 0 + 1]; parts_minmax_new[4 + 1] = parts_minmax[jp1 + 4 + 1] - minmax[jp1 + 4 + 0]; parts_minmax_new[4 + 3] = parts_minmax[jp1 + 4 + 3] - minmax[jp1 + 4 + 1]; SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": %f - %f, %f - %f / %f - %f, %f - %f", i, parts[i], parts_minmax[jp1 + 0 + 1], minmax[jp1 + 0 + 0], parts_minmax[jp1 + 0 + 3], minmax[jp1 + 0 + 1], parts_minmax[jp1 + 4 + 1], minmax[jp1 + 4 + 0], parts_minmax[jp1 + 4 + 3], minmax[jp1 + 4 + 1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": 0. parts_minmax_new: %f %f %f %f / %f %f %f %f", parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); if (parts_minmax_new[0 + 3] < minmax[jp1 + 0 + 2]) parts_minmax_new[0 + 3] = minmax[jp1 + 0 + 2]; if (parts_minmax_new[0 + 1] > minmax[j + 0 + 3]) parts_minmax_new[0 + 1] = minmax[j + 0 + 3]; if (parts_minmax_new[4 + 3] < minmax[jp1 + 4 + 2]) parts_minmax_new[4 + 3] = minmax[jp1 + 4 + 2]; if (parts_minmax_new[4 + 1] > minmax[j + 4 + 3]) parts_minmax_new[4 + 1] = minmax[j + 4 + 3]; parts_minmax_new[0 + 0] = parts_minmax[j + 0 + 0]; parts_minmax_new[0 + 2] = parts_minmax[j + 0 + 2]; parts_minmax_new[4 + 0] = parts_minmax[j + 4 + 0]; parts_minmax_new[4 + 2] = parts_minmax[j + 4 + 2]; } SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": 1. parts_minmax_new: %f %f %f %f / %f %f %f %f", parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": minmax: %f %f / %f %f", parts[i], minmax[2 * 4 * (parts[i] + 1) + 0 + 2], minmax[2 * 4 * (parts[i] + 0) + 0 + 3], minmax[2 * 4 * (parts[i] + 1) + 4 + 2], minmax[2 * 4 * (parts[i] + 0) + 4 + 3]); if (parts_minmax_new[0 + 0] > parts_minmax_new[0 + 1]) parts_minmax_new[0 + 0] = parts_minmax_new[0 + 1] = (parts_minmax_new[0 + 0] + parts_minmax_new[0 + 1]) / 2; if (parts_minmax_new[0 + 2] < parts_minmax_new[0 + 3]) parts_minmax_new[0 + 2] = parts_minmax_new[0 + 3] = (parts_minmax_new[0 + 2] + parts_minmax_new[0 + 3]) / 2; if (parts_minmax_new[4 + 0] > parts_minmax_new[4 + 1]) parts_minmax_new[4 + 0] = parts_minmax_new[4 + 1] = (parts_minmax_new[4 + 0] + parts_minmax_new[4 + 1]) / 2; if (parts_minmax_new[4 + 2] < parts_minmax_new[4 + 3]) parts_minmax_new[4 + 2] = parts_minmax_new[4 + 3] = (parts_minmax_new[4 + 2] + parts_minmax_new[4 + 3]) / 2; } else { parts_minmax_new[0 + 0] = parts_minmax[j + 0 + 0]; parts_minmax_new[0 + 1] = parts_minmax[j + 0 + 1]; parts_minmax_new[0 + 2] = parts_minmax[j + 0 + 2]; parts_minmax_new[0 + 3] = parts_minmax[j + 0 + 3]; parts_minmax_new[4 + 0] = parts_minmax[j + 4 + 0]; parts_minmax_new[4 + 1] = parts_minmax[j + 4 + 1]; parts_minmax_new[4 + 2] = parts_minmax[j + 4 + 2]; parts_minmax_new[4 + 3] = parts_minmax[j + 4 + 3]; } SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": 2. parts_minmax_new: %f %f %f %f / %f %f %f %f", i, parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); current_minmax[0 + 0] = xmax(parts_minmax_new[0 + 0], parts_minmax_new[0 + 3]) - parts_range[l + 0 + 0]; current_minmax[0 + 1] = xmin(parts_minmax_new[0 + 2], parts_minmax_new[0 + 1]) - parts_range[l + 0 + 0]; current_minmax[2 + 0] = xmax(parts_minmax_new[4 + 0], parts_minmax_new[4 + 3]) - parts_range[l + 2 + 0]; current_minmax[2 + 1] = xmin(parts_minmax_new[4 + 2], parts_minmax_new[4 + 1]) - parts_range[l + 2 + 0]; SL_ASSERT(current_minmax[0 + 0] <= current_minmax[0 + 1]); SL_ASSERT(current_minmax[2 + 0] <= current_minmax[2 + 1]); rti_tstop(rti_tid_mpi_partition_radix2_while_check_pre); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": current_minmax: %f %f / %f %f", parts[i], current_minmax[0 + 0], current_minmax[0 + 1], current_minmax[2 + 0], current_minmax[2 + 1]); lcs = gcs = 0; lws = gws = 0; /* HIT is the default */ refine = 0; if (!finalize) { rti_tstart(rti_tid_mpi_partition_radix2_while_check_classes); for (k = 0; k < nclasses; ++k) { lc = local_counts[part_areas[i] * nclasses + k]; gc = global_counts[part_areas[i] * nclasses + k]; lw = local_weights[part_areas[i] * nclasses + k]; gw = global_weights[part_areas[i] * nclasses + k]; current_minmax[0 + 0] -= gc; current_minmax[0 + 1] -= gc; current_minmax[2 + 0] -= gw; current_minmax[2 + 1] -= gw; SL_TRACE_IF(DEBUG_OR_NOT, "k = %" sl_key_pure_type_fmt ", current_minmax: %f %f / %f %f", k, current_minmax[0], current_minmax[1], current_minmax[2], current_minmax[3]); /* stop and refine if max count is skipped OR min count AND max weight is skipped */ if ((current_minmax[0 + 1] < 0) || (current_minmax[0 + 0] < 0 && current_minmax[2 + 1] < 0)) { refine = 1; break; } lcs += lc; gcs += gc; lws += lw; gws += gw; gc = gw = 0.0; /* if between min/max counts */ if (current_minmax[0 + 0] <= 0 && current_minmax[0 + 1] >= 0) { /* go to next if max count not reached AND min weight not reached */ if (current_minmax[0 + 1] > 0 && current_minmax[2 + 0] > 0) continue; /* look ahead for a better stop */ if (k + 1 < nclasses && current_minmax[0 + 1] - global_counts[part_areas[i] * nclasses + k + 1] >= 0) { /* continue if weights will improve */ if (myabs(current_minmax[2 + 0] + current_minmax[2 + 1]) > myabs(current_minmax[2 + 0] + current_minmax[2 + 1] - 2 * global_weights[part_areas[i] * nclasses + k + 1])) continue; } /* stop */ break; } } SL_ASSERT(k < nclasses); SL_TRACE_IF(DEBUG_OR_NOT, "%s k = %" sl_key_pure_type_fmt, (refine)?"REFINE":"HIT", k); rti_tstop(rti_tid_mpi_partition_radix2_while_check_classes); } else { rti_tstart(rti_tid_mpi_partition_radix2_while_check_final); /* middle of min/max weight */ m = (current_minmax[2 + 0] + current_minmax[2 + 1]) / 2; /* min. part of weight to contribute */ d = xmax(0, m - locals[i * 2 + 1]); /* contribute all? */ if (d >= final_locals[i * 2 + 1]) { lc = final_locals[i * 2 + 0]; lw = final_locals[i * 2 + 1]; } else { /* contribute only a part */ lc = 0; lw = 0; /* not required */ do { d -= elem_weight_one(s, sdispls[1 + parts[i]] + lc); ++lc; } while (d >= 0 && lc < final_locals[i * 2 + 0]); --lc; /* if unweighted, then m = middle of min/max count, d = ..., lc = d */ } /* check mc against min/max count borders */ lc = xminmax(current_minmax[0 + 0] - locals[i * 2 + 0], lc, current_minmax[0 + 1] - locals[i * 2 + 0]); /* check agains 0 (don't step back!) and the local contribution */ lc = xminmax(0, lc, final_locals[i * 2 + 0]); /* the exact global counts/weights are unknown (set gc/gw so that parts_range is not changed) */ gc = 0; gw = 0; lcs += lc; gcs += gc; lws += lw; gws += gw; gc = (parts_range[2 * 2 * parts[i] + 0 + 1] - parts_range[2 * 2 * parts[i] + 0 + 0]); gw = (parts_range[2 * 2 * parts[i] + 2 + 1] - parts_range[2 * 2 * parts[i] + 2 + 0]); rti_tstop(rti_tid_mpi_partition_radix2_while_check_final); } rti_tstart(rti_tid_mpi_partition_radix2_while_check_post); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": sdispls[%" sl_int_type_fmt " + 1] = %d, lcs = %" sl_int_type_fmt, i, parts[i], parts[i], sdispls[parts[i] + 1], lcs); sdispls[parts[i] + 1] += lcs; if (gcs > 0 || gws > 0) { parts_range[l + 0 + 0] += gcs; parts_range[l + 0 + 1] = parts_range[l + 0 + 0] + gc; parts_range[l + 2 + 0] += gws; parts_range[l + 2 + 1] = parts_range[l + 2 + 0] + gw; SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": 3. part_minmax_new: %f %f %f %f / %f %f %f %f", i, parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": parts_range: %f %f / %f %f", i, parts[i], parts_range[2 * 2 * parts[i] + 0 + 0], parts_range[2 * 2 * parts[i] + 0 + 1], parts_range[2 * 2 * parts[i] + 2 + 0], parts_range[2 * 2 * parts[i] + 2 + 1]); parts_minmax_new[0 + 0] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 0], parts_range[l + 0 + 1]); parts_minmax_new[0 + 2] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 2], parts_range[l + 0 + 1]); parts_minmax_new[0 + 1] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 1], parts_range[l + 0 + 1]); parts_minmax_new[0 + 3] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 3], parts_range[l + 0 + 1]); parts_minmax_new[4 + 0] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 0], parts_range[l + 2 + 1]); parts_minmax_new[4 + 2] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 2], parts_range[l + 2 + 1]); parts_minmax_new[4 + 1] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 1], parts_range[l + 2 + 1]); parts_minmax_new[4 + 3] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 3], parts_range[l + 2 + 1]); } SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": 4. part_minmax_new: %f %f %f %f / %f %f %f %f", i, parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); if (parts_minmax_new[0 + 0] != parts_minmax[j + 0 + 0] || parts_minmax_new[0 + 2] != parts_minmax[j + 0 + 2] || parts_minmax_new[4 + 0] != parts_minmax[j + 4 + 0] || parts_minmax_new[4 + 2] != parts_minmax[j + 4 + 2]) { parts_minmax[j + 0 + 0] = parts_minmax_new[0 + 0]; parts_minmax[j + 0 + 2] = parts_minmax_new[0 + 2]; parts_minmax[j + 4 + 0] = parts_minmax_new[4 + 0]; parts_minmax[j + 4 + 2] = parts_minmax_new[4 + 2]; parts_update[parts[i] + 1] = 1; } if (parts_minmax_new[0 + 1] != parts_minmax[j + 0 + 1] || parts_minmax_new[0 + 3] != parts_minmax[j + 0 + 3] || parts_minmax_new[4 + 1] != parts_minmax[j + 4 + 1] || parts_minmax_new[4 + 3] != parts_minmax[j + 4 + 3]) { parts_minmax[j + 0 + 1] = parts_minmax_new[0 + 1]; parts_minmax[j + 0 + 3] = parts_minmax_new[0 + 3]; parts_minmax[j + 4 + 1] = parts_minmax_new[4 + 1]; parts_minmax[j + 4 + 3] = parts_minmax_new[4 + 3]; parts_update[parts[i] - 1] = 1; } parts_update[parts[i]] = 0; /* refine or remove */ if (refine) { /* bits left for partitioning? */ if (rhigh >= rlow) { if (last_new_area == part_areas[i] && last_new_class == k) part_areas[i] = nareas_new - 1; else { /* update last_new_... */ last_new_area = part_areas[i]; last_new_class = k; /* create new area */ elem_assign_at(&areas[part_areas[i]], lcs, &areas_new[nareas_new]); areas_new[nareas_new].size = local_counts[part_areas[i] * nclasses + k]; part_areas[i] = nareas_new; ++nareas_new; } } else { /* save local count/weight for the later prefix calculations */ final_locals[2 * (i - nparts_removed * direction) + 0] = lc; final_locals[2 * (i - nparts_removed * direction) + 1] = lw; } parts[i - nparts_removed * direction] = parts[i]; part_areas[i - nparts_removed * direction] = part_areas[i]; } else ++nparts_removed; rti_tstop(rti_tid_mpi_partition_radix2_while_check_post); i += direction; } if (direction > 0) parts_high -= nparts_removed; else parts_low += nparts_removed; direction *= -1; /* SL_NOTICE_IF(DEBUG_OR_NOT, "nparts = %" sl_int_type_fmt " vs. nareas_new = %" sl_int_type_fmt, nparts, nareas_new);*/ rti_tstop(rti_tid_mpi_partition_radix2_while_check); /* switch areas */ nareas = nareas_new; if (areas == areas0) { areas = areas1; areas_new = areas0; } else { areas = areas0; areas_new = areas1; } } rti_tstop(rti_tid_mpi_partition_radix2_while); /* create scounts */ for (i = 0; i < size - 1; ++i) scounts[i] = sdispls[i + 1] - sdispls[i]; scounts[size - 1] = s->size - sdispls[size - 1]; #ifdef SCOUNTS_SDISPLS printf("%d: scounts", rank); for (i = 0, j = 0; i < size; ++i) { printf(" %d", scounts[i]); j += scounts[i]; } printf(" = %" sl_int_type_fmt "\n", j); printf("%d: sdispls", rank); for (i = 0; i < size; ++i) printf(" %d", sdispls[i]); printf("\n"); #endif #ifdef RCOUNTS_RDISPLS rcounts = sl_alloc(size, sizeof(int)); rdispls = sl_alloc(size, sizeof(int)); MPI_Alltoall(scounts, 1, MPI_INT, rcounts, 1, MPI_INT, comm); rdispls[0] = 0; for (i = 1; i < size; ++i) rdispls[i] = rdispls[i - 1] + rcounts[i - 1]; printf("%d: rcounts", rank); for (i = 0; i < size; ++i) printf(" %d", rcounts[i]); printf("\n"); printf("%d: rdispls", rank); for (i = 0; i < size; ++i) printf(" %d", rdispls[i]); printf("\n"); sl_free(rcounts); sl_free(rdispls); #endif sl_free(locals); sl_free(globals); #ifdef WEIGHT_STATS partial_counts[size] = 0; partial_weights[size] = 0.0; for (i = 0; i < size; ++i) { partial_counts[i] = scounts[i]; partial_weights[i] = 0.0; for (j = sdispls[i]; j < sdispls[i] + scounts[i]; ++j) partial_weights[i] += elem_weight_one(s, j); partial_counts[size] += partial_counts[i]; partial_weights[size] += partial_weights[i]; } #ifdef HAVENT_MPI_IN_PLACE MPI_Reduce(partial_counts, partial_counts2, size + 1, int_mpi_datatype, MPI_SUM, 0, comm); MPI_Reduce(partial_weights, partial_weights2, size + 1, MPI_DOUBLE, MPI_SUM, 0, comm); # define partial_counts partial_counts2 # define partial_weights partial_weights2 #else /* recvbuf requires workaround for an in-place/aliased-buffer-check-bug in mpich2 (fixed with rev 5518) */ MPI_Reduce((rank == 0)?MPI_IN_PLACE:partial_counts, (rank == 0)?partial_counts:NULL, size + 1, int_mpi_datatype, MPI_SUM, 0, comm); MPI_Reduce((rank == 0)?MPI_IN_PLACE:partial_weights, (rank == 0)?partial_weights:NULL, size + 1, MPI_DOUBLE, MPI_SUM, 0, comm); #endif if (rank == 0) { printf("%d: total_count: %" sl_int_type_fmt " vs. %" sl_int_type_fmt "\n", rank, total_count, partial_counts[size]); d = 0.0; vmin = 1.0; vmax = 0.0; for (i = 0; i < size; ++i) { /* printf("%d: %" sl_int_type_fmt " %" sl_int_type_fmt " / %f - %" sl_int_type_fmt " / %f\n", rank, i, partial_counts[i], (double) partial_counts[i] / partial_counts[size], (partial_counts[size] / size) - partial_counts[i], fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])));*/ d += fabs((partial_counts[size] / size) - partial_counts[i]); if (fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])) < vmin) vmin = fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])); if (fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])) > vmax) vmax = fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])); } printf("%d: min/max: %f / %f\n", rank, vmin, vmax); printf("%d: average_count: %" sl_int_type_fmt " - %f / %f\n", rank, partial_counts[size] / size, d / size, d / partial_counts[size]); printf("%d: total_weight: %f vs. %f\n", rank, total_weight, partial_weights[size]); d = 0.0; vmin = 1.0; vmax = 0.0; for (i = 0; i < size; ++i) { /* printf("%d: %" sl_int_type_fmt " %f / %f - %f / %f\n", rank, i, partial_weights[i], partial_weights[i] / partial_weights[size], (partial_weights[size] / size) - partial_weights[i], fabs(1.0 - (partial_weights[i] * size / partial_weights[size])));*/ d += fabs((partial_weights[size] / size) - partial_weights[i]); if (fabs(1.0 - (partial_weights[i] * size / partial_weights[size])) < vmin) vmin = fabs(1.0 - (partial_weights[i] * size / partial_weights[size])); if (fabs(1.0 - (partial_weights[i] * size / partial_weights[size])) > vmax) vmax = fabs(1.0 - (partial_weights[i] * size / partial_weights[size])); } printf("%d: min/max: %f / %f\n", rank, vmin, vmax); printf("%d: average_weight: %f - %f / %f\n", rank, partial_weights[size] / size, d / size, d / partial_weights[size]); } #endif rti_tstop(rti_tid_mpi_partition_radix2); #if defined(TIMING_STATS) && defined(SL_USE_RTI_TIM) if (rank == 0) { printf("%d: mpi_partition_radix: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2)); printf("%d: mpi_partition_radix: sync: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_sync)); printf("%d: mpi_partition_radix: while: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while)); printf("%d: mpi_partition_radix: count: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_count)); printf("%d: mpi_partition_radix: allreduce: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_allreduce)); printf("%d: mpi_partition_radix: round1: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_round1)); printf("%d: mpi_partition_radix: allgather: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_round1_allgather)); printf("%d: mpi_partition_radix: exscan: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_exscan)); printf("%d: mpi_partition_radix: check: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_check)); printf("%d: mpi_partition_radix: pre: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_pre)); printf("%d: mpi_partition_radix: classes: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_classes)); printf("%d: mpi_partition_radix: final: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_final)); printf("%d: mpi_partition_radix: post: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_post)); } #endif return 0; }
slint_t rs_rec_ma(elements_t *s, elements_t *sx, slint_t rhigh, slint_t rlow, slint_t rwidth) /* sl_func rs_rec_ma */ { #define max_nclasses (powof2_typed(sort_radix_width_max, slkey_pure_t)) slkey_pure_t bit_mask, nclasses; slint_t i, j, k, current_width, c[max_nclasses]; elements_t xi, end, parts[max_nclasses]; elem_assign_at(s, s->size, &end); current_width = xmin(rwidth, rhigh - rlow + 1); rhigh -= current_width - 1; nclasses = powof2_typed(current_width, slkey_pure_t); bit_mask = nclasses - 1; /* zero all counter */ for (i = 0; i < nclasses; i++) c[i] = 0; /* count the number of elements in every class */ for (elem_assign(s, &xi); xi.keys < end.keys; elem_inc(&xi)) ++c[key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask)]; /* compute the target of every class */ elem_assign(s, &parts[0]); for (i = 1; i < nclasses; i++) elem_assign_at(&parts[i - 1], c[i - 1], &parts[i]);; /* split the elements */ elem_assign(s, &end); for (i = 0; i < nclasses; i++) { elem_add(&end, c[i]); elem_assign(&parts[i], &xi); while (xi.keys < end.keys) { j = key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask); while (j != i) { k = key_radix_key2class(key_purify(*parts[j].keys), rhigh, bit_mask); if (k != j) elem_xchange(&xi, &parts[j], sx); elem_inc(&parts[j]); j = k; } elem_inc(&xi); } } --rhigh; if (rhigh >= rlow) { #ifdef SR_MA_INSERTSORT bit_mask = 0; if (rhigh - rlow + 1 <= key_radix_high) bit_mask = powof2_typed(rhigh - rlow + 1, slkey_pure_t); bit_mask = (bit_mask - 1) << rlow; #endif elem_assign(s, &xi); for (i = 0; i < nclasses; i++) { xi.size = c[i]; #ifdef SR_MA_INSERTSORT if (xi.size > sort_radix_threshold_rec) #else if (xi.size > 1) #endif { if (xi.size > sx->size) rs_rec_ma(&xi, sx, rhigh, rlow, rwidth); else rs_rec_ma_db(&xi, sx, rhigh, rlow, rwidth, 1); } #ifdef SR_MA_INSERTSORT else { if (xi.size > 1) sort_insert_bmask_kernel(&xi, sx, bit_mask); } #endif elem_add(&xi, c[i]); } } return 0; }
void PCGSolver::solvefull( OoqpVector& rhs_ ) { SimpleVector& b = dynamic_cast<SimpleVector&>(rhs_); assert(n+m == b.length()); int flag, imin; int stagsteps, maxstagsteps; double normr, normr_act, normrmin; double alpha, beta, rg, pHp; double n2b = b.twonorm(); double tolb = n2b*tol; if(tmpVec1==NULL) tmpVec1=new double[n+m]; if(tmpVec2==NULL) tmpVec2=new double[n+m]; if(tmpVec3==NULL) tmpVec3=new double[n]; if(tmpVec4==NULL) tmpVec4=new double[n]; if(tmpVec5==NULL) tmpVec5=new double[n]; if(tmpVec6==NULL) tmpVec6=new double[n+m]; SimpleVector xy(tmpVec1, n+m); //iterate SimpleVector auxnm(tmpVec2,n+m); //auxiliary SimpleVector xmin(tmpVec3,n); //minimal residual iterate SimpleVector g(tmpVec4,n); //work vectors SimpleVector p(tmpVec5,n); SimpleVector res(tmpVec6, n+m); SimpleVector x(& xy[0],n); //y-part of the iterate SimpleVector y(& xy[n],m); //x-part of the iterate SimpleVector rx(&res[0],n); //residual SimpleVector ry(&res[n],m); //residual corresponding to last m eqn SimpleVector auxn(&auxnm[0],n); SimpleVector auxm(&auxnm[n],m); ////////////////////////////////////////////////////////////////// // Starting procedure ///////////////////////////////////////////////////////////////// //find starting point x satisfying Ax=b_2 applyM1(0.0, xy, 1.0, b); //compute the x-residual for the starting point rx=Hx-b_1 rx.copyFromArray(&b[0]); applyA(-1.0, rx, 1.0, x); //find y such that it minimizes ||r-A'y||_Ginv //this is done by a preconditioner solve with rhs=[rx;0] ry.setToZero(); applyM1(0.0, auxnm, 1.0, res); y.copyFromArray(&auxnm[n]); //remove A'y from residual At->doIt(1.0, rx, -1.0, y); //initialize projected residual g=Pr and update p=-g ry.setToZero(); applyM1(0.0, auxnm, 1.0, res); g.copyFromArray(&auxnm[0]); p.copyFrom(g); p.negate(); normr=rx.twonorm(); rg = rx.dotProductWith(g); xmin.copyFrom(x); flag=1; imin=0; maxit=n/2+10; if(normr<tolb) { //initial guess is good enough for(int i=0; i<n; i++) b[i]=x[i]; for(int i=n; i<n+m; i++) b[i]=y[i-n]; return; } stagsteps=0; maxstagsteps = 5; normrmin=normr; ////////////////////////////////////////////////////////////////// // loop over maxit iterations ////////////////////////////////////////////////////////////////// int ii=0; while(ii<maxit) { ii++; // compute Hp and p'Hp SimpleVector Hp(&auxn[0], n); applyA(0.0, Hp, 1.0, p); pHp = p.dotProductWith(Hp); //check for negative curvature if(pHp<0.0) { flag=2; break; } alpha = rg/pHp; //update x=x+alpha*p and r=r+alpha*H*p x.axpy(alpha, p); rx.axpy(alpha,Hp); normr=rx.twonorm(); /////////////////////////////////////// //convergence tests /////////////////////////////////////// if(normr<=tolb) { //compute actual residual SimpleVector rx_act(&auxnm[0], n); rx_act.copyFromArray(&b[0]); applyA(-1.0, rx_act, 1.0, x); normr_act=rx_act.twonorm(); //if(normr_act/n2b<tolb) { flag=0; break; } { flag=0; break; } } if(normr<normrmin) { imin=ii; xmin.copyFrom(x); normrmin=normr; stagsteps=0;} else stagsteps++; //check for stagnation!!! if(stagsteps>maxstagsteps) { flag=4; break; } //------- end convergence tests ------- //compute y that minimizes ||r-A'y||_Ginv ry.setToZero(); applyM1(0.0, auxnm, 1.0, res); y.copyFromArray(&auxnm[n]); //substract A'y from r At->doIt(1.0, rx, -1.0, y); //projected residual g=Pr ry.setToZero(); applyM1(0.0, auxnm, 1.0, res); g.copyFromArray(&auxnm[0]); double rpgp = rx.dotProductWith(g); beta = rpgp/rg; //p = -g+beta*p p.scale(beta); p.axpy(-1.0, g); rg = rpgp; //rounding error if(rg<0.0) { flag=3; break; } } ////////////////////////////////////////////////////////// // status/error output ///////////////////////////////////////////////////////// if(flag==0) { double relres = normr_act/n2b; printf("CG converged: actual normResid=%g relResid=%g iter=%d\n", normr_act, relres, ii); b.setToZero(); for(int i=0; i<n; i++) b[i] = x[i]; for(int i=n; i<m+n;i++) b[i] = y[i-n]; } else { if(flag==4) x.copyFrom(xmin); //compute actual residual SimpleVector rx(&auxnm[0], n); rx.copyFromArray(&b[0]); applyA(1.0, rx, -1.0, x); normr_act = rx.twonorm(); for(int i=0; i<n; i++) b[i] = x[i]; for(int i=n; i<m+n;i++) b[i] = y[i-n]; if(gOoqpPrintLevel>=1) { printf("Projected CG did not NOT converged after %d iters. The solution from iter %d was returned.\n", ii, imin); printf("\t - Error code %d\n\t - Act res=%g\n\t - Rel res=%g %g\n\n", flag, normr_act, normrmin); } } //b.copyFrom(x); }
slint_t rs_rec_ma_db(elements_t *s, elements_t *sx, slint_t rhigh, slint_t rlow, slint_t rwidth, slint_t switchdb) /* sl_func rs_rec_ma_db */ { #define max_nclasses (powof2_typed(sort_radix_width_max, slkey_pure_t)) slkey_pure_t bit_mask, nclasses; slint_t i, j, current_width, c[max_nclasses]; elements_t xi, xj, end, parts[max_nclasses]; elem_assign_at(s, s->size, &end); current_width = xmin(rwidth, rhigh - rlow + 1); rhigh -= current_width - 1; nclasses = powof2_typed(current_width, slkey_pure_t); bit_mask = nclasses - 1; /* zero all counter */ for (i = 0; i < nclasses; i++) c[i] = 0; /* count the number of elements in every class */ for (elem_assign(s, &xi); xi.keys < end.keys; elem_inc(&xi)) ++c[key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask)]; /* compute the target of every class */ elem_assign(sx, &parts[0]); for (i = 1; i < nclasses; i++) elem_assign_at(&parts[i - 1], c[i - 1], &parts[i]); /* split the elements */ elem_assign(s, &xi); elem_assign_at(s, s->size, &end); while (xi.keys < end.keys) { j = key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask); elem_copy(&xi, &parts[j]); elem_inc(&xi); elem_inc(&parts[j]); } --rhigh; if (rhigh >= rlow) { #ifdef SR_MA_INSERTSORT bit_mask = 0; if (rhigh - rlow + 1 <= key_radix_high) bit_mask = powof2_typed(rhigh - rlow + 1, slkey_pure_t); bit_mask = (bit_mask - 1) << rlow; #endif elem_assign(s, &xi); elem_assign(sx, &xj); for (i = 0; i < nclasses; i++) { xi.size = xj.size = c[i]; #ifdef SR_MA_INSERTSORT if (c[i] > sort_radix_threshold_rec) rs_rec_ma_db(&xj, &xi, rhigh, rlow, rwidth, (!switchdb)); else { if (c[i] > 1) sort_insert_bmask_kernel(&xj, &xi, bit_mask); if (switchdb) elem_ncopy(&xj, &xi, c[i]); } elem_add(&xi, c[i]); elem_add(&xj, c[i]); } #else if (c[i] > 1) rs_rec_ma_db(&xj, &xi, rhigh, rlow, rwidth, (!switchdb)); #endif } else elem_ncopy(sx, s, s->size); return 0; }