bool SmoothConstrainedInterpolator::ProjectVelocity(const Config& x,Config& v) { constraint->PreEval(x); Matrix J; constraint->Jacobian(x,J); if(!xmin.empty()) { //look through active contraints, set that column to 0 for(int i=0;i<x.n;i++) { if(x(i)==xmin(i) || x(i) == xmax(i)) { v(i) = 0; for(int j=0;j<J.m;j++) J(j,i) = 0; } } } RobustSVD<Real> svd; bool res=svd.set(J); if(!res) { fprintf(stderr,"SmoothConstrainedInterpolator: Numerical error projecting velocity?\n"); return false; } Vector temp; svd.nullspaceComponent(v,temp); v -= temp; return true; }
void Instance::transformBoundingBox() { auto b = i->getBoundingBox(); BoundingBox bb; for(int i = 0; i <= RayTracer::getInstance()->maxTime; ++i) { std::set<double> x,y,z; auto m = makeMatrices(i); auto p = Vector(b.xmin(i),0,0); p = transformLoc(m.first, p); x.insert(p.x); y.insert(p.y); z.insert(p.z); p = Vector(b.xmax(i),0,0); p = transformLoc(m.first, p); x.insert(p.x); y.insert(p.y); z.insert(p.z); p = Vector(0,b.ymin(i),0); p = transformLoc(m.first, p); x.insert(p.x); y.insert(p.y); z.insert(p.z); p = Vector(0,b.ymax(i),0); p = transformLoc(m.first, p); x.insert(p.x); y.insert(p.y); z.insert(p.z); p = Vector(0,0,b.zmin(i)); p = transformLoc(m.first, p); x.insert(p.x); y.insert(p.y); z.insert(p.z); p = Vector(0,0,b.zmax(i)); p = transformLoc(m.first, p); x.insert(p.x); y.insert(p.y); z.insert(p.z); bb.xmin.addFrame(i, *x.begin()); bb.xmax.addFrame(i, *x.rbegin()); bb.ymin.addFrame(i, *y.begin()); bb.ymax.addFrame(i, *y.rbegin()); bb.zmin.addFrame(i, *z.begin()); bb.zmax.addFrame(i, *z.rbegin()); } bbox = bb; }
int QDeclarativeDrag::qt_metacall(QMetaObject::Call _c, int _id, void **_a) { _id = QObject::qt_metacall(_c, _id, _a); if (_id < 0) return _id; if (_c == QMetaObject::InvokeMetaMethod) { if (_id < 8) qt_static_metacall(this, _c, _id, _a); _id -= 8; } #ifndef QT_NO_PROPERTIES else if (_c == QMetaObject::ReadProperty) { void *_v = _a[0]; switch (_id) { case 0: *reinterpret_cast< QGraphicsObject**>(_v) = target(); break; case 1: *reinterpret_cast< Axis*>(_v) = axis(); break; case 2: *reinterpret_cast< qreal*>(_v) = xmin(); break; case 3: *reinterpret_cast< qreal*>(_v) = xmax(); break; case 4: *reinterpret_cast< qreal*>(_v) = ymin(); break; case 5: *reinterpret_cast< qreal*>(_v) = ymax(); break; case 6: *reinterpret_cast< bool*>(_v) = active(); break; case 7: *reinterpret_cast< bool*>(_v) = filterChildren(); break; } _id -= 8; } else if (_c == QMetaObject::WriteProperty) { void *_v = _a[0]; switch (_id) { case 0: setTarget(*reinterpret_cast< QGraphicsObject**>(_v)); break; case 1: setAxis(*reinterpret_cast< Axis*>(_v)); break; case 2: setXmin(*reinterpret_cast< qreal*>(_v)); break; case 3: setXmax(*reinterpret_cast< qreal*>(_v)); break; case 4: setYmin(*reinterpret_cast< qreal*>(_v)); break; case 5: setYmax(*reinterpret_cast< qreal*>(_v)); break; case 7: setFilterChildren(*reinterpret_cast< bool*>(_v)); break; } _id -= 8; } else if (_c == QMetaObject::ResetProperty) { switch (_id) { case 0: resetTarget(); break; } _id -= 8; } else if (_c == QMetaObject::QueryPropertyDesignable) { _id -= 8; } else if (_c == QMetaObject::QueryPropertyScriptable) { _id -= 8; } else if (_c == QMetaObject::QueryPropertyStored) { _id -= 8; } else if (_c == QMetaObject::QueryPropertyEditable) { _id -= 8; } else if (_c == QMetaObject::QueryPropertyUser) { _id -= 8; } #endif // QT_NO_PROPERTIES return _id; }
float size() const { if (xmax() < xmin() || ymax() < ymin()) { // If box is invalid (e.g. xmax < xmin or ymax < ymin), return 0. return 0.0f; } else { return width() * height(); } }
boot find_xmax(double *x,bvec y) { boot xmax(y.nboot,y.njack); for(int iboot=0;iboot<=y.nboot;iboot++) { int imax=0; for(int iel=0;iel<y.nel;iel++) if(y[iel][iboot]>=y[imax][iboot]) imax=iel; xmax.data[iboot]=x[imax]; } return xmax; }
void CartesianWidget::setXRange(double a, double b) { setCenter((a+b)/2.0, centerY()); if (xmax()-xmin() > (b-a)) { while ( xmax()-xmin() > (b-a) ) { setZoomLevel(zoomLevel()-1); } if ( xmax()-xmin() < (b-a) ) { setZoomLevel(zoomLevel()+1); } } else { while ( xmax()-xmin() < (b-a) ) { setZoomLevel(zoomLevel()+1); } } update(); }
/* single even-stage */ slint sn_even(slint size, slint rank, slint stage, void *snp, slint *up) /* sl_proto, sl_func sn_even */ { slint stages = 1; /* if the rank is out of range, return 'finshed' */ if (rank >= size) return -1; /* if 'stage < 0' return the number of stages */ if (stage < 0) return stages; /* if the stage is to large, return 'finshed' */ if (stage >= stages) return -1; if (up != NULL) *up = 0; return xmax(0, xmin(size - 1, ((1 == rank % 2)?rank + 1:rank - 1))); }
AccelerationGrid::AccelerationGrid() : m_cells(0,0,0), m_elementidxs(0), m_elementxmins(0), m_elementxmaxs(0), m_elementquery(0), m_lastquery(0), m_gridxmin(0,0,0), m_gridxmax(0,0,0), m_cellsize(0,0,0), m_invcellsize(0,0,0) { Vec3st dims(1,1,1); Vec3d xmin(0,0,0), xmax(1,1,1); set(dims, xmin, xmax); }
static char *xstrsub(const char *src, int begin, int len) { int l; int ind; char *ret; size_t s_full; s_full=strlen(src); if(len==-1) l=(int)s_full; else l=len; if(!(ret=(char *)malloc_w((xmin(s_full, l)+1)*sizeof(char), __func__))) return NULL; ind=begin<0?xmax((int) s_full+begin, 0):xmin(s_full, begin); strncpy(ret, src+ind, xmin(s_full, l)); ret[xmin(s_full, l)] = '\0'; return ret; }
static void memory_ostream::write_mem (memory_ostream_t stream, const void *data, size_t len) { if (len > 0) { if (len > stream->allocated - stream->buflen) { size_t new_allocated = xmax (xsum (stream->buflen, len), xsum (stream->allocated, stream->allocated)); if (size_overflow_p (new_allocated)) error (EXIT_FAILURE, 0, _("%s: too much output, buffer size overflow"), "memory_ostream"); stream->buffer = (char *) xrealloc (stream->buffer, new_allocated); stream->allocated = new_allocated; } memcpy (stream->buffer + stream->buflen, data, len); stream->buflen += len; } }
//------------------------------------------------------------------------------------------------------------------------------------ // called when we want to draw the 3D data in our app. //------------------------------------------------------------------------------------------------------------------------------------ void draw3D() { const float DEG_TO_RAD = PI / 180.0f; const Vec3 xAxis(1.0f, 0, 0); const Vec3 yAxis(0, 1.0f, 0); translate(0, 0, -g_zoom); translate(g_tx, g_ty, 0); rotate(g_rotx * DEG_TO_RAD, xAxis); rotate(g_roty * DEG_TO_RAD, yAxis); // draw the grid on the floor setColour(0.25f, 0.25f, 0.25f); for(float i = -10.0f; i <= 10.1f; i += 1.0f) { Vec3 zmin(i, 0, -10); Vec3 zmax(i, 0, 10); Vec3 xmin(-10, 0, i); Vec3 xmax(10, 0, i); drawLine(xmin, xmax); drawLine(zmin, zmax); } }
int main(int argc, char **argv) { int c = 0; long i_start_arg = 1; long i_end_arg = N; int i_start = 1; int i_end = N; mpfr_fn sin_fn = 0; mpfr_fn cos_fn = 0; for (int k = 0; k < argc; ++k) { printf("%s ", argv[k]); } printf("\n"); while ((c = getopt(argc, argv, "i:j:f:")) != -1) { switch (c) { case 'i': errno = 0; i_start_arg = strtoll(optarg, 0, 0); if (errno) { fprintf(stderr, "bad start index %s\n", optarg); return 1; } break; case 'j': errno = 0; i_end_arg = strtoll(optarg, 0, 0); if (errno) { fprintf(stderr, "bad end index %s\n", optarg); return 1; } break; case 'f': if (!strcmp(optarg, "sin")) { sin_fn = mpfr_sin; cos_fn = mpfr_cos; } else if (!strcmp(optarg, "tan")) { sin_fn = mpfr_tan; cos_fn = mpfr_cot; } else { fprintf(stderr, "unknown function %s\n", optarg); return 1; } break; default: usage(); break; } } if (i_start_arg <= 0 || i_end_arg > N) { printf("truncating start to (0, %d]\n", N); i_start_arg = xmin(xmax(i_start_arg, 1), N); } if (i_end_arg <= 0 || i_end_arg > N) { printf("truncating end to (0, %d]\n", N); i_end_arg = xmin(xmax(i_end_arg, 1), N); } i_start = i_start_arg; i_end = i_end_arg; if (!sin_fn || !cos_fn) { fprintf(stderr, "-f required\n"); return 1; } for (int i = i_start; i <= i_end; ++i) { if (find_triple_64(i, 11, 20, sin_fn, cos_fn) < 0) { /* This indicates you should drop the range limitations on r, re-run, and come back in a week. */ printf("CANNOT FIND SUITABLE CANDIDATE FOR i = %03d\n", i); } } return 0; }
void MAST::NPSOLOptimizationInterface::optimize() { #if MAST_ENABLE_NPSOL == 1 // make sure that functions have been provided libmesh_assert(_funobj); libmesh_assert(_funcon); int N = _feval->n_vars(), NCLIN = 0, NCNLN = _feval->n_eq()+_feval->n_ineq(), NCTOTL = N+NCLIN+NCNLN, LDA = std::max(NCLIN, 1), LDJ = std::max(NCNLN, 1), LDR = N, INFORM = 0, // on exit: Reports result of call to NPSOL // < 0 either funobj or funcon has set this to -ve // 0 => converged to point x // 1 => x satisfies optimality conditions, but sequence of iterates has not converged // 2 => Linear constraints and bounds cannot be satisfied. No feasible solution // 3 => Nonlinear constraints and bounds cannot be satisfied. No feasible solution // 4 => Major iter limit was reached // 6 => x does not satisfy first-order optimality to required accuracy // 7 => function derivatives seem to be incorrect // 9 => input parameter invalid ITER = 0, // iter count LENIW = 3*N + NCLIN + 2*NCNLN, LENW = 2*N*N + N*NCLIN + 2*N*NCNLN + 20*N + 11*NCLIN + 21*NCNLN; Real F = 0.; // on exit: final objective std::vector<int> IW (LENIW, 0), ISTATE (NCTOTL, 0); // status of constraints l <= r(x) <= u, // -2 => lower bound is violated by more than delta // -1 => upper bound is violated by more than delta // 0 => both bounds are satisfied by more than delta // 1 => lower bound is active (to within delta) // 2 => upper bound is active (to within delta) // 3 => boundars are equal and equality constraint is satisfied std::vector<Real> A (LDA, 0.), // this is used for liear constraints, not currently handled BL (NCTOTL, 0.), BU (NCTOTL, 0.), C (NCNLN, 0.), // on exit: nonlinear constraints CJAC (LDJ* N, 0.), // // on exit: CJAC(i,j) is the partial derivative of ith nonlinear constraint CLAMBDA (NCTOTL, 0.), // on entry: need not be initialized for cold start // on exit: QP multiplier from the QP subproblem, >=0 if istate(j)=1, <0 if istate(j)=2 G (N, 0.), // on exit: objective gradient R (LDR*N, 0.), // on entry: need not be initialized if called with Cold Statrt // on exit: information about Hessian, if Hessian=Yes, R is upper Cholesky factor of approx H X (N, 0.), // on entry: initial point // on exit: final estimate of solution W (LENW, 0.), // workspace xmin (N, 0.), xmax (N, 0.); // now setup the lower and upper limits for the variables and constraints _feval->init_dvar(X, xmin, xmax); for (unsigned int i=0; i<N; i++) { BL[i] = xmin[i]; BU[i] = xmax[i]; } // all constraints are assumed to be g_i(x) <= 0, so that the upper // bound is 0 and lower bound is -infinity for (unsigned int i=0; i<NCNLN; i++) { BL[i+N] = -1.e20; BU[i+N] = 0.; } std::string nm; // nm = "List"; // npoptn_(nm.c_str(), (int)nm.length()); // nm = "Verify level 3"; // npoptn_(nm.c_str(), (int)nm.length()); npsol_(&N, &NCLIN, &NCNLN, &LDA, &LDJ, &LDR, &A[0], &BL[0], &BU[0], _funcon, _funobj, &INFORM, &ITER, &ISTATE[0], &C[0], &CJAC[0], &CLAMBDA[0], &F, &G[0], &R[0], &X[0], &IW[0], &LENIW, &W[0], &LENW); #endif // MAST_ENABLE_NPSOL 1 }
CHAR_T * VASNPRINTF (CHAR_T *resultbuf, size_t *lengthp, const CHAR_T *format, va_list args) { DIRECTIVES d; arguments a; if (PRINTF_PARSE (format, &d, &a) < 0) { errno = EINVAL; return NULL; } #define CLEANUP() \ free (d.dir); \ if (a.arg) \ free (a.arg); if (printf_fetchargs (args, &a) < 0) { CLEANUP (); errno = EINVAL; return NULL; } { size_t buf_neededlength; CHAR_T *buf; CHAR_T *buf_malloced; const CHAR_T *cp; size_t i; DIRECTIVE *dp; /* Output string accumulator. */ CHAR_T *result; size_t allocated; size_t length; /* Allocate a small buffer that will hold a directive passed to sprintf or snprintf. */ buf_neededlength = xsum4 (7, d.max_width_length, d.max_precision_length, 6); #if HAVE_ALLOCA if (buf_neededlength < 4000 / sizeof (CHAR_T)) { buf = (CHAR_T *) alloca (buf_neededlength * sizeof (CHAR_T)); buf_malloced = NULL; } else #endif { size_t buf_memsize = xtimes (buf_neededlength, sizeof (CHAR_T)); if (size_overflow_p (buf_memsize)) goto out_of_memory_1; buf = (CHAR_T *) malloc (buf_memsize); if (buf == NULL) goto out_of_memory_1; buf_malloced = buf; } if (resultbuf != NULL) { result = resultbuf; allocated = *lengthp; } else { result = NULL; allocated = 0; } length = 0; /* Invariants: result is either == resultbuf or == NULL or malloc-allocated. If length > 0, then result != NULL. */ /* Ensures that allocated >= needed. Aborts through a jump to out_of_memory if needed is SIZE_MAX or otherwise too big. */ #define ENSURE_ALLOCATION(needed) \ if ((needed) > allocated) \ { \ size_t memory_size; \ CHAR_T *memory; \ \ allocated = (allocated > 0 ? xtimes (allocated, 2) : 12); \ if ((needed) > allocated) \ allocated = (needed); \ memory_size = xtimes (allocated, sizeof (CHAR_T)); \ if (size_overflow_p (memory_size)) \ goto out_of_memory; \ if (result == resultbuf || result == NULL) \ memory = (CHAR_T *) malloc (memory_size); \ else \ memory = (CHAR_T *) realloc (result, memory_size); \ if (memory == NULL) \ goto out_of_memory; \ if (result == resultbuf && length > 0) \ memcpy (memory, result, length * sizeof (CHAR_T)); \ result = memory; \ } for (cp = format, i = 0, dp = &d.dir[0]; ; cp = dp->dir_end, i++, dp++) { if (cp != dp->dir_start) { size_t n = dp->dir_start - cp; size_t augmented_length = xsum (length, n); ENSURE_ALLOCATION (augmented_length); memcpy (result + length, cp, n * sizeof (CHAR_T)); length = augmented_length; } if (i == d.count) break; /* Execute a single directive. */ if (dp->conversion == '%') { size_t augmented_length; if (!(dp->arg_index == ARG_NONE)) abort (); augmented_length = xsum (length, 1); ENSURE_ALLOCATION (augmented_length); result[length] = '%'; length = augmented_length; } else { if (!(dp->arg_index != ARG_NONE)) abort (); if (dp->conversion == 'n') { switch (a.arg[dp->arg_index].type) { case TYPE_COUNT_SCHAR_POINTER: *a.arg[dp->arg_index].a.a_count_schar_pointer = length; break; case TYPE_COUNT_SHORT_POINTER: *a.arg[dp->arg_index].a.a_count_short_pointer = length; break; case TYPE_COUNT_INT_POINTER: *a.arg[dp->arg_index].a.a_count_int_pointer = length; break; case TYPE_COUNT_LONGINT_POINTER: *a.arg[dp->arg_index].a.a_count_longint_pointer = length; break; #ifdef HAVE_LONG_LONG case TYPE_COUNT_LONGLONGINT_POINTER: *a.arg[dp->arg_index].a.a_count_longlongint_pointer = length; break; #endif default: abort (); } } else { arg_type type = a.arg[dp->arg_index].type; CHAR_T *p; unsigned int prefix_count; int prefixes[2]; #if !USE_SNPRINTF size_t tmp_length; CHAR_T tmpbuf[700]; CHAR_T *tmp; /* Allocate a temporary buffer of sufficient size for calling sprintf. */ { size_t width; size_t precision; width = 0; if (dp->width_start != dp->width_end) { if (dp->width_arg_index != ARG_NONE) { int arg; if (!(a.arg[dp->width_arg_index].type == TYPE_INT)) abort (); arg = a.arg[dp->width_arg_index].a.a_int; width = (arg < 0 ? (unsigned int) (-arg) : arg); } else { const CHAR_T *digitp = dp->width_start; do width = xsum (xtimes (width, 10), *digitp++ - '0'); while (digitp != dp->width_end); } } precision = 6; if (dp->precision_start != dp->precision_end) { if (dp->precision_arg_index != ARG_NONE) { int arg; if (!(a.arg[dp->precision_arg_index].type == TYPE_INT)) abort (); arg = a.arg[dp->precision_arg_index].a.a_int; precision = (arg < 0 ? 0 : arg); } else { const CHAR_T *digitp = dp->precision_start + 1; precision = 0; do precision = xsum (xtimes (precision, 10), *digitp++ - '0'); while (digitp != dp->precision_end); } } switch (dp->conversion) { case 'd': case 'i': case 'u': # ifdef HAVE_LONG_LONG if (type == TYPE_LONGLONGINT || type == TYPE_ULONGLONGINT) tmp_length = (unsigned int) (sizeof (unsigned long long) * CHAR_BIT * 0.30103 /* binary -> decimal */ * 2 /* estimate for FLAG_GROUP */ ) + 1 /* turn floor into ceil */ + 1; /* account for leading sign */ else # endif if (type == TYPE_LONGINT || type == TYPE_ULONGINT) tmp_length = (unsigned int) (sizeof (unsigned long) * CHAR_BIT * 0.30103 /* binary -> decimal */ * 2 /* estimate for FLAG_GROUP */ ) + 1 /* turn floor into ceil */ + 1; /* account for leading sign */ else tmp_length = (unsigned int) (sizeof (unsigned int) * CHAR_BIT * 0.30103 /* binary -> decimal */ * 2 /* estimate for FLAG_GROUP */ ) + 1 /* turn floor into ceil */ + 1; /* account for leading sign */ break; case 'o': # ifdef HAVE_LONG_LONG if (type == TYPE_LONGLONGINT || type == TYPE_ULONGLONGINT) tmp_length = (unsigned int) (sizeof (unsigned long long) * CHAR_BIT * 0.333334 /* binary -> octal */ ) + 1 /* turn floor into ceil */ + 1; /* account for leading sign */ else # endif if (type == TYPE_LONGINT || type == TYPE_ULONGINT) tmp_length = (unsigned int) (sizeof (unsigned long) * CHAR_BIT * 0.333334 /* binary -> octal */ ) + 1 /* turn floor into ceil */ + 1; /* account for leading sign */ else tmp_length = (unsigned int) (sizeof (unsigned int) * CHAR_BIT * 0.333334 /* binary -> octal */ ) + 1 /* turn floor into ceil */ + 1; /* account for leading sign */ break; case 'x': case 'X': # ifdef HAVE_LONG_LONG if (type == TYPE_LONGLONGINT || type == TYPE_ULONGLONGINT) tmp_length = (unsigned int) (sizeof (unsigned long long) * CHAR_BIT * 0.25 /* binary -> hexadecimal */ ) + 1 /* turn floor into ceil */ + 2; /* account for leading sign or alternate form */ else # endif if (type == TYPE_LONGINT || type == TYPE_ULONGINT) tmp_length = (unsigned int) (sizeof (unsigned long) * CHAR_BIT * 0.25 /* binary -> hexadecimal */ ) + 1 /* turn floor into ceil */ + 2; /* account for leading sign or alternate form */ else tmp_length = (unsigned int) (sizeof (unsigned int) * CHAR_BIT * 0.25 /* binary -> hexadecimal */ ) + 1 /* turn floor into ceil */ + 2; /* account for leading sign or alternate form */ break; case 'f': case 'F': # ifdef HAVE_LONG_DOUBLE if (type == TYPE_LONGDOUBLE) tmp_length = (unsigned int) (LDBL_MAX_EXP * 0.30103 /* binary -> decimal */ * 2 /* estimate for FLAG_GROUP */ ) + 1 /* turn floor into ceil */ + 10; /* sign, decimal point etc. */ else # endif tmp_length = (unsigned int) (DBL_MAX_EXP * 0.30103 /* binary -> decimal */ * 2 /* estimate for FLAG_GROUP */ ) + 1 /* turn floor into ceil */ + 10; /* sign, decimal point etc. */ tmp_length = xsum (tmp_length, precision); break; case 'e': case 'E': case 'g': case 'G': case 'a': case 'A': tmp_length = 12; /* sign, decimal point, exponent etc. */ tmp_length = xsum (tmp_length, precision); break; case 'c': # if defined HAVE_WINT_T && !WIDE_CHAR_VERSION if (type == TYPE_WIDE_CHAR) tmp_length = MB_CUR_MAX; else # endif tmp_length = 1; break; case 's': # ifdef HAVE_WCHAR_T if (type == TYPE_WIDE_STRING) { tmp_length = local_wcslen (a.arg[dp->arg_index].a.a_wide_string); # if !WIDE_CHAR_VERSION tmp_length = xtimes (tmp_length, MB_CUR_MAX); # endif } else # endif tmp_length = strlen (a.arg[dp->arg_index].a.a_string); break; case 'p': tmp_length = (unsigned int) (sizeof (void *) * CHAR_BIT * 0.25 /* binary -> hexadecimal */ ) + 1 /* turn floor into ceil */ + 2; /* account for leading 0x */ break; default: abort (); } if (tmp_length < width) tmp_length = width; tmp_length = xsum (tmp_length, 1); /* account for trailing NUL */ } if (tmp_length <= sizeof (tmpbuf) / sizeof (CHAR_T)) tmp = tmpbuf; else { size_t tmp_memsize = xtimes (tmp_length, sizeof (CHAR_T)); if (size_overflow_p (tmp_memsize)) /* Overflow, would lead to out of memory. */ goto out_of_memory; tmp = (CHAR_T *) malloc (tmp_memsize); if (tmp == NULL) /* Out of memory. */ goto out_of_memory; } #endif /* Construct the format string for calling snprintf or sprintf. */ p = buf; *p++ = '%'; if (dp->flags & FLAG_GROUP) *p++ = '\''; if (dp->flags & FLAG_LEFT) *p++ = '-'; if (dp->flags & FLAG_SHOWSIGN) *p++ = '+'; if (dp->flags & FLAG_SPACE) *p++ = ' '; if (dp->flags & FLAG_ALT) *p++ = '#'; if (dp->flags & FLAG_ZERO) *p++ = '0'; if (dp->width_start != dp->width_end) { size_t n = dp->width_end - dp->width_start; memcpy (p, dp->width_start, n * sizeof (CHAR_T)); p += n; } if (dp->precision_start != dp->precision_end) { size_t n = dp->precision_end - dp->precision_start; memcpy (p, dp->precision_start, n * sizeof (CHAR_T)); p += n; } switch (type) { #ifdef HAVE_LONG_LONG case TYPE_LONGLONGINT: case TYPE_ULONGLONGINT: *p++ = 'l'; /*FALLTHROUGH*/ #endif case TYPE_LONGINT: case TYPE_ULONGINT: #ifdef HAVE_WINT_T case TYPE_WIDE_CHAR: #endif #ifdef HAVE_WCHAR_T case TYPE_WIDE_STRING: #endif *p++ = 'l'; break; #ifdef HAVE_LONG_DOUBLE case TYPE_LONGDOUBLE: *p++ = 'L'; break; #endif default: break; } *p = dp->conversion; #if USE_SNPRINTF p[1] = '%'; p[2] = 'n'; p[3] = '\0'; #else p[1] = '\0'; #endif /* Construct the arguments for calling snprintf or sprintf. */ prefix_count = 0; if (dp->width_arg_index != ARG_NONE) { if (!(a.arg[dp->width_arg_index].type == TYPE_INT)) abort (); prefixes[prefix_count++] = a.arg[dp->width_arg_index].a.a_int; } if (dp->precision_arg_index != ARG_NONE) { if (!(a.arg[dp->precision_arg_index].type == TYPE_INT)) abort (); prefixes[prefix_count++] = a.arg[dp->precision_arg_index].a.a_int; } #if USE_SNPRINTF /* Prepare checking whether snprintf returns the count via %n. */ ENSURE_ALLOCATION (xsum (length, 1)); result[length] = '\0'; #endif for (;;) { size_t maxlen; int count; int retcount; maxlen = allocated - length; count = -1; retcount = 0; #if USE_SNPRINTF # define SNPRINTF_BUF(arg) \ switch (prefix_count) \ { \ case 0: \ retcount = SNPRINTF (result + length, maxlen, buf, \ arg, &count); \ break; \ case 1: \ retcount = SNPRINTF (result + length, maxlen, buf, \ prefixes[0], arg, &count); \ break; \ case 2: \ retcount = SNPRINTF (result + length, maxlen, buf, \ prefixes[0], prefixes[1], arg, \ &count); \ break; \ default: \ abort (); \ } #else # define SNPRINTF_BUF(arg) \ switch (prefix_count) \ { \ case 0: \ count = sprintf (tmp, buf, arg); \ break; \ case 1: \ count = sprintf (tmp, buf, prefixes[0], arg); \ break; \ case 2: \ count = sprintf (tmp, buf, prefixes[0], prefixes[1],\ arg); \ break; \ default: \ abort (); \ } #endif switch (type) { case TYPE_SCHAR: { int arg = a.arg[dp->arg_index].a.a_schar; SNPRINTF_BUF (arg); } break; case TYPE_UCHAR: { unsigned int arg = a.arg[dp->arg_index].a.a_uchar; SNPRINTF_BUF (arg); } break; case TYPE_SHORT: { int arg = a.arg[dp->arg_index].a.a_short; SNPRINTF_BUF (arg); } break; case TYPE_USHORT: { unsigned int arg = a.arg[dp->arg_index].a.a_ushort; SNPRINTF_BUF (arg); } break; case TYPE_INT: { int arg = a.arg[dp->arg_index].a.a_int; SNPRINTF_BUF (arg); } break; case TYPE_UINT: { unsigned int arg = a.arg[dp->arg_index].a.a_uint; SNPRINTF_BUF (arg); } break; case TYPE_LONGINT: { long int arg = a.arg[dp->arg_index].a.a_longint; SNPRINTF_BUF (arg); } break; case TYPE_ULONGINT: { unsigned long int arg = a.arg[dp->arg_index].a.a_ulongint; SNPRINTF_BUF (arg); } break; #ifdef HAVE_LONG_LONG case TYPE_LONGLONGINT: { long long int arg = a.arg[dp->arg_index].a.a_longlongint; SNPRINTF_BUF (arg); } break; case TYPE_ULONGLONGINT: { unsigned long long int arg = a.arg[dp->arg_index].a.a_ulonglongint; SNPRINTF_BUF (arg); } break; #endif case TYPE_DOUBLE: { double arg = a.arg[dp->arg_index].a.a_double; SNPRINTF_BUF (arg); } break; #ifdef HAVE_LONG_DOUBLE case TYPE_LONGDOUBLE: { long double arg = a.arg[dp->arg_index].a.a_longdouble; SNPRINTF_BUF (arg); } break; #endif case TYPE_CHAR: { int arg = a.arg[dp->arg_index].a.a_char; SNPRINTF_BUF (arg); } break; #ifdef HAVE_WINT_T case TYPE_WIDE_CHAR: { wint_t arg = a.arg[dp->arg_index].a.a_wide_char; SNPRINTF_BUF (arg); } break; #endif case TYPE_STRING: { const char *arg = a.arg[dp->arg_index].a.a_string; SNPRINTF_BUF (arg); } break; #ifdef HAVE_WCHAR_T case TYPE_WIDE_STRING: { const wchar_t *arg = a.arg[dp->arg_index].a.a_wide_string; SNPRINTF_BUF (arg); } break; #endif case TYPE_POINTER: { void *arg = a.arg[dp->arg_index].a.a_pointer; SNPRINTF_BUF (arg); } break; default: abort (); } #if USE_SNPRINTF /* Portability: Not all implementations of snprintf() are ISO C 99 compliant. Determine the number of bytes that snprintf() has produced or would have produced. */ if (count >= 0) { /* Verify that snprintf() has NUL-terminated its result. */ if (count < maxlen && result[length + count] != '\0') abort (); /* Portability hack. */ if (retcount > count) count = retcount; } else { /* snprintf() doesn't understand the '%n' directive. */ if (p[1] != '\0') { /* Don't use the '%n' directive; instead, look at the snprintf() return value. */ p[1] = '\0'; continue; } else { /* Look at the snprintf() return value. */ if (retcount < 0) { /* HP-UX 10.20 snprintf() is doubly deficient: It doesn't understand the '%n' directive, *and* it returns -1 (rather than the length that would have been required) when the buffer is too small. */ size_t bigger_need = xsum (xtimes (allocated, 2), 12); ENSURE_ALLOCATION (bigger_need); continue; } else count = retcount; } } #endif /* Attempt to handle failure. */ if (count < 0) { if (!(result == resultbuf || result == NULL)) free (result); if (buf_malloced != NULL) free (buf_malloced); CLEANUP (); errno = EINVAL; return NULL; } #if !USE_SNPRINTF if (count >= tmp_length) /* tmp_length was incorrectly calculated - fix the code above! */ abort (); #endif /* Make room for the result. */ if (count >= maxlen) { /* Need at least count bytes. But allocate proportionally, to avoid looping eternally if snprintf() reports a too small count. */ size_t n = xmax (xsum (length, count), xtimes (allocated, 2)); ENSURE_ALLOCATION (n); #if USE_SNPRINTF continue; #endif } #if USE_SNPRINTF /* The snprintf() result did fit. */ #else /* Append the sprintf() result. */ memcpy (result + length, tmp, count * sizeof (CHAR_T)); if (tmp != tmpbuf) free (tmp); #endif length += count; break; } } } } /* Add the final NUL. */ ENSURE_ALLOCATION (xsum (length, 1)); result[length] = '\0'; if (result != resultbuf && length + 1 < allocated) { /* Shrink the allocated memory if possible. */ CHAR_T *memory; memory = (CHAR_T *) realloc (result, (length + 1) * sizeof (CHAR_T)); if (memory != NULL) result = memory; } if (buf_malloced != NULL) free (buf_malloced); CLEANUP (); *lengthp = length; return result; out_of_memory: if (!(result == resultbuf || result == NULL)) free (result); if (buf_malloced != NULL) free (buf_malloced); out_of_memory_1: CLEANUP (); errno = ENOMEM; return NULL; } }
//------------------------------------------------------------------------------------------------------------------------------------ // called when we want to draw the 3D data in our app. //------------------------------------------------------------------------------------------------------------------------------------ void draw3D() { // draw the grid on the floor setColour(0.25f, 0.25f, 0.25f); for(float i = -10.0f; i <= 10.1f; i += 1.0f) { Vec3 zmin(i, 0, -10); Vec3 zmax(i, 0, 10); Vec3 xmin(-10, 0, i); Vec3 xmax(10, 0, i); drawLine(xmin, xmax); drawLine(zmin, zmax); } // If using the GPU to compute the lighting (which is what you want to do!) if(!g_manualLighting) { // turn on lighting enableLighting(); // set the diffuse material colour (this will be modulated by the effect of the lighting) setColour(1.0f, 1.0f, 1.0f); // draw the cube geometry drawPrimitives(g_pointsVN, 24, kQuads); // turn off lighting disableLighting(); } else { // otherwise, compute the lighting manually. Don't ever do this in practice! It's insane! (But it may be useful for educational purposes) // The direction from the vertex to the light (effectively sunlight in this case!). Vec3 L(-0.6f, 1.0f, -0.2f); // make sure L has been normalised! L = normalize(L); // start drawing some quads begin(kQuads); // loop through each vertex normal for(int i = 0; i < 24; ++i) { // compute N.L // Make sure we clamp this to zero (so that we ignore any negative values). float N_dot_L = std::max(dot(L, g_pointsVN[i].n), 0.0f); // the ambient material colour (always gets added to the final colour) Vec3 Ka(0.2f, 0.2f, 0.2f); // the diffuse material colour Vec3 Kd(1.0f, 1.0f, 1.0f); // Compute the final colour Vec3 colour = Ka + (Kd * N_dot_L); // set the vertex colour setColour(colour); // specify the vertex addVertex(g_pointsVN[i].v); } // finish drawing our quads end(); } // if we are displaying normals if(g_displayNormals) { // make colour pink setColour(1.0f, 0.0f, 1.0f); // loop through each vertex for(int i = 0; i < 24; ++i) { // compute an offset (along the normal direction) from the vertex Vec3 pn = g_pointsVN[i].v + (g_pointsVN[i].n * 0.2f); // draw a line to show the normal drawLine(g_pointsVN[i].v, pn); } } }
slint_t mpi_partition_radix2(elements_t *s, partcond2_t *pc, slint_t rhigh, slint_t rlow, slint_t rwidth, int *scounts, int *sdispls, int size, int rank, MPI_Comm comm) /* sl_proto, sl_func mpi_partition_radix2 */ { slkey_pure_t max_nclasses; slkey_pure_t nclasses, bit_mask; slkey_pure_t k; const slint_t max_nareas = size - 1; slint_t nareas, nareas_new; elements_t areas0[max_nareas], areas1[max_nareas], *areas, *areas_new; double *locals, *globals; double *local_counts, *local_weights, *global_counts, *global_weights; const slint_t max_nparts = size - 1; slint_t parts_low, parts_high, nparts_removed; slint_t parts[max_nparts], part_areas[max_nparts]; double parts_range_[2 * 2 * (1 + max_nparts + 1)]; double *parts_range = parts_range_ + (2 * 2); double parts_minmax_[2 * 4 * (1 + max_nparts + 1)]; double *parts_minmax = parts_minmax_ + (2 * 4); slint_t parts_update_[1 + max_nparts + 1]; slint_t *parts_update = parts_update_ + 1; double parts_minmax_new[2 * 4]; double current_minmax[2 * 2]; double final_locals[2 * max_nparts]; slint_t i, j, jp1, jm1, l, lp1, lm1; slint_t current_width; double minmax[2 * 4 * size]; slint_t last_new_area, last_new_class; #ifdef HAVENT_MPI_IN_PLACE double local_minmax[2 * 4]; #endif slint_t lc, lcs, gc, gcs; double lw, gw, lws, gws; double d, m; elements_t xi, end; slint_t round = 0; slint_t direction = 1; slint_t refine, finalize; #ifdef RCOUNTS_RDISPLS int *rcounts, *rdispls; #endif #ifdef WEIGHT_STATS slint_t total_count = 0, partial_counts[size + 1]; double total_weight = 0.0, partial_weights[size + 1]; double vmin, vmax; # ifdef HAVENT_MPI_IN_PLACE slint_t partial_counts2[size + 1]; double partial_weights2[size + 1]; # endif #endif rti_treset(rti_tid_mpi_partition_radix2_while); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_count); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_allreduce); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_round1); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_round1_allgather); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_exscan); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_check); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_check_pre); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_check_classes); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_check_final); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_check_post); /* sl_tid */ rti_tstart(rti_tid_mpi_partition_radix2_sync); #ifdef SYNC_ON_INIT MPI_Barrier(comm); #endif rti_tstop(rti_tid_mpi_partition_radix2_sync); rti_tstart(rti_tid_mpi_partition_radix2); if (rhigh < 0) rhigh = radix_high; if (rlow < 0) rlow = radix_low; if (rwidth < 0) rwidth = sort_radix_width_default; max_nclasses = powof2_typed(rwidth, slkey_pure_t); locals = sl_alloc(2 * (max_nareas * max_nclasses + max_nareas), sizeof(double)); globals = sl_alloc(2 * (max_nareas * max_nclasses + max_nareas), sizeof(double)); areas = areas0; areas_new = areas1; /* init the first area (all elements) */ nareas = 1; elem_assign(s, &areas[0]); /* init all parts */ parts_low = 0; parts_high = max_nparts - 1; for (i = parts_low; i <= parts_high; ++i) { parts[i] = i; part_areas[i] = 0; } /* init sdispls */ for (i = 0; i < size; ++i) sdispls[i] = 0; rti_tstart(rti_tid_mpi_partition_radix2_while); while (parts_low <= parts_high) { ++round; /* setup bitmask */ current_width = xmin(rwidth, rhigh - rlow + 1); rhigh -= (current_width > 0)?current_width - 1:rhigh; nclasses = (current_width > 0)?powof2_typed(current_width, slkey_pure_t):1; bit_mask = nclasses - 1; SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" sl_int_type_fmt ", rhigh: %" sl_int_type_fmt ", current_width: %" sl_int_type_fmt ", nclasses: %" sl_key_pure_type_fmt, round, rhigh, current_width, nclasses); finalize = (current_width <= 0); if (!finalize || round == 1) { /* init counters */ local_counts = locals; global_counts = globals; local_weights = locals + (nareas * nclasses) + nareas; global_weights = globals + (nareas * nclasses) + nareas; /* zero all counter */ for (i = 0; i < nareas; ++i) for (k = 0; k < nclasses; ++k) local_counts[i * nclasses + k] = local_weights[i * nclasses + k] = 0.0; rti_tstart(rti_tid_mpi_partition_radix2_while_count); /* for every area */ for (i = 0; i < nareas; ++i) { elem_assign_at(&areas[i], areas[i].size, &end); if (nclasses > 1) { /* counts and weights in every class */ for (elem_assign(&areas[i], &xi); xi.keys < end.keys; elem_inc(&xi)) { k = radix_key2class(key_purify(*xi.keys), rhigh, bit_mask); local_counts[i * nclasses + k] += 1; local_weights[i * nclasses + k] += elem_weight_one(&xi, 0); } } else { /* total counts and weights */ local_counts[i * nclasses + 0] = areas[i].size; for (elem_assign(&areas[i], &xi); xi.keys < end.keys; elem_inc(&xi)) local_weights[i * nclasses + 0] += elem_weight_one(&xi, 0); } /* total counts and weights in this area */ local_counts[nareas * nclasses + i] = areas[i].size; local_weights[nareas * nclasses + i] = 0.0; for (k = 0; k < nclasses; ++k) local_weights[nareas * nclasses + i] += local_weights[i * nclasses + k]; } rti_tstop(rti_tid_mpi_partition_radix2_while_count); --rhigh; rti_tstart(rti_tid_mpi_partition_radix2_while_allreduce); /* create global counts and weights */ #ifdef MPI_PARTITION_RADIX_REDUCEBCAST_THRESHOLD if (size >= MPI_PARTITION_RADIX_REDUCEBCAST_THRESHOLD) { MPI_Reduce(locals, globals, (1 + 1) * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, REDUCEBCAST_ROOT, comm); MPI_Bcast(globals, (1 + 1) * (nareas * nclasses + nareas), MPI_DOUBLE, REDUCEBCAST_ROOT, comm); } else #endif MPI_Allreduce(locals, globals, (1 + 1) * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, comm); rti_tstop(rti_tid_mpi_partition_radix2_while_allreduce); } #ifdef TIMING SL_TRACE_IF(DEBUG_OR_NOT, "allreduce: %f, nareas: %" sl_int_type_fmt ", nclasses: %" sl_key_type_fmt ", doubles: %" sl_int_type_fmt, rti_tlast(rti_tid_mpi_partition_radix2_while_allreduce), nareas, nclasses, (1 + 1) * (nareas * nclasses + nareas)); #endif /* if (DEBUG_OR_NOT) { printf("%d: locals\n", rank); for (i = 0; i < nareas; ++i) { printf("%d: %" sl_int_type_fmt ":", rank, i); for (k = 0; k < nclasses; ++k) printf(" %f", local_counts[i * nclasses + k]); printf(" = %f\n", local_counts[nareas * nclasses + i]); printf("%d: %" sl_int_type_fmt ":", rank, i); for (k = 0; k < nclasses; ++k) printf(" %f", local_weights[i * nclasses + k]); printf(" = %f\n", local_weights[nareas * nclasses + i]); } printf("%d: globals\n", rank); for (i = 0; i < nareas; ++i) { printf("%d: %" sl_int_type_fmt ":", rank, i); for (k = 0; k < nclasses; ++k) printf(" %f", global_counts[i * nclasses + k]); printf(" = %f\n", global_counts[nareas * nclasses + i]); printf("%d: %" sl_int_type_fmt ":", rank, i); for (k = 0; k < nclasses; ++k) printf(" %f", global_weights[i * nclasses + k]); printf(" = %f\n", global_weights[nareas * nclasses + i]); } }*/ /* do some initializations */ if (round == 1) { rti_tstart(rti_tid_mpi_partition_radix2_while_round1); /* distribute min/max counts and weights */ minmax[rank * 2 * 4 + 0 + 0] = (pc->min_count >= 0)?pc->min_count:(-pc->min_count * global_counts[nareas * nclasses + 0] / size); minmax[rank * 2 * 4 + 0 + 1] = (pc->max_count >= 0)?pc->max_count:(-pc->max_count * global_counts[nareas * nclasses + 0] / size); minmax[rank * 2 * 4 + 0 + 2] = (pc->min_cpart >= 0)?pc->min_cpart:(-pc->min_cpart * global_counts[nareas * nclasses + 0]); minmax[rank * 2 * 4 + 0 + 3] = (pc->max_cpart >= 0)?pc->max_cpart:(-pc->max_cpart * global_counts[nareas * nclasses + 0]); minmax[rank * 2 * 4 + 4 + 0] = (pc->min_weight >= 0)?pc->min_weight:(-pc->min_weight * global_weights[nareas * nclasses + 0] / size); minmax[rank * 2 * 4 + 4 + 1] = (pc->max_weight >= 0)?pc->max_weight:(-pc->max_weight * global_weights[nareas * nclasses + 0] / size); minmax[rank * 2 * 4 + 4 + 2] = (pc->min_wpart >= 0)?pc->min_wpart:(-pc->min_wpart * global_weights[nareas * nclasses + 0]); minmax[rank * 2 * 4 + 4 + 3] = (pc->max_wpart >= 0)?pc->max_wpart:(-pc->max_wpart * global_weights[nareas * nclasses + 0]); rti_tstart(rti_tid_mpi_partition_radix2_while_round1_allgather); #ifdef HAVENT_MPI_IN_PLACE local_minmax[0 + 0] = minmax[rank * 2 * 4 + 0 + 0]; local_minmax[0 + 1] = minmax[rank * 2 * 4 + 0 + 1]; local_minmax[0 + 2] = minmax[rank * 2 * 4 + 0 + 2]; local_minmax[0 + 3] = minmax[rank * 2 * 4 + 0 + 3]; local_minmax[4 + 0] = minmax[rank * 2 * 4 + 4 + 0]; local_minmax[4 + 1] = minmax[rank * 2 * 4 + 4 + 1]; local_minmax[4 + 2] = minmax[rank * 2 * 4 + 4 + 2]; local_minmax[4 + 3] = minmax[rank * 2 * 4 + 4 + 3]; MPI_Allgather(local_minmax, 2 * 4, MPI_DOUBLE, minmax, 2 * 4, MPI_DOUBLE, comm); /* MPI_Gather(local_minmax_weights, 2 * 4, MPI_DOUBLE, minmax_weights, 2 * 4, MPI_DOUBLE, 0, comm); MPI_Bcast(minmax_weights, 2 * 4 * size, MPI_DOUBLE, 0, comm);*/ #else MPI_Allgather(MPI_IN_PLACE, 2 * 4, MPI_DOUBLE, minmax_weights, 2 * 4, MPI_DOUBLE, comm); #endif rti_tstop(rti_tid_mpi_partition_radix2_while_round1_allgather); #ifdef WEIGHT_STATS total_count = global_counts[nareas * nclasses + 0]; total_weight = global_weights[nareas * nclasses + 0]; #endif parts_minmax[2 * 4 * (parts_low - 1) + 0 + 0] = parts_minmax[2 * 4 * (parts_low - 1) + 0 + 2] = 0; parts_minmax[2 * 4 * (parts_low - 1) + 0 + 1] = parts_minmax[2 * 4 * (parts_low - 1) + 0 + 3] = 0; parts_minmax[2 * 4 * (parts_low - 1) + 4 + 0] = parts_minmax[2 * 4 * (parts_low - 1) + 4 + 2] = 0; parts_minmax[2 * 4 * (parts_low - 1) + 4 + 1] = parts_minmax[2 * 4 * (parts_low - 1) + 4 + 3] = 0; parts_minmax[2 * 4 * (parts_high + 1) + 0 + 0] = parts_minmax[2 * 4 * (parts_high + 1) + 0 + 2] = 0; parts_minmax[2 * 4 * (parts_high + 1) + 0 + 1] = parts_minmax[2 * 4 * (parts_high + 1) + 0 + 3] = global_counts[nareas * nclasses + 0]; parts_minmax[2 * 4 * (parts_high + 1) + 4 + 0] = parts_minmax[2 * 4 * (parts_high + 1) + 4 + 2] = 0; parts_minmax[2 * 4 * (parts_high + 1) + 4 + 1] = parts_minmax[2 * 4 * (parts_high + 1) + 4 + 3] = global_weights[nareas * nclasses + 0]; parts_range[2 * 2 * (parts_low - 1) + 0 + 0] = parts_range[2 * 2 * (parts_high + 1) + 0 + 0] = 0.0; parts_range[2 * 2 * (parts_low - 1) + 0 + 1] = parts_range[2 * 2 * (parts_high + 1) + 0 + 1] = global_counts[nareas * nclasses + 0]; parts_range[2 * 2 * (parts_low - 1) + 2 + 0] = parts_range[2 * 2 * (parts_high + 1) + 2 + 0] = 0.0; parts_range[2 * 2 * (parts_low - 1) + 2 + 1] = parts_range[2 * 2 * (parts_high + 1) + 2 + 1] = global_weights[nareas * nclasses + 0]; for (i = parts_high; i >= parts_low; --i) { parts_minmax[2 * 4 * parts[i] + 0 + 1] = parts_minmax[2 * 4 * (parts[i] + 1) + 0 + 1] - minmax[2 * 4 * (parts[i] + 1) + 0 + 0]; parts_minmax[2 * 4 * parts[i] + 0 + 3] = parts_minmax[2 * 4 * (parts[i] + 1) + 0 + 3] - minmax[2 * 4 * (parts[i] + 1) + 0 + 1]; parts_minmax[2 * 4 * parts[i] + 4 + 1] = parts_minmax[2 * 4 * (parts[i] + 1) + 4 + 1] - minmax[2 * 4 * (parts[i] + 1) + 4 + 0]; parts_minmax[2 * 4 * parts[i] + 4 + 3] = parts_minmax[2 * 4 * (parts[i] + 1) + 4 + 3] - minmax[2 * 4 * (parts[i] + 1) + 4 + 1]; parts_minmax[2 * 4 * parts[i] + 0 + 0] = parts_minmax[2 * 4 * parts[i] + 0 + 2] = parts_minmax[2 * 4 * parts[i] + 4 + 0] = parts_minmax[2 * 4 * parts[i] + 4 + 2] = -1; parts_range[2 * 2 * parts[i] + 0 + 0] = 0.0; parts_range[2 * 2 * parts[i] + 0 + 1] = global_counts[nareas * nclasses + 0]; parts_range[2 * 2 * parts[i] + 2 + 0] = 0.0; parts_range[2 * 2 * parts[i] + 2 + 1] = global_weights[nareas * nclasses + 0]; /* SL_ASSERT(minmax[2 * 4 * (parts[i] + 1) + 0 + 2] <= minmax[2 * 4 * (parts[i] + 0) + 0 + 3]);*/ /* SL_ASSERT(minmax[2 * 4 * (parts[i] + 1) + 4 + 2] <= minmax[2 * 4 * (parts[i] + 0) + 4 + 3]);*/ parts_update[parts[i]] = 1; if (finalize) { final_locals[2 * i + 0] = local_counts[nareas * nclasses + 0]; final_locals[2 * i + 1] = local_weights[nareas * nclasses + 0]; } } rti_tstop(rti_tid_mpi_partition_radix2_while_round1); } if (finalize) { j = parts_high - parts_low + 1; SL_TRACE_IF(DEBUG_OR_NOT, "Exscan: finalizing %" sl_int_type_fmt " parts", j); rti_tstart(rti_tid_mpi_partition_radix2_while_exscan); MPI_Exscan(&final_locals[2 * parts_low], &locals[2 * parts_low], 2 * j, MPI_DOUBLE, MPI_SUM, comm); if (rank == 0) for (i = parts_low; i <= parts_high; ++i) locals[2 * i + 0] = locals[2 * i + 1] = 0; rti_tstop(rti_tid_mpi_partition_radix2_while_exscan); } nareas_new = 0; last_new_area = last_new_class = -1; /* check all remaining parts */ SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" sl_int_type_fmt ", %s", round, (direction > 0)?"forward":"backward"); nparts_removed = 0; rti_tstart(rti_tid_mpi_partition_radix2_while_check); i = (direction > 0)?parts_low:parts_high; while ((direction > 0)?(i <= parts_high):(i >= parts_low)) { rti_tstart(rti_tid_mpi_partition_radix2_while_check_pre); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": PART: %" sl_int_type_fmt ",%" sl_int_type_fmt, round, i, parts[i]); j = 2 * 4 * parts[i]; jp1 = 2 * 4 * (parts[i] + 1); jm1 = 2 * 4 * (parts[i] - 1); l = 2 * 2 * parts[i]; lp1 = 2 * 2 * (parts[i] + 1); lm1 = 2 * 2 * (parts[i] - 1); if (parts_update[parts[i]]) { if (direction > 0) { parts_minmax_new[0 + 0] = parts_minmax[jm1 + 0 + 0] + minmax[j + 0 + 0]; parts_minmax_new[0 + 2] = parts_minmax[jm1 + 0 + 2] + minmax[j + 0 + 1]; parts_minmax_new[4 + 0] = parts_minmax[jm1 + 4 + 0] + minmax[j + 4 + 0]; parts_minmax_new[4 + 2] = parts_minmax[jm1 + 4 + 2] + minmax[j + 4 + 1]; SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": %f + %f, %f + %f / %f + %f, %f + %f", i, parts[i], parts_minmax[jm1 + 0 + 0], minmax[j + 0 + 0], parts_minmax[jm1 + 0 + 2], minmax[j + 0 + 1], parts_minmax[jm1 + 4 + 0], minmax[j + 4 + 0], parts_minmax[jm1 + 4 + 2], minmax[j + 4 + 1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": 0. parts_minmax_new: %f %f %f %f / %f %f %f %f", parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); if (parts_minmax_new[0 + 0] < minmax[jp1 + 0 + 2]) parts_minmax_new[0 + 0] = minmax[jp1 + 0 + 2]; if (parts_minmax_new[0 + 2] > minmax[j + 0 + 3]) parts_minmax_new[0 + 2] = minmax[j + 0 + 3]; if (parts_minmax_new[4 + 0] < minmax[jp1 + 4 + 2]) parts_minmax_new[4 + 0] = minmax[jp1 + 4 + 2]; if (parts_minmax_new[4 + 2] > minmax[j + 4 + 3]) parts_minmax_new[4 + 2] = minmax[j + 4 + 3]; parts_minmax_new[0 + 1] = parts_minmax[j + 0 + 1]; parts_minmax_new[0 + 3] = parts_minmax[j + 0 + 3]; parts_minmax_new[4 + 1] = parts_minmax[j + 4 + 1]; parts_minmax_new[4 + 3] = parts_minmax[j + 4 + 3]; } else { parts_minmax_new[0 + 1] = parts_minmax[jp1 + 0 + 1] - minmax[jp1 + 0 + 0]; parts_minmax_new[0 + 3] = parts_minmax[jp1 + 0 + 3] - minmax[jp1 + 0 + 1]; parts_minmax_new[4 + 1] = parts_minmax[jp1 + 4 + 1] - minmax[jp1 + 4 + 0]; parts_minmax_new[4 + 3] = parts_minmax[jp1 + 4 + 3] - minmax[jp1 + 4 + 1]; SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": %f - %f, %f - %f / %f - %f, %f - %f", i, parts[i], parts_minmax[jp1 + 0 + 1], minmax[jp1 + 0 + 0], parts_minmax[jp1 + 0 + 3], minmax[jp1 + 0 + 1], parts_minmax[jp1 + 4 + 1], minmax[jp1 + 4 + 0], parts_minmax[jp1 + 4 + 3], minmax[jp1 + 4 + 1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": 0. parts_minmax_new: %f %f %f %f / %f %f %f %f", parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); if (parts_minmax_new[0 + 3] < minmax[jp1 + 0 + 2]) parts_minmax_new[0 + 3] = minmax[jp1 + 0 + 2]; if (parts_minmax_new[0 + 1] > minmax[j + 0 + 3]) parts_minmax_new[0 + 1] = minmax[j + 0 + 3]; if (parts_minmax_new[4 + 3] < minmax[jp1 + 4 + 2]) parts_minmax_new[4 + 3] = minmax[jp1 + 4 + 2]; if (parts_minmax_new[4 + 1] > minmax[j + 4 + 3]) parts_minmax_new[4 + 1] = minmax[j + 4 + 3]; parts_minmax_new[0 + 0] = parts_minmax[j + 0 + 0]; parts_minmax_new[0 + 2] = parts_minmax[j + 0 + 2]; parts_minmax_new[4 + 0] = parts_minmax[j + 4 + 0]; parts_minmax_new[4 + 2] = parts_minmax[j + 4 + 2]; } SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": 1. parts_minmax_new: %f %f %f %f / %f %f %f %f", parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": minmax: %f %f / %f %f", parts[i], minmax[2 * 4 * (parts[i] + 1) + 0 + 2], minmax[2 * 4 * (parts[i] + 0) + 0 + 3], minmax[2 * 4 * (parts[i] + 1) + 4 + 2], minmax[2 * 4 * (parts[i] + 0) + 4 + 3]); if (parts_minmax_new[0 + 0] > parts_minmax_new[0 + 1]) parts_minmax_new[0 + 0] = parts_minmax_new[0 + 1] = (parts_minmax_new[0 + 0] + parts_minmax_new[0 + 1]) / 2; if (parts_minmax_new[0 + 2] < parts_minmax_new[0 + 3]) parts_minmax_new[0 + 2] = parts_minmax_new[0 + 3] = (parts_minmax_new[0 + 2] + parts_minmax_new[0 + 3]) / 2; if (parts_minmax_new[4 + 0] > parts_minmax_new[4 + 1]) parts_minmax_new[4 + 0] = parts_minmax_new[4 + 1] = (parts_minmax_new[4 + 0] + parts_minmax_new[4 + 1]) / 2; if (parts_minmax_new[4 + 2] < parts_minmax_new[4 + 3]) parts_minmax_new[4 + 2] = parts_minmax_new[4 + 3] = (parts_minmax_new[4 + 2] + parts_minmax_new[4 + 3]) / 2; } else { parts_minmax_new[0 + 0] = parts_minmax[j + 0 + 0]; parts_minmax_new[0 + 1] = parts_minmax[j + 0 + 1]; parts_minmax_new[0 + 2] = parts_minmax[j + 0 + 2]; parts_minmax_new[0 + 3] = parts_minmax[j + 0 + 3]; parts_minmax_new[4 + 0] = parts_minmax[j + 4 + 0]; parts_minmax_new[4 + 1] = parts_minmax[j + 4 + 1]; parts_minmax_new[4 + 2] = parts_minmax[j + 4 + 2]; parts_minmax_new[4 + 3] = parts_minmax[j + 4 + 3]; } SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": 2. parts_minmax_new: %f %f %f %f / %f %f %f %f", i, parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); current_minmax[0 + 0] = xmax(parts_minmax_new[0 + 0], parts_minmax_new[0 + 3]) - parts_range[l + 0 + 0]; current_minmax[0 + 1] = xmin(parts_minmax_new[0 + 2], parts_minmax_new[0 + 1]) - parts_range[l + 0 + 0]; current_minmax[2 + 0] = xmax(parts_minmax_new[4 + 0], parts_minmax_new[4 + 3]) - parts_range[l + 2 + 0]; current_minmax[2 + 1] = xmin(parts_minmax_new[4 + 2], parts_minmax_new[4 + 1]) - parts_range[l + 2 + 0]; SL_ASSERT(current_minmax[0 + 0] <= current_minmax[0 + 1]); SL_ASSERT(current_minmax[2 + 0] <= current_minmax[2 + 1]); rti_tstop(rti_tid_mpi_partition_radix2_while_check_pre); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": current_minmax: %f %f / %f %f", parts[i], current_minmax[0 + 0], current_minmax[0 + 1], current_minmax[2 + 0], current_minmax[2 + 1]); lcs = gcs = 0; lws = gws = 0; /* HIT is the default */ refine = 0; if (!finalize) { rti_tstart(rti_tid_mpi_partition_radix2_while_check_classes); for (k = 0; k < nclasses; ++k) { lc = local_counts[part_areas[i] * nclasses + k]; gc = global_counts[part_areas[i] * nclasses + k]; lw = local_weights[part_areas[i] * nclasses + k]; gw = global_weights[part_areas[i] * nclasses + k]; current_minmax[0 + 0] -= gc; current_minmax[0 + 1] -= gc; current_minmax[2 + 0] -= gw; current_minmax[2 + 1] -= gw; SL_TRACE_IF(DEBUG_OR_NOT, "k = %" sl_key_pure_type_fmt ", current_minmax: %f %f / %f %f", k, current_minmax[0], current_minmax[1], current_minmax[2], current_minmax[3]); /* stop and refine if max count is skipped OR min count AND max weight is skipped */ if ((current_minmax[0 + 1] < 0) || (current_minmax[0 + 0] < 0 && current_minmax[2 + 1] < 0)) { refine = 1; break; } lcs += lc; gcs += gc; lws += lw; gws += gw; gc = gw = 0.0; /* if between min/max counts */ if (current_minmax[0 + 0] <= 0 && current_minmax[0 + 1] >= 0) { /* go to next if max count not reached AND min weight not reached */ if (current_minmax[0 + 1] > 0 && current_minmax[2 + 0] > 0) continue; /* look ahead for a better stop */ if (k + 1 < nclasses && current_minmax[0 + 1] - global_counts[part_areas[i] * nclasses + k + 1] >= 0) { /* continue if weights will improve */ if (myabs(current_minmax[2 + 0] + current_minmax[2 + 1]) > myabs(current_minmax[2 + 0] + current_minmax[2 + 1] - 2 * global_weights[part_areas[i] * nclasses + k + 1])) continue; } /* stop */ break; } } SL_ASSERT(k < nclasses); SL_TRACE_IF(DEBUG_OR_NOT, "%s k = %" sl_key_pure_type_fmt, (refine)?"REFINE":"HIT", k); rti_tstop(rti_tid_mpi_partition_radix2_while_check_classes); } else { rti_tstart(rti_tid_mpi_partition_radix2_while_check_final); /* middle of min/max weight */ m = (current_minmax[2 + 0] + current_minmax[2 + 1]) / 2; /* min. part of weight to contribute */ d = xmax(0, m - locals[i * 2 + 1]); /* contribute all? */ if (d >= final_locals[i * 2 + 1]) { lc = final_locals[i * 2 + 0]; lw = final_locals[i * 2 + 1]; } else { /* contribute only a part */ lc = 0; lw = 0; /* not required */ do { d -= elem_weight_one(s, sdispls[1 + parts[i]] + lc); ++lc; } while (d >= 0 && lc < final_locals[i * 2 + 0]); --lc; /* if unweighted, then m = middle of min/max count, d = ..., lc = d */ } /* check mc against min/max count borders */ lc = xminmax(current_minmax[0 + 0] - locals[i * 2 + 0], lc, current_minmax[0 + 1] - locals[i * 2 + 0]); /* check agains 0 (don't step back!) and the local contribution */ lc = xminmax(0, lc, final_locals[i * 2 + 0]); /* the exact global counts/weights are unknown (set gc/gw so that parts_range is not changed) */ gc = 0; gw = 0; lcs += lc; gcs += gc; lws += lw; gws += gw; gc = (parts_range[2 * 2 * parts[i] + 0 + 1] - parts_range[2 * 2 * parts[i] + 0 + 0]); gw = (parts_range[2 * 2 * parts[i] + 2 + 1] - parts_range[2 * 2 * parts[i] + 2 + 0]); rti_tstop(rti_tid_mpi_partition_radix2_while_check_final); } rti_tstart(rti_tid_mpi_partition_radix2_while_check_post); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": sdispls[%" sl_int_type_fmt " + 1] = %d, lcs = %" sl_int_type_fmt, i, parts[i], parts[i], sdispls[parts[i] + 1], lcs); sdispls[parts[i] + 1] += lcs; if (gcs > 0 || gws > 0) { parts_range[l + 0 + 0] += gcs; parts_range[l + 0 + 1] = parts_range[l + 0 + 0] + gc; parts_range[l + 2 + 0] += gws; parts_range[l + 2 + 1] = parts_range[l + 2 + 0] + gw; SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": 3. part_minmax_new: %f %f %f %f / %f %f %f %f", i, parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": parts_range: %f %f / %f %f", i, parts[i], parts_range[2 * 2 * parts[i] + 0 + 0], parts_range[2 * 2 * parts[i] + 0 + 1], parts_range[2 * 2 * parts[i] + 2 + 0], parts_range[2 * 2 * parts[i] + 2 + 1]); parts_minmax_new[0 + 0] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 0], parts_range[l + 0 + 1]); parts_minmax_new[0 + 2] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 2], parts_range[l + 0 + 1]); parts_minmax_new[0 + 1] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 1], parts_range[l + 0 + 1]); parts_minmax_new[0 + 3] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 3], parts_range[l + 0 + 1]); parts_minmax_new[4 + 0] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 0], parts_range[l + 2 + 1]); parts_minmax_new[4 + 2] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 2], parts_range[l + 2 + 1]); parts_minmax_new[4 + 1] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 1], parts_range[l + 2 + 1]); parts_minmax_new[4 + 3] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 3], parts_range[l + 2 + 1]); } SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": 4. part_minmax_new: %f %f %f %f / %f %f %f %f", i, parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); if (parts_minmax_new[0 + 0] != parts_minmax[j + 0 + 0] || parts_minmax_new[0 + 2] != parts_minmax[j + 0 + 2] || parts_minmax_new[4 + 0] != parts_minmax[j + 4 + 0] || parts_minmax_new[4 + 2] != parts_minmax[j + 4 + 2]) { parts_minmax[j + 0 + 0] = parts_minmax_new[0 + 0]; parts_minmax[j + 0 + 2] = parts_minmax_new[0 + 2]; parts_minmax[j + 4 + 0] = parts_minmax_new[4 + 0]; parts_minmax[j + 4 + 2] = parts_minmax_new[4 + 2]; parts_update[parts[i] + 1] = 1; } if (parts_minmax_new[0 + 1] != parts_minmax[j + 0 + 1] || parts_minmax_new[0 + 3] != parts_minmax[j + 0 + 3] || parts_minmax_new[4 + 1] != parts_minmax[j + 4 + 1] || parts_minmax_new[4 + 3] != parts_minmax[j + 4 + 3]) { parts_minmax[j + 0 + 1] = parts_minmax_new[0 + 1]; parts_minmax[j + 0 + 3] = parts_minmax_new[0 + 3]; parts_minmax[j + 4 + 1] = parts_minmax_new[4 + 1]; parts_minmax[j + 4 + 3] = parts_minmax_new[4 + 3]; parts_update[parts[i] - 1] = 1; } parts_update[parts[i]] = 0; /* refine or remove */ if (refine) { /* bits left for partitioning? */ if (rhigh >= rlow) { if (last_new_area == part_areas[i] && last_new_class == k) part_areas[i] = nareas_new - 1; else { /* update last_new_... */ last_new_area = part_areas[i]; last_new_class = k; /* create new area */ elem_assign_at(&areas[part_areas[i]], lcs, &areas_new[nareas_new]); areas_new[nareas_new].size = local_counts[part_areas[i] * nclasses + k]; part_areas[i] = nareas_new; ++nareas_new; } } else { /* save local count/weight for the later prefix calculations */ final_locals[2 * (i - nparts_removed * direction) + 0] = lc; final_locals[2 * (i - nparts_removed * direction) + 1] = lw; } parts[i - nparts_removed * direction] = parts[i]; part_areas[i - nparts_removed * direction] = part_areas[i]; } else ++nparts_removed; rti_tstop(rti_tid_mpi_partition_radix2_while_check_post); i += direction; } if (direction > 0) parts_high -= nparts_removed; else parts_low += nparts_removed; direction *= -1; /* SL_NOTICE_IF(DEBUG_OR_NOT, "nparts = %" sl_int_type_fmt " vs. nareas_new = %" sl_int_type_fmt, nparts, nareas_new);*/ rti_tstop(rti_tid_mpi_partition_radix2_while_check); /* switch areas */ nareas = nareas_new; if (areas == areas0) { areas = areas1; areas_new = areas0; } else { areas = areas0; areas_new = areas1; } } rti_tstop(rti_tid_mpi_partition_radix2_while); /* create scounts */ for (i = 0; i < size - 1; ++i) scounts[i] = sdispls[i + 1] - sdispls[i]; scounts[size - 1] = s->size - sdispls[size - 1]; #ifdef SCOUNTS_SDISPLS printf("%d: scounts", rank); for (i = 0, j = 0; i < size; ++i) { printf(" %d", scounts[i]); j += scounts[i]; } printf(" = %" sl_int_type_fmt "\n", j); printf("%d: sdispls", rank); for (i = 0; i < size; ++i) printf(" %d", sdispls[i]); printf("\n"); #endif #ifdef RCOUNTS_RDISPLS rcounts = sl_alloc(size, sizeof(int)); rdispls = sl_alloc(size, sizeof(int)); MPI_Alltoall(scounts, 1, MPI_INT, rcounts, 1, MPI_INT, comm); rdispls[0] = 0; for (i = 1; i < size; ++i) rdispls[i] = rdispls[i - 1] + rcounts[i - 1]; printf("%d: rcounts", rank); for (i = 0; i < size; ++i) printf(" %d", rcounts[i]); printf("\n"); printf("%d: rdispls", rank); for (i = 0; i < size; ++i) printf(" %d", rdispls[i]); printf("\n"); sl_free(rcounts); sl_free(rdispls); #endif sl_free(locals); sl_free(globals); #ifdef WEIGHT_STATS partial_counts[size] = 0; partial_weights[size] = 0.0; for (i = 0; i < size; ++i) { partial_counts[i] = scounts[i]; partial_weights[i] = 0.0; for (j = sdispls[i]; j < sdispls[i] + scounts[i]; ++j) partial_weights[i] += elem_weight_one(s, j); partial_counts[size] += partial_counts[i]; partial_weights[size] += partial_weights[i]; } #ifdef HAVENT_MPI_IN_PLACE MPI_Reduce(partial_counts, partial_counts2, size + 1, int_mpi_datatype, MPI_SUM, 0, comm); MPI_Reduce(partial_weights, partial_weights2, size + 1, MPI_DOUBLE, MPI_SUM, 0, comm); # define partial_counts partial_counts2 # define partial_weights partial_weights2 #else /* recvbuf requires workaround for an in-place/aliased-buffer-check-bug in mpich2 (fixed with rev 5518) */ MPI_Reduce((rank == 0)?MPI_IN_PLACE:partial_counts, (rank == 0)?partial_counts:NULL, size + 1, int_mpi_datatype, MPI_SUM, 0, comm); MPI_Reduce((rank == 0)?MPI_IN_PLACE:partial_weights, (rank == 0)?partial_weights:NULL, size + 1, MPI_DOUBLE, MPI_SUM, 0, comm); #endif if (rank == 0) { printf("%d: total_count: %" sl_int_type_fmt " vs. %" sl_int_type_fmt "\n", rank, total_count, partial_counts[size]); d = 0.0; vmin = 1.0; vmax = 0.0; for (i = 0; i < size; ++i) { /* printf("%d: %" sl_int_type_fmt " %" sl_int_type_fmt " / %f - %" sl_int_type_fmt " / %f\n", rank, i, partial_counts[i], (double) partial_counts[i] / partial_counts[size], (partial_counts[size] / size) - partial_counts[i], fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])));*/ d += fabs((partial_counts[size] / size) - partial_counts[i]); if (fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])) < vmin) vmin = fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])); if (fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])) > vmax) vmax = fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])); } printf("%d: min/max: %f / %f\n", rank, vmin, vmax); printf("%d: average_count: %" sl_int_type_fmt " - %f / %f\n", rank, partial_counts[size] / size, d / size, d / partial_counts[size]); printf("%d: total_weight: %f vs. %f\n", rank, total_weight, partial_weights[size]); d = 0.0; vmin = 1.0; vmax = 0.0; for (i = 0; i < size; ++i) { /* printf("%d: %" sl_int_type_fmt " %f / %f - %f / %f\n", rank, i, partial_weights[i], partial_weights[i] / partial_weights[size], (partial_weights[size] / size) - partial_weights[i], fabs(1.0 - (partial_weights[i] * size / partial_weights[size])));*/ d += fabs((partial_weights[size] / size) - partial_weights[i]); if (fabs(1.0 - (partial_weights[i] * size / partial_weights[size])) < vmin) vmin = fabs(1.0 - (partial_weights[i] * size / partial_weights[size])); if (fabs(1.0 - (partial_weights[i] * size / partial_weights[size])) > vmax) vmax = fabs(1.0 - (partial_weights[i] * size / partial_weights[size])); } printf("%d: min/max: %f / %f\n", rank, vmin, vmax); printf("%d: average_weight: %f - %f / %f\n", rank, partial_weights[size] / size, d / size, d / partial_weights[size]); } #endif rti_tstop(rti_tid_mpi_partition_radix2); #if defined(TIMING_STATS) && defined(SL_USE_RTI_TIM) if (rank == 0) { printf("%d: mpi_partition_radix: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2)); printf("%d: mpi_partition_radix: sync: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_sync)); printf("%d: mpi_partition_radix: while: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while)); printf("%d: mpi_partition_radix: count: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_count)); printf("%d: mpi_partition_radix: allreduce: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_allreduce)); printf("%d: mpi_partition_radix: round1: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_round1)); printf("%d: mpi_partition_radix: allgather: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_round1_allgather)); printf("%d: mpi_partition_radix: exscan: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_exscan)); printf("%d: mpi_partition_radix: check: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_check)); printf("%d: mpi_partition_radix: pre: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_pre)); printf("%d: mpi_partition_radix: classes: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_classes)); printf("%d: mpi_partition_radix: final: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_final)); printf("%d: mpi_partition_radix: post: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_post)); } #endif return 0; }
void Nhdc12832::setpixel(unsigned int x, unsigned int y){ if( (x <= xmax()) && (y <= ymax()) ){ nhd_mem[x][y/8] |= mask[7-(y&0x07)]; } }
slint_t mpi_select_exact_radix_fixed(elements_t *s, slint_t nelements, slint_t nparts, partcond_t *pconds, slint_t rhigh, slint_t rlow, slint_t rwidth, int *sdispls, int size, int rank, MPI_Comm comm) /* sl_proto, sl_func mpi_select_exact_radix_fixed */ { slkey_pure_t max_nclasses, nclasses, bit_mask; slkey_pure_t k, l; typedef struct { slint_t count_min, count_max; slint_t count_low, count_hig; #ifdef elem_weight double weight_min, weight_max; double weight_low, weight_hig; #endif } mmlh_t; mmlh_t mmlh[nparts]; const slint_t max_nborders = nparts - 1; slint_t border_lo, border_hi, nborders_removed; slint_t borders[max_nborders], border_areas[max_nborders]; #define MIN_LE 0 #define MIN_RI 1 #define MAX_LE 2 #define MAX_RI 3 struct { slint_t update; slint_t crange[2], cmmlr[4]; #ifdef elem_weight double wrange[2], wmmlr[4]; #endif } border_infos_[1 + max_nborders + 1], *border_infos = border_infos_ + 1, border_info_old; const slint_t max_nareas = max_nborders; slint_t nareas, nareas_new; elements_t areas0[max_nareas * nelements], areas1[max_nareas * nelements], *areas, *areas_new; slint_t *area_counts, *current_counts; double *local_counts, *global_counts; #ifdef elem_weight double *local_weights, *global_weights, *current_weights; #endif slint_t current_cmm[2]; #ifdef elem_weight double current_wmm[2]; #endif slint_t final_areas[max_nborders * nelements]; double final_locals[NCONDS * max_nborders], *final_globals; slint_t current_width; slint_t round, direction, refine, finalize; slint_t last_new_area, last_new_class; slint_t lc, lcs, gc, gcs, lcv[nelements], lcsv[nelements]; #ifdef elem_weight double lw, gw, lws, gws; double mw, dw; double mcw[4]; #else slint_t mc, dc; #endif slint_t i, j; elements_t xi, end; #ifdef VERIFY slint_t v; #endif SL_TRACE_IF(DEBUG_OR_NOT, "starting mpi_select_exact_radix"); /* sl_tid rti_tid_mpi_select_exact_radix rti_tid_mpi_select_exact_radix_sync */ rti_treset(rti_tid_mpi_select_exact_radix_while); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_count); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_allreduce); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_round1); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_round1_allgather); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_exscan); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_check); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_check_pre); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_check_classes); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_check_final); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_check_post); /* sl_tid */ rti_tstart(rti_tid_mpi_select_exact_radix_sync); #ifdef SYNC_ON_INIT MPI_Barrier(comm); #endif rti_tstop(rti_tid_mpi_select_exact_radix_sync); #ifdef VERIFY v = elements_validate_order(s, 1); SL_TRACE_IF(DEBUG_OR_NOT, "elements order: %s (%" slint_fmt ")", (v > 0)?"FAILED":"SUCCESS", v); #endif rti_tstart(rti_tid_mpi_select_exact_radix); if (rhigh < 0) rhigh = key_radix_high; if (rlow < 0) rlow = key_radix_low; if (rwidth < 0) rwidth = sort_radix_width_default; max_nclasses = powof2_typed(rwidth, slkey_pure_t); /* SL_TRACE_IF(DEBUG_OR_NOT, "alloc area_counts: %" slint_fmt " * %d", max_nareas * nelements * max_nclasses, sizeof(slint_t)); SL_TRACE_IF(DEBUG_OR_NOT, "alloc local_counts: %" slint_fmt " * %d", NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(slint_t)); SL_TRACE_IF(DEBUG_OR_NOT, "alloc global_counts: %" slint_fmt " * %d", NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(slint_t));*/ area_counts = sl_alloc(max_nareas * nelements * max_nclasses, sizeof(slint_t)); local_counts = sl_alloc(NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(double)); global_counts = sl_alloc(NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(double)); /* init areas (first area = all elements) */ areas = areas0; areas_new = areas1; nareas = 1; for (j = 0; j < nelements; ++j) elem_assign(&s[j], &areas[0 * nelements + j]); /* init parts */ border_lo = 0; border_hi = max_nborders - 1; for (i = border_lo; i <= border_hi; ++i) { borders[i] = i; border_areas[i] = 0; } /* init sdispls */ for (i = 0; i < nparts; ++i) for (j = 0; j < nelements; ++j) sdispls[i * nelements + j] = 0; rti_tstart(rti_tid_mpi_select_exact_radix_while); round = 0; while (border_lo <= border_hi) { ++round; /* setup bitmask */ current_width = xmin(rwidth, rhigh - rlow + 1); rhigh -= (current_width > 0)?current_width - 1:rhigh; nclasses = (current_width > 0)?powof2_typed(current_width, slkey_pure_t):1; bit_mask = nclasses - 1; SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" slint_fmt ", rhigh: %" slint_fmt ", current_width: %" slint_fmt ", nclasses: %" sl_key_pure_type_fmt, round, rhigh, current_width, nclasses); finalize = (current_width <= 0); if (!finalize || round == 1) { #ifdef elem_weight /* init weight counters */ local_weights = local_counts + (nareas * nclasses) + nareas; global_weights = global_counts + (nareas * nclasses) + nareas; #endif /* zero all counter */ for (i = 0; i < nareas; ++i) for (k = 0; k < nclasses; ++k) local_counts[i * nclasses + k] = #ifdef elem_weight local_weights[i * nclasses + k] = #endif 0.0; rti_tstart(rti_tid_mpi_select_exact_radix_while_count); /* for every area */ for (i = 0; i < nareas; ++i) { local_counts[nareas * nclasses + i] = 0; #ifdef elem_weight local_weights[nareas * nclasses + i] = 0.0; #endif /* for every list of elements */ for (j = 0; j < nelements; ++j) { SL_TRACE_IF(DEBUG_OR_NOT, "area %" slint_fmt ",%" slint_fmt ": size = %" slint_fmt, i, j, areas[i * nelements + j].size); elem_assign_at(&areas[i * nelements + j], areas[i * nelements + j].size, &end); current_counts = area_counts + ((i * nelements + j) * nclasses); #ifdef elem_weight current_weights = local_weights + (i * nclasses); #endif for (k = 0; k < nclasses; ++k) current_counts[k] = 0; if (nclasses > 1) { /* counts and weights in every class */ for (elem_assign(&areas[i * nelements + j], &xi); xi.keys < end.keys; elem_inc(&xi)) { k = key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask); current_counts[k] += 1; /* SL_TRACE_IF(DEBUG_OR_NOT, "key %" sl_key_pure_type_fmt " goes to bin %" sl_key_pure_type_fmt, key_purify(*xi.keys), k);*/ #ifdef elem_weight current_weights[k] += elem_weight(&xi, 0); #endif } } else { /* total counts and weights */ current_counts[0] = areas[i * nelements + j].size; #ifdef elem_weight for (elem_assign(&areas[i * nelements + j], &xi); xi.keys < end.keys; elem_inc(&xi)) current_weights[0] += elem_weight(&xi, 0); #endif } for (k = 0; k < nclasses; ++k) local_counts[i * nclasses + k] += current_counts[k]; /* total counts and weights in this area */ local_counts[nareas * nclasses + i] += areas[i * nelements + j].size; #ifdef elem_weight for (k = 0; k < nclasses; ++k) local_weights[nareas * nclasses + i] += current_weights[k]; #endif } SL_TRACE_ARRAY_IF(DEBUG_OR_NOT, "%" slint_fmt ": counts =", " %f", k, nclasses, (&local_counts[i * nclasses]), i); } rti_tstop(rti_tid_mpi_select_exact_radix_while_count); --rhigh; SL_TRACE_IF(DEBUG_OR_NOT, "all-reducing %" slint_fmt " doubles", (slint_t) (NCONDS * (nareas * nclasses + nareas))); rti_tstart(rti_tid_mpi_select_exact_radix_while_allreduce); /* create global counts and weights */ #ifdef MPI_SELECT_EXACT_RADIX_REDUCEBCAST_THRESHOLD if (size >= MPI_SELECT_EXACT_RADIX_REDUCEBCAST_THRESHOLD) { MPI_Reduce(local_counts, global_counts, NCONDS * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, REDUCEBCAST_ROOT, comm); MPI_Bcast(global_counts, NCONDS * (nareas * nclasses + nareas), MPI_DOUBLE, REDUCEBCAST_ROOT, comm); } else #endif MPI_Allreduce(local_counts, global_counts, NCONDS * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, comm); rti_tstop(rti_tid_mpi_select_exact_radix_while_allreduce); } /* do initializations */ if (round == 1) { rti_tstart(rti_tid_mpi_select_exact_radix_while_round1); for (i = 0; i < nparts; ++i) { /* truncate counts, set default values and determine local (count/weight) limits */ init_partconds(1, &pconds[i], nparts, global_counts[nareas * nclasses + 0], #ifdef elem_weight global_weights[nareas * nclasses + 0] #else 0 #endif ); mmlh[i].count_min = pconds[i].count_min; mmlh[i].count_max = pconds[i].count_max; mmlh[i].count_low = pconds[i].count_low; mmlh[i].count_hig = pconds[i].count_high; #ifdef elem_weight mmlh[i].weight_min = pconds[i].weight_min; mmlh[i].weight_max = pconds[i].weight_max; mmlh[i].weight_low = pconds[i].weight_low; mmlh[i].weight_hig = pconds[i].weight_high; #endif } /* init lowest and highest part (sentinels) */ border_infos[border_lo - 1].update = 0; border_infos[border_lo - 1].crange[0] = 0; border_infos[border_lo - 1].crange[1] = 0; border_infos[border_lo - 1].cmmlr[MIN_LE] = border_infos[border_lo - 1].cmmlr[MAX_LE] = 0; border_infos[border_lo - 1].cmmlr[MIN_RI] = border_infos[border_lo - 1].cmmlr[MAX_RI] = 0; SL_TRACE_IF(DEBUG_OR_NOT, "lowest: %" slint_fmt ": init count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", border_lo - 1, border_infos[border_lo - 1].cmmlr[MIN_LE], border_infos[border_lo - 1].cmmlr[MAX_LE], border_infos[border_lo - 1].cmmlr[MIN_RI], border_infos[border_lo - 1].cmmlr[MAX_RI]); #ifdef elem_weight border_infos[border_lo - 1].wrange[0] = 0.0; border_infos[border_lo - 1].wrange[1] = 0.0; border_infos[border_lo - 1].wmmlr[MIN_LE] = border_infos[border_lo - 1].wmmlr[MAX_LE] = 0.0; border_infos[border_lo - 1].wmmlr[MIN_RI] = border_infos[border_lo - 1].wmmlr[MAX_RI] = 0.0; SL_TRACE_IF(DEBUG_OR_NOT, "lowest: %" slint_fmt ": init weight[min/max-left/right]: %f / %f - %f / %f", border_lo - 1, border_infos[border_lo - 1].wmmlr[MIN_LE], border_infos[border_lo - 1].wmmlr[MAX_LE], border_infos[border_lo - 1].wmmlr[MIN_RI], border_infos[border_lo - 1].wmmlr[MAX_RI]); #endif /* init highest part (sentinel) */ border_infos[border_hi + 1].update = 0; border_infos[border_hi + 1].crange[0] = global_counts[nareas * nclasses + 0]; border_infos[border_hi + 1].crange[1] = global_counts[nareas * nclasses + 0]; border_infos[border_hi + 1].cmmlr[MIN_LE] = border_infos[border_hi + 1].cmmlr[MAX_LE] = 0; border_infos[border_hi + 1].cmmlr[MIN_RI] = border_infos[border_hi + 1].cmmlr[MAX_RI] = global_counts[nareas * nclasses + 0]; SL_TRACE_IF(DEBUG_OR_NOT, "highest: %" slint_fmt ": init count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", border_hi + 1, border_infos[border_hi + 1].cmmlr[MIN_LE], border_infos[border_hi + 1].cmmlr[MAX_LE], border_infos[border_hi + 1].cmmlr[MIN_RI], border_infos[border_hi + 1].cmmlr[MAX_RI]); #ifdef elem_weight border_infos[border_hi + 1].wrange[0] = global_weights[nareas * nclasses + 0]; border_infos[border_hi + 1].wrange[1] = global_weights[nareas * nclasses + 0]; border_infos[border_hi + 1].wmmlr[MIN_LE] = border_infos[border_hi + 1].wmmlr[MAX_LE] = 0.0; border_infos[border_hi + 1].wmmlr[MIN_RI] = border_infos[border_hi + 1].wmmlr[MAX_RI] = global_weights[nareas * nclasses + 0]; SL_TRACE_IF(DEBUG_OR_NOT, "highest: %" slint_fmt ": init weight[min/max-left/right]: %f / %f - %f / %f", border_hi + 1, border_infos[border_hi + 1].wmmlr[MIN_LE], border_infos[border_hi + 1].wmmlr[MAX_LE], border_infos[border_hi + 1].wmmlr[MIN_RI], border_infos[border_hi + 1].wmmlr[MAX_RI]); #endif /* init regular parts (backwards) */ for (i = border_hi; i >= border_lo; --i) { border_infos[borders[i]].update = 1; border_infos[borders[i]].crange[0] = 0; border_infos[borders[i]].crange[1] = global_counts[nareas * nclasses + 0]; border_infos[borders[i]].cmmlr[MIN_LE] = -1; border_infos[borders[i]].cmmlr[MIN_RI] = border_infos[borders[i] + 1].cmmlr[MIN_RI] - mmlh[borders[i] + 1].count_min; border_infos[borders[i]].cmmlr[MAX_LE] = -1; border_infos[borders[i]].cmmlr[MAX_RI] = border_infos[borders[i] + 1].cmmlr[MAX_RI] - mmlh[borders[i] + 1].count_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": init count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]); #ifdef elem_weight border_infos[borders[i]].wrange[0] = 0.0; border_infos[borders[i]].wrange[1] = global_weights[nareas * nclasses + 0]; border_infos[borders[i]].wmmlr[MIN_LE] = -1.0; border_infos[borders[i]].wmmlr[MIN_RI] = border_infos[borders[i] + 1].wmmlr[MIN_RI] - mmlh[borders[i] + 1].weight_min; border_infos[borders[i]].wmmlr[MAX_LE] = -1.0; border_infos[borders[i]].wmmlr[MAX_RI] = border_infos[borders[i] + 1].wmmlr[MAX_RI] - mmlh[borders[i] + 1].weight_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": init weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]); #endif /* prepare for finalization in the 1st round */ if (finalize) { for (j = 0; j < nelements; ++j) final_areas[i * nelements + j] = area_counts[(0 * nelements + j) * nclasses + 0]; final_locals[NCONDS * i + 0] = local_counts[nareas * nclasses + 0]; #ifdef elem_weight final_locals[NCONDS * i + 1] = local_weights[nareas * nclasses + 0]; #endif } } /* first direction: forward */ direction = 1; rti_tstop(rti_tid_mpi_select_exact_radix_while_round1); } /* compute prefixes for finalization */ if (finalize) { /* determine number of parts to finalize */ j = border_hi - border_lo + 1; SL_TRACE_IF(DEBUG_OR_NOT, "Exscan: finalizing %" slint_fmt " parts", j); rti_tstart(rti_tid_mpi_select_exact_radix_while_exscan); /* use local_counts to store the global prefix sums */ final_globals = local_counts; /* create global prefix sums (set rank 0 to zero) */ MPI_Exscan(&final_locals[NCONDS * border_lo], &final_globals[NCONDS * border_lo], NCONDS * j, MPI_DOUBLE, MPI_SUM, comm); if (rank == 0) for (i = border_lo; i <= border_hi; ++i) final_globals[NCONDS * i + 0] = #ifdef elem_weight final_globals[NCONDS * i + 1] = #endif 0.0; rti_tstop(rti_tid_mpi_select_exact_radix_while_exscan); } /* check all remaining parts */ SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" slint_fmt ", %s", round, (direction > 0)?"forward":"backward"); nareas_new = 0; last_new_area = last_new_class = -1; nborders_removed = 0; rti_tstart(rti_tid_mpi_select_exact_radix_while_check); i = (direction > 0)?border_lo:border_hi; while ((direction > 0)?(i <= border_hi):(i >= border_lo)) { /* check partition borders[i] */ SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ": PART: %" slint_fmt ",%" slint_fmt, round, i, borders[i]); rti_tstart(rti_tid_mpi_select_exact_radix_while_check_pre); /* save to old limits */ border_info_old = border_infos[borders[i]]; /* is an update required? */ if (border_infos[borders[i]].update) { /* forward */ if (direction > 0) { /* init from min/max (always) */ border_infos[borders[i]].cmmlr[MIN_LE] = border_infos[borders[i] - 1].cmmlr[MIN_LE] + mmlh[borders[i]].count_min; border_infos[borders[i]].cmmlr[MAX_LE] = border_infos[borders[i] - 1].cmmlr[MAX_LE] + mmlh[borders[i]].count_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left]: %" slint_fmt " + %" slint_fmt ", %" slint_fmt " + %" slint_fmt "", i, borders[i], border_infos[borders[i] - 1].cmmlr[MIN_LE], mmlh[borders[i]].count_min, border_infos[borders[i] - 1].cmmlr[MAX_LE], mmlh[borders[i]].count_max); /* check against low/high (on demand) */ if (pconds->pcm & SLPC_COUNTS_LH) { if (border_infos[borders[i]].cmmlr[MIN_LE] < mmlh[borders[i] + 1].count_low) border_infos[borders[i]].cmmlr[MIN_LE] = mmlh[borders[i] + 1].count_low; if (border_infos[borders[i]].cmmlr[MAX_LE] > mmlh[borders[i] ].count_hig) border_infos[borders[i]].cmmlr[MAX_LE] = mmlh[borders[i] ].count_hig; } #ifdef elem_weight /* init from min/max (always) */ border_infos[borders[i]].wmmlr[MIN_LE] = border_infos[borders[i] - 1].wmmlr[MIN_LE] + mmlh[borders[i]].weight_min; border_infos[borders[i]].wmmlr[MAX_LE] = border_infos[borders[i] - 1].wmmlr[MAX_LE] + mmlh[borders[i]].weight_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left]: %f + %f, %f + %f", i, borders[i], border_infos[borders[i] - 1].wmmlr[MIN_LE], mmlh[borders[i]].weight_min, border_infos[borders[i] - 1].wmmlr[MAX_LE], mmlh[borders[i]].weight_max); /* check against low/high (on demand) */ if (pconds->pcm & SLPC_WEIGHTS_LH) { if (border_infos[borders[i]].wmmlr[MIN_LE] < mmlh[borders[i] + 1].weight_low) border_infos[borders[i]].wmmlr[MIN_LE] = mmlh[borders[i] + 1].weight_low; if (border_infos[borders[i]].wmmlr[MAX_LE] > mmlh[borders[i] ].weight_hig) border_infos[borders[i]].wmmlr[MAX_LE] = mmlh[borders[i] ].weight_hig; } #endif } else /* backward */ { /* init from min/max (always) */ border_infos[borders[i]].cmmlr[MIN_RI] = border_infos[borders[i] + 1].cmmlr[MIN_RI] - mmlh[borders[i] + 1].count_min; border_infos[borders[i]].cmmlr[MAX_RI] = border_infos[borders[i] + 1].cmmlr[MAX_RI] - mmlh[borders[i] + 1].count_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-right]: %" slint_fmt " - %" slint_fmt ", %" slint_fmt " - %" slint_fmt "", i, borders[i], border_infos[borders[i] + 1].cmmlr[MIN_RI], mmlh[borders[i] + 1].count_min, border_infos[borders[i] + 1].cmmlr[MAX_RI], mmlh[borders[i] + 1].count_max); /* check against low/high (on demand) */ if (pconds->pcm & SLPC_COUNTS_LH) { if (border_infos[borders[i]].cmmlr[MAX_RI] < mmlh[borders[i] + 1].count_low) border_infos[borders[i]].cmmlr[MAX_RI] = mmlh[borders[i] + 1].count_low; if (border_infos[borders[i]].cmmlr[MIN_RI] > mmlh[borders[i] ].count_hig) border_infos[borders[i]].cmmlr[MIN_RI] = mmlh[borders[i] ].count_hig; } #ifdef elem_weight /* init from min/max (always) */ border_infos[borders[i]].wmmlr[MIN_RI] = border_infos[borders[i] + 1].wmmlr[MIN_RI] - mmlh[borders[i] + 1].weight_min; border_infos[borders[i]].wmmlr[MAX_RI] = border_infos[borders[i] + 1].wmmlr[MAX_RI] - mmlh[borders[i] + 1].weight_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-right]: %f - %f, %f - %f", i, borders[i], border_infos[borders[i] + 1].wmmlr[MIN_RI], mmlh[borders[i] + 1].weight_min, border_infos[borders[i] + 1].wmmlr[MAX_RI], mmlh[borders[i] + 1].weight_max); /* check against low/high (on demand) */ if (pconds->pcm & SLPC_WEIGHTS_LH) { if (border_infos[borders[i]].wmmlr[MAX_RI] < mmlh[borders[i] + 1].weight_low) border_infos[borders[i]].wmmlr[MAX_RI] = mmlh[borders[i] + 1].weight_low; if (border_infos[borders[i]].wmmlr[MIN_RI] > mmlh[borders[i] ].weight_hig) border_infos[borders[i]].wmmlr[MIN_RI] = mmlh[borders[i] ].weight_hig; } #endif } SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]); /* check against inconsistence */ if (border_infos[borders[i]].cmmlr[MIN_LE] > border_infos[borders[i]].cmmlr[MIN_RI]) border_infos[borders[i]].cmmlr[MIN_LE] = border_infos[borders[i]].cmmlr[MIN_RI] = (border_infos[borders[i]].cmmlr[MIN_LE] + border_infos[borders[i]].cmmlr[MIN_RI]) / 2; if (border_infos[borders[i]].cmmlr[MAX_LE] < border_infos[borders[i]].cmmlr[MAX_RI]) border_infos[borders[i]].cmmlr[MAX_LE] = border_infos[borders[i]].cmmlr[MAX_RI] = (border_infos[borders[i]].cmmlr[MAX_LE] + border_infos[borders[i]].cmmlr[MAX_RI]) / 2; #ifdef elem_weight SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]); /* check against inconsistence */ if (border_infos[borders[i]].wmmlr[MIN_LE] > border_infos[borders[i]].wmmlr[MIN_RI]) border_infos[borders[i]].wmmlr[MIN_LE] = border_infos[borders[i]].wmmlr[MIN_RI] = (border_infos[borders[i]].wmmlr[MIN_LE] + border_infos[borders[i]].wmmlr[MIN_RI]) / 2; if (border_infos[borders[i]].wmmlr[MAX_LE] < border_infos[borders[i]].wmmlr[MAX_RI]) border_infos[borders[i]].wmmlr[MAX_LE] = border_infos[borders[i]].wmmlr[MAX_RI] = (border_infos[borders[i]].wmmlr[MAX_LE] + border_infos[borders[i]].wmmlr[MAX_RI]) / 2; #endif } SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": crange: %" slint_fmt " - %" slint_fmt "", i, borders[i], border_infos[borders[i]].crange[0], border_infos[borders[i]].crange[1]); /* select highest min and lowest max */ current_cmm[0] = xmax(border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_RI]) - border_infos[borders[i]].crange[0]; current_cmm[1] = xmin(border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI]) - border_infos[borders[i]].crange[0]; if (rank == 0) SL_ASSERT(current_cmm[0] <= current_cmm[1]); if (rank == 0) SL_ASSERT(0 <= current_cmm[0]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": current_count: %" slint_fmt " - %" slint_fmt "", i, borders[i], current_cmm[0], current_cmm[1]); #ifdef elem_weight SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": wrange: %f - %f", i, borders[i], border_infos[borders[i]].wrange[0], border_infos[borders[i]].wrange[1]); /* select highest min and lowest max */ current_wmm[0] = xmax(border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_RI]) - border_infos[borders[i]].wrange[0]; current_wmm[1] = xmin(border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI]) - border_infos[borders[i]].wrange[0]; if (rank == 0) SL_ASSERT(current_wmm[0] <= current_wmm[1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": current_weight: %f - %f", i, borders[i], current_wmm[0], current_wmm[1]); #endif rti_tstop(rti_tid_mpi_select_exact_radix_while_check_pre); /* HIT is the default */ refine = 0; if (!finalize) { rti_tstart(rti_tid_mpi_select_exact_radix_while_check_classes); lcs = gcs = 0; #ifdef elem_weight lws = gws = 0.0; #endif for (k = 0; k < nclasses; ++k) { lc = local_counts[border_areas[i] * nclasses + k]; gc = global_counts[border_areas[i] * nclasses + k]; current_cmm[0] -= gc; current_cmm[1] -= gc; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": k = %" sl_key_pure_type_fmt ", current_count: %" slint_fmt " - %" slint_fmt ", lc = %" slint_fmt ", lcs = %" slint_fmt ", gc = %" slint_fmt ", gcs = %" slint_fmt, i, borders[i], k, current_cmm[0], current_cmm[1], lc, lcs, gc, gcs); #ifdef elem_weight lw = local_weights[border_areas[i] * nclasses + k]; gw = global_weights[border_areas[i] * nclasses + k]; current_wmm[0] -= gw; current_wmm[1] -= gw; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": k = %" sl_key_pure_type_fmt ", current_weight: %e - %e", i, borders[i], k, current_wmm[0], current_wmm[1]); #endif /* stop and refine if max count is skipped OR min count AND max weight is skipped */ if ((current_cmm[1] < 0) #ifdef elem_weight || (current_cmm[0] < 0 && current_wmm[1] < 0.0) #endif ) { refine = 1; break; } lcs += lc; gcs += gc; gc = 0; #ifdef elem_weight lws += lw; gws += gw; gw = 0.0; #endif /* if between min/max counts */ if (current_cmm[0] <= 0 && current_cmm[1] >= 0) { #ifdef elem_weight SL_TRACE_IF(DEBUG_OR_NOT, "got to next: %d && %d", (current_cmm[1] > 0), (current_wmm[0] > 0)); /* go to next if max count not reached AND min weight not reached */ if (current_cmm[1] > 0 && current_wmm[0] > 0) continue; #endif /* look ahead for a better stop */ if (k + 1 < nclasses && current_cmm[1] - global_counts[border_areas[i] * nclasses + k + 1] >= 0) { #ifdef elem_weight /* continue if weights will improve */ if (myabs(current_wmm[0] + current_wmm[1]) > myabs(current_wmm[0] + current_wmm[1] - 2 * global_weights[border_areas[i] * nclasses + k + 1])) continue; #else /* continue if counts will improve */ if (myabs(current_cmm[0] + current_cmm[1]) > myabs(current_cmm[0] + current_cmm[1] - 2 * global_counts[border_areas[i] * nclasses + k + 1])) continue; #endif } /* stop */ break; } } SL_ASSERT_IF((rank == 0), k < nclasses); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": %s k = %" sl_key_pure_type_fmt ", lcs = %" slint_fmt, i, borders[i], (refine)?"REFINE":"HIT", k, lcs); /* make sure k is safe (it is used as index later) */ if (k >= nclasses) k = nclasses - 1; /* break the local contribution into contributions for the lists of elements */ for (j = 0; j < nelements; ++j) { lcsv[j] = 0; for (l = 0; l < k; ++l) lcsv[j] += area_counts[((border_areas[i] * nelements + j) * nclasses) + l]; if (refine) lcv[j] = area_counts[((border_areas[i] * nelements + j) * nclasses) + k]; else { lcv[j] = 0; lcsv[j] += area_counts[((border_areas[i] * nelements + j) * nclasses) + k]; } lcs -= lcsv[j]; } rti_tstop(rti_tid_mpi_select_exact_radix_while_check_classes); } else { rti_tstart(rti_tid_mpi_select_exact_radix_while_check_final); k = 0; #ifdef elem_weight /* middle of min/max weight */ mw = (current_wmm[0] + current_wmm[1]) / 2.0; /* min. part of weight to contribute */ dw = xmax(0, mw - final_globals[NCONDS * i + 1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": mw = %e, dw = %e", i, borders[i], mw, dw); #else /* middle of min/max count */ mc = (current_cmm[0] + current_cmm[1]) / 2; /* min. part of count to contribute */ dc = xmax(0, mc - final_globals[NCONDS * i + 0]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": mc = %" slint_fmt ", dc = %" slint_fmt, i, borders[i], mc, dc); #endif /* contribute all? */ if ( #ifdef elem_weight dw >= final_locals[NCONDS * i + 1] #else dc >= final_locals[NCONDS * i + 0] #endif ) { lc = final_locals[NCONDS * i + 0]; #ifdef elem_weight lw = final_locals[NCONDS * i + 1]; #endif } else { /* contribute only a part */ #ifdef elem_weight lc = 0; for (j = 0; j < nelements; ++j) { elem_assign_at(&areas[border_areas[i] * nelements + j], areas[border_areas[i] * nelements + j].size, &end); for (elem_assign(&areas[border_areas[i] * nelements + j], &xi); xi.keys < end.keys; elem_inc(&xi)) { dw -= elem_weight(&xi, 0); ++lc; if (dw < 0.0 || lc >= final_locals[NCONDS * i + 0]) { dw += elem_weight(&xi, 0); --lc; break; } } } lw = dw; #else lc = dc; #endif } /* check mc against min/max count borders */ lc = xminmax(current_cmm[0] - final_globals[NCONDS * i + 0], lc, current_cmm[1] - final_globals[NCONDS * i + 0]); /* check agains 0 (don't step back!) and the local contribution */ lc = xminmax(0, lc, final_locals[NCONDS * i + 0]); lcs = lc; #ifdef elem_weight lws = lw; #endif #ifdef elem_weight SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": next border: %" slint_fmt " <= %" slint_fmt " + %" slint_fmt " <= %" slint_fmt, i, borders[i], border_lo, i, direction, border_hi); if (border_lo <= i + direction && i + direction <= border_hi) SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": next border: %" slint_fmt " == %" slint_fmt " + %" slint_fmt, i, borders[i], borders[i + direction], borders[i], direction); /* FIXME: finalize geht auch rückwärts!!! */ /* if the next open border is really the _next_ border */ if (border_lo <= i + direction && i + direction <= border_hi && borders[i + direction] == borders[i] + direction) { /* determine the exact global counts/weights (damn, this is expensive) */ mcw[0] = lcs; mcw[1] = lws; MPI_Allreduce(&mcw[0], &mcw[2], 2, MPI_DOUBLE, MPI_SUM, comm); } else { /* the exact global counts/weights are not required */ mcw[2] = 0.0; mcw[3] = 0.0; } gc = 0; gcs = mcw[2]; gw = 0.0; gws = mcw[3]; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": gcs = %" slint_fmt ", gws = %f", i, borders[i], gcs, gws); #else /* the global count is simply mc */ gc = 0; gcs = mc; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": gcs = %" slint_fmt, i, borders[i], gcs); #endif SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": lcs = %" slint_fmt, i, borders[i], lcs); /* break the local contribution into contributions for the lists of elements */ for (j = 0; j < nelements; ++j) { lcv[j] = 0; lcsv[j] = xmin(lcs, final_areas[i * nelements + j]); lcs -= lcsv[j]; } SL_TRACE_ARRAY_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": lcsv = ", "%" slint_fmt, j, nelements, lcsv, i, borders[i]); rti_tstop(rti_tid_mpi_select_exact_radix_while_check_final); } SL_ASSERT(lcs == 0); /* accept local contributions */ for (j = 0; j < nelements; ++j) sdispls[(borders[i] + 1) * nelements + j] += lcsv[j]; rti_tstart(rti_tid_mpi_select_exact_radix_while_check_post); /* this is wrong, e.g., even if gc == 0 and gcs == 0 then crange[1] is set to crange[0]! */ /* if (gc > 0 || gcs > 0 #ifdef elem_weight || gw != 0.0 || gws != 0.0 #endif )*/ { border_infos[borders[i]].crange[0] += gcs; border_infos[borders[i]].crange[1] = border_infos[borders[i]].crange[0] + gc; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": counts_range: %" slint_fmt " %" slint_fmt "", i, borders[i], border_infos[borders[i]].crange[0], border_infos[borders[i]].crange[1]); border_infos[borders[i]].cmmlr[MIN_LE] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].crange[1]); border_infos[borders[i]].cmmlr[MAX_LE] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].crange[1]); border_infos[borders[i]].cmmlr[MIN_RI] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].crange[1]); border_infos[borders[i]].cmmlr[MAX_RI] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MAX_RI], border_infos[borders[i]].crange[1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]); #ifdef elem_weight border_infos[borders[i]].wrange[0] += gws; border_infos[borders[i]].wrange[1] = border_infos[borders[i]].wrange[0] + gw; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weights_range: %f %f", i, borders[i], border_infos[borders[i]].wrange[0], border_infos[borders[i]].wrange[1]); border_infos[borders[i]].wmmlr[MIN_LE] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wrange[1]); border_infos[borders[i]].wmmlr[MAX_LE] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wrange[1]); border_infos[borders[i]].wmmlr[MIN_RI] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wrange[1]); border_infos[borders[i]].wmmlr[MAX_RI] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MAX_RI], border_infos[borders[i]].wrange[1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]); #endif } SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": range diff 0: %" slint_fmt "-%" slint_fmt " | %" slint_fmt "-%" slint_fmt, i, borders[i], border_infos[borders[i]].crange[0] - border_infos[borders[i] - 1].crange[1], border_infos[borders[i]].crange[0] - border_infos[borders[i] - 1].crange[0], border_infos[borders[i] + 1].crange[0] - border_infos[borders[i]].crange[0], border_infos[borders[i] + 1].crange[1] - border_infos[borders[i]].crange[0]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": range diff 1: %" slint_fmt "-%" slint_fmt " | %" slint_fmt "-%" slint_fmt, i, borders[i], border_infos[borders[i]].crange[1] - border_infos[borders[i] - 1].crange[1], border_infos[borders[i]].crange[1] - border_infos[borders[i] - 1].crange[0], border_infos[borders[i] + 1].crange[0] - border_infos[borders[i]].crange[1], border_infos[borders[i] + 1].crange[1] - border_infos[borders[i]].crange[1]); if (border_infos[borders[i]].cmmlr[MIN_LE] != border_info_old.cmmlr[MIN_LE] || border_infos[borders[i]].cmmlr[MAX_LE] != border_info_old.cmmlr[MAX_LE] #ifdef elem_weight || border_infos[borders[i]].wmmlr[MIN_LE] != border_info_old.wmmlr[MIN_LE] || border_infos[borders[i]].wmmlr[MAX_LE] != border_info_old.wmmlr[MAX_LE] #endif ) border_infos[borders[i] + 1].update = 1; if (border_infos[borders[i]].cmmlr[MIN_RI] != border_info_old.cmmlr[MIN_RI] || border_infos[borders[i]].cmmlr[MAX_RI] != border_info_old.cmmlr[MAX_RI] #ifdef elem_weight || border_infos[borders[i]].wmmlr[MIN_RI] != border_info_old.wmmlr[MIN_RI] || border_infos[borders[i]].wmmlr[MAX_RI] != border_info_old.wmmlr[MAX_RI] #endif ) border_infos[borders[i] - 1].update = 1; border_infos[borders[i]].update = 0; /* refine or remove */ if (refine) { /* bits left for partitioning? */ if (rhigh >= rlow) { if (last_new_area == border_areas[i] && last_new_class == k) border_areas[i] = nareas_new - 1; else { /* update last_new_... */ last_new_area = border_areas[i]; last_new_class = k; /* create new area */ for (j = 0; j < nelements; ++j) { elem_assign_at(&areas[border_areas[i] * nelements + j], lcsv[j], &areas_new[nareas_new * nelements + j]); areas_new[nareas_new * nelements + j].size = lcv[j]; } border_areas[i] = nareas_new; ++nareas_new; } } else { for (j = 0; j < nelements; ++j) final_areas[(i - nborders_removed * direction) * nelements + j] = lcv[j]; /* save local count/weight for the later prefix calculations */ final_locals[NCONDS * (i - nborders_removed * direction) + 0] = lc; #ifdef elem_weight final_locals[NCONDS * (i - nborders_removed * direction) + 1] = lw; #endif } borders[i - nborders_removed * direction] = borders[i]; border_areas[i - nborders_removed * direction] = border_areas[i]; } else ++nborders_removed; rti_tstop(rti_tid_mpi_select_exact_radix_while_check_post); i += direction; } /* restrict the parts */ if (direction > 0) border_hi -= nborders_removed; else border_lo += nborders_removed; /* change direction */ direction *= -1; rti_tstop(rti_tid_mpi_select_exact_radix_while_check); /* switch areas */ nareas = nareas_new; if (areas == areas0) { areas = areas1; areas_new = areas0; } else { areas = areas0; areas_new = areas1; } } rti_tstop(rti_tid_mpi_select_exact_radix_while); sl_free(area_counts); sl_free(local_counts); sl_free(global_counts); rti_tstop(rti_tid_mpi_select_exact_radix); #ifdef VERIFY v = mpi_post_check_partconds(s, nelements, nparts, pconds, sdispls, size, rank, comm); SL_ASSERT_IF(rank == 0, v < 0); SL_NOTICE_IF(rank == 0, "post_check_partconds: %s (%" slint_fmt ")", (v >= 0)?"FAILED":"SUCCESS", v); #endif #ifdef PRINT_SDISPLS printf("%d: sdispls:", rank); for (i = 0; i < nparts; ++i) printf(" %d ", sdispls[i]); printf("\n"); #endif #ifdef PRINT_STATS mpi_select_stats(s, nparts, sdispls, size, rank, comm); #endif #if defined(PRINT_TIMINGS) && defined(SL_USE_RTI_TIM) if (rank == PRINT_TIMINGS) { printf("%d: mpi_select_exact_radix: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix)); printf("%d: mpi_select_exact_radix: sync: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_sync)); printf("%d: mpi_select_exact_radix: while: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while)); printf("%d: mpi_select_exact_radix: count: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_count)); printf("%d: mpi_select_exact_radix: allreduce: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_allreduce)); printf("%d: mpi_select_exact_radix: round1: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_round1)); printf("%d: mpi_select_exact_radix: allgather: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_round1_allgather)); printf("%d: mpi_select_exact_radix: exscan: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_exscan)); printf("%d: mpi_select_exact_radix: check: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_check)); printf("%d: mpi_select_exact_radix: pre: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_pre)); printf("%d: mpi_select_exact_radix: classes: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_classes)); printf("%d: mpi_select_exact_radix: final: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_final)); printf("%d: mpi_select_exact_radix: post: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_post)); printf("%d: mpi_select_exact_radix: rounds: %" slint_fmt "\n", rank, round); } #endif return 0; }
nervana::boundingbox::box unnormalize(float width, float height) { return nervana::boundingbox::box( xmin() * width, ymin() * height, xmax() * width - 1, ymax() * height - 1); }
QRectF QTessellatorPrivate::collectAndSortVertices(const QPointF *points, int *maxActiveEdges) { *maxActiveEdges = 0; Vertex *v = vertices.storage; Vertex **vv = vertices.sorted; qreal xmin(points[0].x()); qreal xmax(points[0].x()); qreal ymin(points[0].y()); qreal ymax(points[0].y()); // collect vertex data Q27Dot5 y_prev = FloatToQ27Dot5(points[vertices.nPoints-1].y()); Q27Dot5 x_next = FloatToQ27Dot5(points[0].x()); Q27Dot5 y_next = FloatToQ27Dot5(points[0].y()); int j = 0; int i = 0; while (i < vertices.nPoints) { Q27Dot5 y_curr = y_next; *vv = v; v->x = x_next; v->y = y_next; v->flags = 0; next_point: xmin = qMin(xmin, points[i+1].x()); xmax = qMax(xmax, points[i+1].x()); ymin = qMin(ymin, points[i+1].y()); ymax = qMax(ymax, points[i+1].y()); y_next = FloatToQ27Dot5(points[i+1].y()); x_next = FloatToQ27Dot5(points[i+1].x()); // skip vertices on top of each other if (v->x == x_next && v->y == y_next) { ++i; if (i < vertices.nPoints) goto next_point; Vertex *v0 = vertices.storage; v0->flags &= ~(LineBeforeStarts|LineBeforeEnds|LineBeforeHorizontal); if (y_prev < y_curr) v0->flags |= LineBeforeEnds; else if (y_prev > y_curr) v0->flags |= LineBeforeStarts; else v0->flags |= LineBeforeHorizontal; if ((v0->flags & (LineBeforeStarts|LineAfterStarts)) && !(v0->flags & (LineAfterEnds|LineBeforeEnds))) *maxActiveEdges += 2; break; } if (y_prev < y_curr) v->flags |= LineBeforeEnds; else if (y_prev > y_curr) v->flags |= LineBeforeStarts; else v->flags |= LineBeforeHorizontal; if (y_curr < y_next) v->flags |= LineAfterStarts; else if (y_curr > y_next) v->flags |= LineAfterEnds; else v->flags |= LineAfterHorizontal; // ### could probably get better limit by looping over sorted list and counting down on ending edges if ((v->flags & (LineBeforeStarts|LineAfterStarts)) && !(v->flags & (LineAfterEnds|LineBeforeEnds))) *maxActiveEdges += 2; y_prev = y_curr; ++v; ++vv; ++j; ++i; } vertices.nPoints = j; QDEBUG() << "maxActiveEdges=" << *maxActiveEdges; vv = vertices.sorted; qSort(vv, vv + vertices.nPoints, compareVertex); return QRectF(xmin, ymin, xmax-xmin, ymax-ymin); }
/* Returns >= zero iff successful */ static int find_triple_64(int i, int min_leeway, int perfect_leeway, mpfr_fn sin_fn, mpfr_fn cos_fn) { /* Using mpfr is not entirely overkill for this; [Lut95] includes PASCAL fragments that use almost entirely integer arithmetic... but the error term in that only handles up to 13 extra bits of zeroes or so. We proudly boast at least 16 bits of extra zeroes in all cases. */ mpfr_t xi; mpfr_t xip1; mpfr_t cos; mpfr_t sin; double xip1_d; double t; uint64_t sin_u; uint64_t cos_u; int e1; int e2; uint64_t xip1_u; double xi_initial; uint64_t xi_initial_u; double xi_current; uint64_t xi_current_u; long int r = 0; long int best_r = 0; int sgn = 1; int ml = min_leeway; int best_l = 0; uint64_t best_xi_u; uint64_t best_sin_u; uint64_t best_cos_u; time_t start; time_t end; start = time(0); mpfr_init2(xi, 100); mpfr_init2(xip1, 100); mpfr_init2(cos, 100); mpfr_init2(sin, 100); /* start out at xi = πi/(4N) */ mpfr_const_pi(xi, MPFR_RNDN); mpfr_mul_si(xip1, xi, (long int) (i + 1), MPFR_RNDN); mpfr_mul_si(xi, xi, (long int) i, MPFR_RNDN); mpfr_div_si(xi, xi, (long int) 4 * N, MPFR_RNDN); mpfr_div_si(xip1, xip1, (long int) 4 * N, MPFR_RNDN); xip1_d = mpfr_get_d(xip1, MPFR_RNDN); xip1_u = FLT64_TO_UINT64(xip1_d); xi_initial = mpfr_get_d(xi, MPFR_RNDN); xi_initial_u = FLT64_TO_UINT64(xi_initial); while (1) { xi_current_u = xi_initial_u + (sgn * r); xi_current = UINT64_TO_FLT64(xi_current_u); mpfr_set_d(xi, xi_current, MPFR_RNDN); /* Test if cos(xi) has enough zeroes */ cos_fn(cos, xi, MPFR_RNDN); t = mpfr_get_d(cos, MPFR_RNDN); cos_u = FLT64_TO_UINT64(t); e1 = EXP_OF_FLT64(t); mpfr_sub_d(cos, cos, t, MPFR_RNDN); t = mpfr_get_d(cos, MPFR_RNDN); e2 = EXP_OF_FLT64(t); if (e2 == -1024) { /* Damn; this is too close to a subnormal. i = 0 or N? */ return -1; } if (e1 - e2 < (52 + min_leeway)) { goto inc; } ml = xmax(min_leeway, e1 - e2 - 52); /* Test if sin(xi) has enough zeroes */ sin_fn(sin, xi, MPFR_RNDN); t = mpfr_get_d(sin, MPFR_RNDN); sin_u = FLT64_TO_UINT64(t); e1 = EXP_OF_FLT64(t); mpfr_sub_d(sin, sin, t, MPFR_RNDN); t = mpfr_get_d(sin, MPFR_RNDN); e2 = EXP_OF_FLT64(t); if (e2 == -1024) { /* Damn; this is too close to a subnormal. i = 0 or N? */ return -1; } if (e1 - e2 < (52 + min_leeway)) { goto inc; } ml = xmin(ml, e1 - e2 - 52); /* Hurrah, this is valid */ if (ml > best_l) { best_l = ml; best_xi_u = xi_current_u; best_cos_u = cos_u; best_sin_u = sin_u; best_r = sgn * r; /* If this is super-good, don't bother finding more */ if (best_l >= perfect_leeway) { break; } } inc: /* Increment */ sgn *= -1; if (sgn < 0) { r++; } else if (r > (1 << 29) || xi_current_u > xip1_u) { /* This is taking too long, give up looking for perfection and take the best we've got. A sweep of 1 << 28 finishes in ~60 hrs on my personal machine as I write this. */ break; } } end = time(0); if (best_l > min_leeway) { printf( "(%#018lx, %#018lx, %#018lx), /* i = %03d, l = %02d, r = %010ld, t = %ld */ \n", best_xi_u, best_cos_u, best_sin_u, i, best_l, best_r, end - start); return 0; } else { return -1; } }
CHAR_T * VASNPRINTF (CHAR_T *resultbuf, size_t *lengthp, const CHAR_T *format, va_list args) { DIRECTIVES d; arguments a; if (PRINTF_PARSE (format, &d, &a) < 0) { errno = EINVAL; return NULL; } #define CLEANUP() \ free (d.dir); \ if (a.arg) \ free (a.arg); if (printf_fetchargs (args, &a) < 0) { CLEANUP (); errno = EINVAL; return NULL; } { size_t buf_neededlength; CHAR_T *buf; CHAR_T *buf_malloced; const CHAR_T *cp; size_t i; DIRECTIVE *dp; CHAR_T *result; size_t allocated; size_t length; buf_neededlength = xsum4 (7, d.max_width_length, d.max_precision_length, 6); #if HAVE_ALLOCA if (buf_neededlength < 4000 / sizeof (CHAR_T)) { buf = (CHAR_T *) alloca (buf_neededlength * sizeof (CHAR_T)); buf_malloced = NULL; } else #endif { size_t buf_memsize = xtimes (buf_neededlength, sizeof (CHAR_T)); if (size_overflow_p (buf_memsize)) goto out_of_memory_1; buf = (CHAR_T *) malloc (buf_memsize); if (buf == NULL) goto out_of_memory_1; buf_malloced = buf; } if (resultbuf != NULL) { result = resultbuf; allocated = *lengthp; } else { result = NULL; allocated = 0; } length = 0; #define ENSURE_ALLOCATION(needed) \ if ((needed) > allocated) \ { \ size_t memory_size; \ CHAR_T *memory; \ \ allocated = (allocated > 0 ? xtimes (allocated, 2) : 12); \ if ((needed) > allocated) \ allocated = (needed); \ memory_size = xtimes (allocated, sizeof (CHAR_T)); \ if (size_overflow_p (memory_size)) \ goto out_of_memory; \ if (result == resultbuf || result == NULL) \ memory = (CHAR_T *) malloc (memory_size); \ else \ memory = (CHAR_T *) realloc (result, memory_size); \ if (memory == NULL) \ goto out_of_memory; \ if (result == resultbuf && length > 0) \ memcpy (memory, result, length * sizeof (CHAR_T)); \ result = memory; \ } for (cp = format, i = 0, dp = &d.dir[0]; ; cp = dp->dir_end, i++, dp++) { if (cp != dp->dir_start) { size_t n = dp->dir_start - cp; size_t augmented_length = xsum (length, n); ENSURE_ALLOCATION (augmented_length); memcpy (result + length, cp, n * sizeof (CHAR_T)); length = augmented_length; } if (i == d.count) break; if (dp->conversion == '%') { size_t augmented_length; if (!(dp->arg_index == ARG_NONE)) abort (); augmented_length = xsum (length, 1); ENSURE_ALLOCATION (augmented_length); result[length] = '%'; length = augmented_length; } else { if (!(dp->arg_index != ARG_NONE)) abort (); if (dp->conversion == 'n') { switch (a.arg[dp->arg_index].type) { case TYPE_COUNT_SCHAR_POINTER: *a.arg[dp->arg_index].a.a_count_schar_pointer = length; break; case TYPE_COUNT_SHORT_POINTER: *a.arg[dp->arg_index].a.a_count_short_pointer = length; break; case TYPE_COUNT_INT_POINTER: *a.arg[dp->arg_index].a.a_count_int_pointer = length; break; case TYPE_COUNT_LONGINT_POINTER: *a.arg[dp->arg_index].a.a_count_longint_pointer = length; break; #ifdef HAVE_LONG_LONG case TYPE_COUNT_LONGLONGINT_POINTER: *a.arg[dp->arg_index].a.a_count_longlongint_pointer = length; break; #endif default: abort (); } } else { arg_type type = a.arg[dp->arg_index].type; CHAR_T *p; unsigned int prefix_count; int prefixes[2]; #if !USE_SNPRINTF size_t tmp_length; CHAR_T tmpbuf[700]; CHAR_T *tmp; { size_t width; size_t precision; width = 0; if (dp->width_start != dp->width_end) { if (dp->width_arg_index != ARG_NONE) { int arg; if (!(a.arg[dp->width_arg_index].type == TYPE_INT)) abort (); arg = a.arg[dp->width_arg_index].a.a_int; width = (arg < 0 ? (unsigned int) (-arg) : arg); } else { const CHAR_T *digitp = dp->width_start; do width = xsum (xtimes (width, 10), *digitp++ - '0'); while (digitp != dp->width_end); } } precision = 6; if (dp->precision_start != dp->precision_end) { if (dp->precision_arg_index != ARG_NONE) { int arg; if (!(a.arg[dp->precision_arg_index].type == TYPE_INT)) abort (); arg = a.arg[dp->precision_arg_index].a.a_int; precision = (arg < 0 ? 0 : arg); } else { const CHAR_T *digitp = dp->precision_start + 1; precision = 0; do precision = xsum (xtimes (precision, 10), *digitp++ - '0'); while (digitp != dp->precision_end); } } switch (dp->conversion) { case 'd': case 'i': case 'u': # ifdef HAVE_LONG_LONG if (type == TYPE_LONGLONGINT || type == TYPE_ULONGLONGINT) tmp_length = (unsigned int) (sizeof (unsigned long long) * CHAR_BIT * 0.30103 * 2 ) + 1 + 1; else # endif if (type == TYPE_LONGINT || type == TYPE_ULONGINT) tmp_length = (unsigned int) (sizeof (unsigned long) * CHAR_BIT * 0.30103 * 2 ) + 1 + 1; else tmp_length = (unsigned int) (sizeof (unsigned int) * CHAR_BIT * 0.30103 * 2 ) + 1 + 1; break; case 'o': # ifdef HAVE_LONG_LONG if (type == TYPE_LONGLONGINT || type == TYPE_ULONGLONGINT) tmp_length = (unsigned int) (sizeof (unsigned long long) * CHAR_BIT * 0.333334 ) + 1 + 1; else # endif if (type == TYPE_LONGINT || type == TYPE_ULONGINT) tmp_length = (unsigned int) (sizeof (unsigned long) * CHAR_BIT * 0.333334 ) + 1 + 1; else tmp_length = (unsigned int) (sizeof (unsigned int) * CHAR_BIT * 0.333334 ) + 1 + 1; break; case 'x': case 'X': # ifdef HAVE_LONG_LONG if (type == TYPE_LONGLONGINT || type == TYPE_ULONGLONGINT) tmp_length = (unsigned int) (sizeof (unsigned long long) * CHAR_BIT * 0.25 ) + 1 + 2; else # endif if (type == TYPE_LONGINT || type == TYPE_ULONGINT) tmp_length = (unsigned int) (sizeof (unsigned long) * CHAR_BIT * 0.25 ) + 1 + 2; else tmp_length = (unsigned int) (sizeof (unsigned int) * CHAR_BIT * 0.25 ) + 1 + 2; break; case 'f': case 'F': # ifdef HAVE_LONG_DOUBLE if (type == TYPE_LONGDOUBLE) tmp_length = (unsigned int) (LDBL_MAX_EXP * 0.30103 * 2 ) + 1 + 10; else # endif tmp_length = (unsigned int) (DBL_MAX_EXP * 0.30103 * 2 ) + 1 + 10; tmp_length = xsum (tmp_length, precision); break; case 'e': case 'E': case 'g': case 'G': case 'a': case 'A': tmp_length = 12; tmp_length = xsum (tmp_length, precision); break; case 'c': # if defined HAVE_WINT_T && !WIDE_CHAR_VERSION if (type == TYPE_WIDE_CHAR) tmp_length = MB_CUR_MAX; else # endif tmp_length = 1; break; case 's': # ifdef HAVE_WCHAR_T if (type == TYPE_WIDE_STRING) { tmp_length = local_wcslen (a.arg[dp->arg_index].a.a_wide_string); # if !WIDE_CHAR_VERSION tmp_length = xtimes (tmp_length, MB_CUR_MAX); # endif } else # endif tmp_length = strlen (a.arg[dp->arg_index].a.a_string); break; case 'p': tmp_length = (unsigned int) (sizeof (void *) * CHAR_BIT * 0.25 ) + 1 + 2; break; default: abort (); } if (tmp_length < width) tmp_length = width; tmp_length = xsum (tmp_length, 1); } if (tmp_length <= sizeof (tmpbuf) / sizeof (CHAR_T)) tmp = tmpbuf; else { size_t tmp_memsize = xtimes (tmp_length, sizeof (CHAR_T)); if (size_overflow_p (tmp_memsize)) goto out_of_memory; tmp = (CHAR_T *) malloc (tmp_memsize); if (tmp == NULL) goto out_of_memory; } #endif p = buf; *p++ = '%'; if (dp->flags & FLAG_GROUP) *p++ = '\''; if (dp->flags & FLAG_LEFT) *p++ = '-'; if (dp->flags & FLAG_SHOWSIGN) *p++ = '+'; if (dp->flags & FLAG_SPACE) *p++ = ' '; if (dp->flags & FLAG_ALT) *p++ = '#'; if (dp->flags & FLAG_ZERO) *p++ = '0'; if (dp->width_start != dp->width_end) { size_t n = dp->width_end - dp->width_start; memcpy (p, dp->width_start, n * sizeof (CHAR_T)); p += n; } if (dp->precision_start != dp->precision_end) { size_t n = dp->precision_end - dp->precision_start; memcpy (p, dp->precision_start, n * sizeof (CHAR_T)); p += n; } switch (type) { #ifdef HAVE_LONG_LONG case TYPE_LONGLONGINT: case TYPE_ULONGLONGINT: *p++ = 'l'; #endif case TYPE_LONGINT: case TYPE_ULONGINT: #ifdef HAVE_WINT_T case TYPE_WIDE_CHAR: #endif #ifdef HAVE_WCHAR_T case TYPE_WIDE_STRING: #endif *p++ = 'l'; break; #ifdef HAVE_LONG_DOUBLE case TYPE_LONGDOUBLE: *p++ = 'L'; break; #endif default: break; } *p = dp->conversion; #if USE_SNPRINTF p[1] = '%'; p[2] = 'n'; p[3] = '\0'; #else p[1] = '\0'; #endif prefix_count = 0; if (dp->width_arg_index != ARG_NONE) { if (!(a.arg[dp->width_arg_index].type == TYPE_INT)) abort (); prefixes[prefix_count++] = a.arg[dp->width_arg_index].a.a_int; } if (dp->precision_arg_index != ARG_NONE) { if (!(a.arg[dp->precision_arg_index].type == TYPE_INT)) abort (); prefixes[prefix_count++] = a.arg[dp->precision_arg_index].a.a_int; } #if USE_SNPRINTF ENSURE_ALLOCATION (xsum (length, 1)); result[length] = '\0'; #endif for (;;) { size_t maxlen; int count; int retcount; maxlen = allocated - length; count = -1; retcount = 0; #if USE_SNPRINTF # define SNPRINTF_BUF(arg) \ switch (prefix_count) \ { \ case 0: \ retcount = SNPRINTF (result + length, maxlen, buf, \ arg, &count); \ break; \ case 1: \ retcount = SNPRINTF (result + length, maxlen, buf, \ prefixes[0], arg, &count); \ break; \ case 2: \ retcount = SNPRINTF (result + length, maxlen, buf, \ prefixes[0], prefixes[1], arg, \ &count); \ break; \ default: \ abort (); \ } #else # define SNPRINTF_BUF(arg) \ switch (prefix_count) \ { \ case 0: \ count = sprintf (tmp, buf, arg); \ break; \ case 1: \ count = sprintf (tmp, buf, prefixes[0], arg); \ break; \ case 2: \ count = sprintf (tmp, buf, prefixes[0], prefixes[1],\ arg); \ break; \ default: \ abort (); \ } #endif switch (type) { case TYPE_SCHAR: { int arg = a.arg[dp->arg_index].a.a_schar; SNPRINTF_BUF (arg); } break; case TYPE_UCHAR: { unsigned int arg = a.arg[dp->arg_index].a.a_uchar; SNPRINTF_BUF (arg); } break; case TYPE_SHORT: { int arg = a.arg[dp->arg_index].a.a_short; SNPRINTF_BUF (arg); } break; case TYPE_USHORT: { unsigned int arg = a.arg[dp->arg_index].a.a_ushort; SNPRINTF_BUF (arg); } break; case TYPE_INT: { int arg = a.arg[dp->arg_index].a.a_int; SNPRINTF_BUF (arg); } break; case TYPE_UINT: { unsigned int arg = a.arg[dp->arg_index].a.a_uint; SNPRINTF_BUF (arg); } break; case TYPE_LONGINT: { long int arg = a.arg[dp->arg_index].a.a_longint; SNPRINTF_BUF (arg); } break; case TYPE_ULONGINT: { unsigned long int arg = a.arg[dp->arg_index].a.a_ulongint; SNPRINTF_BUF (arg); } break; #ifdef HAVE_LONG_LONG case TYPE_LONGLONGINT: { long long int arg = a.arg[dp->arg_index].a.a_longlongint; SNPRINTF_BUF (arg); } break; case TYPE_ULONGLONGINT: { unsigned long long int arg = a.arg[dp->arg_index].a.a_ulonglongint; SNPRINTF_BUF (arg); } break; #endif case TYPE_DOUBLE: { double arg = a.arg[dp->arg_index].a.a_double; SNPRINTF_BUF (arg); } break; #ifdef HAVE_LONG_DOUBLE case TYPE_LONGDOUBLE: { long double arg = a.arg[dp->arg_index].a.a_longdouble; SNPRINTF_BUF (arg); } break; #endif case TYPE_CHAR: { int arg = a.arg[dp->arg_index].a.a_char; SNPRINTF_BUF (arg); } break; #ifdef HAVE_WINT_T case TYPE_WIDE_CHAR: { wint_t arg = a.arg[dp->arg_index].a.a_wide_char; SNPRINTF_BUF (arg); } break; #endif case TYPE_STRING: { const char *arg = a.arg[dp->arg_index].a.a_string; SNPRINTF_BUF (arg); } break; #ifdef HAVE_WCHAR_T case TYPE_WIDE_STRING: { const wchar_t *arg = a.arg[dp->arg_index].a.a_wide_string; SNPRINTF_BUF (arg); } break; #endif case TYPE_POINTER: { void *arg = a.arg[dp->arg_index].a.a_pointer; SNPRINTF_BUF (arg); } break; default: abort (); } #if USE_SNPRINTF if (count >= 0) { if (count < maxlen && result[length + count] != '\0') abort (); if (retcount > count) count = retcount; } else { if (p[1] != '\0') { p[1] = '\0'; continue; } else { if (retcount < 0) { size_t bigger_need = xsum (xtimes (allocated, 2), 12); ENSURE_ALLOCATION (bigger_need); continue; } else count = retcount; } } #endif if (count < 0) { if (!(result == resultbuf || result == NULL)) free (result); free (buf_malloced); CLEANUP (); errno = EINVAL; return NULL; } #if !USE_SNPRINTF if (count >= tmp_length) abort (); #endif if (count >= maxlen) { size_t n = xmax (xsum (length, count), xtimes (allocated, 2)); ENSURE_ALLOCATION (n); #if USE_SNPRINTF continue; #endif } #if USE_SNPRINTF #else memcpy (result + length, tmp, count * sizeof (CHAR_T)); if (tmp != tmpbuf) free (tmp); #endif length += count; break; } } } } ENSURE_ALLOCATION (xsum (length, 1)); result[length] = '\0'; if (result != resultbuf && length + 1 < allocated) { CHAR_T *memory; memory = (CHAR_T *) realloc (result, (length + 1) * sizeof (CHAR_T)); if (memory != NULL) result = memory; } free (buf_malloced); CLEANUP (); *lengthp = length; return result; out_of_memory: if (!(result == resultbuf || result == NULL)) free (result); free (buf_malloced); out_of_memory_1: CLEANUP (); errno = ENOMEM; return NULL; } }
CHAR_T * VASNPRINTF (CHAR_T *resultbuf, size_t *lengthp, const CHAR_T *format, va_list args) { DIRECTIVES d; arguments a; if (PRINTF_PARSE (format, &d, &a) < 0) { errno = EINVAL; return NULL; } #define CLEANUP() \ free (d.dir); \ if (a.arg) \ free (a.arg); if (printf_fetchargs (args, &a) < 0) { CLEANUP (); errno = EINVAL; return NULL; } { size_t buf_neededlength; CHAR_T *buf; CHAR_T *buf_malloced; const CHAR_T *cp; size_t i; DIRECTIVE *dp; /* Output string accumulator. */ CHAR_T *result; size_t allocated; size_t length; /* Allocate a small buffer that will hold a directive passed to snprintf. */ buf_neededlength = xsum4 (7, d.max_width_length, d.max_precision_length, 6); #if HAVE_ALLOCA if (buf_neededlength < 4000 / sizeof (CHAR_T)) { buf = (CHAR_T *) alloca (buf_neededlength * sizeof (CHAR_T)); buf_malloced = NULL; } else #endif { size_t buf_memsize = xtimes (buf_neededlength, sizeof (CHAR_T)); if (size_overflow_p (buf_memsize)) goto out_of_memory_1; buf = (CHAR_T *) malloc (buf_memsize); if (buf == NULL) goto out_of_memory_1; buf_malloced = buf; } if (resultbuf != NULL) { result = resultbuf; allocated = *lengthp; } else { result = NULL; allocated = 0; } length = 0; /* Invariants: result is either == resultbuf or == NULL or malloc-allocated. If length > 0, then result != NULL. */ /* Ensures that allocated >= needed. Aborts through a jump to out_of_memory if needed is SIZE_MAX or otherwise too big. */ #define ENSURE_ALLOCATION(needed) \ if ((needed) > allocated) \ { \ size_t memory_size; \ CHAR_T *memory; \ \ allocated = (allocated > 0 ? xtimes (allocated, 2) : 12); \ if ((needed) > allocated) \ allocated = (needed); \ memory_size = xtimes (allocated, sizeof (CHAR_T)); \ if (size_overflow_p (memory_size)) \ goto out_of_memory; \ if (result == resultbuf || result == NULL) \ memory = (CHAR_T *) malloc (memory_size); \ else \ memory = (CHAR_T *) realloc (result, memory_size); \ if (memory == NULL) \ goto out_of_memory; \ if (result == resultbuf && length > 0) \ memcpy (memory, result, length * sizeof (CHAR_T)); \ result = memory; \ } for (cp = format, i = 0, dp = &d.dir[0]; ; cp = dp->dir_end, i++, dp++) { if (cp != dp->dir_start) { size_t n = dp->dir_start - cp; size_t augmented_length = xsum (length, n); ENSURE_ALLOCATION (augmented_length); memcpy (result + length, cp, n * sizeof (CHAR_T)); length = augmented_length; } if (i == d.count) break; /* Execute a single directive. */ if (dp->conversion == '%') { size_t augmented_length; if (!(dp->arg_index == ARG_NONE)) abort (); augmented_length = xsum (length, 1); ENSURE_ALLOCATION (augmented_length); result[length] = '%'; length = augmented_length; } else { if (!(dp->arg_index != ARG_NONE)) abort (); if (dp->conversion == 'n') { switch (a.arg[dp->arg_index].type) { case TYPE_COUNT_SCHAR_POINTER: *a.arg[dp->arg_index].a.a_count_schar_pointer = length; break; case TYPE_COUNT_SHORT_POINTER: *a.arg[dp->arg_index].a.a_count_short_pointer = length; break; case TYPE_COUNT_INT_POINTER: *a.arg[dp->arg_index].a.a_count_int_pointer = length; break; case TYPE_COUNT_LONGINT_POINTER: *a.arg[dp->arg_index].a.a_count_longint_pointer = length; break; #ifdef HAVE_LONG_LONG case TYPE_COUNT_LONGLONGINT_POINTER: *a.arg[dp->arg_index].a.a_count_longlongint_pointer = length; break; #endif default: abort (); } } else { arg_type type = a.arg[dp->arg_index].type; CHAR_T *p; unsigned int prefix_count; int prefixes[2]; /* Construct the format string for calling snprintf. */ p = buf; *p++ = '%'; if (dp->flags & FLAG_GROUP) *p++ = '\''; if (dp->flags & FLAG_LEFT) *p++ = '-'; if (dp->flags & FLAG_SHOWSIGN) *p++ = '+'; if (dp->flags & FLAG_SPACE) *p++ = ' '; if (dp->flags & FLAG_ALT) *p++ = '#'; if (dp->flags & FLAG_ZERO) *p++ = '0'; if (dp->width_start != dp->width_end) { size_t n = dp->width_end - dp->width_start; memcpy (p, dp->width_start, n * sizeof (CHAR_T)); p += n; } if (dp->precision_start != dp->precision_end) { size_t n = dp->precision_end - dp->precision_start; memcpy (p, dp->precision_start, n * sizeof (CHAR_T)); p += n; } switch (type) { #ifdef HAVE_LONG_LONG case TYPE_LONGLONGINT: case TYPE_ULONGLONGINT: *p++ = 'l'; /*FALLTHROUGH*/ #endif case TYPE_LONGINT: case TYPE_ULONGINT: #ifdef HAVE_WINT_T case TYPE_WIDE_CHAR: #endif #ifdef HAVE_WCHAR_T case TYPE_WIDE_STRING: #endif *p++ = 'l'; break; #ifdef HAVE_LONG_DOUBLE case TYPE_LONGDOUBLE: *p++ = 'L'; break; #endif default: break; } *p = dp->conversion; #if USE_SNPRINTF p[1] = '%'; p[2] = 'n'; p[3] = '\0'; #else p[1] = '\0'; #endif /* Construct the arguments for calling snprintf. */ prefix_count = 0; if (dp->width_arg_index != ARG_NONE) { if (!(a.arg[dp->width_arg_index].type == TYPE_INT)) abort (); prefixes[prefix_count++] = a.arg[dp->width_arg_index].a.a_int; } if (dp->precision_arg_index != ARG_NONE) { if (!(a.arg[dp->precision_arg_index].type == TYPE_INT)) abort (); prefixes[prefix_count++] = a.arg[dp->precision_arg_index].a.a_int; } #if USE_SNPRINTF /* Prepare checking whether snprintf returns the count via %n. */ ENSURE_ALLOCATION (xsum (length, 1)); result[length] = '\0'; #endif for (;;) { size_t maxlen; int count; int retcount; maxlen = allocated - length; count = -1; retcount = 0; #if USE_SNPRINTF # define SNPRINTF_BUF(arg) \ switch (prefix_count) \ { \ case 0: \ retcount = SNPRINTF (result + length, maxlen, buf, \ arg, &count); \ break; \ case 1: \ retcount = SNPRINTF (result + length, maxlen, buf, \ prefixes[0], arg, &count); \ break; \ case 2: \ retcount = SNPRINTF (result + length, maxlen, buf, \ prefixes[0], prefixes[1], arg, \ &count); \ break; \ default: \ abort (); \ } #endif switch (type) { case TYPE_SCHAR: { int arg = a.arg[dp->arg_index].a.a_schar; SNPRINTF_BUF (arg); } break; case TYPE_UCHAR: { unsigned int arg = a.arg[dp->arg_index].a.a_uchar; SNPRINTF_BUF (arg); } break; case TYPE_SHORT: { int arg = a.arg[dp->arg_index].a.a_short; SNPRINTF_BUF (arg); } break; case TYPE_USHORT: { unsigned int arg = a.arg[dp->arg_index].a.a_ushort; SNPRINTF_BUF (arg); } break; case TYPE_INT: { int arg = a.arg[dp->arg_index].a.a_int; SNPRINTF_BUF (arg); } break; case TYPE_UINT: { unsigned int arg = a.arg[dp->arg_index].a.a_uint; SNPRINTF_BUF (arg); } break; case TYPE_LONGINT: { long int arg = a.arg[dp->arg_index].a.a_longint; SNPRINTF_BUF (arg); } break; case TYPE_ULONGINT: { unsigned long int arg = a.arg[dp->arg_index].a.a_ulongint; SNPRINTF_BUF (arg); } break; #ifdef HAVE_LONG_LONG case TYPE_LONGLONGINT: { long long int arg = a.arg[dp->arg_index].a.a_longlongint; SNPRINTF_BUF (arg); } break; case TYPE_ULONGLONGINT: { unsigned long long int arg = a.arg[dp->arg_index].a.a_ulonglongint; SNPRINTF_BUF (arg); } break; #endif case TYPE_DOUBLE: { double arg = a.arg[dp->arg_index].a.a_double; SNPRINTF_BUF (arg); } break; #ifdef HAVE_LONG_DOUBLE case TYPE_LONGDOUBLE: { long double arg = a.arg[dp->arg_index].a.a_longdouble; SNPRINTF_BUF (arg); } break; #endif case TYPE_CHAR: { int arg = a.arg[dp->arg_index].a.a_char; SNPRINTF_BUF (arg); } break; #ifdef HAVE_WINT_T case TYPE_WIDE_CHAR: { wint_t arg = a.arg[dp->arg_index].a.a_wide_char; SNPRINTF_BUF (arg); } break; #endif case TYPE_STRING: { const char *arg = a.arg[dp->arg_index].a.a_string; SNPRINTF_BUF (arg); } break; #ifdef HAVE_WCHAR_T case TYPE_WIDE_STRING: { const wchar_t *arg = a.arg[dp->arg_index].a.a_wide_string; SNPRINTF_BUF (arg); } break; #endif case TYPE_POINTER: { void *arg = a.arg[dp->arg_index].a.a_pointer; SNPRINTF_BUF (arg); } break; default: abort (); } #if USE_SNPRINTF /* Portability: Not all implementations of snprintf() are ISO C 99 compliant. Determine the number of bytes that snprintf() has produced or would have produced. */ if (count >= 0) { /* Verify that snprintf() has NUL-terminated its result. */ if (count < maxlen && result[length + count] != '\0') abort (); /* Portability hack. */ if (retcount > count) count = retcount; } else { /* snprintf() doesn't understand the '%n' directive. */ if (p[1] != '\0') { /* Don't use the '%n' directive; instead, look at the snprintf() return value. */ p[1] = '\0'; continue; } else { /* Look at the snprintf() return value. */ if (retcount < 0) { /* HP-UX 10.20 snprintf() is doubly deficient: It doesn't understand the '%n' directive, *and* it returns -1 (rather than the length that would have been required) when the buffer is too small. */ size_t bigger_need = xsum (xtimes (allocated, 2), 12); ENSURE_ALLOCATION (bigger_need); continue; } else count = retcount; } } #endif /* Attempt to handle failure. */ if (count < 0) { if (!(result == resultbuf || result == NULL)) free (result); if (buf_malloced != NULL) free (buf_malloced); CLEANUP (); errno = EINVAL; return NULL; } #if !USE_SNPRINTF if (count >= tmp_length) /* tmp_length was incorrectly calculated - fix the code above! */ abort (); #endif /* Make room for the result. */ if (count >= maxlen) { /* Need at least count bytes. But allocate proportionally, to avoid looping eternally if snprintf() reports a too small count. */ size_t n = xmax (xsum (length, count), xtimes (allocated, 2)); ENSURE_ALLOCATION (n); #if USE_SNPRINTF continue; #endif } length += count; break; } } } } /* Add the final NUL. */ ENSURE_ALLOCATION (xsum (length, 1)); result[length] = '\0'; if (result != resultbuf && length + 1 < allocated) { /* Shrink the allocated memory if possible. */ CHAR_T *memory; memory = (CHAR_T *) realloc (result, (length + 1) * sizeof (CHAR_T)); if (memory != NULL) result = memory; } if (buf_malloced != NULL) free (buf_malloced); CLEANUP (); *lengthp = length; if (length > INT_MAX) goto length_overflow; return result; length_overflow: /* We could produce such a big string, but its length doesn't fit into an 'int'. POSIX says that snprintf() fails with errno = EOVERFLOW in this case. */ if (result != resultbuf) free (result); errno = EOVERFLOW; return NULL; out_of_memory: if (!(result == resultbuf || result == NULL)) free (result); if (buf_malloced != NULL) free (buf_malloced); out_of_memory_1: CLEANUP (); errno = ENOMEM; return NULL; } }