static int Integrate(This *t, real *integral, real *error, real *prob) { TYPEDEFREGION; count dim, comp, df; int fail; Result totals[NCOMP]; Region *anchor = NULL, *region = NULL; if( VERBOSE > 1 ) { char s[256]; sprintf(s, "Suave input parameters:\n" " ndim " COUNT "\n ncomp " COUNT "\n" " epsrel " REAL "\n epsabs " REAL "\n" " flags %d\n seed %d\n" " mineval " NUMBER "\n maxeval " NUMBER "\n" " nnew " NUMBER "\n flatness " REAL, t->ndim, t->ncomp, t->epsrel, t->epsabs, t->flags, t->seed, t->mineval, t->maxeval, t->nnew, t->flatness); Print(s); } if( BadComponent(t) ) return -2; if( BadDimension(t) ) return -1; if( (fail = setjmp(t->abort)) ) goto abort; t->epsabs = Max(t->epsabs, NOTZERO); IniRandom(t); RegionAlloc(t, anchor, t->nnew, t->nnew); anchor->next = NULL; anchor->div = 0; for( dim = 0; dim < t->ndim; ++dim ) { Bounds *b = &anchor->bounds[dim]; b->lower = 0; b->upper = 1; b->mid = .5; if( dim == 0 ) { count bin; /* define the initial distribution of bins */ for( bin = 0; bin < NBINS; ++bin ) b->grid[bin] = (bin + 1)/(real)NBINS; } else Copy(b->grid, anchor->bounds[0].grid, NBINS); } Sample(t, t->nnew, anchor, anchor->w, anchor->w + t->nnew, anchor->w + t->nnew + t->ndim*t->nnew); df = anchor->df; ResCopy(totals, anchor->result); for( t->nregions = 1; ; ++t->nregions ) { Var var[NDIM][2], *vLR; real maxratio, maxerr, minfluct, bias, mid; Region *regionL, *regionR, *reg, **parent, **par; Bounds *bounds, *boundsL, *boundsR; count maxcomp, bisectdim; number n, nL, nR, nnewL, nnewR; real *w, *wL, *wR, *x, *xL, *xR, *f, *fL, *fR, *wlast, *flast; if( VERBOSE ) { char s[128 + 128*NCOMP], *p = s; p += sprintf(p, "\n" "Iteration " COUNT ": " NUMBER " integrand evaluations so far", t->nregions, t->neval); for( comp = 0; comp < t->ncomp; ++comp ) { cResult *tot = &totals[comp]; p += sprintf(p, "\n[" COUNT "] " REAL " +- " REAL " \tchisq " REAL " (" COUNT " df)", comp + 1, tot->avg, tot->err, tot->chisq, df); } Print(s); } maxratio = -INFTY; maxcomp = 0; for( comp = 0; comp < t->ncomp; ++comp ) { creal ratio = totals[comp].err/MaxErr(totals[comp].avg); if( ratio > maxratio ) { maxratio = ratio; maxcomp = comp; } } if( maxratio <= 1 && t->neval >= t->mineval ) { fail = 0; break; } if( t->neval >= t->maxeval ) break; maxerr = -INFTY; parent = &anchor; region = anchor; for( par = &anchor; (reg = *par); par = ®->next ) { creal err = reg->result[maxcomp].err; if( err > maxerr ) { maxerr = err; parent = par; region = reg; } } Fluct(t, var[0], region->bounds, region->w, region->n, maxcomp, region->result[maxcomp].avg, Max(maxerr, t->epsabs)); bias = (t->epsrel < 1e-50) ? 2 : Max(pow(2., -(real)region->div/t->ndim)/t->epsrel, 2.); minfluct = INFTY; bisectdim = 0; for( dim = 0; dim < t->ndim; ++dim ) { cBounds *b = ®ion->bounds[dim]; creal fluct = (var[dim][0].fluct + var[dim][1].fluct)* (bias - b->upper + b->lower); if( fluct < minfluct ) { minfluct = fluct; bisectdim = dim; } } vLR = var[bisectdim]; minfluct = vLR[0].fluct + vLR[1].fluct; nnewL = IMax( (minfluct == 0) ? t->nnew/2 : (count)(vLR[0].fluct/minfluct*t->nnew), MINSAMPLES ); nL = vLR[0].n + nnewL; nnewR = IMax(t->nnew - nnewL, MINSAMPLES); nR = vLR[1].n + nnewR; RegionAlloc(t, regionL, nL, nnewL); RegionAlloc(t, regionR, nR, nnewR); *parent = regionL; regionL->next = regionR; regionR->next = region->next; regionL->div = regionR->div = region->div + 1; bounds = ®ion->bounds[bisectdim]; mid = bounds->mid; n = region->n; w = wlast = region->w; x = w + n; f = flast = x + n*t->ndim; wL = regionL->w; xL = wL + nL; fL = xL + nL*t->ndim; wR = regionR->w; xR = wR + nR; fR = xR + nR*t->ndim; while( n-- ) { cbool final = (*w < 0); if( x[bisectdim] < mid ) { if( final && wR > regionR->w ) *(wR - 1) = -fabs(*(wR - 1)); *wL++ = *w++; VecCopy(xL, x); xL += t->ndim; ResCopy(fL, f); fL += t->ncomp; } else { if( final && wL > regionL->w ) *(wL - 1) = -fabs(*(wL - 1)); *wR++ = *w++; VecCopy(xR, x); xR += t->ndim; ResCopy(fR, f); fR += t->ncomp; } x += t->ndim; f += t->ncomp; if( n && final ) wlast = w, flast = f; } Reweight(t, region->bounds, wlast, flast, f, totals); VecCopy(regionL->bounds, region->bounds); VecCopy(regionR->bounds, region->bounds); boundsL = ®ionL->bounds[bisectdim]; boundsR = ®ionR->bounds[bisectdim]; boundsL->mid = .5*(boundsL->lower + (boundsL->upper = mid)); boundsR->mid = .5*((boundsR->lower = mid) + boundsR->upper); StretchGrid(bounds->grid, boundsL->grid, boundsR->grid); Sample(t, nnewL, regionL, wL, xL, fL); Sample(t, nnewR, regionR, wR, xR, fR); df += regionL->df + regionR->df - region->df; for( comp = 0; comp < t->ncomp; ++comp ) { cResult *r = ®ion->result[comp]; Result *rL = ®ionL->result[comp]; Result *rR = ®ionR->result[comp]; Result *tot = &totals[comp]; real diff, sigsq; tot->avg += diff = rL->avg + rR->avg - r->avg; diff = Sq(.25*diff); sigsq = rL->sigsq + rR->sigsq; if( sigsq > 0 ) { creal c = Sq(1 + sqrt(diff/sigsq)); rL->sigsq *= c; rR->sigsq *= c; } rL->err = sqrt(rL->sigsq += diff); rR->err = sqrt(rR->sigsq += diff); tot->sigsq += rL->sigsq + rR->sigsq - r->sigsq; tot->err = sqrt(tot->sigsq); tot->chisq += rL->chisq + rR->chisq - r->chisq; } free(region); region = NULL; }
void TDataProviderBuilder::Finish() { CB_ENSURE(!IsDone, "Error: can't finish more than once"); DataProvider.Features.reserve(FeatureValues.size()); DataProvider.Order.resize(DataProvider.Targets.size()); std::iota(DataProvider.Order.begin(), DataProvider.Order.end(), 0); if (!AreEqualTo<ui64>(DataProvider.Timestamp, 0)) { ShuffleFlag = false; DataProvider.Order = CreateOrderByKey(DataProvider.Timestamp); } bool hasQueryIds = HasQueryIds(DataProvider.QueryIds); if (!hasQueryIds) { DataProvider.QueryIds.resize(0); } //TODO(noxoomo): it's not safe here, if we change order with shuffle everything'll go wrong if (Pairs.size()) { //they are local, so we don't need shuffle CB_ENSURE(hasQueryIds, "Error: for GPU pairwise learning you should provide query id column. Query ids will be used to split data between devices and for dynamic boosting learning scheme."); DataProvider.FillQueryPairs(Pairs); } if (ShuffleFlag) { if (hasQueryIds) { //should not change order inside query for pairs consistency QueryConsistentShuffle(Seed, 1, DataProvider.QueryIds, &DataProvider.Order); } else { Shuffle(Seed, 1, DataProvider.Targets.size(), &DataProvider.Order); } DataProvider.SetShuffleSeed(Seed); } if (ShuffleFlag || !DataProvider.Timestamp.empty()) { DataProvider.ApplyOrderToMetaColumns(); } TVector<TString> featureNames; featureNames.resize(FeatureValues.size()); TAdaptiveLock lock; NPar::TLocalExecutor executor; executor.RunAdditionalThreads(BuildThreads - 1); TVector<TFeatureColumnPtr> featureColumns(FeatureValues.size()); if (!IsTest) { RegisterFeaturesInFeatureManager(featureColumns); } TVector<TVector<float>> grid; grid.resize(FeatureValues.size()); NPar::ParallelFor(executor, 0, FeatureValues.size(), [&](ui32 featureId) { auto featureName = GetFeatureName(featureId); featureNames[featureId] = featureName; if (FeatureValues[featureId].size() == 0) { return; } TVector<float> line(DataProvider.Order.size()); for (ui32 i = 0; i < DataProvider.Order.size(); ++i) { line[i] = FeatureValues[featureId][DataProvider.Order[i]]; } if (CatFeatureIds.has(featureId)) { static_assert(sizeof(float) == sizeof(ui32), "Error: float size should be equal to ui32 size"); const bool shouldSkip = IsTest && (CatFeaturesPerfectHashHelper.GetUniqueValues(featureId) == 0); if (!shouldSkip) { auto data = CatFeaturesPerfectHashHelper.UpdatePerfectHashAndBinarize(featureId, ~line, line.size()); const ui32 uniqueValues = CatFeaturesPerfectHashHelper.GetUniqueValues(featureId); if (uniqueValues > 1) { auto compressedData = CompressVector<ui64>(~data, line.size(), IntLog2(uniqueValues)); featureColumns[featureId] = MakeHolder<TCatFeatureValuesHolder>(featureId, line.size(), std::move(compressedData), uniqueValues, featureName); } } } else { auto floatFeature = MakeHolder<TFloatValuesHolder>(featureId, std::move(line), featureName); TVector<float>& borders = grid[featureId]; ENanMode nanMode = ENanMode::Forbidden; { TGuard<TAdaptiveLock> guard(lock); nanMode = FeaturesManager.GetOrCreateNanMode(*floatFeature); } if (FeaturesManager.HasFloatFeatureBorders(*floatFeature)) { borders = FeaturesManager.GetFloatFeatureBorders(*floatFeature); } if (borders.empty() && !IsTest) { const auto& floatValues = floatFeature->GetValues(); NCatboostOptions::TBinarizationOptions config = FeaturesManager.GetFloatFeatureBinarization(); config.NanMode = nanMode; borders = BuildBorders(floatValues, floatFeature->GetId(), config); } if (borders.ysize() == 0) { MATRIXNET_DEBUG_LOG << "Float Feature #" << featureId << " is empty" << Endl; return; } auto binarizedData = BinarizeLine(floatFeature->GetValues().data(), floatFeature->GetValues().size(), nanMode, borders); const int binCount = static_cast<const int>(borders.size() + 1 + (ENanMode::Forbidden != nanMode)); auto compressedLine = CompressVector<ui64>(binarizedData, IntLog2(binCount)); featureColumns[featureId] = MakeHolder<TBinarizedFloatValuesHolder>(featureId, floatFeature->GetValues().size(), nanMode, borders, std::move(compressedLine), featureName); } //Free memory { auto emptyVec = TVector<float>(); FeatureValues[featureId].swap(emptyVec); } }); for (ui32 featureId = 0; featureId < featureColumns.size(); ++featureId) { if (CatFeatureIds.has(featureId)) { if (featureColumns[featureId] == nullptr && (!IsTest)) { MATRIXNET_DEBUG_LOG << "Cat Feature #" << featureId << " is empty" << Endl; } } else if (featureColumns[featureId] != nullptr) { if (!FeaturesManager.HasFloatFeatureBordersForDataProviderFeature(featureId)) { FeaturesManager.SetFloatFeatureBordersForDataProviderId(featureId, std::move(grid[featureId])); } } if (featureColumns[featureId] != nullptr) { DataProvider.Features.push_back(std::move(featureColumns[featureId])); } } DataProvider.BuildIndicesRemap(); if (!IsTest) { TOnCpuGridBuilderFactory gridBuilderFactory; FeaturesManager.SetTargetBorders(TBordersBuilder(gridBuilderFactory, DataProvider.GetTargets())(FeaturesManager.GetTargetBinarizationDescription())); } DataProvider.FeatureNames = featureNames; DataProvider.CatFeatureIds = CatFeatureIds; if (ClassesWeights.size()) { Reweight(DataProvider.Targets, ClassesWeights, &DataProvider.Weights); } IsDone = true; }