static TVector<TFeaturePathElement> UnwindFeaturePath(const TVector<TFeaturePathElement>& oldFeaturePath, size_t eraseElementIdx) { const size_t pathLength = oldFeaturePath.size(); CB_ENSURE(pathLength > 0, "Path to unwind must have at least one element"); TVector<TFeaturePathElement> newFeaturePath(oldFeaturePath.begin(), oldFeaturePath.begin() + pathLength - 1); for (size_t elementIdx = eraseElementIdx; elementIdx < pathLength - 1; ++elementIdx) { newFeaturePath[elementIdx].Feature = oldFeaturePath[elementIdx + 1].Feature; newFeaturePath[elementIdx].ZeroPathsFraction = oldFeaturePath[elementIdx + 1].ZeroPathsFraction; newFeaturePath[elementIdx].OnePathsFraction = oldFeaturePath[elementIdx + 1].OnePathsFraction; } const double onePathsFraction = oldFeaturePath[eraseElementIdx].OnePathsFraction; const double zeroPathsFraction = oldFeaturePath[eraseElementIdx].ZeroPathsFraction; double weightDiff = oldFeaturePath[pathLength - 1].Weight; if (!FuzzyEquals(onePathsFraction, 0.0)) { for (int elementIdx = pathLength - 2; elementIdx >= 0; --elementIdx) { double oldWeight = newFeaturePath[elementIdx].Weight; newFeaturePath[elementIdx].Weight = weightDiff * pathLength / (onePathsFraction * (elementIdx + 1)); weightDiff = oldWeight - newFeaturePath[elementIdx].Weight * zeroPathsFraction * (pathLength - elementIdx - 1) / pathLength; } } else { for (int elementIdx = pathLength - 2; elementIdx >= 0; --elementIdx) { newFeaturePath[elementIdx].Weight *= pathLength / (zeroPathsFraction * (pathLength - elementIdx - 1)); } } return newFeaturePath; }
TConstArrayRef<T> Remap(const TConstArrayRef<ui64>& keys) { if (keys.empty()) { return TConstArrayRef<T>(); } Y_ASSERT(keys.begin() >= Keys.begin() && keys.begin() <= Keys.end()); Y_ASSERT(keys.end() >= Keys.begin() && keys.end() <= Keys.end()); return TConstArrayRef<T>(Words[keys.begin() - Keys.begin()].begin(), Words[keys.end() - Keys.begin() - 1].end()); }
TVector<TVector<double>> PrepareEval(const EPredictionType predictionType, const TVector<TVector<double>>& approx, NPar::TLocalExecutor* localExecutor) { TVector<TVector<double>> result; switch (predictionType) { case EPredictionType::Probability: if (IsMulticlass(approx)) { result = CalcSoftmax(approx, localExecutor); } else { result = {CalcSigmoid(approx[0])}; } break; case EPredictionType::Class: result.resize(1); result[0].reserve(approx.size()); if (IsMulticlass(approx)) { TVector<int> predictions = {SelectBestClass(approx, localExecutor)}; result[0].assign(predictions.begin(), predictions.end()); } else { for (const double prediction : approx[0]) { result[0].push_back(prediction > 0); } } break; case EPredictionType::RawFormulaVal: result = approx; break; default: Y_ASSERT(false); } return result; }
TVector<std::pair<double, TFeature>> CalcFeatureEffect(const TFullModel& model, const TPool& pool, int threadCount/*= 1*/) { CB_ENSURE(pool.Docs.GetDocCount() != 0, "Pool should not be empty"); if (model.GetTreeCount() == 0) { return TVector<std::pair<double, TFeature>>(); } int featureCount = pool.Docs.GetFactorsCount(); NJson::TJsonValue jsonParams = ReadTJsonValue(model.ModelInfo.at("params")); jsonParams["system_options"].InsertValue("thread_count", threadCount); TCommonContext ctx(jsonParams, Nothing(), Nothing(), featureCount, pool.CatFeatures, pool.FeatureId); CB_ENSURE(model.GetTreeCount() != 0, "model should not be empty"); CB_ENSURE(pool.Docs.GetFactorsCount() > 0, "no features in pool"); TVector<TFeature> features; TVector<TMxTree> trees = BuildMatrixnetTrees(model, &features); TVector<TVector<ui64>> leavesStatistics = CollectLeavesStatistics(pool, model); TVector<double> effect = CalcEffect(trees, leavesStatistics); TVector<std::pair<double, int>> effectWithFeature; for (int i = 0; i < effect.ysize(); ++i) { effectWithFeature.emplace_back(effect[i], i); } Sort(effectWithFeature.begin(), effectWithFeature.end(), std::greater<std::pair<double, int>>()); TVector<std::pair<double, TFeature>> result; for (int i = 0; i < effectWithFeature.ysize(); ++i) { result.emplace_back(effectWithFeature[i].first, features[effectWithFeature[i].second]); } return result; }
void set_points(const TVector &x, const TVector &y) { assert(x.size() == y.size()); assert(x.size()>2); int n =static_cast<int>(x.size()); m_x.assign(x.begin(), x.end()); m_c0.assign(y.begin(), y.end()); m_c1.resize(n); m_c2.resize(n); m_c3.resize(n); Vector<T, e_host> m_h(n); Vector<T, e_host> m_l(n); Vector<T, e_host> m_mu(n); Vector<T, e_host> m_z(n); n--; for(auto i = 0; i < n; i++) { m_h[i] = m_x[i+1]-m_x[i]; } m_l[0] = 1; m_mu[0] = 0; m_z[0] = 0; for(auto i = 1; i < n; i++) { m_l[i] = 2*(m_x[i+1]-m_x[i-1])-m_h[i-1]*m_mu[i-1]; m_mu[i] = m_h[i]/m_l[i]; auto alpha = 3*(m_c0[i+1]-m_c0[i])/m_h[i]-3*(m_c0[i]-m_c0[i-1])/m_h[i-1]; m_z[i] = (alpha-m_h[i-1]*m_z[i-1])/m_l[i]; } m_l[n] = 1; m_z[n] = 0; m_c2[n] = 0; for(auto i =n-1; i >= 0; i--) { m_c2[i] = m_z[i] - m_mu[i]*m_c2[i+1]; m_c1[i] = (m_c0[i+1]-m_c0[i])/m_h[i]-m_h[i]*(m_c2[i+1]+2*m_c2[i])/3; m_c3[i] = (m_c2[i+1]-m_c2[i])/(3*m_h[i]); } }
static TVector<TFeaturePathElement> ExtendFeaturePath(const TVector<TFeaturePathElement>& oldFeaturePath, double zeroPathsFraction, double onePathsFraction, int feature) { const size_t pathLength = oldFeaturePath.size(); TVector<TFeaturePathElement> newFeaturePath(pathLength + 1); Copy(oldFeaturePath.begin(), oldFeaturePath.begin() + pathLength, newFeaturePath.begin()); const double weight = pathLength == 0 ? 1.0 : 0.0; newFeaturePath[pathLength] = TFeaturePathElement(feature, zeroPathsFraction, onePathsFraction, weight); for (int elementIdx = pathLength - 1; elementIdx >= 0; --elementIdx) { newFeaturePath[elementIdx + 1].Weight += onePathsFraction * newFeaturePath[elementIdx].Weight * (elementIdx + 1) / (pathLength + 1); newFeaturePath[elementIdx].Weight = zeroPathsFraction * newFeaturePath[elementIdx].Weight * (pathLength - elementIdx) / (pathLength + 1); } return newFeaturePath; }
void CalcSoftmax(const TVector<double>& approx, TVector<double>* softmax) { double maxApprox = *MaxElement(approx.begin(), approx.end()); double sumExpApprox = 0; for (int dim = 0; dim < approx.ysize(); ++dim) { double expApprox = exp(approx[dim] - maxApprox); (*softmax)[dim] = expApprox; sumExpApprox += expApprox; } for (auto& curSoftmax : *softmax) { curSoftmax /= sumExpApprox; } }
void TestsOfTheRevolutionWillNotFallSilent(){ TVector<int> a; TVector<int> b; for (int i=0; i<20; ++i){ a.push_back(i); b.push_back(20-i); } cout << "a: "; print(a); cout << "b: "; print(b); cout << "Popback for both" << endl; a.pop_back(); cout << "a: "; print(a); cout << "b: "; print(b); cout << "Swhap" << endl; a.Swap(b); cout << "a: "; print(a); cout << "b: "; print(b); cout << "INSERTIONNN" << endl; a.insert(a.begin()+2, 9); b.insert(b.begin()+2, 0); cout << "a: "; print(a); cout << "b: "; print(b); cout << "More insertion!" << endl; a.insert(a.begin(), 3, 4); cout << "a: "; print(a); cout << "b: "; print(b); cout << "Destroy the lesser middle nodes!" << endl; a.erase(a.begin()+7); b.erase(b.begin()+6, b.begin()+8); cout << "a: "; print(a); cout << "b: "; print(b); system("pause"); };
TWxTestResult WxTest(const TVector<double>& baseline, const TVector<double>& test) { TVector<double> diffs; for (ui32 i = 0; i < baseline.size(); i++) { const double i1 = baseline[i]; const double i2 = test[i]; const double diff = i1 - i2; if (diff != 0) { diffs.push_back(diff); } } if (diffs.size() < 2) { TWxTestResult result; result.PValue = 0.5; result.WMinus = result.WPlus = 0; return result; } Sort(diffs.begin(), diffs.end(), [&](double x, double y) { return Abs(x) < Abs(y); }); double w_plus = 0; double w_minus = 0; double n = diffs.size(); for (int i = 0; i < n; ++i) { double sum = 0; double weight = 0; int j = i; double signPlus = 0; double signMinus = 0; for (j = i; j < n && diffs[j] == diffs[i]; ++j) { sum += (j + 1); ++weight; signPlus += diffs[i] >= 0; signMinus += diffs[i] < 0; } const double meanRank = sum / weight; w_plus += signPlus * meanRank; w_minus += signMinus * meanRank; i = j - 1; } TWxTestResult result; result.WPlus = w_plus; result.WMinus = w_minus; const double w = result.WPlus - result.WMinus; if (n > 16) { double z = w / sqrt(n * (n + 1) * (2 * n + 1) * 1.0 / 6); result.PValue = 2 * (1.0 - NormalCDF(Abs(z))); } else { result.PValue = 2 * CalcLevelOfSignificanceWXMPSR(Abs(w), (int) n); } result.PValue = 1.0 - result.PValue; return result; }
iterator begin() { return m_vector.begin(); }
void print(TVector<T> v){ for(TVector<int>::Iterator it=v.begin(); it!=v.end(); ++it) cout << ' ' << *it; cout << endl; };
void TEvalResult::SetPredictionTypes(const TVector<EPredictionType>& predictionTypes_) { PredictionTypes.clear(); PredictionTypes.assign(predictionTypes_.begin(), predictionTypes_.end()); }
// Select the best matching function for 'call' from 'candidateList'. // // Assumptions // // There is no exact match, so a selection algorithm needs to run. That is, the // language-specific handler should check for exact match first, to // decide what to do, before calling this selector. // // Input // // * list of candidate signatures to select from // * the call // * a predicate function convertible(from, to) that says whether or not type // 'from' can implicitly convert to type 'to' (it includes the case of what // the calling language would consider a matching type with no conversion // needed) // * a predicate function better(from1, from2, to1, to2) that says whether or // not a conversion from <-> to2 is considered better than a conversion // from <-> to1 (both in and out directions need testing, as declared by the // formal parameter) // // Output // // * best matching candidate (or none, if no viable candidates found) // * whether there was a tie for the best match (ambiguous overload selection, // caller's choice for how to report) // const TFunction* TParseContextBase::selectFunction( const TVector<const TFunction*> candidateList, const TFunction& call, std::function<bool(const TType& from, const TType& to, TOperator op, int arg)> convertible, std::function<bool(const TType& from, const TType& to1, const TType& to2)> better, /* output */ bool& tie) { // // Operation // // 1. Prune the input list of candidates down to a list of viable candidates, // where each viable candidate has // // * at least as many parameters as there are calling arguments, with any // remaining parameters being optional or having default values // * each parameter is true under convertible(A, B), where A is the calling // type for in and B is the formal type, and in addition, for out B is the // calling type and A is the formal type // // 2. If there are no viable candidates, return with no match. // // 3. If there is only one viable candidate, it is the best match. // // 4. If there are multiple viable candidates, select the first viable candidate // as the incumbent. Compare the incumbent to the next viable candidate, and if // that candidate is better (bullets below), make it the incumbent. Repeat, with // a linear walk through the viable candidate list. The final incumbent will be // returned as the best match. A viable candidate is better than the incumbent if // // * it has a function argument with a better(...) conversion than the incumbent, // for all directions needed by in and out // * the incumbent has no argument with a better(...) conversion then the // candidate, for either in or out (as needed) // // 5. Check for ambiguity by comparing the best match against all other viable // candidates. If any other viable candidate has a function argument with a // better(...) conversion than the best candidate (for either in or out // directions), return that there was a tie for best. // tie = false; // 1. prune to viable... TVector<const TFunction*> viableCandidates; for (auto it = candidateList.begin(); it != candidateList.end(); ++it) { const TFunction& candidate = *(*it); // to even be a potential match, number of arguments must be >= the number of // fixed (non-default) parameters, and <= the total (including parameter with defaults). if (call.getParamCount() < candidate.getFixedParamCount() || call.getParamCount() > candidate.getParamCount()) continue; // see if arguments are convertible bool viable = true; // The call can have fewer parameters than the candidate, if some have defaults. const int paramCount = std::min(call.getParamCount(), candidate.getParamCount()); for (int param = 0; param < paramCount; ++param) { if (candidate[param].type->getQualifier().isParamInput()) { if (! convertible(*call[param].type, *candidate[param].type, candidate.getBuiltInOp(), param)) { viable = false; break; } } if (candidate[param].type->getQualifier().isParamOutput()) { if (! convertible(*candidate[param].type, *call[param].type, candidate.getBuiltInOp(), param)) { viable = false; break; } } } if (viable) viableCandidates.push_back(&candidate); } // 2. none viable... if (viableCandidates.size() == 0) return nullptr; // 3. only one viable... if (viableCandidates.size() == 1) return viableCandidates.front(); // 4. find best... const auto betterParam = [&call, &better](const TFunction& can1, const TFunction& can2) -> bool { // is call -> can2 better than call -> can1 for any parameter bool hasBetterParam = false; for (int param = 0; param < call.getParamCount(); ++param) { if (better(*call[param].type, *can1[param].type, *can2[param].type)) { hasBetterParam = true; break; } } return hasBetterParam; }; const auto equivalentParams = [&call, &better](const TFunction& can1, const TFunction& can2) -> bool { // is call -> can2 equivalent to call -> can1 for all the call parameters? for (int param = 0; param < call.getParamCount(); ++param) { if (better(*call[param].type, *can1[param].type, *can2[param].type) || better(*call[param].type, *can2[param].type, *can1[param].type)) return false; } return true; }; const TFunction* incumbent = viableCandidates.front(); for (auto it = viableCandidates.begin() + 1; it != viableCandidates.end(); ++it) { const TFunction& candidate = *(*it); if (betterParam(*incumbent, candidate) && ! betterParam(candidate, *incumbent)) incumbent = &candidate; } // 5. ambiguity... for (auto it = viableCandidates.begin(); it != viableCandidates.end(); ++it) { if (incumbent == *it) continue; const TFunction& candidate = *(*it); // In the case of default parameters, it may have an identical initial set, which is // also ambiguous if (betterParam(*incumbent, candidate) || equivalentParams(*incumbent, candidate)) tie = true; } return incumbent; }
static void CalcShapValuesRecursive(const TObliviousTrees& forest, const TVector<int>& binFeaturesMapping, const TVector<ui8>& binFeaturesValues, size_t treeIdx, int depth, const TVector<TVector<size_t>>& subtreeSizes, int dimension, size_t nodeIdx, const TVector<TFeaturePathElement>& oldFeaturePath, double zeroPathsFraction, double onePathsFraction, int feature, TVector<double>* shapValuesPtr) { TVector<double>& shapValues = *shapValuesPtr; TVector<TFeaturePathElement> featurePath = ExtendFeaturePath(oldFeaturePath, zeroPathsFraction, onePathsFraction, feature); if (depth == forest.TreeSizes[treeIdx]) { for (size_t elementIdx = 1; elementIdx < featurePath.size(); ++elementIdx) { TVector<TFeaturePathElement> unwoundPath = UnwindFeaturePath(featurePath, elementIdx); double weightSum = 0.0; for (const TFeaturePathElement& unwoundPathElement : unwoundPath) { weightSum += unwoundPathElement.Weight; } const TFeaturePathElement& element = featurePath[elementIdx]; const int approxDimension = forest.ApproxDimension; shapValues[element.Feature] += weightSum * (element.OnePathsFraction - element.ZeroPathsFraction) * forest.LeafValues[treeIdx][nodeIdx * approxDimension + dimension]; } } else { const TRepackedBin& split = forest.GetRepackedBins()[forest.TreeStartOffsets[treeIdx] + depth]; const int splitBinFeature = split.FeatureIndex; const int splitFlatFeature = binFeaturesMapping[splitBinFeature]; const ui8 threshold = split.SplitIdx; const ui8 xorMask = split.XorMask; double newZeroPathsFraction = 1.0; double newOnePathsFraction = 1.0; const auto sameFeatureElement = FindIf(featurePath.begin(), featurePath.end(), [splitFlatFeature](const TFeaturePathElement& element) {return element.Feature == splitFlatFeature;}); if (sameFeatureElement != featurePath.end()) { const size_t sameFeatureIndex = sameFeatureElement - featurePath.begin(); newZeroPathsFraction = featurePath[sameFeatureIndex].ZeroPathsFraction; newOnePathsFraction = featurePath[sameFeatureIndex].OnePathsFraction; featurePath = UnwindFeaturePath(featurePath, sameFeatureIndex); } const size_t goNodeIdx = nodeIdx | (((binFeaturesValues[splitBinFeature] ^ xorMask) >= threshold) << depth); const size_t skipNodeIdx = goNodeIdx ^ (1 << depth); if (subtreeSizes[depth + 1][goNodeIdx] > 0) { CalcShapValuesRecursive(forest, binFeaturesMapping, binFeaturesValues, treeIdx, depth + 1, subtreeSizes, dimension, goNodeIdx, featurePath, newZeroPathsFraction * subtreeSizes[depth + 1][goNodeIdx] / subtreeSizes[depth][nodeIdx], newOnePathsFraction, splitFlatFeature, &shapValues); } if (subtreeSizes[depth + 1][skipNodeIdx] > 0) { CalcShapValuesRecursive(forest, binFeaturesMapping, binFeaturesValues, treeIdx, depth + 1, subtreeSizes, dimension, skipNodeIdx, featurePath, newZeroPathsFraction * subtreeSizes[depth + 1][skipNodeIdx] / subtreeSizes[depth][nodeIdx], /*onePathFraction*/ 0, splitFlatFeature, &shapValues); } } }
TConstArrayRef<ui64> GetKeys() const { return TConstArrayRef<ui64>(Keys.begin(), Keys.end()); }