TVector<TVector<double>> PrepareEval(const EPredictionType predictionType, const TVector<TVector<double>>& approx, int threadCount) { NPar::TLocalExecutor executor; executor.RunAdditionalThreads(threadCount - 1); return PrepareEval(predictionType, approx, &executor); }
SEXP CatBoostPrepareEval_R(SEXP approxParam, SEXP typeParam, SEXP columnCountParam, SEXP threadCountParam) { SEXP result = NULL; R_API_BEGIN(); SEXP dataDim = getAttrib(approxParam, R_DimSymbol); size_t dataRows = static_cast<size_t>(INTEGER(dataDim)[0]) / asInteger(columnCountParam); TVector<TVector<double>> prediction(asInteger(columnCountParam), TVector<double>(dataRows)); for (size_t i = 0, k = 0; i < dataRows; ++i) { for (size_t j = 0; j < prediction.size(); ++j) { prediction[j][i] = static_cast<double>(REAL(approxParam)[k++]); } } NPar::TLocalExecutor executor; executor.RunAdditionalThreads(asInteger(threadCountParam) - 1); EPredictionType predictionType; CB_ENSURE(TryFromString<EPredictionType>(CHAR(asChar(typeParam)), predictionType), "unsupported prediction type: 'Probability', 'Class' or 'RawFormulaVal' was expected"); prediction = PrepareEval(predictionType, prediction, &executor); size_t predictionSize = prediction.size() * dataRows; result = PROTECT(allocVector(REALSXP, predictionSize)); for (size_t i = 0, k = 0; i < dataRows; ++i) { for (size_t j = 0; j < prediction.size(); ++j) { REAL(result)[k++] = prediction[j][i]; } } R_API_END(); UNPROTECT(1); return result; }
TVector<TVector<double>> CalcShapValues(const TFullModel& model, const TPool& pool, int threadCount, int dimension) { NPar::TLocalExecutor localExecutor; localExecutor.RunAdditionalThreads(threadCount - 1); TVector<TVector<double>> result = CalcShapValuesForDocumentBlock(model, pool, /*start*/ 0, pool.Docs.GetDocCount(), localExecutor, dimension); return result; }
static TVector<TVector<double>> CalcShapValuesForDocumentBlock(const TFullModel& model, const TPool& pool, size_t start, size_t end, NPar::TLocalExecutor& localExecutor, int dimension) { CB_ENSURE(!HasComplexCtrs(model.ObliviousTrees), "Model uses complex Ctr features. This is not allowed for SHAP values calculation"); const TObliviousTrees& forest = model.ObliviousTrees; const size_t documentCount = end - start; TVector<ui8> allBinarizedFeatures = BinarizeFeatures(model, pool, start, end); TVector<TVector<ui8>> binarizedFeaturesByDocument = TransposeBinarizedFeatures(allBinarizedFeatures, documentCount); allBinarizedFeatures.clear(); const int flatFeatureCount = pool.Docs.GetFactorsCount(); TVector<int> binFeaturesMapping = MapFeatures(forest); TVector<TVector<double>> shapValues(documentCount, TVector<double>(flatFeatureCount + 1, 0.0)); NPar::TLocalExecutor::TExecRangeParams blockParams(0, documentCount); localExecutor.ExecRange([&] (int documentIdx) { const size_t treeCount = forest.GetTreeCount(); for (size_t treeIdx = 0; treeIdx < treeCount; ++treeIdx) { TVector<TVector<size_t>> subtreeSizes = CalcSubtreeSizesForTree(forest, treeIdx); TVector<TFeaturePathElement> initialFeaturePath; CalcShapValuesRecursive(forest, binFeaturesMapping, binarizedFeaturesByDocument[documentIdx], treeIdx, /*depth*/ 0, subtreeSizes, dimension, /*nodeIdx*/ 0, initialFeaturePath, /*zeroPathFraction*/ 1, /*onePathFraction*/ 1, /*feature*/ -1, &shapValues[documentIdx]); shapValues[documentIdx][flatFeatureCount] += CalcMeanValueForTree(forest, subtreeSizes, treeIdx, dimension); } }, blockParams, NPar::TLocalExecutor::WAIT_COMPLETE); return shapValues; }
void CalcAndOutputShapValues(const TFullModel& model, const TPool& pool, const TString& outputPath, int threadCount, int dimension) { const size_t documentCount = pool.Docs.GetDocCount(); NPar::TLocalExecutor localExecutor; localExecutor.RunAdditionalThreads(threadCount - 1); TFileOutput out(outputPath); const size_t documentBlockSize = CB_THREAD_LIMIT; // least necessary for threading for (size_t start = 0; start < documentCount; start += documentBlockSize) { size_t end = Min(start + documentBlockSize, pool.Docs.GetDocCount()); TVector<TVector<double>> shapValues = CalcShapValuesForDocumentBlock(model, pool, start, end, localExecutor, dimension); OutputShapValues(shapValues, out); } }
static inline void BinarizeFloatFeature(int featureIdx, const TDocumentStorage& docStorage, const TDocSelector& docSelector, const TVector<float>& borders, ENanMode nanMode, NPar::TLocalExecutor& localExecutor, int floatFeatureIdx, TAllFeatures* features, bool* seenNans) { size_t docCount = docSelector.GetDocCount(); const TVector<float>& src = docStorage.Factors[featureIdx]; TVector<ui8>& hist = features->FloatHistograms[floatFeatureIdx]; hist.resize(docCount); ui8* histData = hist.data(); const float* featureBorderData = borders.data(); const int featureBorderSize = borders.ysize(); localExecutor.ExecRange([&] (int i) { const auto& featureVal = src[docSelector(i)]; if (IsNan(featureVal)) { *seenNans = true; histData[i] = nanMode == ENanMode::Min ? 0 : featureBorderSize; } else { int j = 0; while (j < featureBorderSize && featureVal > featureBorderData[j]) { ++histData[i]; ++j; } // histData[i] = LowerBound(featureBorderData, featureBorderData + featureBorderSize, featureVal) - featureBorderData; } } , NPar::TLocalExecutor::TExecRangeParams(0, docCount).SetBlockSize(1000) , NPar::TLocalExecutor::WAIT_COMPLETE); }
void TEvalResult::PostProcess(int threadCount) { NPar::TLocalExecutor executor; executor.RunAdditionalThreads(threadCount - 1); PostProcess(&executor); }
void TDataProviderBuilder::Finish() { CB_ENSURE(!IsDone, "Error: can't finish more than once"); DataProvider.Features.reserve(FeatureValues.size()); DataProvider.Order.resize(DataProvider.Targets.size()); std::iota(DataProvider.Order.begin(), DataProvider.Order.end(), 0); if (!AreEqualTo<ui64>(DataProvider.Timestamp, 0)) { ShuffleFlag = false; DataProvider.Order = CreateOrderByKey(DataProvider.Timestamp); } bool hasQueryIds = HasQueryIds(DataProvider.QueryIds); if (!hasQueryIds) { DataProvider.QueryIds.resize(0); } //TODO(noxoomo): it's not safe here, if we change order with shuffle everything'll go wrong if (Pairs.size()) { //they are local, so we don't need shuffle CB_ENSURE(hasQueryIds, "Error: for GPU pairwise learning you should provide query id column. Query ids will be used to split data between devices and for dynamic boosting learning scheme."); DataProvider.FillQueryPairs(Pairs); } if (ShuffleFlag) { if (hasQueryIds) { //should not change order inside query for pairs consistency QueryConsistentShuffle(Seed, 1, DataProvider.QueryIds, &DataProvider.Order); } else { Shuffle(Seed, 1, DataProvider.Targets.size(), &DataProvider.Order); } DataProvider.SetShuffleSeed(Seed); } if (ShuffleFlag || !DataProvider.Timestamp.empty()) { DataProvider.ApplyOrderToMetaColumns(); } TVector<TString> featureNames; featureNames.resize(FeatureValues.size()); TAdaptiveLock lock; NPar::TLocalExecutor executor; executor.RunAdditionalThreads(BuildThreads - 1); TVector<TFeatureColumnPtr> featureColumns(FeatureValues.size()); if (!IsTest) { RegisterFeaturesInFeatureManager(featureColumns); } TVector<TVector<float>> grid; grid.resize(FeatureValues.size()); NPar::ParallelFor(executor, 0, FeatureValues.size(), [&](ui32 featureId) { auto featureName = GetFeatureName(featureId); featureNames[featureId] = featureName; if (FeatureValues[featureId].size() == 0) { return; } TVector<float> line(DataProvider.Order.size()); for (ui32 i = 0; i < DataProvider.Order.size(); ++i) { line[i] = FeatureValues[featureId][DataProvider.Order[i]]; } if (CatFeatureIds.has(featureId)) { static_assert(sizeof(float) == sizeof(ui32), "Error: float size should be equal to ui32 size"); const bool shouldSkip = IsTest && (CatFeaturesPerfectHashHelper.GetUniqueValues(featureId) == 0); if (!shouldSkip) { auto data = CatFeaturesPerfectHashHelper.UpdatePerfectHashAndBinarize(featureId, ~line, line.size()); const ui32 uniqueValues = CatFeaturesPerfectHashHelper.GetUniqueValues(featureId); if (uniqueValues > 1) { auto compressedData = CompressVector<ui64>(~data, line.size(), IntLog2(uniqueValues)); featureColumns[featureId] = MakeHolder<TCatFeatureValuesHolder>(featureId, line.size(), std::move(compressedData), uniqueValues, featureName); } } } else { auto floatFeature = MakeHolder<TFloatValuesHolder>(featureId, std::move(line), featureName); TVector<float>& borders = grid[featureId]; ENanMode nanMode = ENanMode::Forbidden; { TGuard<TAdaptiveLock> guard(lock); nanMode = FeaturesManager.GetOrCreateNanMode(*floatFeature); } if (FeaturesManager.HasFloatFeatureBorders(*floatFeature)) { borders = FeaturesManager.GetFloatFeatureBorders(*floatFeature); } if (borders.empty() && !IsTest) { const auto& floatValues = floatFeature->GetValues(); NCatboostOptions::TBinarizationOptions config = FeaturesManager.GetFloatFeatureBinarization(); config.NanMode = nanMode; borders = BuildBorders(floatValues, floatFeature->GetId(), config); } if (borders.ysize() == 0) { MATRIXNET_DEBUG_LOG << "Float Feature #" << featureId << " is empty" << Endl; return; } auto binarizedData = BinarizeLine(floatFeature->GetValues().data(), floatFeature->GetValues().size(), nanMode, borders); const int binCount = static_cast<const int>(borders.size() + 1 + (ENanMode::Forbidden != nanMode)); auto compressedLine = CompressVector<ui64>(binarizedData, IntLog2(binCount)); featureColumns[featureId] = MakeHolder<TBinarizedFloatValuesHolder>(featureId, floatFeature->GetValues().size(), nanMode, borders, std::move(compressedLine), featureName); } //Free memory { auto emptyVec = TVector<float>(); FeatureValues[featureId].swap(emptyVec); } }); for (ui32 featureId = 0; featureId < featureColumns.size(); ++featureId) { if (CatFeatureIds.has(featureId)) { if (featureColumns[featureId] == nullptr && (!IsTest)) { MATRIXNET_DEBUG_LOG << "Cat Feature #" << featureId << " is empty" << Endl; } } else if (featureColumns[featureId] != nullptr) { if (!FeaturesManager.HasFloatFeatureBordersForDataProviderFeature(featureId)) { FeaturesManager.SetFloatFeatureBordersForDataProviderId(featureId, std::move(grid[featureId])); } } if (featureColumns[featureId] != nullptr) { DataProvider.Features.push_back(std::move(featureColumns[featureId])); } } DataProvider.BuildIndicesRemap(); if (!IsTest) { TOnCpuGridBuilderFactory gridBuilderFactory; FeaturesManager.SetTargetBorders(TBordersBuilder(gridBuilderFactory, DataProvider.GetTargets())(FeaturesManager.GetTargetBinarizationDescription())); } DataProvider.FeatureNames = featureNames; DataProvider.CatFeatureIds = CatFeatureIds; if (ClassesWeights.size()) { Reweight(DataProvider.Targets, ClassesWeights, &DataProvider.Weights); } IsDone = true; }