TVector<std::pair<double, TFeature>> CalcFeatureEffect(const TFullModel& model, const TPool& pool, int threadCount/*= 1*/) { CB_ENSURE(pool.Docs.GetDocCount() != 0, "Pool should not be empty"); if (model.GetTreeCount() == 0) { return TVector<std::pair<double, TFeature>>(); } int featureCount = pool.Docs.GetFactorsCount(); NJson::TJsonValue jsonParams = ReadTJsonValue(model.ModelInfo.at("params")); jsonParams["system_options"].InsertValue("thread_count", threadCount); TCommonContext ctx(jsonParams, Nothing(), Nothing(), featureCount, pool.CatFeatures, pool.FeatureId); CB_ENSURE(model.GetTreeCount() != 0, "model should not be empty"); CB_ENSURE(pool.Docs.GetFactorsCount() > 0, "no features in pool"); TVector<TFeature> features; TVector<TMxTree> trees = BuildMatrixnetTrees(model, &features); TVector<TVector<ui64>> leavesStatistics = CollectLeavesStatistics(pool, model); TVector<double> effect = CalcEffect(trees, leavesStatistics); TVector<std::pair<double, int>> effectWithFeature; for (int i = 0; i < effect.ysize(); ++i) { effectWithFeature.emplace_back(effect[i], i); } Sort(effectWithFeature.begin(), effectWithFeature.end(), std::greater<std::pair<double, int>>()); TVector<std::pair<double, TFeature>> result; for (int i = 0; i < effectWithFeature.ysize(); ++i) { result.emplace_back(effectWithFeature[i].first, features[effectWithFeature[i].second]); } return result; }
int mode_fit(const int argc, const char* argv[]) { ConfigureMalloc(); NCatboostOptions::TPoolLoadParams poolLoadOptions; TString paramsFile; NJson::TJsonValue catBoostFlatJsonOptions; ParseCommandLine(argc, argv, &catBoostFlatJsonOptions, ¶msFile, &poolLoadOptions); NJson::TJsonValue catBoostJsonOptions; NJson::TJsonValue outputOptionsJson; if (!paramsFile.empty()) { CB_ENSURE(NFs::Exists(paramsFile), "Params file does not exists " << paramsFile); TIFStream in(paramsFile); NJson::TJsonValue fromFileParams; CB_ENSURE(NJson::ReadJsonTree(&in, &fromFileParams), "can't parse params file"); NCatboostOptions::PlainJsonToOptions(fromFileParams, &catBoostJsonOptions, &outputOptionsJson); } NCatboostOptions::PlainJsonToOptions(catBoostFlatJsonOptions, &catBoostJsonOptions, &outputOptionsJson); poolLoadOptions.Validate(); auto taskType = NCatboostOptions::GetTaskType(catBoostJsonOptions); THolder<IModelTrainer> modelTrainerHolder; NCatboostOptions::TOutputFilesOptions outputOptions(taskType); outputOptions.Load(outputOptionsJson); const bool isGpuDeviceType = taskType == ETaskType::GPU; if (isGpuDeviceType && TTrainerFactory::Has(ETaskType::GPU)) { modelTrainerHolder = TTrainerFactory::Construct(ETaskType::GPU); } else { CB_ENSURE(!isGpuDeviceType, "GPU Device not found."); modelTrainerHolder = TTrainerFactory::Construct(ETaskType::CPU); } modelTrainerHolder->TrainModel(poolLoadOptions, outputOptions, catBoostJsonOptions); return 0; }
ui32 NCatboostCuda::UpdateFeatureId(TBinarizedFeaturesManager& featuresManager, const TModelFeaturesMap& map, const ui32 featureId) { if (map.Ctrs.has(featureId)) { const auto& info = map.Ctrs.at(featureId); TCtr remapedCtr = MigrateCtr(featuresManager, map, info.Ctr); if (featuresManager.IsKnown(remapedCtr)) { ui32 remappedId = featuresManager.GetId(remapedCtr); CB_ENSURE(info.Borders == featuresManager.GetBorders(remappedId), " tensor : " << remapedCtr.FeatureTensor << " (ctr type " << remapedCtr.Configuration.Type << "). Error: progress borders should be consistent: " << remappedId << " / " << featureId << " " << Print(info.Borders) << " vs " << Print(featuresManager.GetBorders(remappedId))); return remappedId; } else { return featuresManager.AddCtr(remapedCtr, TVector<float>(info.Borders)); } } else if (map.FloatFeatures.has(featureId)) { auto& floatInfo = map.FloatFeatures.at(featureId); const ui32 featureManagerId = featuresManager.GetFeatureManagerIdForFloatFeature(floatInfo.DataProviderId); CB_ENSURE(floatInfo.Borders == featuresManager.GetBorders(featureManagerId), "Error: progress borders should be consistent"); return featureManagerId; } else if (map.CatFeaturesMap.has(featureId)) { const ui32 dataProviderId = map.CatFeaturesMap.at(featureId); return featuresManager.GetFeatureManagerIdForCatFeature(dataProviderId); } else { ythrow yexception() << "Error: can't remap featureId #" << featureId; } }
static inline void EstimatePriors(const TDataProvider& dataProvider, TBinarizedFeaturesManager& featureManager, NCatboostOptions::TCatFeatureParams& options) { CB_ENSURE(&(featureManager.GetCatFeatureOptions()) == &options, "Error: for consistent catFeature options should be equal to one in feature manager"); bool needSimpleCtrsPriorEstimation = NeedPriorEstimation(options.SimpleCtrs); const auto& borders = featureManager.GetTargetBorders(); if (borders.size() > 1) { return; } auto binarizedTarget = BinarizeLine<ui8>(dataProvider.GetTargets().data(), dataProvider.GetTargets().size(), ENanMode::Forbidden, borders); TVector<int> catFeatureIds(dataProvider.GetCatFeatureIds().begin(), dataProvider.GetCatFeatureIds().end()); TAdaptiveLock lock; //TODO(noxoomo): locks here are ugly and error prone NPar::ParallelFor(0, catFeatureIds.size(), [&](int i) { ui32 catFeature = catFeatureIds[i]; if (!dataProvider.HasFeatureId(catFeature)) { return; } const ICatFeatureValuesHolder& catFeatureValues = dynamic_cast<const ICatFeatureValuesHolder&>(dataProvider.GetFeatureById(catFeature)); bool hasPerFeatureCtr = false; with_lock (lock) { if (needSimpleCtrsPriorEstimation && !options.PerFeatureCtrs->has(catFeature)) { options.PerFeatureCtrs.Get()[catFeature] = options.SimpleCtrs; } hasPerFeatureCtr = options.PerFeatureCtrs->has(catFeature); } if (hasPerFeatureCtr) { TVector<NCatboostOptions::TCtrDescription> currentFeatureDescription; with_lock (lock) { currentFeatureDescription = options.PerFeatureCtrs->at(catFeature); } if (!NeedPriorEstimation(currentFeatureDescription)) { return; } auto values = catFeatureValues.ExtractValues(); for (ui32 i = 0; i < currentFeatureDescription.size(); ++i) { if (currentFeatureDescription[i].Type == ECtrType::Borders && options.TargetBorders->BorderCount == 1u) { TBetaPriorEstimator::TBetaPrior prior = TBetaPriorEstimator::EstimateBetaPrior(binarizedTarget.data(), values.data(), values.size(), catFeatureValues.GetUniqueValues()); MATRIXNET_INFO_LOG << "Estimate borders-ctr prior for feature #" << catFeature << ": " << prior.Alpha << " / " << prior.Beta << Endl; currentFeatureDescription[i].Priors = {{(float)prior.Alpha, (float)(prior.Alpha + prior.Beta)}}; } else { CB_ENSURE(currentFeatureDescription[i].PriorEstimation == EPriorEstimation::No, "Error: auto prior estimation is not available for ctr type " << currentFeatureDescription[i].Type); } } with_lock (lock) { options.PerFeatureCtrs.Get()[catFeature] = currentFeatureDescription; } } });
SEXP CatBoostPrepareEval_R(SEXP approxParam, SEXP typeParam, SEXP columnCountParam, SEXP threadCountParam) { SEXP result = NULL; R_API_BEGIN(); SEXP dataDim = getAttrib(approxParam, R_DimSymbol); size_t dataRows = static_cast<size_t>(INTEGER(dataDim)[0]) / asInteger(columnCountParam); TVector<TVector<double>> prediction(asInteger(columnCountParam), TVector<double>(dataRows)); for (size_t i = 0, k = 0; i < dataRows; ++i) { for (size_t j = 0; j < prediction.size(); ++j) { prediction[j][i] = static_cast<double>(REAL(approxParam)[k++]); } } NPar::TLocalExecutor executor; executor.RunAdditionalThreads(asInteger(threadCountParam) - 1); EPredictionType predictionType; CB_ENSURE(TryFromString<EPredictionType>(CHAR(asChar(typeParam)), predictionType), "unsupported prediction type: 'Probability', 'Class' or 'RawFormulaVal' was expected"); prediction = PrepareEval(predictionType, prediction, &executor); size_t predictionSize = prediction.size() * dataRows; result = PROTECT(allocVector(REALSXP, predictionSize)); for (size_t i = 0, k = 0; i < dataRows; ++i) { for (size_t j = 0; j < prediction.size(); ++j) { REAL(result)[k++] = prediction[j][i]; } } R_API_END(); UNPROTECT(1); return result; }
SEXP CatBoostPredictMulti_R(SEXP modelParam, SEXP poolParam, SEXP verboseParam, SEXP typeParam, SEXP treeCountStartParam, SEXP treeCountEndParam, SEXP threadCountParam) { SEXP result = NULL; R_API_BEGIN(); TFullModelHandle model = reinterpret_cast<TFullModelHandle>(R_ExternalPtrAddr(modelParam)); TPoolHandle pool = reinterpret_cast<TPoolHandle>(R_ExternalPtrAddr(poolParam)); EPredictionType predictionType; CB_ENSURE(TryFromString<EPredictionType>(CHAR(asChar(typeParam)), predictionType), "unsupported prediction type: 'Probability', 'Class' or 'RawFormulaVal' was expected"); TVector<TVector<double>> prediction = ApplyModelMulti(*model, *pool, asLogical(verboseParam), predictionType, asInteger(treeCountStartParam), asInteger(treeCountEndParam), asInteger(threadCountParam)); size_t predictionSize = prediction.size() * pool->Docs.GetDocCount(); result = PROTECT(allocVector(REALSXP, predictionSize)); for (size_t i = 0, k = 0; i < pool->Docs.GetDocCount(); ++i) { for (size_t j = 0; j < prediction.size(); ++j) { REAL(result)[k++] = prediction[j][i]; } } R_API_END(); UNPROTECT(1); return result; }
static TVector<TFeaturePathElement> UnwindFeaturePath(const TVector<TFeaturePathElement>& oldFeaturePath, size_t eraseElementIdx) { const size_t pathLength = oldFeaturePath.size(); CB_ENSURE(pathLength > 0, "Path to unwind must have at least one element"); TVector<TFeaturePathElement> newFeaturePath(oldFeaturePath.begin(), oldFeaturePath.begin() + pathLength - 1); for (size_t elementIdx = eraseElementIdx; elementIdx < pathLength - 1; ++elementIdx) { newFeaturePath[elementIdx].Feature = oldFeaturePath[elementIdx + 1].Feature; newFeaturePath[elementIdx].ZeroPathsFraction = oldFeaturePath[elementIdx + 1].ZeroPathsFraction; newFeaturePath[elementIdx].OnePathsFraction = oldFeaturePath[elementIdx + 1].OnePathsFraction; } const double onePathsFraction = oldFeaturePath[eraseElementIdx].OnePathsFraction; const double zeroPathsFraction = oldFeaturePath[eraseElementIdx].ZeroPathsFraction; double weightDiff = oldFeaturePath[pathLength - 1].Weight; if (!FuzzyEquals(onePathsFraction, 0.0)) { for (int elementIdx = pathLength - 2; elementIdx >= 0; --elementIdx) { double oldWeight = newFeaturePath[elementIdx].Weight; newFeaturePath[elementIdx].Weight = weightDiff * pathLength / (onePathsFraction * (elementIdx + 1)); weightDiff = oldWeight - newFeaturePath[elementIdx].Weight * zeroPathsFraction * (pathLength - elementIdx - 1) / pathLength; } } else { for (int elementIdx = pathLength - 2; elementIdx >= 0; --elementIdx) { newFeaturePath[elementIdx].Weight *= pathLength / (zeroPathsFraction * (pathLength - elementIdx - 1)); } } return newFeaturePath; }
static TVector<TVector<double>> CalcShapValuesForDocumentBlock(const TFullModel& model, const TPool& pool, size_t start, size_t end, NPar::TLocalExecutor& localExecutor, int dimension) { CB_ENSURE(!HasComplexCtrs(model.ObliviousTrees), "Model uses complex Ctr features. This is not allowed for SHAP values calculation"); const TObliviousTrees& forest = model.ObliviousTrees; const size_t documentCount = end - start; TVector<ui8> allBinarizedFeatures = BinarizeFeatures(model, pool, start, end); TVector<TVector<ui8>> binarizedFeaturesByDocument = TransposeBinarizedFeatures(allBinarizedFeatures, documentCount); allBinarizedFeatures.clear(); const int flatFeatureCount = pool.Docs.GetFactorsCount(); TVector<int> binFeaturesMapping = MapFeatures(forest); TVector<TVector<double>> shapValues(documentCount, TVector<double>(flatFeatureCount + 1, 0.0)); NPar::TLocalExecutor::TExecRangeParams blockParams(0, documentCount); localExecutor.ExecRange([&] (int documentIdx) { const size_t treeCount = forest.GetTreeCount(); for (size_t treeIdx = 0; treeIdx < treeCount; ++treeIdx) { TVector<TVector<size_t>> subtreeSizes = CalcSubtreeSizesForTree(forest, treeIdx); TVector<TFeaturePathElement> initialFeaturePath; CalcShapValuesRecursive(forest, binFeaturesMapping, binarizedFeaturesByDocument[documentIdx], treeIdx, /*depth*/ 0, subtreeSizes, dimension, /*nodeIdx*/ 0, initialFeaturePath, /*zeroPathFraction*/ 1, /*onePathFraction*/ 1, /*feature*/ -1, &shapValues[documentIdx]); shapValues[documentIdx][flatFeatureCount] += CalcMeanValueForTree(forest, subtreeSizes, treeIdx, dimension); } }, blockParams, NPar::TLocalExecutor::WAIT_COMPLETE); return shapValues; }
void TCatboostOptions::Validate() const { SystemOptions.Get().Validate(); BoostingOptions.Get().Validate(); ObliviousTreeOptions.Get().Validate(); ELossFunction lossFunction = LossFunctionDescription->GetLossFunction(); { const ui32 classesCount = DataProcessingOptions->ClassesCount; if (classesCount != 0 ) { CB_ENSURE(IsMultiClassError(lossFunction), "classes_count parameter takes effect only with MultiClass/MultiClassOneVsAll loss functions"); CB_ENSURE(classesCount > 1, "classes-count should be at least 2"); } const auto& classWeights = DataProcessingOptions->ClassWeights.Get(); if (!classWeights.empty()) { CB_ENSURE(lossFunction == ELossFunction::Logloss || IsMultiClassError(lossFunction), "class weights takes effect only with Logloss, MultiClass and MultiClassOneVsAll loss functions"); CB_ENSURE(IsMultiClassError(lossFunction) || (classWeights.size() == 2), "if loss-function is Logloss, then class weights should be given for 0 and 1 classes"); CB_ENSURE(classesCount == 0 || classesCount == classWeights.size(), "class weights should be specified for each class in range 0, ... , classes_count - 1"); } } ELeavesEstimation leavesEstimation = ObliviousTreeOptions->LeavesEstimationMethod; if (lossFunction == ELossFunction::Quantile || lossFunction == ELossFunction::MAE || lossFunction == ELossFunction::LogLinQuantile || lossFunction == ELossFunction::MAPE) { CB_ENSURE(leavesEstimation != ELeavesEstimation::Newton, "Newton leave estimation method is not supported for " << lossFunction << " loss function"); CB_ENSURE(ObliviousTreeOptions->LeavesEstimationIterations == 1U, "gradient_iterations should equals 1 for this mode"); } if (GetTaskType() == ETaskType::CPU) { CB_ENSURE(!(IsQuerywiseError(lossFunction) && leavesEstimation == ELeavesEstimation::Newton), "This leaf estimation method is not supported for querywise error for CPU learning"); CB_ENSURE(!(IsPairwiseError(lossFunction) && leavesEstimation == ELeavesEstimation::Newton), "This leaf estimation method is not supported for pairwise error"); } ValidateCtrs(CatFeatureParams->SimpleCtrs, lossFunction, false); for (const auto& perFeatureCtr : CatFeatureParams->PerFeatureCtrs.Get()) { ValidateCtrs(perFeatureCtr.second, lossFunction, false); } ValidateCtrs(CatFeatureParams->CombinationCtrs, lossFunction, true); }
void TOutputFiles::InitializeFiles(const NCatboostOptions::TOutputFilesOptions& params, const TString& namesPrefix) { if (!params.AllowWriteFiles()) { Y_ASSERT(TimeLeftLogFile.empty()); Y_ASSERT(LearnErrorLogFile.empty()); Y_ASSERT(TestErrorLogFile.empty()); Y_ASSERT(MetaFile.empty()); Y_ASSERT(SnapshotFile.empty()); return; } const auto& trainDir = params.GetTrainDir(); TFsPath trainDirPath(trainDir); if (!trainDir.empty() && !trainDirPath.Exists()) { trainDirPath.MkDir(); } NamesPrefix = namesPrefix; CB_ENSURE(!params.GetTimeLeftLogFilename().empty(), "empty time_left filename"); TimeLeftLogFile = TOutputFiles::AlignFilePath(trainDir, params.GetTimeLeftLogFilename(), NamesPrefix); CB_ENSURE(!params.GetLearnErrorFilename().empty(), "empty learn_error filename"); LearnErrorLogFile = TOutputFiles::AlignFilePath(trainDir, params.GetLearnErrorFilename(), NamesPrefix); if (params.GetTestErrorFilename()) { TestErrorLogFile = TOutputFiles::AlignFilePath(trainDir, params.GetTestErrorFilename(), NamesPrefix); } if (params.SaveSnapshot()) { SnapshotFile = TOutputFiles::AlignFilePath(trainDir, params.GetSnapshotFilename(), NamesPrefix); } const TString& metaFileFilename = params.GetMetaFileFilename(); CB_ENSURE(!metaFileFilename.empty(), "empty meta filename"); MetaFile = TOutputFiles::AlignFilePath(trainDir, metaFileFilename, NamesPrefix); const TString& jsonLogFilename = params.GetJsonLogFilename(); CB_ENSURE(!jsonLogFilename.empty(), "empty json_log filename"); JsonLogFile = TOutputFiles::AlignFilePath(trainDir, jsonLogFilename, ""); const TString& profileLogFilename = params.GetProfileLogFilename(); CB_ENSURE(!profileLogFilename.empty(), "empty profile_log filename"); ProfileLogFile = TOutputFiles::AlignFilePath(trainDir, profileLogFilename, ""); }
static TVector<TVector<ui8>> TransposeBinarizedFeatures(const TVector<ui8>& allBinarizedFeatures, size_t documentCount) { CB_ENSURE(documentCount > 0, "Document block must be non-empty."); const size_t featuresCount = allBinarizedFeatures.size() / documentCount; TVector<TVector<ui8>> binarizedFeaturesByDocument(documentCount, TVector<ui8>(featuresCount)); for (size_t documentIdx = 0; documentIdx < documentCount; ++documentIdx) { for (size_t featureIdx = 0; featureIdx < featuresCount; ++featureIdx) { binarizedFeaturesByDocument[documentIdx][featureIdx] = allBinarizedFeatures[featureIdx * documentCount + documentIdx]; } } return binarizedFeaturesByDocument; }
TVector<TVector<double>> GetFeatureImportances(const TFullModel& model, const TPool& pool, const TString& type, int threadCount){ CB_ENSURE(pool.Docs.GetDocCount() != 0, "Pool should not be empty"); EFstrType FstrType = FromString<EFstrType>(type); switch (FstrType) { case EFstrType::FeatureImportance: return CalcFstr(model, pool, threadCount); case EFstrType::Interaction: return CalcInteraction(model, pool); case EFstrType::Doc: return CalcFeatureImportancesForDocuments(model, pool, threadCount); default: Y_UNREACHABLE(); } }
bool TLearnContext::TryLoadProgress() { if (!OutputOptions.SaveSnapshot() || !NFs::Exists(Files.SnapshotFile)) { return false; } try { TProgressHelper(ToString(ETaskType::CPU)).CheckedLoad(Files.SnapshotFile, [&](TIFStream* in) { TLearnProgress LearnProgressRestored = LearnProgress; // use progress copy to avoid partial deserialization of corrupted progress file TProfileInfoData ProfileRestored; ::LoadMany(in, Rand, LearnProgressRestored, ProfileRestored); // fail here does nothing with real LearnProgress CB_ENSURE(IsParamsCompatible(LearnProgressRestored.SerializedTrainParams, LearnProgress.SerializedTrainParams), "Saved model's Params are different from current model's params"); CB_ENSURE(LearnProgressRestored.PoolCheckSum == LearnProgress.PoolCheckSum, "Current pool differs from the original pool"); LearnProgress = std::move(LearnProgressRestored); Profile.InitProfileInfo(std::move(ProfileRestored)); LearnProgress.SerializedTrainParams = ToString(Params); // substitute real MATRIXNET_INFO_LOG << "Loaded progress file containing " << LearnProgress.TreeStruct.size() << " trees" << Endl; }); return true; } catch (...) { MATRIXNET_WARNING_LOG << "Can't load progress from file: " << Files.SnapshotFile << " exception: " << CurrentExceptionMessage() << Endl; return false; } }
void TrainModel( const NJson::TJsonValue& params, const NCatboostOptions::TOutputFilesOptions& outputOptions, const TMaybe<TCustomObjectiveDescriptor>& objectiveDescriptor, const TMaybe<TCustomMetricDescriptor>& evalMetricDescriptor, TPool& learnPool, bool allowClearPool, const TVector<const TPool*>& testPoolPtrs, TFullModel* model, const TVector<TEvalResult*>& evalResultPtrs) const override { Y_UNUSED(objectiveDescriptor); Y_UNUSED(evalMetricDescriptor); Y_UNUSED(allowClearPool); CB_ENSURE(testPoolPtrs.size() == 1, "Multiple eval sets not supported for GPU"); Y_VERIFY(evalResultPtrs.size() == testPoolPtrs.size()); NCatboostCuda::TrainModel(params, outputOptions, learnPool, *testPoolPtrs[0], model); evalResultPtrs[0]->GetRawValuesRef().resize(model->ObliviousTrees.ApproxDimension); }
TVector<double> CalcRegularFeatureEffect(const TFullModel& model, const TPool& pool, int threadCount/*= 1*/) { int featureCount = pool.Docs.GetFactorsCount(); CB_ENSURE(static_cast<size_t>(featureCount) >= model.ObliviousTrees.GetFlatFeatureVectorExpectedSize(), "Insufficient features count in pool"); int catFeaturesCount = pool.CatFeatures.ysize(); int floatFeaturesCount = featureCount - catFeaturesCount; TFeaturesLayout layout(featureCount, pool.CatFeatures, pool.FeatureId); TVector<TFeatureEffect> regularEffect = CalcRegularFeatureEffect(CalcFeatureEffect(model, pool, threadCount), catFeaturesCount, floatFeaturesCount); TVector<double> effect(featureCount); for (const auto& featureEffect : regularEffect) { int featureIdx = layout.GetFeature(featureEffect.Feature.Index, featureEffect.Feature.Type); Y_ASSERT(featureIdx < featureCount); effect[featureIdx] = featureEffect.Score; } return effect; }
TVector<TColumn> ReadCD(const TString& fileName, const TCdParserDefaults& defaults) { CB_ENSURE(NFs::Exists(TString(fileName)), "column description file is not found"); int columnsCount = defaults.UseDefaultType ? defaults.ColumnCount : 0; TVector<TColumn> columns(columnsCount, TColumn{defaults.DefaultColumnType, TString()}); TSet<int> parsedColumns; TString line; TIFStream reader(fileName.c_str()); while (reader.ReadLine(line)) { TVector<TString> tokens; try { Split(line, "\t", tokens); } catch (const yexception& e) { MATRIXNET_DEBUG_LOG << "Got exception " << e.what() << " while parsing feature descriptions line " << line << Endl; break; } if (tokens.empty()) { continue; } CB_ENSURE(tokens.ysize() == 2 || tokens.ysize() == 3, "Each line should have two or three columns. " << line); int index = FromString<int>(tokens[0]); CB_ENSURE(index >= 0, "Invalid column index " << index); if (defaults.UseDefaultType) { CB_ENSURE(index < columnsCount, "Invalid column index " << index); } CB_ENSURE(!parsedColumns.has(index), "column specified twice in cd file: " << index); parsedColumns.insert(index); columns.resize(Max(columns.ysize(), index + 1)); TStringBuf type = tokens[1]; if (type == "QueryId") { type = "GroupId"; } if (type == "Target") { type = "Label"; } CB_ENSURE(TryFromString<EColumn>(type, columns[index].Type), "unsupported column type " << type); if (tokens.ysize() == 3) { columns[index].Id = tokens[2]; } } if (!defaults.UseDefaultType) { CheckAllFeaturesPresent(columns, parsedColumns); } return columns; }
TDStrResult GetDocumentImportances( const TFullModel& model, const TPool& trainPool, const TPool& testPool, const TString& dstrTypeStr, int topSize, const TString& updateMethodStr, const TString& importanceValuesSignStr, int threadCount ) { if (topSize == -1) { topSize = trainPool.Docs.GetDocCount(); } else { CB_ENSURE(topSize >= 0, "Top size should be nonnegative integer or -1 (for unlimited top size)."); } TUpdateMethod updateMethod = ParseUpdateMethod(updateMethodStr); EDocumentStrengthType dstrType = FromString<EDocumentStrengthType>(dstrTypeStr); EImportanceValuesSign importanceValuesSign = FromString<EImportanceValuesSign>(importanceValuesSignStr); TDocumentImportancesEvaluator leafInfluenceEvaluator(model, trainPool, updateMethod, threadCount); const TVector<TVector<double>> documentImportances = leafInfluenceEvaluator.GetDocumentImportances(testPool); return GetFinalDocumentImportances(documentImportances, dstrType, topSize, importanceValuesSign); }
void PrepareAllFeaturesLearn(const THashSet<int>& categFeatures, const TVector<TFloatFeature>& floatFeatures, const TVector<int>& ignoredFeatures, bool ignoreRedundantCatFeatures, size_t oneHotMaxSize, ENanMode nanMode, bool clearPool, NPar::TLocalExecutor& localExecutor, const TVector<size_t>& selectedDocIndices, TDocumentStorage* learnDocStorage, TAllFeatures* learnFeatures) { if (learnDocStorage->GetDocCount() == 0) { return; } TBinarizer binarizer(learnDocStorage->GetFactorsCount(), categFeatures, floatFeatures, nanMode, localExecutor); binarizer.SetupToIgnoreFeatures(ignoredFeatures, ignoreRedundantCatFeatures); PrepareSlots(binarizer.GetCatFeatureCount(), binarizer.GetFloatFeatureCount(), learnFeatures); binarizer.Binarize(/*forLearn=*/true, learnDocStorage, selectedDocIndices, clearPool, learnFeatures); CleanupOneHotFeatures(oneHotMaxSize, learnFeatures); CB_ENSURE(learnFeatures->GetDocCount() > 0, "Train dataset is empty after binarization"); DumpMemUsage("Extract bools done"); }
void TLearnProgress::Load(IInputStream* s) { ui64 foldCount; ::Load(s, foldCount); CB_ENSURE(foldCount == Folds.size(), "Cannot load progress from file"); for (ui64 i = 0; i < foldCount; ++i) { Folds[i].LoadApproxes(s); } AveragingFold.LoadApproxes(s); ::LoadMany(s, AvrgApprox, TestApprox, BestTestApprox, CatFeatures, FloatFeatures, ApproxDimension, SerializedTrainParams, TreeStruct, TreeStats, LeafValues, LearnErrorsHistory, TestErrorsHistory, TimeHistory, UsedCtrSplits, PoolCheckSum); }
TCtrDescription TCatboostOptions::CreateDefaultCounter(EProjectionType projectionType) const { if (GetTaskType() == ETaskType::CPU) { return TCtrDescription(ECtrType::Counter, GetDefaultPriors(ECtrType::Counter)); } else { CB_ENSURE(GetTaskType() == ETaskType::GPU); EBorderSelectionType borderSelectionType; switch (projectionType) { case EProjectionType::TreeCtr: { borderSelectionType = EBorderSelectionType::Median; break; } case EProjectionType::SimpleCtr: { borderSelectionType = EBorderSelectionType::MinEntropy; break; } default: { ythrow TCatboostException() << "Unknown projection type " << projectionType; } } return TCtrDescription(ECtrType::FeatureFreq, GetDefaultPriors(ECtrType::FeatureFreq), TBinarizationOptions(borderSelectionType, 15)); } }
void TDataProviderBuilder::Finish() { CB_ENSURE(!IsDone, "Error: can't finish more than once"); DataProvider.Features.reserve(FeatureValues.size()); DataProvider.Order.resize(DataProvider.Targets.size()); std::iota(DataProvider.Order.begin(), DataProvider.Order.end(), 0); if (!AreEqualTo<ui64>(DataProvider.Timestamp, 0)) { ShuffleFlag = false; DataProvider.Order = CreateOrderByKey(DataProvider.Timestamp); } bool hasQueryIds = HasQueryIds(DataProvider.QueryIds); if (!hasQueryIds) { DataProvider.QueryIds.resize(0); } //TODO(noxoomo): it's not safe here, if we change order with shuffle everything'll go wrong if (Pairs.size()) { //they are local, so we don't need shuffle CB_ENSURE(hasQueryIds, "Error: for GPU pairwise learning you should provide query id column. Query ids will be used to split data between devices and for dynamic boosting learning scheme."); DataProvider.FillQueryPairs(Pairs); } if (ShuffleFlag) { if (hasQueryIds) { //should not change order inside query for pairs consistency QueryConsistentShuffle(Seed, 1, DataProvider.QueryIds, &DataProvider.Order); } else { Shuffle(Seed, 1, DataProvider.Targets.size(), &DataProvider.Order); } DataProvider.SetShuffleSeed(Seed); } if (ShuffleFlag || !DataProvider.Timestamp.empty()) { DataProvider.ApplyOrderToMetaColumns(); } TVector<TString> featureNames; featureNames.resize(FeatureValues.size()); TAdaptiveLock lock; NPar::TLocalExecutor executor; executor.RunAdditionalThreads(BuildThreads - 1); TVector<TFeatureColumnPtr> featureColumns(FeatureValues.size()); if (!IsTest) { RegisterFeaturesInFeatureManager(featureColumns); } TVector<TVector<float>> grid; grid.resize(FeatureValues.size()); NPar::ParallelFor(executor, 0, FeatureValues.size(), [&](ui32 featureId) { auto featureName = GetFeatureName(featureId); featureNames[featureId] = featureName; if (FeatureValues[featureId].size() == 0) { return; } TVector<float> line(DataProvider.Order.size()); for (ui32 i = 0; i < DataProvider.Order.size(); ++i) { line[i] = FeatureValues[featureId][DataProvider.Order[i]]; } if (CatFeatureIds.has(featureId)) { static_assert(sizeof(float) == sizeof(ui32), "Error: float size should be equal to ui32 size"); const bool shouldSkip = IsTest && (CatFeaturesPerfectHashHelper.GetUniqueValues(featureId) == 0); if (!shouldSkip) { auto data = CatFeaturesPerfectHashHelper.UpdatePerfectHashAndBinarize(featureId, ~line, line.size()); const ui32 uniqueValues = CatFeaturesPerfectHashHelper.GetUniqueValues(featureId); if (uniqueValues > 1) { auto compressedData = CompressVector<ui64>(~data, line.size(), IntLog2(uniqueValues)); featureColumns[featureId] = MakeHolder<TCatFeatureValuesHolder>(featureId, line.size(), std::move(compressedData), uniqueValues, featureName); } } } else { auto floatFeature = MakeHolder<TFloatValuesHolder>(featureId, std::move(line), featureName); TVector<float>& borders = grid[featureId]; ENanMode nanMode = ENanMode::Forbidden; { TGuard<TAdaptiveLock> guard(lock); nanMode = FeaturesManager.GetOrCreateNanMode(*floatFeature); } if (FeaturesManager.HasFloatFeatureBorders(*floatFeature)) { borders = FeaturesManager.GetFloatFeatureBorders(*floatFeature); } if (borders.empty() && !IsTest) { const auto& floatValues = floatFeature->GetValues(); NCatboostOptions::TBinarizationOptions config = FeaturesManager.GetFloatFeatureBinarization(); config.NanMode = nanMode; borders = BuildBorders(floatValues, floatFeature->GetId(), config); } if (borders.ysize() == 0) { MATRIXNET_DEBUG_LOG << "Float Feature #" << featureId << " is empty" << Endl; return; } auto binarizedData = BinarizeLine(floatFeature->GetValues().data(), floatFeature->GetValues().size(), nanMode, borders); const int binCount = static_cast<const int>(borders.size() + 1 + (ENanMode::Forbidden != nanMode)); auto compressedLine = CompressVector<ui64>(binarizedData, IntLog2(binCount)); featureColumns[featureId] = MakeHolder<TBinarizedFloatValuesHolder>(featureId, floatFeature->GetValues().size(), nanMode, borders, std::move(compressedLine), featureName); } //Free memory { auto emptyVec = TVector<float>(); FeatureValues[featureId].swap(emptyVec); } }); for (ui32 featureId = 0; featureId < featureColumns.size(); ++featureId) { if (CatFeatureIds.has(featureId)) { if (featureColumns[featureId] == nullptr && (!IsTest)) { MATRIXNET_DEBUG_LOG << "Cat Feature #" << featureId << " is empty" << Endl; } } else if (featureColumns[featureId] != nullptr) { if (!FeaturesManager.HasFloatFeatureBordersForDataProviderFeature(featureId)) { FeaturesManager.SetFloatFeatureBordersForDataProviderId(featureId, std::move(grid[featureId])); } } if (featureColumns[featureId] != nullptr) { DataProvider.Features.push_back(std::move(featureColumns[featureId])); } } DataProvider.BuildIndicesRemap(); if (!IsTest) { TOnCpuGridBuilderFactory gridBuilderFactory; FeaturesManager.SetTargetBorders(TBordersBuilder(gridBuilderFactory, DataProvider.GetTargets())(FeaturesManager.GetTargetBinarizationDescription())); } DataProvider.FeatureNames = featureNames; DataProvider.CatFeatureIds = CatFeatureIds; if (ClassesWeights.size()) { Reweight(DataProvider.Targets, ClassesWeights, &DataProvider.Weights); } IsDone = true; }
void TCatboostOptions::SetLeavesEstimationDefault() { const auto& lossFunctionConfig = LossFunctionDescription.Get(); auto& treeConfig = ObliviousTreeOptions.Get(); ui32 defaultNewtonIterations = 1; ui32 defaultGradientIterations = 1; ELeavesEstimation defaultEstimationMethod = ELeavesEstimation::Newton; switch (lossFunctionConfig.GetLossFunction()) { case ELossFunction::RMSE: { defaultEstimationMethod = ELeavesEstimation::Newton; defaultNewtonIterations = 1; defaultGradientIterations = 1; break; } case ELossFunction::QueryRMSE: { defaultEstimationMethod = ELeavesEstimation::Gradient; defaultNewtonIterations = 1; defaultGradientIterations = 1; break; } case ELossFunction::MultiClass: case ELossFunction::MultiClassOneVsAll: { defaultEstimationMethod = ELeavesEstimation::Newton; defaultNewtonIterations = 1; defaultGradientIterations = 10; break; } case ELossFunction::Quantile: case ELossFunction::MAE: case ELossFunction::LogLinQuantile: case ELossFunction::MAPE: { defaultNewtonIterations = 1; defaultGradientIterations = 1; defaultEstimationMethod = ELeavesEstimation::Gradient; break; } case ELossFunction::PairLogit: { defaultEstimationMethod = ELeavesEstimation::Gradient; //TODO(noxoomo): update to 10 after options merge defaultNewtonIterations = 1; defaultGradientIterations = 1; break; } case ELossFunction::Poisson: { defaultEstimationMethod = ELeavesEstimation::Gradient; defaultNewtonIterations = 1; defaultGradientIterations = 1; break; } case ELossFunction::Logloss: case ELossFunction::CrossEntropy: { defaultNewtonIterations = 10; defaultGradientIterations = 100; defaultEstimationMethod = ELeavesEstimation::Newton; break; } case ELossFunction::YetiRank: { defaultEstimationMethod = ELeavesEstimation::Newton; defaultGradientIterations = 1; defaultNewtonIterations = 1; break; } case ELossFunction::UserPerObjErr: case ELossFunction::UserQuerywiseErr: case ELossFunction::Custom: { //skip defaultNewtonIterations = 1; defaultGradientIterations = 1; break; } default: { CB_ENSURE(false, "Unknown loss function " << lossFunctionConfig.GetLossFunction()); } } if (treeConfig.LeavesEstimationMethod.NotSet()) { treeConfig.LeavesEstimationMethod = defaultEstimationMethod; } if (treeConfig.LeavesEstimationIterations.NotSet()) { const ELeavesEstimation method = treeConfig.LeavesEstimationMethod; switch (method) { case ELeavesEstimation::Newton: { treeConfig.LeavesEstimationIterations = defaultNewtonIterations; break; } case ELeavesEstimation::Gradient: { treeConfig.LeavesEstimationIterations = defaultGradientIterations; break; } default: { ythrow TCatboostException() << "Unknown estimation type " << method; } } } if (treeConfig.L2Reg == 0.0f) { treeConfig.L2Reg = 1e-20f; } }
inline void CheckAllFeaturesPresent(const TVector<TColumn>& columns, const TSet<int>& parsedColumns) { for (int i = 0; i < columns.ysize(); ++i) { CB_ENSURE(parsedColumns.has(i), "column not present in cd file: " << i); } }
void TCatboostOptions::ValidateCtr(const TCtrDescription& ctr, ELossFunction lossFunction, bool isTreeCtrs) const { if (ctr.TargetBinarization->BorderCount > 1) { CB_ENSURE(lossFunction == ELossFunction::RMSE || lossFunction == ELossFunction::Quantile || lossFunction == ELossFunction::LogLinQuantile || lossFunction == ELossFunction::Poisson || lossFunction == ELossFunction::MAPE || lossFunction == ELossFunction::MAE, "target-border-cnt is not supported for loss function " << lossFunction); } CB_ENSURE(ctr.GetPriors().size(), "Provide at least one prior for CTR" << ToString(*this)); const ETaskType taskType = GetTaskType(); const ECtrType ctrType = ctr.Type; if (taskType == ETaskType::GPU) { CB_ENSURE(IsSupportedOnGpu(ctrType), "Ctr type " << ctrType << " is not implemented on GPU yet"); CB_ENSURE(ctr.TargetBinarization.IsDefault(), "Error: GPU doesn't not support target binarization per CTR description currently. Please use target_borders option instead"); } else { CB_ENSURE(taskType == ETaskType::CPU); CB_ENSURE(IsSupportedOnCpu(ctrType), "Ctr type " << ctrType << " is not implemented on CPU yet"); CB_ENSURE(ctr.PriorEstimation == EPriorEstimation::No, "Error: CPU doesn't not support prior estimation currently"); } const EBorderSelectionType borderSelectionType = ctr.CtrBinarization->BorderSelectionType; if (taskType == ETaskType::CPU) { CB_ENSURE(borderSelectionType == EBorderSelectionType::Uniform, "Error: custom ctr binarization is not supported on CPU yet"); } else { CB_ENSURE(taskType == ETaskType::GPU); if (isTreeCtrs) { EBorderSelectionType borderType = borderSelectionType; CB_ENSURE(borderType == EBorderSelectionType::Uniform || borderType == EBorderSelectionType::Median, "Error: GPU supports Median and Uniform combinations-ctr binarization only"); CB_ENSURE(ctr.CtrBinarization->BorderCount <= GetMaxTreeCtrBinarizationForGpu(), "Error: max combinations-ctr binarization for GPU is " << GetMaxTreeCtrBinarizationForGpu()); CB_ENSURE(ctr.PriorEstimation == EPriorEstimation::No, "Error: prior estimation is not available for combinations-ctr"); } else { switch (ctrType) { case ECtrType::Borders: { break; } default: { CB_ENSURE(ctr.PriorEstimation == EPriorEstimation::No, "Error: prior estimation is not available for ctr type " << ctrType); } } } } if ((ctrType == ECtrType::FeatureFreq) && borderSelectionType == EBorderSelectionType::Uniform) { MATRIXNET_WARNING_LOG << "Uniform ctr binarization for featureFreq ctr is not good choice. Use MinEntropy for simpleCtrs and Median for combinations-ctrs instead" << Endl; } }