Ejemplo n.º 1
0
TVector<std::pair<double, TFeature>> CalcFeatureEffect(const TFullModel& model, const TPool& pool, int threadCount/*= 1*/) {
    CB_ENSURE(pool.Docs.GetDocCount() != 0, "Pool should not be empty");
    if (model.GetTreeCount() == 0) {
        return TVector<std::pair<double, TFeature>>();
    }
    int featureCount = pool.Docs.GetFactorsCount();
    NJson::TJsonValue jsonParams = ReadTJsonValue(model.ModelInfo.at("params"));
    jsonParams["system_options"].InsertValue("thread_count", threadCount);
    TCommonContext ctx(jsonParams, Nothing(), Nothing(), featureCount, pool.CatFeatures, pool.FeatureId);

    CB_ENSURE(model.GetTreeCount() != 0, "model should not be empty");
    CB_ENSURE(pool.Docs.GetFactorsCount() > 0, "no features in pool");
    TVector<TFeature> features;
    TVector<TMxTree> trees = BuildMatrixnetTrees(model, &features);

    TVector<TVector<ui64>> leavesStatistics = CollectLeavesStatistics(pool, model);
    TVector<double> effect = CalcEffect(trees, leavesStatistics);
    TVector<std::pair<double, int>> effectWithFeature;
    for (int i = 0; i < effect.ysize(); ++i) {
        effectWithFeature.emplace_back(effect[i], i);
    }
    Sort(effectWithFeature.begin(), effectWithFeature.end(), std::greater<std::pair<double, int>>());

    TVector<std::pair<double, TFeature>> result;
    for (int i = 0; i < effectWithFeature.ysize(); ++i) {
        result.emplace_back(effectWithFeature[i].first, features[effectWithFeature[i].second]);
    }
    return result;
}
Ejemplo n.º 2
0
int mode_fit(const int argc, const char* argv[]) {
    ConfigureMalloc();

    NCatboostOptions::TPoolLoadParams poolLoadOptions;
    TString paramsFile;
    NJson::TJsonValue catBoostFlatJsonOptions;
    ParseCommandLine(argc, argv, &catBoostFlatJsonOptions, &paramsFile, &poolLoadOptions);
    NJson::TJsonValue catBoostJsonOptions;
    NJson::TJsonValue outputOptionsJson;
    if (!paramsFile.empty()) {
        CB_ENSURE(NFs::Exists(paramsFile), "Params file does not exists " << paramsFile);
        TIFStream in(paramsFile);
        NJson::TJsonValue fromFileParams;
        CB_ENSURE(NJson::ReadJsonTree(&in, &fromFileParams), "can't parse params file");
        NCatboostOptions::PlainJsonToOptions(fromFileParams, &catBoostJsonOptions, &outputOptionsJson);
    }
    NCatboostOptions::PlainJsonToOptions(catBoostFlatJsonOptions, &catBoostJsonOptions, &outputOptionsJson);
    poolLoadOptions.Validate();

    auto taskType = NCatboostOptions::GetTaskType(catBoostJsonOptions);
    THolder<IModelTrainer> modelTrainerHolder;
    NCatboostOptions::TOutputFilesOptions outputOptions(taskType);
    outputOptions.Load(outputOptionsJson);

    const bool isGpuDeviceType = taskType == ETaskType::GPU;
    if (isGpuDeviceType && TTrainerFactory::Has(ETaskType::GPU)) {
        modelTrainerHolder = TTrainerFactory::Construct(ETaskType::GPU);
    } else {
        CB_ENSURE(!isGpuDeviceType, "GPU Device not found.");

        modelTrainerHolder = TTrainerFactory::Construct(ETaskType::CPU);
    }
    modelTrainerHolder->TrainModel(poolLoadOptions, outputOptions, catBoostJsonOptions);
    return 0;
}
ui32 NCatboostCuda::UpdateFeatureId(TBinarizedFeaturesManager& featuresManager,
                                    const TModelFeaturesMap& map,
                                    const ui32 featureId) {
    if (map.Ctrs.has(featureId)) {
        const auto& info = map.Ctrs.at(featureId);
        TCtr remapedCtr = MigrateCtr(featuresManager, map, info.Ctr);

        if (featuresManager.IsKnown(remapedCtr)) {
            ui32 remappedId = featuresManager.GetId(remapedCtr);

            CB_ENSURE(info.Borders == featuresManager.GetBorders(remappedId),
                      " tensor : " << remapedCtr.FeatureTensor << "  (ctr type "
                                   << remapedCtr.Configuration.Type << "). Error: progress borders should be consistent: " << remappedId << " / " << featureId << " " << Print(info.Borders) << " vs " << Print(featuresManager.GetBorders(remappedId)));
            return remappedId;
        } else {
            return featuresManager.AddCtr(remapedCtr,
                                          TVector<float>(info.Borders));
        }
    } else if (map.FloatFeatures.has(featureId)) {
        auto& floatInfo = map.FloatFeatures.at(featureId);
        const ui32 featureManagerId = featuresManager.GetFeatureManagerIdForFloatFeature(floatInfo.DataProviderId);
        CB_ENSURE(floatInfo.Borders == featuresManager.GetBorders(featureManagerId),
                  "Error: progress borders should be consistent");
        return featureManagerId;
    } else if (map.CatFeaturesMap.has(featureId)) {
        const ui32 dataProviderId = map.CatFeaturesMap.at(featureId);
        return featuresManager.GetFeatureManagerIdForCatFeature(dataProviderId);
    } else {
        ythrow yexception() << "Error: can't remap featureId #" << featureId;
    }
}
Ejemplo n.º 4
0
    static inline void EstimatePriors(const TDataProvider& dataProvider,
                                      TBinarizedFeaturesManager& featureManager,
                                      NCatboostOptions::TCatFeatureParams& options) {
        CB_ENSURE(&(featureManager.GetCatFeatureOptions()) == &options, "Error: for consistent catFeature options should be equal to one in feature manager");

        bool needSimpleCtrsPriorEstimation = NeedPriorEstimation(options.SimpleCtrs);
        const auto& borders = featureManager.GetTargetBorders();
        if (borders.size() > 1) {
            return;
        }
        auto binarizedTarget = BinarizeLine<ui8>(dataProvider.GetTargets().data(), dataProvider.GetTargets().size(), ENanMode::Forbidden, borders);

        TVector<int> catFeatureIds(dataProvider.GetCatFeatureIds().begin(), dataProvider.GetCatFeatureIds().end());
        TAdaptiveLock lock;

        //TODO(noxoomo): locks here are ugly and error prone
        NPar::ParallelFor(0, catFeatureIds.size(), [&](int i) {
            ui32 catFeature = catFeatureIds[i];
            if (!dataProvider.HasFeatureId(catFeature)) {
                return;
            }
            const ICatFeatureValuesHolder& catFeatureValues = dynamic_cast<const ICatFeatureValuesHolder&>(dataProvider.GetFeatureById(catFeature));

            bool hasPerFeatureCtr = false;

            with_lock (lock) {
                if (needSimpleCtrsPriorEstimation && !options.PerFeatureCtrs->has(catFeature)) {
                    options.PerFeatureCtrs.Get()[catFeature] = options.SimpleCtrs;
                }
                hasPerFeatureCtr = options.PerFeatureCtrs->has(catFeature);
            }

            if (hasPerFeatureCtr) {
                TVector<NCatboostOptions::TCtrDescription> currentFeatureDescription;
                with_lock (lock) {
                    currentFeatureDescription = options.PerFeatureCtrs->at(catFeature);
                }
                if (!NeedPriorEstimation(currentFeatureDescription)) {
                    return;
                }
                auto values = catFeatureValues.ExtractValues();

                for (ui32 i = 0; i < currentFeatureDescription.size(); ++i) {
                    if (currentFeatureDescription[i].Type == ECtrType::Borders && options.TargetBorders->BorderCount == 1u) {
                        TBetaPriorEstimator::TBetaPrior prior = TBetaPriorEstimator::EstimateBetaPrior(binarizedTarget.data(),
                                                                                                       values.data(), values.size(), catFeatureValues.GetUniqueValues());

                        MATRIXNET_INFO_LOG << "Estimate borders-ctr prior for feature #" << catFeature << ": " << prior.Alpha << " / " << prior.Beta << Endl;
                        currentFeatureDescription[i].Priors = {{(float)prior.Alpha, (float)(prior.Alpha + prior.Beta)}};
                    } else {
                        CB_ENSURE(currentFeatureDescription[i].PriorEstimation == EPriorEstimation::No, "Error: auto prior estimation is not available for ctr type " << currentFeatureDescription[i].Type);
                    }
                }
                with_lock (lock) {
                    options.PerFeatureCtrs.Get()[catFeature] = currentFeatureDescription;
                }
            }
        });
Ejemplo n.º 5
0
SEXP CatBoostPrepareEval_R(SEXP approxParam, SEXP typeParam, SEXP columnCountParam, SEXP threadCountParam) {
    SEXP result = NULL;
    R_API_BEGIN();
    SEXP dataDim = getAttrib(approxParam, R_DimSymbol);
    size_t dataRows = static_cast<size_t>(INTEGER(dataDim)[0]) / asInteger(columnCountParam);
    TVector<TVector<double>> prediction(asInteger(columnCountParam), TVector<double>(dataRows));
    for (size_t i = 0, k = 0; i < dataRows; ++i) {
        for (size_t j = 0; j < prediction.size(); ++j) {
            prediction[j][i] = static_cast<double>(REAL(approxParam)[k++]);
        }
    }

    NPar::TLocalExecutor executor;
    executor.RunAdditionalThreads(asInteger(threadCountParam) - 1);
    EPredictionType predictionType;
    CB_ENSURE(TryFromString<EPredictionType>(CHAR(asChar(typeParam)), predictionType),
              "unsupported prediction type: 'Probability', 'Class' or 'RawFormulaVal' was expected");
    prediction = PrepareEval(predictionType, prediction, &executor);

    size_t predictionSize = prediction.size() * dataRows;
    result = PROTECT(allocVector(REALSXP, predictionSize));
    for (size_t i = 0, k = 0; i < dataRows; ++i) {
        for (size_t j = 0; j < prediction.size(); ++j) {
            REAL(result)[k++] = prediction[j][i];
        }
    }
    R_API_END();
    UNPROTECT(1);
    return result;
}
Ejemplo n.º 6
0
SEXP CatBoostPredictMulti_R(SEXP modelParam, SEXP poolParam, SEXP verboseParam,
                            SEXP typeParam, SEXP treeCountStartParam, SEXP treeCountEndParam, SEXP threadCountParam) {
    SEXP result = NULL;
    R_API_BEGIN();
    TFullModelHandle model = reinterpret_cast<TFullModelHandle>(R_ExternalPtrAddr(modelParam));
    TPoolHandle pool = reinterpret_cast<TPoolHandle>(R_ExternalPtrAddr(poolParam));
    EPredictionType predictionType;
    CB_ENSURE(TryFromString<EPredictionType>(CHAR(asChar(typeParam)), predictionType),
              "unsupported prediction type: 'Probability', 'Class' or 'RawFormulaVal' was expected");
    TVector<TVector<double>> prediction = ApplyModelMulti(*model,
                                                          *pool,
                                                          asLogical(verboseParam),
                                                          predictionType,
                                                          asInteger(treeCountStartParam),
                                                          asInteger(treeCountEndParam),
                                                          asInteger(threadCountParam));
    size_t predictionSize = prediction.size() * pool->Docs.GetDocCount();
    result = PROTECT(allocVector(REALSXP, predictionSize));
    for (size_t i = 0, k = 0; i < pool->Docs.GetDocCount(); ++i) {
        for (size_t j = 0; j < prediction.size(); ++j) {
            REAL(result)[k++] = prediction[j][i];
        }
    }
    R_API_END();
    UNPROTECT(1);
    return result;
}
Ejemplo n.º 7
0
static TVector<TFeaturePathElement> UnwindFeaturePath(const TVector<TFeaturePathElement>& oldFeaturePath, size_t eraseElementIdx) {
    const size_t pathLength = oldFeaturePath.size();
    CB_ENSURE(pathLength > 0, "Path to unwind must have at least one element");

    TVector<TFeaturePathElement> newFeaturePath(oldFeaturePath.begin(), oldFeaturePath.begin() + pathLength - 1);

    for (size_t elementIdx = eraseElementIdx; elementIdx < pathLength - 1; ++elementIdx) {
        newFeaturePath[elementIdx].Feature = oldFeaturePath[elementIdx + 1].Feature;
        newFeaturePath[elementIdx].ZeroPathsFraction = oldFeaturePath[elementIdx + 1].ZeroPathsFraction;
        newFeaturePath[elementIdx].OnePathsFraction = oldFeaturePath[elementIdx + 1].OnePathsFraction;
    }

    const double onePathsFraction = oldFeaturePath[eraseElementIdx].OnePathsFraction;
    const double zeroPathsFraction = oldFeaturePath[eraseElementIdx].ZeroPathsFraction;
    double weightDiff = oldFeaturePath[pathLength - 1].Weight;

    if (!FuzzyEquals(onePathsFraction, 0.0)) {
        for (int elementIdx = pathLength - 2; elementIdx >= 0; --elementIdx) {
            double oldWeight = newFeaturePath[elementIdx].Weight;
            newFeaturePath[elementIdx].Weight = weightDiff * pathLength / (onePathsFraction * (elementIdx + 1));
            weightDiff = oldWeight - newFeaturePath[elementIdx].Weight * zeroPathsFraction * (pathLength - elementIdx - 1) / pathLength;
        }
    } else {
        for (int elementIdx = pathLength - 2; elementIdx >= 0; --elementIdx) {
            newFeaturePath[elementIdx].Weight *= pathLength / (zeroPathsFraction * (pathLength - elementIdx - 1));
        }
    }


    return newFeaturePath;
}
Ejemplo n.º 8
0
static TVector<TVector<double>> CalcShapValuesForDocumentBlock(const TFullModel& model,
                                                               const TPool& pool,
                                                               size_t start,
                                                               size_t end,
                                                               NPar::TLocalExecutor& localExecutor,
                                                               int dimension) {
    CB_ENSURE(!HasComplexCtrs(model.ObliviousTrees), "Model uses complex Ctr features. This is not allowed for SHAP values calculation");

    const TObliviousTrees& forest = model.ObliviousTrees;
    const size_t documentCount = end - start;

    TVector<ui8> allBinarizedFeatures = BinarizeFeatures(model, pool, start, end);
    TVector<TVector<ui8>> binarizedFeaturesByDocument = TransposeBinarizedFeatures(allBinarizedFeatures, documentCount);
    allBinarizedFeatures.clear();

    const int flatFeatureCount = pool.Docs.GetFactorsCount();
    TVector<int> binFeaturesMapping = MapFeatures(forest);
    TVector<TVector<double>> shapValues(documentCount, TVector<double>(flatFeatureCount + 1, 0.0));

    NPar::TLocalExecutor::TExecRangeParams blockParams(0, documentCount);
    localExecutor.ExecRange([&] (int documentIdx) {
        const size_t treeCount = forest.GetTreeCount();
        for (size_t treeIdx = 0; treeIdx < treeCount; ++treeIdx) {
            TVector<TVector<size_t>> subtreeSizes = CalcSubtreeSizesForTree(forest, treeIdx);
            TVector<TFeaturePathElement> initialFeaturePath;
            CalcShapValuesRecursive(forest, binFeaturesMapping, binarizedFeaturesByDocument[documentIdx], treeIdx, /*depth*/ 0, subtreeSizes, dimension,
                                    /*nodeIdx*/ 0, initialFeaturePath, /*zeroPathFraction*/ 1, /*onePathFraction*/ 1, /*feature*/ -1,
                                    &shapValues[documentIdx]);

            shapValues[documentIdx][flatFeatureCount] += CalcMeanValueForTree(forest, subtreeSizes, treeIdx, dimension);
        }
    }, blockParams, NPar::TLocalExecutor::WAIT_COMPLETE);

    return shapValues;
}
Ejemplo n.º 9
0
    void TCatboostOptions::Validate() const {
        SystemOptions.Get().Validate();
        BoostingOptions.Get().Validate();
        ObliviousTreeOptions.Get().Validate();

        ELossFunction lossFunction = LossFunctionDescription->GetLossFunction();
        {
            const ui32 classesCount = DataProcessingOptions->ClassesCount;
            if (classesCount != 0 ) {
                CB_ENSURE(IsMultiClassError(lossFunction), "classes_count parameter takes effect only with MultiClass/MultiClassOneVsAll loss functions");
                CB_ENSURE(classesCount > 1, "classes-count should be at least 2");
            }
            const auto& classWeights = DataProcessingOptions->ClassWeights.Get();
            if (!classWeights.empty()) {
                CB_ENSURE(lossFunction == ELossFunction::Logloss || IsMultiClassError(lossFunction),
                          "class weights takes effect only with Logloss, MultiClass and MultiClassOneVsAll loss functions");
                CB_ENSURE(IsMultiClassError(lossFunction) || (classWeights.size() == 2),
                          "if loss-function is Logloss, then class weights should be given for 0 and 1 classes");
                CB_ENSURE(classesCount == 0 || classesCount == classWeights.size(), "class weights should be specified for each class in range 0, ... , classes_count - 1");
            }
        }

        ELeavesEstimation leavesEstimation = ObliviousTreeOptions->LeavesEstimationMethod;
        if (lossFunction == ELossFunction::Quantile ||
            lossFunction == ELossFunction::MAE ||
            lossFunction == ELossFunction::LogLinQuantile ||
            lossFunction == ELossFunction::MAPE)
        {
            CB_ENSURE(leavesEstimation != ELeavesEstimation::Newton,
                      "Newton leave estimation method is not supported for " << lossFunction << " loss function");
            CB_ENSURE(ObliviousTreeOptions->LeavesEstimationIterations == 1U,
                      "gradient_iterations should equals 1 for this mode");
        }

        if (GetTaskType() == ETaskType::CPU) {
            CB_ENSURE(!(IsQuerywiseError(lossFunction) && leavesEstimation == ELeavesEstimation::Newton),
                      "This leaf estimation method is not supported for querywise error for CPU learning");

            CB_ENSURE(!(IsPairwiseError(lossFunction) && leavesEstimation == ELeavesEstimation::Newton),
                      "This leaf estimation method is not supported for pairwise error");
        }


        ValidateCtrs(CatFeatureParams->SimpleCtrs, lossFunction, false);
        for (const auto& perFeatureCtr : CatFeatureParams->PerFeatureCtrs.Get()) {
            ValidateCtrs(perFeatureCtr.second, lossFunction, false);
        }
        ValidateCtrs(CatFeatureParams->CombinationCtrs, lossFunction, true);
    }
Ejemplo n.º 10
0
void TOutputFiles::InitializeFiles(const NCatboostOptions::TOutputFilesOptions& params, const TString& namesPrefix) {
    if (!params.AllowWriteFiles()) {
        Y_ASSERT(TimeLeftLogFile.empty());
        Y_ASSERT(LearnErrorLogFile.empty());
        Y_ASSERT(TestErrorLogFile.empty());
        Y_ASSERT(MetaFile.empty());
        Y_ASSERT(SnapshotFile.empty());
        return;
    }

    const auto& trainDir = params.GetTrainDir();
    TFsPath trainDirPath(trainDir);
    if (!trainDir.empty() && !trainDirPath.Exists()) {
        trainDirPath.MkDir();
    }
    NamesPrefix = namesPrefix;
    CB_ENSURE(!params.GetTimeLeftLogFilename().empty(), "empty time_left filename");
    TimeLeftLogFile = TOutputFiles::AlignFilePath(trainDir, params.GetTimeLeftLogFilename(), NamesPrefix);

    CB_ENSURE(!params.GetLearnErrorFilename().empty(), "empty learn_error filename");
    LearnErrorLogFile = TOutputFiles::AlignFilePath(trainDir, params.GetLearnErrorFilename(), NamesPrefix);
    if (params.GetTestErrorFilename()) {
        TestErrorLogFile = TOutputFiles::AlignFilePath(trainDir, params.GetTestErrorFilename(), NamesPrefix);
    }
    if (params.SaveSnapshot()) {
        SnapshotFile = TOutputFiles::AlignFilePath(trainDir, params.GetSnapshotFilename(), NamesPrefix);
    }
    const TString& metaFileFilename = params.GetMetaFileFilename();
    CB_ENSURE(!metaFileFilename.empty(), "empty meta filename");
    MetaFile = TOutputFiles::AlignFilePath(trainDir, metaFileFilename, NamesPrefix);

    const TString& jsonLogFilename = params.GetJsonLogFilename();
    CB_ENSURE(!jsonLogFilename.empty(), "empty json_log filename");
    JsonLogFile = TOutputFiles::AlignFilePath(trainDir, jsonLogFilename, "");

    const TString& profileLogFilename = params.GetProfileLogFilename();
    CB_ENSURE(!profileLogFilename.empty(), "empty profile_log filename");
    ProfileLogFile = TOutputFiles::AlignFilePath(trainDir, profileLogFilename, "");
}
Ejemplo n.º 11
0
static TVector<TVector<ui8>> TransposeBinarizedFeatures(const TVector<ui8>& allBinarizedFeatures, size_t documentCount) {
    CB_ENSURE(documentCount > 0, "Document block must be non-empty.");
    const size_t featuresCount = allBinarizedFeatures.size() / documentCount;

    TVector<TVector<ui8>> binarizedFeaturesByDocument(documentCount, TVector<ui8>(featuresCount));

    for (size_t documentIdx = 0; documentIdx < documentCount; ++documentIdx) {
        for (size_t featureIdx = 0; featureIdx < featuresCount; ++featureIdx) {
            binarizedFeaturesByDocument[documentIdx][featureIdx] = allBinarizedFeatures[featureIdx * documentCount + documentIdx];
        }
    }

    return binarizedFeaturesByDocument;
}
Ejemplo n.º 12
0
TVector<TVector<double>> GetFeatureImportances(const TFullModel& model, const TPool& pool, const TString& type, int threadCount){
    CB_ENSURE(pool.Docs.GetDocCount() != 0, "Pool should not be empty");
    EFstrType FstrType = FromString<EFstrType>(type);
    switch (FstrType) {
        case EFstrType::FeatureImportance:
            return CalcFstr(model, pool, threadCount);
        case EFstrType::Interaction:
            return CalcInteraction(model, pool);
        case EFstrType::Doc:
            return CalcFeatureImportancesForDocuments(model, pool, threadCount);
        default:
            Y_UNREACHABLE();
    }
}
Ejemplo n.º 13
0
bool TLearnContext::TryLoadProgress() {
    if (!OutputOptions.SaveSnapshot() || !NFs::Exists(Files.SnapshotFile)) {
        return false;
    }
    try {
        TProgressHelper(ToString(ETaskType::CPU)).CheckedLoad(Files.SnapshotFile, [&](TIFStream* in)
        {
            TLearnProgress LearnProgressRestored = LearnProgress; // use progress copy to avoid partial deserialization of corrupted progress file
            TProfileInfoData ProfileRestored;
            ::LoadMany(in, Rand, LearnProgressRestored, ProfileRestored); // fail here does nothing with real LearnProgress
            CB_ENSURE(IsParamsCompatible(LearnProgressRestored.SerializedTrainParams, LearnProgress.SerializedTrainParams), "Saved model's Params are different from current model's params");
            CB_ENSURE(LearnProgressRestored.PoolCheckSum == LearnProgress.PoolCheckSum, "Current pool differs from the original pool");
            LearnProgress = std::move(LearnProgressRestored);
            Profile.InitProfileInfo(std::move(ProfileRestored));
            LearnProgress.SerializedTrainParams = ToString(Params); // substitute real
            MATRIXNET_INFO_LOG << "Loaded progress file containing " <<  LearnProgress.TreeStruct.size() << " trees" << Endl;
        });
        return true;
    } catch (...) {
        MATRIXNET_WARNING_LOG << "Can't load progress from file: " << Files.SnapshotFile << " exception: " << CurrentExceptionMessage() << Endl;
        return false;
    }
}
Ejemplo n.º 14
0
 void TrainModel(
     const NJson::TJsonValue& params,
     const NCatboostOptions::TOutputFilesOptions& outputOptions,
     const TMaybe<TCustomObjectiveDescriptor>& objectiveDescriptor,
     const TMaybe<TCustomMetricDescriptor>& evalMetricDescriptor,
     TPool& learnPool,
     bool allowClearPool,
     const TVector<const TPool*>& testPoolPtrs,
     TFullModel* model,
     const TVector<TEvalResult*>& evalResultPtrs) const override {
     Y_UNUSED(objectiveDescriptor);
     Y_UNUSED(evalMetricDescriptor);
     Y_UNUSED(allowClearPool);
     CB_ENSURE(testPoolPtrs.size() == 1, "Multiple eval sets not supported for GPU");
     Y_VERIFY(evalResultPtrs.size() == testPoolPtrs.size());
     NCatboostCuda::TrainModel(params, outputOptions, learnPool, *testPoolPtrs[0], model);
     evalResultPtrs[0]->GetRawValuesRef().resize(model->ObliviousTrees.ApproxDimension);
 }
Ejemplo n.º 15
0
TVector<double> CalcRegularFeatureEffect(const TFullModel& model, const TPool& pool, int threadCount/*= 1*/) {
    int featureCount = pool.Docs.GetFactorsCount();
    CB_ENSURE(static_cast<size_t>(featureCount) >= model.ObliviousTrees.GetFlatFeatureVectorExpectedSize(), "Insufficient features count in pool");
    int catFeaturesCount = pool.CatFeatures.ysize();
    int floatFeaturesCount = featureCount - catFeaturesCount;
    TFeaturesLayout layout(featureCount, pool.CatFeatures, pool.FeatureId);

    TVector<TFeatureEffect> regularEffect = CalcRegularFeatureEffect(CalcFeatureEffect(model, pool, threadCount),
                                                                     catFeaturesCount, floatFeaturesCount);

    TVector<double> effect(featureCount);
    for (const auto& featureEffect : regularEffect) {
        int featureIdx = layout.GetFeature(featureEffect.Feature.Index, featureEffect.Feature.Type);
        Y_ASSERT(featureIdx < featureCount);
        effect[featureIdx] = featureEffect.Score;
    }

    return effect;
}
Ejemplo n.º 16
0
TVector<TColumn> ReadCD(const TString& fileName, const TCdParserDefaults& defaults) {
    CB_ENSURE(NFs::Exists(TString(fileName)), "column description file is not found");
    int columnsCount = defaults.UseDefaultType ? defaults.ColumnCount : 0;

    TVector<TColumn> columns(columnsCount, TColumn{defaults.DefaultColumnType, TString()});
    TSet<int> parsedColumns;

    TString line;
    TIFStream reader(fileName.c_str());
    while (reader.ReadLine(line)) {
        TVector<TString> tokens;
        try {
            Split(line, "\t", tokens);
        } catch (const yexception& e) {
            MATRIXNET_DEBUG_LOG << "Got exception " << e.what() << " while parsing feature descriptions line " << line << Endl;
            break;
        }
        if (tokens.empty()) {
            continue;
        }
        CB_ENSURE(tokens.ysize() == 2 || tokens.ysize() == 3, "Each line should have two or three columns. " << line);
        int index = FromString<int>(tokens[0]);
        CB_ENSURE(index >= 0, "Invalid column index " << index);
        if (defaults.UseDefaultType) {
            CB_ENSURE(index < columnsCount, "Invalid column index " << index);
        }
        CB_ENSURE(!parsedColumns.has(index), "column specified twice in cd file: " << index);
        parsedColumns.insert(index);
        columns.resize(Max(columns.ysize(), index + 1));

        TStringBuf type = tokens[1];
        if (type == "QueryId") {
            type = "GroupId";
        }
        if (type == "Target") {
            type = "Label";
        }
        CB_ENSURE(TryFromString<EColumn>(type, columns[index].Type), "unsupported column type " << type);
        if (tokens.ysize() == 3) {
            columns[index].Id = tokens[2];
        }
    }
    if (!defaults.UseDefaultType) {
        CheckAllFeaturesPresent(columns, parsedColumns);
    }

    return columns;
}
TDStrResult GetDocumentImportances(
    const TFullModel& model,
    const TPool& trainPool,
    const TPool& testPool,
    const TString& dstrTypeStr,
    int topSize,
    const TString& updateMethodStr,
    const TString& importanceValuesSignStr,
    int threadCount
) {
    if (topSize == -1) {
        topSize = trainPool.Docs.GetDocCount();
    } else {
        CB_ENSURE(topSize >= 0, "Top size should be nonnegative integer or -1 (for unlimited top size).");
    }

    TUpdateMethod updateMethod = ParseUpdateMethod(updateMethodStr);
    EDocumentStrengthType dstrType = FromString<EDocumentStrengthType>(dstrTypeStr);
    EImportanceValuesSign importanceValuesSign = FromString<EImportanceValuesSign>(importanceValuesSignStr);
    TDocumentImportancesEvaluator leafInfluenceEvaluator(model, trainPool, updateMethod, threadCount);
    const TVector<TVector<double>> documentImportances = leafInfluenceEvaluator.GetDocumentImportances(testPool);
    return GetFinalDocumentImportances(documentImportances, dstrType, topSize, importanceValuesSign);
}
Ejemplo n.º 18
0
void PrepareAllFeaturesLearn(const THashSet<int>& categFeatures,
                             const TVector<TFloatFeature>& floatFeatures,
                             const TVector<int>& ignoredFeatures,
                             bool ignoreRedundantCatFeatures,
                             size_t oneHotMaxSize,
                             ENanMode nanMode,
                             bool clearPool,
                             NPar::TLocalExecutor& localExecutor,
                             const TVector<size_t>& selectedDocIndices,
                             TDocumentStorage* learnDocStorage,
                             TAllFeatures* learnFeatures) {
    if (learnDocStorage->GetDocCount() == 0) {
        return;
    }

    TBinarizer binarizer(learnDocStorage->GetFactorsCount(), categFeatures, floatFeatures, nanMode, localExecutor);
    binarizer.SetupToIgnoreFeatures(ignoredFeatures, ignoreRedundantCatFeatures);
    PrepareSlots(binarizer.GetCatFeatureCount(), binarizer.GetFloatFeatureCount(), learnFeatures);
    binarizer.Binarize(/*forLearn=*/true, learnDocStorage, selectedDocIndices, clearPool, learnFeatures);
    CleanupOneHotFeatures(oneHotMaxSize, learnFeatures);
    CB_ENSURE(learnFeatures->GetDocCount() > 0, "Train dataset is empty after binarization");
    DumpMemUsage("Extract bools done");
}
Ejemplo n.º 19
0
void TLearnProgress::Load(IInputStream* s) {
    ui64 foldCount;
    ::Load(s, foldCount);
    CB_ENSURE(foldCount == Folds.size(), "Cannot load progress from file");
    for (ui64 i = 0; i < foldCount; ++i) {
        Folds[i].LoadApproxes(s);
    }
    AveragingFold.LoadApproxes(s);
    ::LoadMany(s, AvrgApprox,
               TestApprox,
               BestTestApprox,
               CatFeatures,
               FloatFeatures,
               ApproxDimension,
               SerializedTrainParams,
               TreeStruct,
               TreeStats,
               LeafValues,
               LearnErrorsHistory,
               TestErrorsHistory,
               TimeHistory,
               UsedCtrSplits,
               PoolCheckSum);
}
Ejemplo n.º 20
0
 TCtrDescription TCatboostOptions::CreateDefaultCounter(EProjectionType projectionType) const {
     if (GetTaskType() == ETaskType::CPU) {
         return TCtrDescription(ECtrType::Counter, GetDefaultPriors(ECtrType::Counter));
     } else {
         CB_ENSURE(GetTaskType() == ETaskType::GPU);
         EBorderSelectionType borderSelectionType;
         switch (projectionType) {
             case EProjectionType::TreeCtr: {
                 borderSelectionType = EBorderSelectionType::Median;
                 break;
             }
             case EProjectionType::SimpleCtr: {
                 borderSelectionType = EBorderSelectionType::MinEntropy;
                 break;
             }
             default: {
                 ythrow TCatboostException() << "Unknown projection type " << projectionType;
             }
         }
         return TCtrDescription(ECtrType::FeatureFreq,
                                GetDefaultPriors(ECtrType::FeatureFreq),
                                TBinarizationOptions(borderSelectionType, 15));
     }
 }
Ejemplo n.º 21
0
    void TDataProviderBuilder::Finish() {
        CB_ENSURE(!IsDone, "Error: can't finish more than once");
        DataProvider.Features.reserve(FeatureValues.size());

        DataProvider.Order.resize(DataProvider.Targets.size());
        std::iota(DataProvider.Order.begin(),
                  DataProvider.Order.end(), 0);

        if (!AreEqualTo<ui64>(DataProvider.Timestamp, 0)) {
            ShuffleFlag = false;
            DataProvider.Order = CreateOrderByKey(DataProvider.Timestamp);
        }

        bool hasQueryIds = HasQueryIds(DataProvider.QueryIds);
        if (!hasQueryIds) {
            DataProvider.QueryIds.resize(0);
        }

        //TODO(noxoomo): it's not safe here, if we change order with shuffle everything'll go wrong
        if (Pairs.size()) {
            //they are local, so we don't need shuffle
            CB_ENSURE(hasQueryIds, "Error: for GPU pairwise learning you should provide query id column. Query ids will be used to split data between devices and for dynamic boosting learning scheme.");
            DataProvider.FillQueryPairs(Pairs);
        }

        if (ShuffleFlag) {
            if (hasQueryIds) {
                //should not change order inside query for pairs consistency
                QueryConsistentShuffle(Seed, 1, DataProvider.QueryIds, &DataProvider.Order);
            } else {
                Shuffle(Seed, 1, DataProvider.Targets.size(), &DataProvider.Order);
            }
            DataProvider.SetShuffleSeed(Seed);
        }

        if (ShuffleFlag || !DataProvider.Timestamp.empty()) {
            DataProvider.ApplyOrderToMetaColumns();
        }

        TVector<TString> featureNames;
        featureNames.resize(FeatureValues.size());

        TAdaptiveLock lock;

        NPar::TLocalExecutor executor;
        executor.RunAdditionalThreads(BuildThreads - 1);

        TVector<TFeatureColumnPtr> featureColumns(FeatureValues.size());

        if (!IsTest) {
            RegisterFeaturesInFeatureManager(featureColumns);
        }

        TVector<TVector<float>> grid;
        grid.resize(FeatureValues.size());

        NPar::ParallelFor(executor, 0, FeatureValues.size(), [&](ui32 featureId) {
            auto featureName = GetFeatureName(featureId);
            featureNames[featureId] = featureName;

            if (FeatureValues[featureId].size() == 0) {
                return;
            }

            TVector<float> line(DataProvider.Order.size());
            for (ui32 i = 0; i < DataProvider.Order.size(); ++i) {
                line[i] = FeatureValues[featureId][DataProvider.Order[i]];
            }

            if (CatFeatureIds.has(featureId)) {
                static_assert(sizeof(float) == sizeof(ui32), "Error: float size should be equal to ui32 size");
                const bool shouldSkip = IsTest && (CatFeaturesPerfectHashHelper.GetUniqueValues(featureId) == 0);
                if (!shouldSkip) {
                    auto data = CatFeaturesPerfectHashHelper.UpdatePerfectHashAndBinarize(featureId,
                                                                                          ~line,
                                                                                          line.size());

                    const ui32 uniqueValues = CatFeaturesPerfectHashHelper.GetUniqueValues(featureId);

                    if (uniqueValues > 1) {
                        auto compressedData = CompressVector<ui64>(~data, line.size(), IntLog2(uniqueValues));
                        featureColumns[featureId] = MakeHolder<TCatFeatureValuesHolder>(featureId,
                                                                                        line.size(),
                                                                                        std::move(compressedData),
                                                                                        uniqueValues,
                                                                                        featureName);
                    }
                }
            } else {
                auto floatFeature = MakeHolder<TFloatValuesHolder>(featureId,
                                                                   std::move(line),
                                                                   featureName);

                TVector<float>& borders = grid[featureId];

                ENanMode nanMode = ENanMode::Forbidden;
                {
                    TGuard<TAdaptiveLock> guard(lock);
                    nanMode = FeaturesManager.GetOrCreateNanMode(*floatFeature);
                }

                if (FeaturesManager.HasFloatFeatureBorders(*floatFeature)) {
                    borders = FeaturesManager.GetFloatFeatureBorders(*floatFeature);
                }

                if (borders.empty() && !IsTest) {
                    const auto& floatValues = floatFeature->GetValues();
                    NCatboostOptions::TBinarizationOptions config = FeaturesManager.GetFloatFeatureBinarization();
                    config.NanMode = nanMode;
                    borders = BuildBorders(floatValues, floatFeature->GetId(), config);
                }
                if (borders.ysize() == 0) {
                    MATRIXNET_DEBUG_LOG << "Float Feature #" << featureId << " is empty" << Endl;
                    return;
                }

                auto binarizedData = BinarizeLine(floatFeature->GetValues().data(),
                                                  floatFeature->GetValues().size(),
                                                  nanMode,
                                                  borders);

                const int binCount = static_cast<const int>(borders.size() + 1 + (ENanMode::Forbidden != nanMode));
                auto compressedLine = CompressVector<ui64>(binarizedData, IntLog2(binCount));

                featureColumns[featureId] = MakeHolder<TBinarizedFloatValuesHolder>(featureId,
                                                                                    floatFeature->GetValues().size(),
                                                                                    nanMode,
                                                                                    borders,
                                                                                    std::move(compressedLine),
                                                                                    featureName);
            }

            //Free memory
            {
                auto emptyVec = TVector<float>();
                FeatureValues[featureId].swap(emptyVec);
            }
        });

        for (ui32 featureId = 0; featureId < featureColumns.size(); ++featureId) {
            if (CatFeatureIds.has(featureId)) {
                if (featureColumns[featureId] == nullptr && (!IsTest)) {
                    MATRIXNET_DEBUG_LOG << "Cat Feature #" << featureId << " is empty" << Endl;
                }
            } else if (featureColumns[featureId] != nullptr) {
                if (!FeaturesManager.HasFloatFeatureBordersForDataProviderFeature(featureId)) {
                    FeaturesManager.SetFloatFeatureBordersForDataProviderId(featureId,
                                                                            std::move(grid[featureId]));
                }
            }
            if (featureColumns[featureId] != nullptr) {
                DataProvider.Features.push_back(std::move(featureColumns[featureId]));
            }
        }

        DataProvider.BuildIndicesRemap();

        if (!IsTest) {
            TOnCpuGridBuilderFactory gridBuilderFactory;
            FeaturesManager.SetTargetBorders(TBordersBuilder(gridBuilderFactory,
                                                             DataProvider.GetTargets())(FeaturesManager.GetTargetBinarizationDescription()));
        }

        DataProvider.FeatureNames = featureNames;
        DataProvider.CatFeatureIds = CatFeatureIds;

        if (ClassesWeights.size()) {
            Reweight(DataProvider.Targets, ClassesWeights, &DataProvider.Weights);
        }
        IsDone = true;
    }
Ejemplo n.º 22
0
    void TCatboostOptions::SetLeavesEstimationDefault() {
        const auto& lossFunctionConfig = LossFunctionDescription.Get();

        auto& treeConfig = ObliviousTreeOptions.Get();
        ui32 defaultNewtonIterations = 1;
        ui32 defaultGradientIterations = 1;
        ELeavesEstimation defaultEstimationMethod = ELeavesEstimation::Newton;

        switch (lossFunctionConfig.GetLossFunction()) {
            case ELossFunction::RMSE: {
                defaultEstimationMethod = ELeavesEstimation::Newton;
                defaultNewtonIterations = 1;
                defaultGradientIterations = 1;
                break;
            }
            case ELossFunction::QueryRMSE: {
                defaultEstimationMethod = ELeavesEstimation::Gradient;
                defaultNewtonIterations = 1;
                defaultGradientIterations = 1;
                break;
            }
            case ELossFunction::MultiClass:
            case ELossFunction::MultiClassOneVsAll: {
                defaultEstimationMethod = ELeavesEstimation::Newton;
                defaultNewtonIterations = 1;
                defaultGradientIterations = 10;
                break;
            }
            case ELossFunction::Quantile:
            case ELossFunction::MAE:
            case ELossFunction::LogLinQuantile:
            case ELossFunction::MAPE: {
                defaultNewtonIterations = 1;
                defaultGradientIterations = 1;
                defaultEstimationMethod = ELeavesEstimation::Gradient;
                break;
            }
            case ELossFunction::PairLogit: {
                defaultEstimationMethod = ELeavesEstimation::Gradient;
                //TODO(noxoomo): update to 10 after options merge
                defaultNewtonIterations = 1;
                defaultGradientIterations = 1;
                break;
            }
            case ELossFunction::Poisson: {
                defaultEstimationMethod = ELeavesEstimation::Gradient;
                defaultNewtonIterations = 1;
                defaultGradientIterations = 1;
                break;
            }
            case ELossFunction::Logloss:
            case ELossFunction::CrossEntropy: {
                defaultNewtonIterations = 10;
                defaultGradientIterations = 100;
                defaultEstimationMethod = ELeavesEstimation::Newton;
                break;
            }
            case ELossFunction::YetiRank: {
                defaultEstimationMethod = ELeavesEstimation::Newton;
                defaultGradientIterations = 1;
                defaultNewtonIterations = 1;
                break;
            }
            case ELossFunction::UserPerObjErr:
            case ELossFunction::UserQuerywiseErr:
            case ELossFunction::Custom: {
                //skip
                defaultNewtonIterations = 1;
                defaultGradientIterations = 1;
                break;
            }
            default: {
                CB_ENSURE(false, "Unknown loss function " << lossFunctionConfig.GetLossFunction());
            }
        }

        if (treeConfig.LeavesEstimationMethod.NotSet()) {
            treeConfig.LeavesEstimationMethod = defaultEstimationMethod;
        }

        if (treeConfig.LeavesEstimationIterations.NotSet()) {
            const ELeavesEstimation method = treeConfig.LeavesEstimationMethod;
            switch (method) {
                case ELeavesEstimation::Newton: {
                    treeConfig.LeavesEstimationIterations = defaultNewtonIterations;
                    break;
                }
                case ELeavesEstimation::Gradient: {
                    treeConfig.LeavesEstimationIterations = defaultGradientIterations;
                    break;
                }
                default: {
                    ythrow TCatboostException() << "Unknown estimation type "
                                                << method;
                }
            }
        }

        if (treeConfig.L2Reg == 0.0f) {
            treeConfig.L2Reg = 1e-20f;
        }
    }
Ejemplo n.º 23
0
inline void CheckAllFeaturesPresent(const TVector<TColumn>& columns, const TSet<int>& parsedColumns) {
    for (int i = 0; i < columns.ysize(); ++i) {
        CB_ENSURE(parsedColumns.has(i), "column not present in cd file: " << i);
    }
}
Ejemplo n.º 24
0
    void TCatboostOptions::ValidateCtr(const TCtrDescription& ctr, ELossFunction lossFunction, bool isTreeCtrs) const {
        if (ctr.TargetBinarization->BorderCount > 1) {
            CB_ENSURE(lossFunction == ELossFunction::RMSE || lossFunction == ELossFunction::Quantile ||
                          lossFunction == ELossFunction::LogLinQuantile || lossFunction == ELossFunction::Poisson ||
                          lossFunction == ELossFunction::MAPE || lossFunction == ELossFunction::MAE,
                      "target-border-cnt is not supported for loss function " << lossFunction);
        }
        CB_ENSURE(ctr.GetPriors().size(), "Provide at least one prior for CTR" << ToString(*this));

        const ETaskType taskType = GetTaskType();
        const ECtrType ctrType = ctr.Type;

        if (taskType == ETaskType::GPU) {
            CB_ENSURE(IsSupportedOnGpu(ctrType),
                      "Ctr type " << ctrType << " is not implemented on GPU yet");
            CB_ENSURE(ctr.TargetBinarization.IsDefault(), "Error: GPU doesn't not support target binarization per CTR description currently. Please use target_borders option instead");
        } else {
            CB_ENSURE(taskType == ETaskType::CPU);
            CB_ENSURE(IsSupportedOnCpu(ctrType),
                      "Ctr type " << ctrType << " is not implemented on CPU yet");
            CB_ENSURE(ctr.PriorEstimation == EPriorEstimation::No, "Error: CPU doesn't not support prior estimation currently");
        }

        const EBorderSelectionType borderSelectionType = ctr.CtrBinarization->BorderSelectionType;
        if (taskType == ETaskType::CPU) {
            CB_ENSURE(borderSelectionType == EBorderSelectionType::Uniform,
                      "Error: custom ctr binarization is not supported on CPU yet");
        } else {
            CB_ENSURE(taskType == ETaskType::GPU);
            if (isTreeCtrs) {
                EBorderSelectionType borderType = borderSelectionType;
                CB_ENSURE(borderType == EBorderSelectionType::Uniform || borderType == EBorderSelectionType::Median,
                          "Error: GPU supports Median and Uniform combinations-ctr binarization only");

                CB_ENSURE(ctr.CtrBinarization->BorderCount <= GetMaxTreeCtrBinarizationForGpu(), "Error: max combinations-ctr binarization for GPU is " << GetMaxTreeCtrBinarizationForGpu());
                CB_ENSURE(ctr.PriorEstimation == EPriorEstimation::No, "Error: prior estimation is not available for combinations-ctr");
            } else {
                switch (ctrType) {
                    case ECtrType::Borders: {
                        break;
                    }
                    default: {
                        CB_ENSURE(ctr.PriorEstimation == EPriorEstimation::No, "Error: prior estimation is not available for ctr type " << ctrType);
                    }
                }
            }
        }

        if ((ctrType == ECtrType::FeatureFreq) && borderSelectionType == EBorderSelectionType::Uniform) {
            MATRIXNET_WARNING_LOG << "Uniform ctr binarization for featureFreq ctr is not good choice. Use MinEntropy for simpleCtrs and Median for combinations-ctrs instead" << Endl;
        }
    }