Exemple #1
0
void metapy_bind_stats(py::module& m)
{
    auto m_stats = m.def_submodule("stats");

    py::class_<py_multinomial>{m_stats, "Multinomial"}
        .def("increment", &py_multinomial::increment)
        .def("decrement", &py_multinomial::decrement)
        .def("counts", [](const py_multinomial& dist,
                          py::object obj) { return dist.counts(obj); })
        .def("counts", [](const py_multinomial& dist) { return dist.counts(); })
        .def("unique_events", &py_multinomial::unique_events)
        .def("each_seen_event", &py_multinomial::each_seen_event)
        .def("clear", &py_multinomial::clear)
        .def("probability", &py_multinomial::probability)
        .def("__repr__", [](const py_multinomial& mult) {
            const auto size = mult.unique_events();
            uint64_t i = 0;
            std::string result = "<metapy.stats.Multinomial {";
            mult.each_seen_event([&](const py::object& obj) {
                result += obj.attr("__repr__")().cast<std::string>();
                result += ": ";
                result += std::to_string(mult.probability(obj));
                if (++i != size)
                    result += ", ";
            });
            result += "}>";
            return result;
        });
}
void metapy_bind_topics(py::module& m)
{
    auto m_topics = m.def_submodule("topics");

    py::class_<topics::lda_model>{m_topics, "LDAModel"}
        .def("run",
             [](topics::lda_model& model, uint64_t num_iters,
                double convergence) {
                 py::gil_scoped_release release;
                 model.run(num_iters, convergence);
             })
        .def("save_doc_topic_distributions",
             [](const topics::lda_model& model, const std::string& filename) {
                 std::ofstream output{filename, std::ios::binary};
                 model.save_doc_topic_distributions(output);
             })
        .def("save_topic_term_distributions",
             [](const topics::lda_model& model, const std::string& filename) {
                 std::ofstream output{filename, std::ios::binary};
                 model.save_topic_term_distributions(output);
             })
        .def("save", &topics::lda_model::save)
        .def("compute_term_topic_probability",
             &topics::lda_model::compute_term_topic_probability)
        .def("compute_doc_topic_probability",
             &topics::lda_model::compute_doc_topic_probability)
        .def("topic_distribution",
             [](const topics::lda_model& model, doc_id doc) {
                 return py_multinomial{model.topic_distribution(doc)};
             })
        .def("term_distribution",
             [](const topics::lda_model& model, topic_id k) {
                 return py_multinomial{model.term_distribution(k)};
             })
        .def("num_topics", &topics::lda_model::num_topics);

    py::class_<topics::inferencer>{m_topics, "LDAInferencer"}
        .def("term_distribution",
             [](const topics::inferencer& inf, topic_id k) {
                 return py_multinomial{inf.term_distribution(k)};
             },
             py::arg("k"))
        .def("num_topics", &topics::inferencer::num_topics);

    py::class_<topics::lda_cvb, topics::lda_model>{m_topics, "LDACollapsedVB"}
        .def(py::init<const learn::dataset&, std::size_t, double, double>(),
             py::keep_alive<0, 1>(), py::arg("docs"), py::arg("num_topics"),
             py::arg("alpha"), py::arg("beta"))
        .def("run",
             [](topics::lda_cvb& lda, uint64_t num_iters, double convergence) {
                 py::gil_scoped_release release;
                 lda.run(num_iters, convergence);
             },
             py::arg("num_iters"), py::arg("convergence") = 1e-3);

    py::class_<topics::lda_cvb::inferencer, topics::inferencer>{m_topics,
                                                                "CVBInferencer"}
        .def("__init__",
             [](topics::inferencer& inf, const std::string& cfgfile) {
                 py::gil_scoped_release release;
                 auto config = cpptoml::parse_file(cfgfile);
                 new (&inf) topics::inferencer(*config);
             },
             py::arg("cfg_file"))
        .def("__init__",
             [](topics::inferencer& inf, const std::string& topicsfile,
                double alpha) {
                 py::gil_scoped_release release;
                 std::ifstream topics_stream{topicsfile, std::ios::binary};
                 new (&inf) topics::inferencer(topics_stream, alpha);
             },
             py::arg("topics_file"), py::arg("alpha"))
        .def("infer",
             [](const topics::lda_cvb::inferencer& inf,
                const learn::feature_vector& doc, std::size_t max_iters,
                double convergence) {
                 return py_multinomial{inf(doc, max_iters, convergence)};
             },
             py::arg("doc"), py::arg("max_iters"), py::arg("convergence"));

    py::class_<topics::lda_gibbs, topics::lda_model>{m_topics, "LDAGibbs"}
        .def(py::init<const learn::dataset&, std::size_t, double, double>(),
             py::keep_alive<0, 1>(), py::arg("docs"), py::arg("num_topics"),
             py::arg("alpha"), py::arg("beta"))
        .def(
            "run",
            [](topics::lda_gibbs& lda, uint64_t num_iters, double convergence) {
                py::gil_scoped_release release;
                lda.run(num_iters, convergence);
            },
            py::arg("num_iters"), py::arg("convergence") = 1e-6);

    py::class_<topics::lda_gibbs::inferencer, topics::inferencer>{
        m_topics, "GibbsInferencer"}
        .def("__init__",
             [](topics::inferencer& inf, const std::string& cfgfile) {
                 py::gil_scoped_release release;
                 auto config = cpptoml::parse_file(cfgfile);
                 new (&inf) topics::inferencer(*config);
             },
             py::arg("cfg_file"))
        .def("__init__",
             [](topics::inferencer& inf, const std::string& topicsfile,
                double alpha) {
                 py::gil_scoped_release release;
                 std::ifstream topics_stream{topicsfile, std::ios::binary};
                 new (&inf) topics::inferencer(topics_stream, alpha);
             },
             py::arg("topics_file"), py::arg("alpha"))

        .def("infer",
             [](const topics::lda_gibbs::inferencer& inf,
                const learn::feature_vector& doc, std::size_t num_iters,
                std::size_t seed) {
                 random::xoroshiro128 rng{seed};
                 return py_multinomial{inf(doc, num_iters, rng)};
             },
             py::arg("doc"), py::arg("max_iters"), py::arg("rng_seed"));

    py::class_<topics::parallel_lda_gibbs, topics::lda_gibbs>{
        m_topics, "LDAParallelGibbs"}
        .def(py::init<const learn::dataset&, std::size_t, double, double>(),
             py::keep_alive<0, 1>(), py::arg("docs"), py::arg("num_topics"),
             py::arg("alpha"), py::arg("beta"));

    py::class_<topics::lda_scvb, topics::lda_model>{m_topics,
                                                    "LDAStochasticCVB"}
        .def(py::init<const learn::dataset&, std::size_t, double, double,
                      uint64_t>(),
             py::keep_alive<0, 1>(), py::arg("docs"), py::arg("num_topics"),
             py::arg("alpha"), py::arg("beta"), py::arg("minibatch_size") = 100)
        .def("run",
             [](topics::lda_scvb& lda, uint64_t num_iters, double convergence) {
                 py::gil_scoped_release release;
                 lda.run(num_iters, convergence);
             },
             py::arg("num_iters"), py::arg("convergence") = 0);

    py::class_<topics::topic_model>{m_topics, "TopicModel"}
        .def("__init__",
             [](topics::topic_model& model, const std::string& prefix) {
                 py::gil_scoped_release release;

                 std::ifstream theta{prefix + ".theta.bin", std::ios::binary};

                 if (!theta)
                 {
                     throw topics::topic_model_exception{
                         "missing document topic probabilities file: " + prefix
                         + ".theta.bin"};
                 }

                 std::ifstream phi{prefix + ".phi.bin", std::ios::binary};
                 if (!phi)
                 {
                     throw topics::topic_model_exception{
                         "missing topic term probabilities file: " + prefix
                         + ".phi.bin"};
                 }

                 new (&model) topics::topic_model(theta, phi);
             })
        .def("top_k",
             [](const topics::topic_model& model, topic_id tid, std::size_t k) {
                 return model.top_k(tid, k);
             },
             py::arg("tid"), py::arg("k") = 10)
        .def("top_k",
             [](const topics::topic_model& model, topic_id tid, std::size_t k,
                std::function<double(topic_id, term_id)> scorer) {
                 return model.top_k(tid, k, scorer);
             },
             py::arg("tid"), py::arg("k") = 10, py::arg("scorer"))
        .def("top_k",
             [](const topics::topic_model& model, topic_id tid, std::size_t k,
                const topics::bl_term_scorer& scorer) {
                 return model.top_k(tid, k, scorer);
             },
             py::arg("tid"), py::arg("k") = 10, py::arg("scorer"))
        .def("topic_distribution",
             [](const topics::topic_model& self, doc_id did) {
                 return py_multinomial{self.topic_distribution(did)};
             })
        .def("term_distribution",
             [](const topics::topic_model& self, topic_id k) {
                 return py_multinomial{self.term_distribution(k)};
             })
        .def("term_probability", &topics::topic_model::term_probability)
        .def("topic_probability", &topics::topic_model::topic_probability)
        .def("num_topics", &topics::topic_model::num_topics)
        .def("num_words", &topics::topic_model::num_words)
        .def("num_docs", &topics::topic_model::num_docs);

    m_topics.def("load_topic_model", [](const std::string& config_path) {
        py::gil_scoped_release release;
        auto config = cpptoml::parse_file(config_path);
        return topics::load_topic_model(*config);
    });

    py::class_<topics::bl_term_scorer>{m_topics, "BLTermScorer"}
        .def(py::init<const topics::topic_model&>(), py::keep_alive<0, 1>())
        .def("__call__", &topics::bl_term_scorer::operator());
}