void check_cv(Index& idx, Classifier& c, double min_accuracy) { std::vector<doc_id> docs = idx.docs(); classify::confusion_matrix mtx = c.cross_validate(docs, 5); ASSERT_GREATER(mtx.accuracy(), min_accuracy); ASSERT_LESS(mtx.accuracy(), 100.0); }
classify::confusion_matrix cv(Index& idx, Classifier& c, bool even) { std::vector<doc_id> docs = idx.docs(); classify::confusion_matrix matrix; auto seconds = common::time<std::chrono::seconds>( [&]() { matrix = c.cross_validate(docs, 5, even); }); std::cerr << "time elapsed: " << seconds.count() << "s" << std::endl; matrix.print(); matrix.print_stats(); return matrix; }
void check_split(Index& idx, Classifier& c, double min_accuracy) { // create splits std::vector<doc_id> docs = idx.docs(); std::mt19937 gen(47); std::shuffle(docs.begin(), docs.end(), gen); size_t split_idx = docs.size() / 8; std::vector<doc_id> train_docs{docs.begin() + split_idx, docs.end()}; std::vector<doc_id> test_docs{docs.begin(), docs.begin() + split_idx}; // train and test c.train(train_docs); classify::confusion_matrix mtx = c.test(test_docs); ASSERT_GREATER(mtx.accuracy(), min_accuracy); ASSERT_LESS(mtx.accuracy(), 100.0); }
void test_rank(Ranker& r, Index& idx, const std::string& encoding) { for (size_t i = 0; i < idx.num_docs(); ++i) { auto d_id = idx.docs()[i]; corpus::document query{idx.doc_path(d_id), doc_id{i}}; query.encoding(encoding); auto ranking = r.score(idx, query); ASSERT_EQUAL(ranking.size(), 10); // default is 10 docs // since we're searching for a document already in the index, the same // document should be ranked first, but there are a few duplicate // documents...... if (ranking[0].first != i) { ASSERT_EQUAL(ranking[1].first, i); ASSERT_APPROX_EQUAL(ranking[0].second, ranking[1].second); } } }