std::string get_content(const corpus::document& doc) { if (!doc.contains_content()) throw analyzer_exception{ "document content was not populated for analysis"}; return utf::to_utf8(doc.content(), doc.encoding()); }
void check_analyzer_expected(Analyzer& ana, corpus::document doc, uint64_t num_unique, uint64_t length) { ana.tokenize(doc); ASSERT_EQUAL(doc.counts().size(), num_unique); ASSERT_EQUAL(doc.length(), length); ASSERT_EQUAL(doc.id(), 47ul); if (doc.contains_content()) { ASSERT_EQUAL(doc.path(), "/home/person/filename.txt"); ASSERT_EQUAL(doc.name(), "filename.txt"); } else { ASSERT_EQUAL(doc.path(), "../data/sample-document.txt"); ASSERT_EQUAL(doc.name(), "sample-document.txt"); } }