void testOrder() { InvertedIndex index; const Term term1("foo"); index.insert(term1, 0); index.insert(term1, 2); index.insert(term1, 1); index.insert(term1, 5); index.insert(term1, 4); index.insert(term1, 3); index.insert(term1, 6); const Postings &postings = index.getPostings(term1); CPPUNIT_ASSERT(postings.size() == 7); Postings::const_iterator iter = postings.begin(); DocId prevId = *iter; ++iter; for(; iter != postings.end(); ++iter) { CPPUNIT_ASSERT(prevId < *iter); prevId = *iter; } CPPUNIT_ASSERT(prevId == 6); }
void testInsert() { InvertedIndex index; const Term term1("foo"); index.insert(term1, 0); CPPUNIT_ASSERT(index.getPostings(term1).size() == 1); CPPUNIT_ASSERT(isContain(index.getPostings(term1), 0)); index.insert(term1, 1); CPPUNIT_ASSERT(index.getPostings(term1).size() == 2); CPPUNIT_ASSERT(isContain(index.getPostings(term1), 0)); CPPUNIT_ASSERT(isContain(index.getPostings(term1), 1)); CPPUNIT_ASSERT(!isContain(index.getPostings(term1), 2)); const Term term2("bar"); index.insert(term2, 0); CPPUNIT_ASSERT(index.getPostings(term2).size() == 1); CPPUNIT_ASSERT(isContain(index.getPostings(term2), 0)); index.insert(term2, 1); CPPUNIT_ASSERT(index.getPostings(term2).size() == 2); CPPUNIT_ASSERT(isContain(index.getPostings(term2), 0)); CPPUNIT_ASSERT(isContain(index.getPostings(term2), 1)); CPPUNIT_ASSERT(!isContain(index.getPostings(term2), 2)); }
void Tokenizer::execute(wchar const * it , wchar const * endit , InvertedIndex & inverted_index , unsigned document_id) { Token token; auto not_character = character_.end(); auto not_delimiter = delimiter_.end(); while (it != endit) { // Read a sequence of 6 words size_t counter = 6 - token.size(); bool separated = false; for (; counter > 0; --counter) { // Eliminate all delimiters while (it != endit && character_.find(*it) == not_character) { if (delimiter_.find(*it) != not_delimiter) { // If it is a delimiter, eliminate all of 'em do { ++it; } while (it != endit && delimiter_.find(*it) != not_delimiter); // If it is a delimiter, it must not read more words // => There is no delimiter can present in the middle of a token separated = true; break; } ++it; } if (it == endit) { // End of file separated = true; break; } if (separated) if (!token.empty()) // Delimiters found => must not read more words break; else { // Delimiteres found => Token is still empty => continue separated = false; ++counter; continue; } wchar const * begin = it; do { // Read characters of a word in the token ++it; } while (it != endit && character_.find(*it) != not_character); wstring origin = wstring(begin , it - begin); size_t length = origin.length(); // Lowercase the word for (size_t i = 0; i < length; ++i) origin[i] = character_.at(origin[i]); token.push_back(origin); } do { size_t counter; while (true) { // Eliminate stop words counter = stopword_.max_match(token); if (counter == 0) break; else { do { token.pop_front(); --counter; } while (counter > 0); if (token.empty()) break; } } if (token.empty()) break; counter = vocabulary.max_match(token); if (counter > 0) { // If there is a token in vocabulary, insert it inverted_index.insert(token.cbegin() , token.cbegin() + counter , document_id); //token = Token(token.cbegin() + counter , token.cend()); do { token.pop_front(); --counter; } while (counter > 0); } else { // Otherwise, only insert the first word inverted_index.insert(token.cbegin() , token.cbegin() + 1 , document_id); //token = Token(token.cbegin() + 1 , token.cend()); token.pop_front(); } } while (separated); } }