void IndexTestCase::testKeywordIndex() { DocumentSchema schema; schema.addUnIndexedField("PATH"); schema.addField("Keyword", "KEYWORD", false); buildIndex(schema, "file1.txt, hello world."); tstring str = getTestPath(); Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("Keyword"); CPPUNIT_ASSERT(pTermIterator != NULL); while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)1, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)1, termMeta.getCTF()); } Term term("Keyword", "hello world."); TermPostingIteratorPtr pPost = pTermReader->seek(&term); CPPUNIT_ASSERT(pPost); docid_t docId = pPost->skipTo(0); CPPUNIT_ASSERT_EQUAL((docid_t)0, docId); docId = pPost->skipTo(++docId); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); }
void IndexTestCase::testInt32_IF() { DocumentSchema schema; schema.addField("Int32", "INT32_IF", false); const static size_t NUM_DOCS = 1000; stringstream ss; for (size_t i = 0; i < NUM_DOCS; ++i) { ss << (i % 100) << ";"; } GLOBAL_CONF().Build.buildThreadCount = 1; buildIndex(schema, ss.str()); tstring str = getTestPath(); Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("Int32"); CPPUNIT_ASSERT(pTermIterator != NULL); while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)10, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)10, termMeta.getCTF()); } Term term("Int32", "0"); TermPostingIteratorPtr pPost = pTermReader->seek(&term); CPPUNIT_ASSERT(pPost); docid_t docId = pPost->skipTo(0); CPPUNIT_ASSERT_EQUAL((docid_t)0, docId); docId = pPost->skipTo(901); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); ForwardIndexIteratorPtr pForIndexIt = pReader->forwardIndexReader("Int32"); CPPUNIT_ASSERT(pForIndexIt != NULL); Int32ForwardIndexIteratorPtr pInt32ForIndexIt = pForIndexIt.cast<Int32ForwardIndexIterator>(); CPPUNIT_ASSERT(pInt32ForIndexIt != NULL); int32_t value = 0; docId = 0; for (; docId < (docid_t)NUM_DOCS; ++docId) { CPPUNIT_ASSERT(pInt32ForIndexIt->seek(docId, value)); CPPUNIT_ASSERT_EQUAL((int32_t)(docId % 100), value); } CPPUNIT_ASSERT(!pInt32ForIndexIt->seek(docId, value)); }
void DateTimeIndexTestCase::testPosting() { const size_t NUM_DOCS = 100; vector<TimeAndDoc> timeVec; stringstream ss; for (size_t i = 0; i < NUM_DOCS; ++i) { int y = 2011; int m = (i % 12) + 1; int d = (i % 27) + 1; int h = i % 24; int min = i % 60; int s = i % 60; ss << y << "-" << m << "-" << d << " " << h << ":" << min << ":" << s << ";"; TimeAndDoc td; td.nTime = DateTimeAnalyzer::makeTime(y, m, d, h, min, s); td.docId = (docid_t)i; timeVec .push_back(td); } sort(timeVec.begin(), timeVec.end(), timeLess); GLOBAL_CONF().Build.buildThreadCount = 1; buildDateTimeIndex(ss.str()); Index index; index.open(getIndexPath(), Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); TermReaderPtr pTermReader = pReader->termReader(); TermIteratorPtr pIterator = pTermReader->termIterator("DateTime1"); CPPUNIT_ASSERT(pIterator); size_t j = 0; while (pIterator->hasNext()) { TermIterator::TermEntry entry = pIterator->next(); const Term* pTerm = entry.term; const Int64Term* pTermX = pTerm->cast<int64_t>(); int64_t nCurTerm = pTermX->getValue(); int64_t nExpTerm = timeVec[j].nTime; CPPUNIT_ASSERT_EQUAL(nExpTerm, nCurTerm); TermPostingIteratorPtr pPostingIter = entry.postingIterator; CPPUNIT_ASSERT(pPostingIter != NULL); docid_t docId = pPostingIter->skipTo(0); CPPUNIT_ASSERT_EQUAL(timeVec[j].docId, docId); docId = pPostingIter->skipTo(docId + 1); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); j++; } }
void IndexTestCase::testPrimaryKeyIndex() { DocumentSchema schema; schema.addField("PK", "PRIMARY_KEY", false); const static size_t NUM_DOCS = 1000; stringstream ss; for (size_t i = 0; i < NUM_DOCS; ++i) { ss << i << ";"; } GLOBAL_CONF().Build.buildThreadCount = 1; buildIndex(schema, ss.str()); tstring str = getTestPath(); Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("PK"); CPPUNIT_ASSERT(pTermIterator != NULL); while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)1, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)1, termMeta.getCTF()); } for (size_t i = 0; i < NUM_DOCS; ++i) { stringstream ss2; ss2 << i; Term term("PK", ss2.str()); TermPostingIteratorPtr pPost = pTermReader->seek(&term); CPPUNIT_ASSERT(pPost); docid_t docId = pPost->skipTo(0); CPPUNIT_ASSERT_EQUAL((docid_t)i, docId); docId = pPost->skipTo(++docId); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); } }
void IndexTestCase::testTextIndex() { DocumentSchema schema; schema.addUnIndexedField("PATH"); schema.addTextField("CONTENT"); buildIndex(schema, "file1.txt, hello world."); tstring str = getTestPath(); Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("CONTENT"); CPPUNIT_ASSERT(pTermIterator != NULL); while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)1, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)1, termMeta.getCTF()); } Term term("CONTENT", "hello"); TermPostingIteratorPtr pPost = pTermReader->seek(&term); CPPUNIT_ASSERT(pPost); docid_t docId = pPost->skipTo(0); CPPUNIT_ASSERT_EQUAL((docid_t)0, docId); docId = pPost->skipTo(++docId); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); StoredFieldsReaderPtr pDocReader = pReader->createStoredFieldsReader(); CPPUNIT_ASSERT(pDocReader); FieldSelector selector(pReader->getDocSchema(), true, false); ResultDoc resultDoc(0); bool ret = pDocReader->getDocument(selector, resultDoc); CPPUNIT_ASSERT(ret); CPPUNIT_ASSERT(resultDoc.size() > 0); }
void IndexTestCase::testDocumentDeletion() { DocumentSchema schema; schema.addField("URL", "PRIMARY_KEY", true); schema.addTextField("BODY"); schema.addField("MODIFIED", "INT64", true); stringstream ss1; const size_t NUM_DOCS = 1000; size_t i = 0; for (; i < NUM_DOCS; ++i) { ss1 << "url" << i << ", body" << i << " hot," << (i * 100) % 1000 << ";"; } buildIndex(schema, ss1.str()); stringstream ss2; for (; i < 2 * NUM_DOCS; ++i) { ss2 << "url" << i << ", body" << i << " hot," << (i * 100) % 1000 << ";"; } buildIndex(schema, ss2.str(), true); StandardAnalyzerPtr sa(new StandardAnalyzer()); sa->init(); TokenViewPtr pTokens = sa->tokenize("hot", 3); CPPUNIT_ASSERT(pTokens); CPPUNIT_ASSERT(pTokens->getNumTokens() == 1); TokenView::Iterator it = pTokens->iterator(); TermPtr pTerm(new Term("BODY", it.next().getTextValue())); tstring str = getTestPath(); std::set<docid_t> answer; { Index index; index.open(str, Index::RDWR, NULL); IndexWriterPtr pIndexWriter = index.acquireWriter(); CPPUNIT_ASSERT(pIndexWriter != NULL); IndexReaderPtr pIndexReader = index.acquireReader(); CPPUNIT_ASSERT(pIndexReader != NULL); for (size_t i = 0; i < 2 * NUM_DOCS; ++i) { stringstream ss; ss << "url" << i; if (i == 1000 || i == 1500 || i == 1505 || i == 1999) { pIndexWriter->deleteDocument(ss.str()); } else { TermReaderPtr pTermReader = pIndexReader->termReader(); TermPtr pTerm(new Term("URL", ss.str())); TermPostingIteratorPtr pIt = pTermReader->seek(pTerm.get()); docid_t docId = pIt->skipTo(0); answer.insert(docId); } } TermReaderPtr pTermReader = pIndexReader->termReader(); TermPostingIteratorPtr pDocFreqs = pTermReader->seek(pTerm.get()); CPPUNIT_ASSERT(pDocFreqs); CPPUNIT_ASSERT_EQUAL((df_t)NUM_DOCS * 2, pDocFreqs->getTermMeta().getDocFreq()); std::set<docid_t>::const_iterator it = answer.begin(); for (docid_t i = 0; i < (docid_t)(2 * NUM_DOCS); ) { docid_t docId = pDocFreqs->skipTo((docid_t)i); i = docId + 1; if (docId == INVALID_DOCID) { break; } CPPUNIT_ASSERT_EQUAL(*it, docId); ++it; } CPPUNIT_ASSERT(it == answer.end()); } { Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pIndexReader = index.acquireReader(); CPPUNIT_ASSERT(pIndexReader != NULL); TermReaderPtr pTermReader = pIndexReader->termReader(); TermPostingIteratorPtr pDocFreqs = pTermReader->seek(pTerm.get()); CPPUNIT_ASSERT(pDocFreqs); CPPUNIT_ASSERT_EQUAL((df_t)(2 * NUM_DOCS), pDocFreqs->getTermMeta().getDocFreq()); std::set<docid_t>::const_iterator it = answer.begin(); for (docid_t i = 0; i < (docid_t)(2 * NUM_DOCS); ) { docid_t docId = pDocFreqs->skipTo((docid_t)i); i = docId + 1; if (docId == INVALID_DOCID) { break; } CPPUNIT_ASSERT_EQUAL(*it, docId); ++it; } CPPUNIT_ASSERT(it == answer.end()); // for (std::set<docid_t>::const_iterator it = answer.begin(); // it != answer.end(); ++it) // { // docid_t docId = pDocFreqs->skipTo(*it); // CPPUNIT_ASSERT_EQUAL(*it, docId); // } // docid_t docId = pDocFreqs->skipTo(NUM_DOCS + 0); // CPPUNIT_ASSERT_EQUAL((docid_t)NUM_DOCS + 1, docId); // docId = pDocFreqs->skipTo(NUM_DOCS + 500); // CPPUNIT_ASSERT_EQUAL((docid_t)NUM_DOCS + 501, docId); // docId = pDocFreqs->skipTo(NUM_DOCS + 505); // CPPUNIT_ASSERT_EQUAL((docid_t)NUM_DOCS + 506, docId); // docId = pDocFreqs->skipTo(2 * NUM_DOCS - 1); // CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); } }
void KeywordIndexTestCase::testPosting() { const size_t NUM_DOCS = 100; vector<HashAndDoc> hashVec; stringstream ss; size_t i; for (i = 0; i < NUM_DOCS; ++i) { stringstream ss1; ss1 << "abc" << i; ss << ss1.str() << ", " << "abc" << ";"; HashAndDoc hd; hd.nHash = Hash::hashString64(ss1.str().c_str()); hd.docId = (docid_t)i; hashVec.push_back(hd); } GLOBAL_CONF().Build.buildThreadCount = 1; buildKeywordIndex(ss.str()); Index index; index.open(getIndexPath(), Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); TermReaderPtr pTermReader = pReader->termReader(); TermIteratorPtr pIterator = pTermReader->termIterator("Keyword1"); CPPUNIT_ASSERT(!pIterator.isNull()); sort(hashVec.begin(), hashVec.end(), hashLess); int32_t j = 0; while (pIterator->hasNext()) { TermIterator::TermEntry entry = pIterator->next(); const Term* pTerm = entry.term; const UInt64Term* pTermX = pTerm->cast<uint64_t>(); uint64_t nCurTerm = pTermX->getValue(); uint64_t nExpTerm = hashVec[j].nHash; CPPUNIT_ASSERT_EQUAL(nExpTerm, nCurTerm); TermPostingIteratorPtr pPostingIter = entry.postingIterator; CPPUNIT_ASSERT(pPostingIter != NULL); docid_t docId = pPostingIter->skipTo(0); CPPUNIT_ASSERT_EQUAL(hashVec[j].docId, docId); docId = pPostingIter->skipTo(docId + 1); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); j++; } //test abc term TermIteratorPtr pIterator2 = pTermReader->termIterator("Keyword2"); CPPUNIT_ASSERT(!pIterator2.isNull()); i = 0; while (pIterator2->hasNext()) { TermIterator::TermEntry entry = pIterator2->next(); const Term* pTerm = entry.term; const UInt64Term* pTermX = pTerm->cast<uint64_t>(); uint64_t nCurTerm = pTermX->getValue(); uint64_t nExpTerm = Hash::hashString64("abc"); CPPUNIT_ASSERT_EQUAL(nExpTerm, nCurTerm); TermPostingIteratorPtr pPostingIter = entry.postingIterator; CPPUNIT_ASSERT(pPostingIter != NULL); docid_t docId = 0; docid_t docIdRet = 0; while ((docIdRet = pPostingIter->skipTo(docId)) != INVALID_DOCID) { CPPUNIT_ASSERT_EQUAL(docId++, docIdRet); } CPPUNIT_ASSERT_EQUAL((docid_t)100, docId); i++; } CPPUNIT_ASSERT_EQUAL((size_t)1, i); }