void IndexTestCase::testKeywordIndex() { DocumentSchema schema; schema.addUnIndexedField("PATH"); schema.addField("Keyword", "KEYWORD", false); buildIndex(schema, "file1.txt, hello world."); tstring str = getTestPath(); Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("Keyword"); CPPUNIT_ASSERT(pTermIterator != NULL); while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)1, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)1, termMeta.getCTF()); } Term term("Keyword", "hello world."); TermPostingIteratorPtr pPost = pTermReader->seek(&term); CPPUNIT_ASSERT(pPost); docid_t docId = pPost->skipTo(0); CPPUNIT_ASSERT_EQUAL((docid_t)0, docId); docId = pPost->skipTo(++docId); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); }
void DateTimeIndexTestCase::testCTF() { const size_t NUM_DOCS = 100; stringstream ss; for (size_t i = 0; i < NUM_DOCS; ++i) { ss << 2009 << "-" << i % 12 + 1 << "-" << i % 27 + 1 << " " << i % 24 << ":" << i % 60 << ":" << i % 60 << ";"; } buildDateTimeIndex(ss.str()); Index index; index.open(getIndexPath(), Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("DateTime1"); CPPUNIT_ASSERT(pTermIterator != NULL); df_t ttf = 0; while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)1, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)1, termMeta.getCTF()); ttf += termMeta.getDocFreq(); } CPPUNIT_ASSERT_EQUAL((df_t)100, ttf); }
void IndexContentTestCase::testIndexContent_DL() { Index* pIndex; IndexReaderPtr pReader; const Term* pTerm; TermIteratorPtr pTermIter; int docCount = 0; int termCount = 0; uint32_t i; uint32_t indexTermId; string fileName; //Check posting list Path indexPath = TestHelper::getTestDataPath(); indexPath.makeDirectory(); indexPath.pushDirectory(_T("test_dlindex")); pIndex = new Index(indexPath.toString().c_str(), Index::READ, NULL); auto_ptr<Index> indexPtr(pIndex); pReader = pIndex->acquireReader(); TermReaderPtr pTermReader = pReader->termReader(); pTermIter = pTermReader->termIterator("BODY"); StoredFieldsReaderPtr pDocReader = pReader->createStoredFieldsReader(); //Iterator all terms while(pTermIter->next()) { pTerm = pTermIter->term(); CPPUNIT_ASSERT(pTermReader->seek(pTerm)); indexTermId = (pTerm->cast<int32_t>())->getValue(); docCount = 0; TermPostingIteratorPtr pTermDocFreqs = pTermReader->termPostings(); while(pTermDocFreqs->nextDoc()) { DocumentPtr pDoc = pDocReader->document(pTermDocFreqs->doc()); docCount++; // 获取文件路径 fileName.assign(pDoc->getField("PATH")->getValue().c_str()); TermList* pTermIdList = m_pDocScanner->getTermListOfFile(fileName); CPPUNIT_ASSERT(pTermIdList != NULL); for(i = 0, termCount = 0; i < pTermIdList->getSize(); i++) { if(indexTermId == pTermIdList->getValue(i)) { termCount++; } } CPPUNIT_ASSERT_EQUAL((tf_t)termCount, pTermDocFreqs->freq()); }//end while nextDoc() CPPUNIT_ASSERT_EQUAL((df_t)docCount, pTermDocFreqs->getDocFreq()); } CPPUNIT_ASSERT(m_pDocScanner->getTotalTermCount() == pReader->getNumTerms()); }
void KeywordIndexTestCase::testCTF() { GLOBAL_CONF().Build.buildThreadCount = 2; buildKeywordIndex("1, 2; 3, 4; 5, 6; 7, 8; 9, 10; " "1, 2; 3, 4; 5, 6; 7, 8; 9, 10"); Index index; index.open(getIndexPath(), Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("Keyword1"); CPPUNIT_ASSERT(pTermIterator != NULL); try { while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)2, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)2, termMeta.getCTF()); } } catch(const FirteXException& e) { cout << "ERROR:" << e.what() << endl; CPPUNIT_ASSERT(false); } }
void IndexTestCase::testInt32_IF() { DocumentSchema schema; schema.addField("Int32", "INT32_IF", false); const static size_t NUM_DOCS = 1000; stringstream ss; for (size_t i = 0; i < NUM_DOCS; ++i) { ss << (i % 100) << ";"; } GLOBAL_CONF().Build.buildThreadCount = 1; buildIndex(schema, ss.str()); tstring str = getTestPath(); Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("Int32"); CPPUNIT_ASSERT(pTermIterator != NULL); while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)10, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)10, termMeta.getCTF()); } Term term("Int32", "0"); TermPostingIteratorPtr pPost = pTermReader->seek(&term); CPPUNIT_ASSERT(pPost); docid_t docId = pPost->skipTo(0); CPPUNIT_ASSERT_EQUAL((docid_t)0, docId); docId = pPost->skipTo(901); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); ForwardIndexIteratorPtr pForIndexIt = pReader->forwardIndexReader("Int32"); CPPUNIT_ASSERT(pForIndexIt != NULL); Int32ForwardIndexIteratorPtr pInt32ForIndexIt = pForIndexIt.cast<Int32ForwardIndexIterator>(); CPPUNIT_ASSERT(pInt32ForIndexIt != NULL); int32_t value = 0; docId = 0; for (; docId < (docid_t)NUM_DOCS; ++docId) { CPPUNIT_ASSERT(pInt32ForIndexIt->seek(docId, value)); CPPUNIT_ASSERT_EQUAL((int32_t)(docId % 100), value); } CPPUNIT_ASSERT(!pInt32ForIndexIt->seek(docId, value)); }
void DateTimeIndexTestCase::testPosting() { const size_t NUM_DOCS = 100; vector<TimeAndDoc> timeVec; stringstream ss; for (size_t i = 0; i < NUM_DOCS; ++i) { int y = 2011; int m = (i % 12) + 1; int d = (i % 27) + 1; int h = i % 24; int min = i % 60; int s = i % 60; ss << y << "-" << m << "-" << d << " " << h << ":" << min << ":" << s << ";"; TimeAndDoc td; td.nTime = DateTimeAnalyzer::makeTime(y, m, d, h, min, s); td.docId = (docid_t)i; timeVec .push_back(td); } sort(timeVec.begin(), timeVec.end(), timeLess); GLOBAL_CONF().Build.buildThreadCount = 1; buildDateTimeIndex(ss.str()); Index index; index.open(getIndexPath(), Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); TermReaderPtr pTermReader = pReader->termReader(); TermIteratorPtr pIterator = pTermReader->termIterator("DateTime1"); CPPUNIT_ASSERT(pIterator); size_t j = 0; while (pIterator->hasNext()) { TermIterator::TermEntry entry = pIterator->next(); const Term* pTerm = entry.term; const Int64Term* pTermX = pTerm->cast<int64_t>(); int64_t nCurTerm = pTermX->getValue(); int64_t nExpTerm = timeVec[j].nTime; CPPUNIT_ASSERT_EQUAL(nExpTerm, nCurTerm); TermPostingIteratorPtr pPostingIter = entry.postingIterator; CPPUNIT_ASSERT(pPostingIter != NULL); docid_t docId = pPostingIter->skipTo(0); CPPUNIT_ASSERT_EQUAL(timeVec[j].docId, docId); docId = pPostingIter->skipTo(docId + 1); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); j++; } }
QueryExecutorPtr TermQuery::createExecutor(IndexReaderPtr& pIndexReader, FeatureProviderPtr& pProvider, PoolPtr& pPool) const { TermReaderPtr pTermReader = pIndexReader->termReader(); TermPostingIteratorPtr pPostIter = pTermReader->seek(m_pTerm.get()); if (pPostIter.isNull()) { return QueryExecutorPtr(); } QueryExecutorPtr pExe = new TermQueryExecutor(this, pPostIter, pProvider, pPool); return pExe; }
void IndexTestCase::testPrimaryKeyIndex() { DocumentSchema schema; schema.addField("PK", "PRIMARY_KEY", false); const static size_t NUM_DOCS = 1000; stringstream ss; for (size_t i = 0; i < NUM_DOCS; ++i) { ss << i << ";"; } GLOBAL_CONF().Build.buildThreadCount = 1; buildIndex(schema, ss.str()); tstring str = getTestPath(); Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("PK"); CPPUNIT_ASSERT(pTermIterator != NULL); while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)1, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)1, termMeta.getCTF()); } for (size_t i = 0; i < NUM_DOCS; ++i) { stringstream ss2; ss2 << i; Term term("PK", ss2.str()); TermPostingIteratorPtr pPost = pTermReader->seek(&term); CPPUNIT_ASSERT(pPost); docid_t docId = pPost->skipTo(0); CPPUNIT_ASSERT_EQUAL((docid_t)i, docId); docId = pPost->skipTo(++docId); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); } }
void IndexTestCase::testTextIndex() { DocumentSchema schema; schema.addUnIndexedField("PATH"); schema.addTextField("CONTENT"); buildIndex(schema, "file1.txt, hello world."); tstring str = getTestPath(); Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("CONTENT"); CPPUNIT_ASSERT(pTermIterator != NULL); while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)1, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)1, termMeta.getCTF()); } Term term("CONTENT", "hello"); TermPostingIteratorPtr pPost = pTermReader->seek(&term); CPPUNIT_ASSERT(pPost); docid_t docId = pPost->skipTo(0); CPPUNIT_ASSERT_EQUAL((docid_t)0, docId); docId = pPost->skipTo(++docId); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); StoredFieldsReaderPtr pDocReader = pReader->createStoredFieldsReader(); CPPUNIT_ASSERT(pDocReader); FieldSelector selector(pReader->getDocSchema(), true, false); ResultDoc resultDoc(0); bool ret = pDocReader->getDocument(selector, resultDoc); CPPUNIT_ASSERT(ret); CPPUNIT_ASSERT(resultDoc.size() > 0); }
void IndexTestCase::checkDocFreq(IndexReaderPtr& pIndexReader, const tstring& sField, const tstring& sTerm, df_t expDf) { TermReaderPtr pTermReader = pIndexReader->termReader(); CPPUNIT_ASSERT(pTermReader); StandardAnalyzer sa; sa.init(); TokenViewPtr pTokens = sa.tokenize(sTerm.c_str(), sTerm.length()); CPPUNIT_ASSERT(pTokens); CPPUNIT_ASSERT(pTokens->getNumTokens() == 1); TokenView::Iterator it = pTokens->iterator(); TermPtr pTerm(new Term(sField, it.next().getTextValue())); TermPostingIteratorPtr pPost = pTermReader->seek(pTerm.get()); CPPUNIT_ASSERT(pPost); const TermMeta& termMeta = pPost->getTermMeta(); CPPUNIT_ASSERT_EQUAL(expDf, termMeta.getDocFreq()); }
void IndexTestCase::testDocumentDeletion() { DocumentSchema schema; schema.addField("URL", "PRIMARY_KEY", true); schema.addTextField("BODY"); schema.addField("MODIFIED", "INT64", true); stringstream ss1; const size_t NUM_DOCS = 1000; size_t i = 0; for (; i < NUM_DOCS; ++i) { ss1 << "url" << i << ", body" << i << " hot," << (i * 100) % 1000 << ";"; } buildIndex(schema, ss1.str()); stringstream ss2; for (; i < 2 * NUM_DOCS; ++i) { ss2 << "url" << i << ", body" << i << " hot," << (i * 100) % 1000 << ";"; } buildIndex(schema, ss2.str(), true); StandardAnalyzerPtr sa(new StandardAnalyzer()); sa->init(); TokenViewPtr pTokens = sa->tokenize("hot", 3); CPPUNIT_ASSERT(pTokens); CPPUNIT_ASSERT(pTokens->getNumTokens() == 1); TokenView::Iterator it = pTokens->iterator(); TermPtr pTerm(new Term("BODY", it.next().getTextValue())); tstring str = getTestPath(); std::set<docid_t> answer; { Index index; index.open(str, Index::RDWR, NULL); IndexWriterPtr pIndexWriter = index.acquireWriter(); CPPUNIT_ASSERT(pIndexWriter != NULL); IndexReaderPtr pIndexReader = index.acquireReader(); CPPUNIT_ASSERT(pIndexReader != NULL); for (size_t i = 0; i < 2 * NUM_DOCS; ++i) { stringstream ss; ss << "url" << i; if (i == 1000 || i == 1500 || i == 1505 || i == 1999) { pIndexWriter->deleteDocument(ss.str()); } else { TermReaderPtr pTermReader = pIndexReader->termReader(); TermPtr pTerm(new Term("URL", ss.str())); TermPostingIteratorPtr pIt = pTermReader->seek(pTerm.get()); docid_t docId = pIt->skipTo(0); answer.insert(docId); } } TermReaderPtr pTermReader = pIndexReader->termReader(); TermPostingIteratorPtr pDocFreqs = pTermReader->seek(pTerm.get()); CPPUNIT_ASSERT(pDocFreqs); CPPUNIT_ASSERT_EQUAL((df_t)NUM_DOCS * 2, pDocFreqs->getTermMeta().getDocFreq()); std::set<docid_t>::const_iterator it = answer.begin(); for (docid_t i = 0; i < (docid_t)(2 * NUM_DOCS); ) { docid_t docId = pDocFreqs->skipTo((docid_t)i); i = docId + 1; if (docId == INVALID_DOCID) { break; } CPPUNIT_ASSERT_EQUAL(*it, docId); ++it; } CPPUNIT_ASSERT(it == answer.end()); } { Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pIndexReader = index.acquireReader(); CPPUNIT_ASSERT(pIndexReader != NULL); TermReaderPtr pTermReader = pIndexReader->termReader(); TermPostingIteratorPtr pDocFreqs = pTermReader->seek(pTerm.get()); CPPUNIT_ASSERT(pDocFreqs); CPPUNIT_ASSERT_EQUAL((df_t)(2 * NUM_DOCS), pDocFreqs->getTermMeta().getDocFreq()); std::set<docid_t>::const_iterator it = answer.begin(); for (docid_t i = 0; i < (docid_t)(2 * NUM_DOCS); ) { docid_t docId = pDocFreqs->skipTo((docid_t)i); i = docId + 1; if (docId == INVALID_DOCID) { break; } CPPUNIT_ASSERT_EQUAL(*it, docId); ++it; } CPPUNIT_ASSERT(it == answer.end()); // for (std::set<docid_t>::const_iterator it = answer.begin(); // it != answer.end(); ++it) // { // docid_t docId = pDocFreqs->skipTo(*it); // CPPUNIT_ASSERT_EQUAL(*it, docId); // } // docid_t docId = pDocFreqs->skipTo(NUM_DOCS + 0); // CPPUNIT_ASSERT_EQUAL((docid_t)NUM_DOCS + 1, docId); // docId = pDocFreqs->skipTo(NUM_DOCS + 500); // CPPUNIT_ASSERT_EQUAL((docid_t)NUM_DOCS + 501, docId); // docId = pDocFreqs->skipTo(NUM_DOCS + 505); // CPPUNIT_ASSERT_EQUAL((docid_t)NUM_DOCS + 506, docId); // docId = pDocFreqs->skipTo(2 * NUM_DOCS - 1); // CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); } }
void IndexContentTestCase::testIndexContent_WL() { Index* pIndex; IndexReaderPtr pReader; const Term* pTerm; TermIteratorPtr pTermIter; int docCount = 0; int termCount = 0; int pos = -1; uint32_t indexTermId; string fileName; //Check posting list Path indexPath = TestHelper::getTestDataPath(); indexPath.makeDirectory(); indexPath.pushDirectory(_T("test_wlindex")); pIndex = new Index(indexPath.toString().c_str(), Index::READ, NULL); auto_ptr<Index> indexPtr(pIndex); CPPUNIT_ASSERT(pIndex != NULL); pReader = pIndex->acquireReader(); TermReaderPtr pTermReader = pReader->termReader(); pTermIter = pTermReader->termIterator("BODY"); StoredFieldsReaderPtr pDocReader = pReader->createStoredFieldsReader(); //Iterator all terms while(pTermIter->next()) { pTerm = pTermIter->term(); CPPUNIT_ASSERT(pTermReader->seek(pTerm)); indexTermId = (pTerm->cast<int32_t>())->getValue(); TermPositionIteratorPtr pPositions = pTermReader->termPositions(); docCount = 0; while(pPositions->nextDoc()) { DocumentPtr pDoc = pDocReader->document(pPositions->doc()); docCount++; fileName.assign(pDoc->getField("PATH")->getValue().c_str()); TermList* pTermIdList = m_pDocScanner->getTermListOfFile(fileName); CPPUNIT_ASSERT(pTermIdList != NULL); pos = pPositions->nextPosition(); termCount = 0; while(pos != -1) { termCount++; CPPUNIT_ASSERT(indexTermId == pTermIdList->getValue(pos)); pos = pPositions->nextPosition(); } CPPUNIT_ASSERT(termCount == pPositions->freq()); }//end while nextDoc() CPPUNIT_ASSERT(docCount == pPositions->getDocFreq()); } CPPUNIT_ASSERT_EQUAL((int64_t)m_pDocScanner->getTotalTermCount(), (int64_t)pReader->getNumTerms()); }
void KeywordIndexTestCase::testPosting() { const size_t NUM_DOCS = 100; vector<HashAndDoc> hashVec; stringstream ss; size_t i; for (i = 0; i < NUM_DOCS; ++i) { stringstream ss1; ss1 << "abc" << i; ss << ss1.str() << ", " << "abc" << ";"; HashAndDoc hd; hd.nHash = Hash::hashString64(ss1.str().c_str()); hd.docId = (docid_t)i; hashVec.push_back(hd); } GLOBAL_CONF().Build.buildThreadCount = 1; buildKeywordIndex(ss.str()); Index index; index.open(getIndexPath(), Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); TermReaderPtr pTermReader = pReader->termReader(); TermIteratorPtr pIterator = pTermReader->termIterator("Keyword1"); CPPUNIT_ASSERT(!pIterator.isNull()); sort(hashVec.begin(), hashVec.end(), hashLess); int32_t j = 0; while (pIterator->hasNext()) { TermIterator::TermEntry entry = pIterator->next(); const Term* pTerm = entry.term; const UInt64Term* pTermX = pTerm->cast<uint64_t>(); uint64_t nCurTerm = pTermX->getValue(); uint64_t nExpTerm = hashVec[j].nHash; CPPUNIT_ASSERT_EQUAL(nExpTerm, nCurTerm); TermPostingIteratorPtr pPostingIter = entry.postingIterator; CPPUNIT_ASSERT(pPostingIter != NULL); docid_t docId = pPostingIter->skipTo(0); CPPUNIT_ASSERT_EQUAL(hashVec[j].docId, docId); docId = pPostingIter->skipTo(docId + 1); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); j++; } //test abc term TermIteratorPtr pIterator2 = pTermReader->termIterator("Keyword2"); CPPUNIT_ASSERT(!pIterator2.isNull()); i = 0; while (pIterator2->hasNext()) { TermIterator::TermEntry entry = pIterator2->next(); const Term* pTerm = entry.term; const UInt64Term* pTermX = pTerm->cast<uint64_t>(); uint64_t nCurTerm = pTermX->getValue(); uint64_t nExpTerm = Hash::hashString64("abc"); CPPUNIT_ASSERT_EQUAL(nExpTerm, nCurTerm); TermPostingIteratorPtr pPostingIter = entry.postingIterator; CPPUNIT_ASSERT(pPostingIter != NULL); docid_t docId = 0; docid_t docIdRet = 0; while ((docIdRet = pPostingIter->skipTo(docId)) != INVALID_DOCID) { CPPUNIT_ASSERT_EQUAL(docId++, docIdRet); } CPPUNIT_ASSERT_EQUAL((docid_t)100, docId); i++; } CPPUNIT_ASSERT_EQUAL((size_t)1, i); }