void IndexContentTestCase::testIndexContent_DL() { Index* pIndex; IndexReaderPtr pReader; const Term* pTerm; TermIteratorPtr pTermIter; int docCount = 0; int termCount = 0; uint32_t i; uint32_t indexTermId; string fileName; //Check posting list Path indexPath = TestHelper::getTestDataPath(); indexPath.makeDirectory(); indexPath.pushDirectory(_T("test_dlindex")); pIndex = new Index(indexPath.toString().c_str(), Index::READ, NULL); auto_ptr<Index> indexPtr(pIndex); pReader = pIndex->acquireReader(); TermReaderPtr pTermReader = pReader->termReader(); pTermIter = pTermReader->termIterator("BODY"); StoredFieldsReaderPtr pDocReader = pReader->createStoredFieldsReader(); //Iterator all terms while(pTermIter->next()) { pTerm = pTermIter->term(); CPPUNIT_ASSERT(pTermReader->seek(pTerm)); indexTermId = (pTerm->cast<int32_t>())->getValue(); docCount = 0; TermPostingIteratorPtr pTermDocFreqs = pTermReader->termPostings(); while(pTermDocFreqs->nextDoc()) { DocumentPtr pDoc = pDocReader->document(pTermDocFreqs->doc()); docCount++; // 获取文件路径 fileName.assign(pDoc->getField("PATH")->getValue().c_str()); TermList* pTermIdList = m_pDocScanner->getTermListOfFile(fileName); CPPUNIT_ASSERT(pTermIdList != NULL); for(i = 0, termCount = 0; i < pTermIdList->getSize(); i++) { if(indexTermId == pTermIdList->getValue(i)) { termCount++; } } CPPUNIT_ASSERT_EQUAL((tf_t)termCount, pTermDocFreqs->freq()); }//end while nextDoc() CPPUNIT_ASSERT_EQUAL((df_t)docCount, pTermDocFreqs->getDocFreq()); } CPPUNIT_ASSERT(m_pDocScanner->getTotalTermCount() == pReader->getNumTerms()); }
void IndexTestCase::testKeywordIndex() { DocumentSchema schema; schema.addUnIndexedField("PATH"); schema.addField("Keyword", "KEYWORD", false); buildIndex(schema, "file1.txt, hello world."); tstring str = getTestPath(); Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("Keyword"); CPPUNIT_ASSERT(pTermIterator != NULL); while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)1, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)1, termMeta.getCTF()); } Term term("Keyword", "hello world."); TermPostingIteratorPtr pPost = pTermReader->seek(&term); CPPUNIT_ASSERT(pPost); docid_t docId = pPost->skipTo(0); CPPUNIT_ASSERT_EQUAL((docid_t)0, docId); docId = pPost->skipTo(++docId); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); }
void DateTimeIndexTestCase::testCTF() { const size_t NUM_DOCS = 100; stringstream ss; for (size_t i = 0; i < NUM_DOCS; ++i) { ss << 2009 << "-" << i % 12 + 1 << "-" << i % 27 + 1 << " " << i % 24 << ":" << i % 60 << ":" << i % 60 << ";"; } buildDateTimeIndex(ss.str()); Index index; index.open(getIndexPath(), Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("DateTime1"); CPPUNIT_ASSERT(pTermIterator != NULL); df_t ttf = 0; while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)1, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)1, termMeta.getCTF()); ttf += termMeta.getDocFreq(); } CPPUNIT_ASSERT_EQUAL((df_t)100, ttf); }
void KeywordIndexTestCase::testCTF() { GLOBAL_CONF().Build.buildThreadCount = 2; buildKeywordIndex("1, 2; 3, 4; 5, 6; 7, 8; 9, 10; " "1, 2; 3, 4; 5, 6; 7, 8; 9, 10"); Index index; index.open(getIndexPath(), Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("Keyword1"); CPPUNIT_ASSERT(pTermIterator != NULL); try { while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)2, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)2, termMeta.getCTF()); } } catch(const FirteXException& e) { cout << "ERROR:" << e.what() << endl; CPPUNIT_ASSERT(false); } }
void IndexTestCase::testInt32_IF() { DocumentSchema schema; schema.addField("Int32", "INT32_IF", false); const static size_t NUM_DOCS = 1000; stringstream ss; for (size_t i = 0; i < NUM_DOCS; ++i) { ss << (i % 100) << ";"; } GLOBAL_CONF().Build.buildThreadCount = 1; buildIndex(schema, ss.str()); tstring str = getTestPath(); Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("Int32"); CPPUNIT_ASSERT(pTermIterator != NULL); while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)10, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)10, termMeta.getCTF()); } Term term("Int32", "0"); TermPostingIteratorPtr pPost = pTermReader->seek(&term); CPPUNIT_ASSERT(pPost); docid_t docId = pPost->skipTo(0); CPPUNIT_ASSERT_EQUAL((docid_t)0, docId); docId = pPost->skipTo(901); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); ForwardIndexIteratorPtr pForIndexIt = pReader->forwardIndexReader("Int32"); CPPUNIT_ASSERT(pForIndexIt != NULL); Int32ForwardIndexIteratorPtr pInt32ForIndexIt = pForIndexIt.cast<Int32ForwardIndexIterator>(); CPPUNIT_ASSERT(pInt32ForIndexIt != NULL); int32_t value = 0; docId = 0; for (; docId < (docid_t)NUM_DOCS; ++docId) { CPPUNIT_ASSERT(pInt32ForIndexIt->seek(docId, value)); CPPUNIT_ASSERT_EQUAL((int32_t)(docId % 100), value); } CPPUNIT_ASSERT(!pInt32ForIndexIt->seek(docId, value)); }
void DateTimeIndexTestCase::testPosting() { const size_t NUM_DOCS = 100; vector<TimeAndDoc> timeVec; stringstream ss; for (size_t i = 0; i < NUM_DOCS; ++i) { int y = 2011; int m = (i % 12) + 1; int d = (i % 27) + 1; int h = i % 24; int min = i % 60; int s = i % 60; ss << y << "-" << m << "-" << d << " " << h << ":" << min << ":" << s << ";"; TimeAndDoc td; td.nTime = DateTimeAnalyzer::makeTime(y, m, d, h, min, s); td.docId = (docid_t)i; timeVec .push_back(td); } sort(timeVec.begin(), timeVec.end(), timeLess); GLOBAL_CONF().Build.buildThreadCount = 1; buildDateTimeIndex(ss.str()); Index index; index.open(getIndexPath(), Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); TermReaderPtr pTermReader = pReader->termReader(); TermIteratorPtr pIterator = pTermReader->termIterator("DateTime1"); CPPUNIT_ASSERT(pIterator); size_t j = 0; while (pIterator->hasNext()) { TermIterator::TermEntry entry = pIterator->next(); const Term* pTerm = entry.term; const Int64Term* pTermX = pTerm->cast<int64_t>(); int64_t nCurTerm = pTermX->getValue(); int64_t nExpTerm = timeVec[j].nTime; CPPUNIT_ASSERT_EQUAL(nExpTerm, nCurTerm); TermPostingIteratorPtr pPostingIter = entry.postingIterator; CPPUNIT_ASSERT(pPostingIter != NULL); docid_t docId = pPostingIter->skipTo(0); CPPUNIT_ASSERT_EQUAL(timeVec[j].docId, docId); docId = pPostingIter->skipTo(docId + 1); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); j++; } }
void IndexTestCase::testPrimaryKeyIndex() { DocumentSchema schema; schema.addField("PK", "PRIMARY_KEY", false); const static size_t NUM_DOCS = 1000; stringstream ss; for (size_t i = 0; i < NUM_DOCS; ++i) { ss << i << ";"; } GLOBAL_CONF().Build.buildThreadCount = 1; buildIndex(schema, ss.str()); tstring str = getTestPath(); Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("PK"); CPPUNIT_ASSERT(pTermIterator != NULL); while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)1, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)1, termMeta.getCTF()); } for (size_t i = 0; i < NUM_DOCS; ++i) { stringstream ss2; ss2 << i; Term term("PK", ss2.str()); TermPostingIteratorPtr pPost = pTermReader->seek(&term); CPPUNIT_ASSERT(pPost); docid_t docId = pPost->skipTo(0); CPPUNIT_ASSERT_EQUAL((docid_t)i, docId); docId = pPost->skipTo(++docId); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); } }
void TextIndexMergerTestCase::checkMergedResult(TermIteratorPtr& pTermIt, TermMap& answer) { CPPUNIT_ASSERT(pTermIt); CPPUNIT_ASSERT_EQUAL((int64_t)answer.size(), pTermIt->size()); size_t termCount = 0; while (pTermIt->hasNext()) { TermIterator::TermEntry entry = pTermIt->next(); const TextTermIterator::TermType* pTerm = dynamic_cast<const TextTermIterator::TermType*>(entry.term); CPPUNIT_ASSERT(pTerm != NULL); uint64_t hash = pTerm->getValue(); FX_TRACE("Term hash: %llu", hash); TermMap::const_iterator it = answer.find(hash); CPPUNIT_ASSERT(it != answer.end()); CPPUNIT_ASSERT_EQUAL(it->first, hash); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); size_t df = it->second.size(); CPPUNIT_ASSERT_EQUAL((df_t)df, termMeta.getDocFreq()); ctf_t ctf = 0; for (size_t i = 0; i < df; ++i) { docid_t docId = entry.postingIterator->skipTo(it->second[i].first); CPPUNIT_ASSERT_EQUAL(it->second[i].first, docId); tf_t tf = it->second[i].second.size(); CPPUNIT_ASSERT_EQUAL(tf, entry.postingIterator->freq()); for (size_t j = 0; j < (size_t)tf; ++j) { loc_t posExp = it->second[i].second[j]; loc_t pos = entry.postingIterator->skipToPosition(posExp); CPPUNIT_ASSERT_EQUAL(posExp, pos); } ctf += it->second[i].second.size(); } CPPUNIT_ASSERT_EQUAL(ctf, termMeta.getCTF()); termCount++; } CPPUNIT_ASSERT_EQUAL(answer.size(), termCount); }
void IndexTestCase::testTextIndex() { DocumentSchema schema; schema.addUnIndexedField("PATH"); schema.addTextField("CONTENT"); buildIndex(schema, "file1.txt, hello world."); tstring str = getTestPath(); Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); CPPUNIT_ASSERT(pReader != NULL); TermReaderPtr pTermReader = pReader->termReader(); CPPUNIT_ASSERT(pTermReader); TermIteratorPtr pTermIterator = pTermReader->termIterator("CONTENT"); CPPUNIT_ASSERT(pTermIterator != NULL); while (pTermIterator->hasNext()) { TermIterator::TermEntry entry = pTermIterator->next(); const TermMeta& termMeta = entry.postingIterator->getTermMeta(); CPPUNIT_ASSERT_EQUAL((df_t)1, termMeta.getDocFreq()); CPPUNIT_ASSERT_EQUAL((ctf_t)1, termMeta.getCTF()); } Term term("CONTENT", "hello"); TermPostingIteratorPtr pPost = pTermReader->seek(&term); CPPUNIT_ASSERT(pPost); docid_t docId = pPost->skipTo(0); CPPUNIT_ASSERT_EQUAL((docid_t)0, docId); docId = pPost->skipTo(++docId); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); StoredFieldsReaderPtr pDocReader = pReader->createStoredFieldsReader(); CPPUNIT_ASSERT(pDocReader); FieldSelector selector(pReader->getDocSchema(), true, false); ResultDoc resultDoc(0); bool ret = pDocReader->getDocument(selector, resultDoc); CPPUNIT_ASSERT(ret); CPPUNIT_ASSERT(resultDoc.size() > 0); }
void IndexContentTestCase::testIndexContent_WL() { Index* pIndex; IndexReaderPtr pReader; const Term* pTerm; TermIteratorPtr pTermIter; int docCount = 0; int termCount = 0; int pos = -1; uint32_t indexTermId; string fileName; //Check posting list Path indexPath = TestHelper::getTestDataPath(); indexPath.makeDirectory(); indexPath.pushDirectory(_T("test_wlindex")); pIndex = new Index(indexPath.toString().c_str(), Index::READ, NULL); auto_ptr<Index> indexPtr(pIndex); CPPUNIT_ASSERT(pIndex != NULL); pReader = pIndex->acquireReader(); TermReaderPtr pTermReader = pReader->termReader(); pTermIter = pTermReader->termIterator("BODY"); StoredFieldsReaderPtr pDocReader = pReader->createStoredFieldsReader(); //Iterator all terms while(pTermIter->next()) { pTerm = pTermIter->term(); CPPUNIT_ASSERT(pTermReader->seek(pTerm)); indexTermId = (pTerm->cast<int32_t>())->getValue(); TermPositionIteratorPtr pPositions = pTermReader->termPositions(); docCount = 0; while(pPositions->nextDoc()) { DocumentPtr pDoc = pDocReader->document(pPositions->doc()); docCount++; fileName.assign(pDoc->getField("PATH")->getValue().c_str()); TermList* pTermIdList = m_pDocScanner->getTermListOfFile(fileName); CPPUNIT_ASSERT(pTermIdList != NULL); pos = pPositions->nextPosition(); termCount = 0; while(pos != -1) { termCount++; CPPUNIT_ASSERT(indexTermId == pTermIdList->getValue(pos)); pos = pPositions->nextPosition(); } CPPUNIT_ASSERT(termCount == pPositions->freq()); }//end while nextDoc() CPPUNIT_ASSERT(docCount == pPositions->getDocFreq()); } CPPUNIT_ASSERT_EQUAL((int64_t)m_pDocScanner->getTotalTermCount(), (int64_t)pReader->getNumTerms()); }
void TextIndexMergerTestCase::printPositng(TermIteratorPtr& pTermIt) { while (pTermIt->hasNext()) { TermIterator::TermEntry entry = pTermIt->next(); const TextTermIterator::TermType* pTerm = dynamic_cast<const TextTermIterator::TermType*>(entry.term); CPPUNIT_ASSERT(pTerm != NULL); uint64_t hash = pTerm->getValue(); std::cout << "Term hash: " << hash << std::endl; const TermMeta& termMeta = entry.postingIterator->getTermMeta(); std::cout << "Term meta: df: " << termMeta.getDocFreq() << ", ctf: " << termMeta.getCTF() << std::endl; PostingDecoderPtr pDecoder = entry.postingIterator->getPostingDecoder(); docid_t docBuffer[RECORD_SIZE]; tf_t tfBuffer[RECORD_SIZE]; uint32_t nDecoded = 0; docid_t lastDocId = 0; while ((nDecoded = pDecoder->decodeDocRecord(docBuffer, lastDocId)) > 0) { if (pDecoder->decodeTfRecord(tfBuffer) != nDecoded) { FIRTEX_THROW(IndexCollapseException, "Doc and Tf record is inconsistant."); } std::cout << "doc list: " << std::endl; for (size_t i = 0;i < nDecoded; ++i) { std::cout << docBuffer[i] << ", "; } std::cout << "end doc list: " << std::endl; std::cout << "tf list: " << std::endl; for (size_t i = 0;i < nDecoded; ++i) { std::cout << tfBuffer[i] << ", "; } std::cout << "end tf list: " << std::endl; lastDocId = docBuffer[nDecoded - 1] + 1; } loc_t posBuffer[RECORD_SIZE]; uint32_t startOff = 0; uint32_t nDecodedPos = 0; uint32_t nTotalDecodedPos = 0; while ((nDecodedPos = pDecoder->decodePosRecord(posBuffer, nTotalDecodedPos, startOff)) > 0) { std::cout << "pos list: " << std::endl; for (size_t i = 0;i < nDecodedPos; ++i) { std::cout << posBuffer[i] << ", "; } std::cout << "end pos list: " << std::endl; nTotalDecodedPos += nDecodedPos; } } }
void KeywordIndexTestCase::testPosting() { const size_t NUM_DOCS = 100; vector<HashAndDoc> hashVec; stringstream ss; size_t i; for (i = 0; i < NUM_DOCS; ++i) { stringstream ss1; ss1 << "abc" << i; ss << ss1.str() << ", " << "abc" << ";"; HashAndDoc hd; hd.nHash = Hash::hashString64(ss1.str().c_str()); hd.docId = (docid_t)i; hashVec.push_back(hd); } GLOBAL_CONF().Build.buildThreadCount = 1; buildKeywordIndex(ss.str()); Index index; index.open(getIndexPath(), Index::READ, NULL); IndexReaderPtr pReader = index.acquireReader(); TermReaderPtr pTermReader = pReader->termReader(); TermIteratorPtr pIterator = pTermReader->termIterator("Keyword1"); CPPUNIT_ASSERT(!pIterator.isNull()); sort(hashVec.begin(), hashVec.end(), hashLess); int32_t j = 0; while (pIterator->hasNext()) { TermIterator::TermEntry entry = pIterator->next(); const Term* pTerm = entry.term; const UInt64Term* pTermX = pTerm->cast<uint64_t>(); uint64_t nCurTerm = pTermX->getValue(); uint64_t nExpTerm = hashVec[j].nHash; CPPUNIT_ASSERT_EQUAL(nExpTerm, nCurTerm); TermPostingIteratorPtr pPostingIter = entry.postingIterator; CPPUNIT_ASSERT(pPostingIter != NULL); docid_t docId = pPostingIter->skipTo(0); CPPUNIT_ASSERT_EQUAL(hashVec[j].docId, docId); docId = pPostingIter->skipTo(docId + 1); CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); j++; } //test abc term TermIteratorPtr pIterator2 = pTermReader->termIterator("Keyword2"); CPPUNIT_ASSERT(!pIterator2.isNull()); i = 0; while (pIterator2->hasNext()) { TermIterator::TermEntry entry = pIterator2->next(); const Term* pTerm = entry.term; const UInt64Term* pTermX = pTerm->cast<uint64_t>(); uint64_t nCurTerm = pTermX->getValue(); uint64_t nExpTerm = Hash::hashString64("abc"); CPPUNIT_ASSERT_EQUAL(nExpTerm, nCurTerm); TermPostingIteratorPtr pPostingIter = entry.postingIterator; CPPUNIT_ASSERT(pPostingIter != NULL); docid_t docId = 0; docid_t docIdRet = 0; while ((docIdRet = pPostingIter->skipTo(docId)) != INVALID_DOCID) { CPPUNIT_ASSERT_EQUAL(docId++, docIdRet); } CPPUNIT_ASSERT_EQUAL((docid_t)100, docId); i++; } CPPUNIT_ASSERT_EQUAL((size_t)1, i); }