TEST_F(SegmentTermEnumTest, testPrevTermAtEnd) { DirectoryPtr dir = newLucene<MockRAMDirectory>(); IndexWriterPtr writer = newLucene<IndexWriter>(dir, newLucene<WhitespaceAnalyzer>(), true, IndexWriter::MaxFieldLengthLIMITED); addDoc(writer, L"aaa bbb"); writer->close(); SegmentReaderPtr reader = SegmentReader::getOnlySegmentReader(dir); SegmentTermEnumPtr termEnum = boost::dynamic_pointer_cast<SegmentTermEnum>(reader->terms()); EXPECT_TRUE(termEnum->next()); EXPECT_EQ(L"aaa", termEnum->term()->text()); EXPECT_TRUE(termEnum->next()); EXPECT_EQ(L"aaa", termEnum->prev()->text()); EXPECT_EQ(L"bbb", termEnum->term()->text()); EXPECT_TRUE(!termEnum->next()); EXPECT_EQ(L"bbb", termEnum->prev()->text()); }
MultiThreadTermVectorsFixture() { directory = newLucene<RAMDirectory>(); numDocs = 100; numThreads = 3; IndexWriterPtr writer = newLucene<IndexWriter>(directory, newLucene<SimpleAnalyzer>(), true, IndexWriter::MaxFieldLengthLIMITED); for (int32_t i = 0; i < numDocs; ++i) { DocumentPtr doc = newLucene<Document>(); FieldablePtr fld = newLucene<Field>(L"field", intToEnglish(i), Field::STORE_YES, Field::INDEX_NOT_ANALYZED, Field::TERM_VECTOR_YES); doc->add(fld); writer->addDocument(doc); } writer->close(); }
/// One-time setup to initialise static members void setup() { // set the theoretical maximum term count for 8bit (see docs for the number) BooleanQuery::setMaxClauseCount(3 * 255 * 2 + 255); directory = newLucene<RAMDirectory>(); IndexWriterPtr writer = newLucene<IndexWriter>(directory, newLucene<WhitespaceAnalyzer>(), true, IndexWriter::MaxFieldLengthUNLIMITED); NumericFieldPtr field8 = newLucene<NumericField>(L"field8", 8, Field::STORE_YES, true); NumericFieldPtr field4 = newLucene<NumericField>(L"field4", 4, Field::STORE_YES, true); NumericFieldPtr field2 = newLucene<NumericField>(L"field2", 2, Field::STORE_YES, true); NumericFieldPtr fieldNoTrie = newLucene<NumericField>(L"field" + StringUtils::toString(INT_MAX), INT_MAX, Field::STORE_YES, true); NumericFieldPtr ascfield8 = newLucene<NumericField>(L"ascfield8", 8, Field::STORE_NO, true); NumericFieldPtr ascfield4 = newLucene<NumericField>(L"ascfield4", 4, Field::STORE_NO, true); NumericFieldPtr ascfield2 = newLucene<NumericField>(L"ascfield2", 2, Field::STORE_NO, true); DocumentPtr doc = newLucene<Document>(); // add fields, that have a distance to test general functionality doc->add(field8); doc->add(field4); doc->add(field2); doc->add(fieldNoTrie); // add ascending fields with a distance of 1, beginning at -noDocs/2 to test the correct splitting of range and inclusive/exclusive doc->add(ascfield8); doc->add(ascfield4); doc->add(ascfield2); // Add a series of noDocs docs with increasing int values for (int32_t l = 0; l < noDocs; ++l) { int32_t val = distance * l + startOffset; field8->setIntValue(val); field4->setIntValue(val); field2->setIntValue(val); fieldNoTrie->setIntValue(val); val = l - (noDocs / 2); ascfield8->setIntValue(val); ascfield4->setIntValue(val); ascfield2->setIntValue(val); writer->addDocument(doc); } writer->optimize(); writer->close(); searcher = newLucene<IndexSearcher>(directory, true); }
static void addDocs2(IndexWriterPtr writer, int32_t numDocs) { for (int32_t i = 0; i < numDocs; ++i) { DocumentPtr doc = newLucene<Document>(); doc->add(newLucene<Field>(L"content", L"bbb", Field::STORE_NO, Field::INDEX_ANALYZED)); writer->addDocument(doc); } }
double checkPhraseQuery(DocumentPtr doc, PhraseQueryPtr query, int32_t slop, int32_t expectedNumResults) { query->setSlop(slop); RAMDirectoryPtr ramDir = newLucene<RAMDirectory>(); WhitespaceAnalyzerPtr analyzer = newLucene<WhitespaceAnalyzer>(); IndexWriterPtr writer = newLucene<IndexWriter>(ramDir, analyzer, IndexWriter::MaxFieldLengthUNLIMITED); writer->addDocument(doc); writer->close(); IndexSearcherPtr searcher = newLucene<IndexSearcher>(ramDir, true); TopDocsPtr td = searcher->search(query, FilterPtr(), 10); BOOST_CHECK_EQUAL(expectedNumResults, td->totalHits); searcher->close(); ramDir->close(); return td->maxScore; }
void doBody(int32_t j, Collection<DirectoryPtr> dirs) { switch (j % 4) { case 0: mainWriter->addIndexesNoOptimize(dirs); mainWriter->optimize(); break; case 1: mainWriter->addIndexesNoOptimize(dirs); numAddIndexesNoOptimize->incrementAndGet(); break; case 2: mainWriter->addIndexes(readers); break; case 3: mainWriter->commit(); break; } count->addAndGet(dirs.size() * NUM_INIT_DOCS); }
virtual void doWork() { // Update all 100 docs for (int32_t i = 0; i < 100; ++i) { DocumentPtr d = newLucene<Document>(); d->add(newLucene<Field>(L"id", StringUtils::toString(i), Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); d->add(newLucene<Field>(L"contents", intToEnglish(i), Field::STORE_NO, Field::INDEX_ANALYZED)); writer->updateDocument(newLucene<Term>(L"id", StringUtils::toString(i)), d); } }
virtual void run() { try { for (int32_t j = 0; j < numDocs; ++j) { DocumentPtr d = newLucene<Document>(); d->add(newLucene<Field>(L"test", newLucene<PoolingPayloadTokenStream>(pool))); writer->addDocument(d); } } catch (LuceneException& e) { FAIL() << "Unexpected exception: " << e.getError(); } }
void createIndex(const DirectoryPtr& dir, bool multiSegment) { IndexWriter::unlock(dir); IndexWriterPtr w = newLucene<IndexWriter>(dir, newLucene<WhitespaceAnalyzer>(), IndexWriter::MaxFieldLengthLIMITED); w->setMergePolicy(newLucene<LogDocMergePolicy>(w)); for (int32_t i = 0; i < 100; ++i) { w->addDocument(createDocument(i, 4)); if (multiSegment && (i % 10) == 0) { w->commit(); } } if (!multiSegment) { w->optimize(); } w->close(); IndexReaderPtr r = IndexReader::open(dir, false); if (multiSegment) { EXPECT_TRUE(r->getSequentialSubReaders().size() > 1); } else { EXPECT_EQ(r->getSequentialSubReaders().size(), 1); } r->close(); }
PrefixInBooleanQueryFixture() { directory = newLucene<RAMDirectory>(); IndexWriterPtr writer = newLucene<IndexWriter>(directory, newLucene<WhitespaceAnalyzer>(), true, IndexWriter::MaxFieldLengthLIMITED); for (int32_t i = 0; i < 5137; ++i) { DocumentPtr doc = newLucene<Document>(); doc->add(newLucene<Field>(FIELD, L"meaninglessnames", Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); writer->addDocument(doc); } { DocumentPtr doc = newLucene<Document>(); doc->add(newLucene<Field>(FIELD, L"tangfulin", Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); writer->addDocument(doc); } for (int32_t i = 5138; i < 11377; ++i) { DocumentPtr doc = newLucene<Document>(); doc->add(newLucene<Field>(FIELD, L"meaninglessnames", Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); writer->addDocument(doc); } { DocumentPtr doc = newLucene<Document>(); doc->add(newLucene<Field>(FIELD, L"tangfulin", Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); writer->addDocument(doc); } writer->close(); }
void createIndex(int32_t numHits) { int32_t numDocs = 500; DirectoryPtr directory = newLucene<SeekCountingDirectory>(); IndexWriterPtr writer = newLucene<IndexWriter>(directory, newLucene<WhitespaceAnalyzer>(), true, IndexWriter::MaxFieldLengthLIMITED); writer->setUseCompoundFile(false); writer->setMaxBufferedDocs(10); for (int32_t i = 0; i < numDocs; ++i) { DocumentPtr doc = newLucene<Document>(); String content; if (i % (numDocs / numHits) == 0) { // add a document that matches the query "term1 term2" content = term1 + L" " + term2; } else if (i % 15 == 0) { // add a document that only contains term1 content = term1 + L" " + term1; } else { // add a document that contains term2 but not term 1 content = term3 + L" " + term2; } doc->add(newLucene<Field>(field, content, Field::STORE_YES, Field::INDEX_ANALYZED)); writer->addDocument(doc); } // make sure the index has only a single segment writer->optimize(); writer->close(); SegmentReaderPtr reader = SegmentReader::getOnlySegmentReader(directory); searcher = newLucene<IndexSearcher>(reader); }
FieldCacheSanityCheckerTestFixture() { RAMDirectoryPtr dirA = newLucene<RAMDirectory>(); RAMDirectoryPtr dirB = newLucene<RAMDirectory>(); IndexWriterPtr wA = newLucene<IndexWriter>(dirA, newLucene<WhitespaceAnalyzer>(), true, IndexWriter::MaxFieldLengthLIMITED); IndexWriterPtr wB = newLucene<IndexWriter>(dirB, newLucene<WhitespaceAnalyzer>(), true, IndexWriter::MaxFieldLengthLIMITED); int64_t theLong = LLONG_MAX; double theDouble = DBL_MAX; uint8_t theByte = UCHAR_MAX; int32_t theInt = INT_MAX; for (int32_t i = 0; i < NUM_DOCS; ++i) { DocumentPtr doc = newLucene<Document>(); doc->add(newLucene<Field>(L"theLong", StringUtils::toString(theLong--), Field::STORE_NO, Field::INDEX_NOT_ANALYZED)); doc->add(newLucene<Field>(L"theDouble", StringUtils::toString(theDouble--), Field::STORE_NO, Field::INDEX_NOT_ANALYZED)); doc->add(newLucene<Field>(L"theByte", StringUtils::toString(theByte--), Field::STORE_NO, Field::INDEX_NOT_ANALYZED)); doc->add(newLucene<Field>(L"theInt", StringUtils::toString(theInt--), Field::STORE_NO, Field::INDEX_NOT_ANALYZED)); if (i % 3 == 0) wA->addDocument(doc); else wB->addDocument(doc); } wA->close(); wB->close(); readerA = IndexReader::open(dirA, true); readerB = IndexReader::open(dirB, true); readerX = newLucene<MultiReader>(newCollection<IndexReaderPtr>(readerA, readerB)); }
TEST_F(IndexWriterReaderTest, testAddIndexes2) { bool optimize = false; DirectoryPtr dir1 = newLucene<MockRAMDirectory>(); IndexWriterPtr writer = newLucene<IndexWriter>(dir1, newLucene<WhitespaceAnalyzer>(), IndexWriter::MaxFieldLengthLIMITED); DirectoryPtr dir2 = newLucene<MockRAMDirectory>(); IndexWriterPtr writer2 = newLucene<IndexWriter>(dir2, newLucene<WhitespaceAnalyzer>(), IndexWriter::MaxFieldLengthLIMITED); createIndexNoClose(!optimize, L"index2", writer2); writer2->close(); Collection<DirectoryPtr> dirs = newCollection<DirectoryPtr>(dir2); writer->addIndexesNoOptimize(dirs); writer->addIndexesNoOptimize(dirs); writer->addIndexesNoOptimize(dirs); writer->addIndexesNoOptimize(dirs); writer->addIndexesNoOptimize(dirs); IndexReaderPtr r1 = writer->getReader(); EXPECT_EQ(500, r1->maxDoc()); r1->close(); writer->close(); dir1->close(); }
// Run one indexer and 2 searchers against single index as stress test. static void runTest(DirectoryPtr directory) { Collection<TimedThreadPtr> threads(Collection<TimedThreadPtr>::newInstance(4)); AnalyzerPtr analyzer = newLucene<SimpleAnalyzer>(); IndexWriterPtr writer = newLucene<MockIndexWriter>(directory, analyzer, true, IndexWriter::MaxFieldLengthUNLIMITED); writer->setMaxBufferedDocs(7); writer->setMergeFactor(3); // Establish a base index of 100 docs for (int32_t i = 0; i < 100; ++i) { DocumentPtr d = newLucene<Document>(); d->add(newLucene<Field>(L"id", StringUtils::toString(i), Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); d->add(newLucene<Field>(L"contents", intToEnglish(i), Field::STORE_NO, Field::INDEX_ANALYZED)); if ((i - 1) % 7 == 0) writer->commit(); writer->addDocument(d); } writer->commit(); IndexReaderPtr r = IndexReader::open(directory, true); BOOST_CHECK_EQUAL(100, r->numDocs()); r->close(); IndexerThreadPtr indexerThread1 = newLucene<IndexerThread>(writer); threads[0] = indexerThread1; indexerThread1->start(); IndexerThreadPtr indexerThread2 = newLucene<IndexerThread>(writer); threads[1] = indexerThread2; indexerThread2->start(); SearcherThreadPtr searcherThread1 = newLucene<SearcherThread>(directory); threads[2] = searcherThread1; searcherThread1->start(); SearcherThreadPtr searcherThread2 = newLucene<SearcherThread>(directory); threads[3] = searcherThread2; searcherThread2->start(); indexerThread1->join(); indexerThread2->join(); searcherThread1->join(); searcherThread2->join(); writer->close(); BOOST_CHECK(!indexerThread1->failed); // hit unexpected exception in indexer1 BOOST_CHECK(!indexerThread2->failed); // hit unexpected exception in indexer2 BOOST_CHECK(!searcherThread1->failed); // hit unexpected exception in search1 BOOST_CHECK(!searcherThread2->failed); // hit unexpected exception in search2 }
void addDocs(const DirectoryPtr& dir, int32_t ndocs, bool compound) { IndexWriterPtr iw = newLucene<IndexWriter>(dir, anlzr, false, IndexWriter::MaxFieldLengthLIMITED); iw->setMaxBufferedDocs(5); iw->setMergeFactor(3); iw->setSimilarity(similarityOne); iw->setUseCompoundFile(compound); for (int32_t i = 0; i < ndocs; ++i) { iw->addDocument(newDoc()); } iw->close(); }
void createIndex(const DirectoryPtr& dir) { IndexWriterPtr iw = newLucene<IndexWriter>(dir, anlzr, true, IndexWriter::MaxFieldLengthLIMITED); iw->setMaxBufferedDocs(5); iw->setMergeFactor(3); iw->setSimilarity(similarityOne); iw->setUseCompoundFile(true); iw->close(); }
TEST_F(IndexWriterReaderTest, testUpdateDocument) { bool optimize = true; DirectoryPtr dir1 = newLucene<MockRAMDirectory>(); IndexWriterPtr writer = newLucene<IndexWriter>(dir1, newLucene<WhitespaceAnalyzer>(), IndexWriter::MaxFieldLengthLIMITED); // create the index createIndexNoClose(!optimize, L"index1", writer); // get a reader IndexReaderPtr r1 = writer->getReader(); EXPECT_TRUE(r1->isCurrent()); String id10 = r1->document(10)->getField(L"id")->stringValue(); DocumentPtr newDoc = r1->document(10); newDoc->removeField(L"id"); newDoc->add(newLucene<Field>(L"id", StringUtils::toString(8000), Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); writer->updateDocument(newLucene<Term>(L"id", id10), newDoc); EXPECT_TRUE(!r1->isCurrent()); IndexReaderPtr r2 = writer->getReader(); EXPECT_TRUE(r2->isCurrent()); EXPECT_EQ(0, count(newLucene<Term>(L"id", id10), r2)); EXPECT_EQ(1, count(newLucene<Term>(L"id", StringUtils::toString(8000)), r2)); r1->close(); writer->close(); EXPECT_TRUE(r2->isCurrent()); IndexReaderPtr r3 = IndexReader::open(dir1, true); EXPECT_TRUE(r3->isCurrent()); EXPECT_TRUE(r2->isCurrent()); EXPECT_EQ(0, count(newLucene<Term>(L"id", id10), r3)); EXPECT_EQ(1, count(newLucene<Term>(L"id", StringUtils::toString(8000)), r3)); writer = newLucene<IndexWriter>(dir1, newLucene<WhitespaceAnalyzer>(), IndexWriter::MaxFieldLengthLIMITED); DocumentPtr doc = newLucene<Document>(); doc->add(newLucene<Field>(L"field", L"a b c", Field::STORE_NO, Field::INDEX_ANALYZED)); writer->addDocument(doc); EXPECT_TRUE(r2->isCurrent()); EXPECT_TRUE(r3->isCurrent()); writer->close(); EXPECT_TRUE(!r2->isCurrent()); EXPECT_TRUE(!r3->isCurrent()); r2->close(); r3->close(); dir1->close(); }
/// Tests whether the DocumentWriter and SegmentMerger correctly enable the payload bit in the FieldInfo TEST_F(PayloadsTest, testPayloadFieldBit) { DirectoryPtr ram = newLucene<RAMDirectory>(); PayloadAnalyzerPtr analyzer = newLucene<PayloadAnalyzer>(); IndexWriterPtr writer = newLucene<IndexWriter>(ram, analyzer, true, IndexWriter::MaxFieldLengthLIMITED); DocumentPtr d = newLucene<Document>(); // this field won't have any payloads d->add(newLucene<Field>(L"f1", L"This field has no payloads", Field::STORE_NO, Field::INDEX_ANALYZED)); // this field will have payloads in all docs, however not for all term positions, // so this field is used to check if the DocumentWriter correctly enables the payloads bit // even if only some term positions have payloads d->add(newLucene<Field>(L"f2", L"This field has payloads in all docs", Field::STORE_NO, Field::INDEX_ANALYZED)); d->add(newLucene<Field>(L"f2", L"This field has payloads in all docs", Field::STORE_NO, Field::INDEX_ANALYZED)); // this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads // enabled in only some documents d->add(newLucene<Field>(L"f3", L"This field has payloads in some docs", Field::STORE_NO, Field::INDEX_ANALYZED)); // only add payload data for field f2 ByteArray someData(ByteArray::newInstance(8)); uint8_t input[8] = { 's', 'o', 'm', 'e', 'd', 'a', 't', 'a' }; std::memcpy(someData.get(), input, 8); analyzer->setPayloadData(L"f2", 1, someData, 0, 1); writer->addDocument(d); // flush writer->close(); SegmentReaderPtr reader = SegmentReader::getOnlySegmentReader(ram); FieldInfosPtr fi = reader->fieldInfos(); EXPECT_TRUE(!fi->fieldInfo(L"f1")->storePayloads); EXPECT_TRUE(fi->fieldInfo(L"f2")->storePayloads); EXPECT_TRUE(!fi->fieldInfo(L"f3")->storePayloads); reader->close(); // now we add another document which has payloads for field f3 and verify if the SegmentMerger // enabled payloads for that field writer = newLucene<IndexWriter>(ram, analyzer, true, IndexWriter::MaxFieldLengthLIMITED); d = newLucene<Document>(); d->add(newLucene<Field>(L"f1", L"This field has no payloads", Field::STORE_NO, Field::INDEX_ANALYZED)); d->add(newLucene<Field>(L"f2", L"This field has payloads in all docs", Field::STORE_NO, Field::INDEX_ANALYZED)); d->add(newLucene<Field>(L"f2", L"This field has payloads in all docs", Field::STORE_NO, Field::INDEX_ANALYZED)); d->add(newLucene<Field>(L"f3", L"This field has payloads in some docs", Field::STORE_NO, Field::INDEX_ANALYZED)); // add payload data for field f2 and f3 analyzer->setPayloadData(L"f2", someData, 0, 1); analyzer->setPayloadData(L"f3", someData, 0, 3); writer->addDocument(d); // force merge writer->optimize(); // flush writer->close(); reader = SegmentReader::getOnlySegmentReader(ram); fi = reader->fieldInfos(); EXPECT_TRUE(!fi->fieldInfo(L"f1")->storePayloads); EXPECT_TRUE(fi->fieldInfo(L"f2")->storePayloads); EXPECT_TRUE(fi->fieldInfo(L"f3")->storePayloads); reader->close(); }
AddDirectoriesThreads(int32_t numDirs, const IndexWriterPtr& mainWriter) { this->numDirs = numDirs; this->mainWriter = mainWriter; threads = Collection<LuceneThreadPtr>::newInstance(NUM_THREADS); failures = Collection<LuceneException>::newInstance(); didClose = false; count = newLucene<HeavyAtomicInt>(0); numAddIndexesNoOptimize = newLucene<HeavyAtomicInt>(0); addDir = newLucene<MockRAMDirectory>(); IndexWriterPtr writer = newLucene<IndexWriter>(addDir, newLucene<WhitespaceAnalyzer>(), IndexWriter::MaxFieldLengthLIMITED); writer->setMaxBufferedDocs(2); for (int32_t i = 0; i < NUM_INIT_DOCS; ++i) { DocumentPtr doc = createDocument(i, L"addindex", 4); writer->addDocument(doc); } writer->close(); readers = Collection<IndexReaderPtr>::newInstance(numDirs); for (int32_t i = 0; i < numDirs; ++i) { readers[i] = IndexReader::open(addDir, false); } }
MultiSearcherRankingFixture() { // create MultiSearcher from two separate searchers DirectoryPtr d1 = newLucene<RAMDirectory>(); IndexWriterPtr iw1 = newLucene<IndexWriter>(d1, newLucene<StandardAnalyzer>(LuceneVersion::LUCENE_CURRENT), true, IndexWriter::MaxFieldLengthLIMITED); addCollection1(iw1); iw1->close(); DirectoryPtr d2 = newLucene<RAMDirectory>(); IndexWriterPtr iw2 = newLucene<IndexWriter>(d2, newLucene<StandardAnalyzer>(LuceneVersion::LUCENE_CURRENT), true, IndexWriter::MaxFieldLengthLIMITED); addCollection2(iw2); iw2->close(); Collection<SearchablePtr> s = newCollection<SearchablePtr>(newLucene<IndexSearcher>(d1, true), newLucene<IndexSearcher>(d2, true)); multiSearcher = newLucene<MultiSearcher>(s); // create IndexSearcher which contains all documents DirectoryPtr d = newLucene<RAMDirectory>(); IndexWriterPtr iw = newLucene<IndexWriter>(d, newLucene<StandardAnalyzer>(LuceneVersion::LUCENE_CURRENT), true, IndexWriter::MaxFieldLengthLIMITED); addCollection1(iw); addCollection2(iw); iw->close(); singleSearcher = newLucene<IndexSearcher>(d, true); }
void ConcurrentMergeScheduler::merge(const IndexWriterPtr& writer) { BOOST_ASSERT(!writer->holdsLock()); this->_writer = writer; initMergeThreadPriority(); dir = writer->getDirectory(); // First, quickly run through the newly proposed merges and add any orthogonal merges (ie a merge not // involving segments already pending to be merged) to the queue. If we are way behind on merging, // many of these newly proposed merges will likely already be registered. message(L"now merge"); message(L" index: " + writer->segString()); // Iterate, pulling from the IndexWriter's queue of pending merges, until it's empty while (true) { OneMergePtr merge(writer->getNextMerge()); if (!merge) { message(L" no more merges pending; now return"); return; } // We do this with the primary thread to keep deterministic assignment of segment names writer->mergeInit(merge); bool success = false; LuceneException finally; try { SyncLock syncLock(this); MergeThreadPtr merger; while (mergeThreadCount() >= maxThreadCount) { message(L" too many merge threads running; stalling..."); wait(1000); } message(L" consider merge " + merge->segString(dir)); BOOST_ASSERT(mergeThreadCount() < maxThreadCount); // OK to spawn a new merge thread to handle this merge merger = getMergeThread(writer, merge); mergeThreads.add(merger); message(L" launch new thread"); merger->start(); success = true; } catch (LuceneException& e) { finally = e; } if (!success) { writer->mergeFinish(merge); } finally.throwException(); } }
virtual void run() { try { for (int32_t j = 0; j < numIter; ++j) { writerFinal->optimize(false); for (int32_t k = 0; k < 17 * (1 + iFinal); ++k) { DocumentPtr d = newLucene<Document>(); d->add(newLucene<Field>(L"id", StringUtils::toString(iterFinal) + L"_" + StringUtils::toString(iFinal) + L"_" + StringUtils::toString(j) + L"_" + StringUtils::toString(k), Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); d->add(newLucene<Field>(L"contents", intToEnglish(iFinal + k), Field::STORE_NO, Field::INDEX_ANALYZED)); writer->addDocument(d); } for (int32_t k = 0; k < 9 * (1 + iFinal); ++k) writerFinal->deleteDocuments(newLucene<Term>(L"id", StringUtils::toString(iterFinal) + L"_" + StringUtils::toString(iFinal) + L"_" + StringUtils::toString(j) + L"_" + StringUtils::toString(k))); writerFinal->optimize(); } } catch (LuceneException& e) { BOOST_FAIL("Unexpected exception: " << e.getError()); } }
DateSortTest() { // Create an index writer. directory = newLucene<RAMDirectory>(); IndexWriterPtr writer = newLucene<IndexWriter>(directory, newLucene<WhitespaceAnalyzer>(), true, IndexWriter::MaxFieldLengthLIMITED); // oldest doc: // Add the first document. text = "Document 1" dateTime = Oct 10 03:25:22 EDT 2007 writer->addDocument(createDocument(L"Document 1", 1192001122000LL)); // Add the second document. text = "Document 2" dateTime = Oct 10 03:25:26 EDT 2007 writer->addDocument(createDocument(L"Document 2", 1192001126000LL)); // Add the third document. text = "Document 3" dateTime = Oct 11 07:12:13 EDT 2007 writer->addDocument(createDocument(L"Document 3", 1192101133000LL)); // Add the fourth document. text = "Document 4" dateTime = Oct 11 08:02:09 EDT 2007 writer->addDocument(createDocument(L"Document 4", 1192104129000LL)); // latest doc: // Add the fifth document. text = "Document 5" dateTime = Oct 12 13:25:43 EDT 2007 writer->addDocument(createDocument(L"Document 5", 1192209943000LL)); writer->optimize(); writer->close(); }
void add(const String& value, IndexWriterPtr iw) { DocumentPtr d = newLucene<Document>(); d->add(newLucene<Field>(FIELD_NAME, value, Field::STORE_YES, Field::INDEX_ANALYZED)); iw->addDocument(d); }
void addDoc(const IndexWriterPtr& writer, const String& value) { DocumentPtr doc = newLucene<Document>(); doc->add(newLucene<Field>(L"content", value, Field::STORE_NO, Field::INDEX_ANALYZED)); writer->addDocument(doc); }
void checkSkipTo(int32_t indexDivisor) { DirectoryPtr dir = newLucene<RAMDirectory>(); IndexWriterPtr writer = newLucene<IndexWriter>(dir, newLucene<WhitespaceAnalyzer>(), true, IndexWriter::MaxFieldLengthLIMITED); TermPtr ta = newLucene<Term>(L"content", L"aaa"); for (int32_t i = 0; i < 10; ++i) { addDoc(writer, L"aaa aaa aaa aaa"); } TermPtr tb = newLucene<Term>(L"content", L"bbb"); for (int32_t i = 0; i < 16; ++i) { addDoc(writer, L"bbb bbb bbb bbb"); } TermPtr tc = newLucene<Term>(L"content", L"ccc"); for (int32_t i = 0; i < 50; ++i) { addDoc(writer, L"ccc ccc ccc ccc"); } // assure that we deal with a single segment writer->optimize(); writer->close(); IndexReaderPtr reader = IndexReader::open(dir, IndexDeletionPolicyPtr(), true, indexDivisor); TermDocsPtr tdocs = reader->termDocs(); // without optimization (assumption skipInterval == 16) // with next tdocs->seek(ta); EXPECT_TRUE(tdocs->next()); EXPECT_EQ(0, tdocs->doc()); EXPECT_EQ(4, tdocs->freq()); EXPECT_TRUE(tdocs->next()); EXPECT_EQ(1, tdocs->doc()); EXPECT_EQ(4, tdocs->freq()); EXPECT_TRUE(tdocs->skipTo(0)); EXPECT_EQ(2, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(4)); EXPECT_EQ(4, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(9)); EXPECT_EQ(9, tdocs->doc()); EXPECT_TRUE(!tdocs->skipTo(10)); // without next tdocs->seek(ta); EXPECT_TRUE(tdocs->skipTo(0)); EXPECT_EQ(0, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(4)); EXPECT_EQ(4, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(9)); EXPECT_EQ(9, tdocs->doc()); EXPECT_TRUE(!tdocs->skipTo(10)); // exactly skipInterval documents and therefore with optimization // with next tdocs->seek(tb); EXPECT_TRUE(tdocs->next()); EXPECT_EQ(10, tdocs->doc()); EXPECT_EQ(4, tdocs->freq()); EXPECT_TRUE(tdocs->next()); EXPECT_EQ(11, tdocs->doc()); EXPECT_EQ(4, tdocs->freq()); EXPECT_TRUE(tdocs->skipTo(5)); EXPECT_EQ(12, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(15)); EXPECT_EQ(15, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(24)); EXPECT_EQ(24, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(25)); EXPECT_EQ(25, tdocs->doc()); EXPECT_TRUE(!tdocs->skipTo(26)); // without next tdocs->seek(tb); EXPECT_TRUE(tdocs->skipTo(5)); EXPECT_EQ(10, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(15)); EXPECT_EQ(15, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(24)); EXPECT_EQ(24, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(25)); EXPECT_EQ(25, tdocs->doc()); EXPECT_TRUE(!tdocs->skipTo(26)); // much more than skipInterval documents and therefore with optimization // with next tdocs->seek(tc); EXPECT_TRUE(tdocs->next()); EXPECT_EQ(26, tdocs->doc()); EXPECT_EQ(4, tdocs->freq()); EXPECT_TRUE(tdocs->next()); EXPECT_EQ(27, tdocs->doc()); EXPECT_EQ(4, tdocs->freq()); EXPECT_TRUE(tdocs->skipTo(5)); EXPECT_EQ(28, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(40)); EXPECT_EQ(40, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(57)); EXPECT_EQ(57, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(74)); EXPECT_EQ(74, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(75)); EXPECT_EQ(75, tdocs->doc()); EXPECT_TRUE(!tdocs->skipTo(76)); // without next tdocs->seek(tc); EXPECT_TRUE(tdocs->skipTo(5)); EXPECT_EQ(26, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(40)); EXPECT_EQ(40, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(57)); EXPECT_EQ(57, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(74)); EXPECT_EQ(74, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(75)); EXPECT_EQ(75, tdocs->doc()); EXPECT_TRUE(!tdocs->skipTo(76)); tdocs->close(); reader->close(); dir->close(); }
void IndexTestCase::testDocumentDeletion() { DocumentSchema schema; schema.addField("URL", "PRIMARY_KEY", true); schema.addTextField("BODY"); schema.addField("MODIFIED", "INT64", true); stringstream ss1; const size_t NUM_DOCS = 1000; size_t i = 0; for (; i < NUM_DOCS; ++i) { ss1 << "url" << i << ", body" << i << " hot," << (i * 100) % 1000 << ";"; } buildIndex(schema, ss1.str()); stringstream ss2; for (; i < 2 * NUM_DOCS; ++i) { ss2 << "url" << i << ", body" << i << " hot," << (i * 100) % 1000 << ";"; } buildIndex(schema, ss2.str(), true); StandardAnalyzerPtr sa(new StandardAnalyzer()); sa->init(); TokenViewPtr pTokens = sa->tokenize("hot", 3); CPPUNIT_ASSERT(pTokens); CPPUNIT_ASSERT(pTokens->getNumTokens() == 1); TokenView::Iterator it = pTokens->iterator(); TermPtr pTerm(new Term("BODY", it.next().getTextValue())); tstring str = getTestPath(); std::set<docid_t> answer; { Index index; index.open(str, Index::RDWR, NULL); IndexWriterPtr pIndexWriter = index.acquireWriter(); CPPUNIT_ASSERT(pIndexWriter != NULL); IndexReaderPtr pIndexReader = index.acquireReader(); CPPUNIT_ASSERT(pIndexReader != NULL); for (size_t i = 0; i < 2 * NUM_DOCS; ++i) { stringstream ss; ss << "url" << i; if (i == 1000 || i == 1500 || i == 1505 || i == 1999) { pIndexWriter->deleteDocument(ss.str()); } else { TermReaderPtr pTermReader = pIndexReader->termReader(); TermPtr pTerm(new Term("URL", ss.str())); TermPostingIteratorPtr pIt = pTermReader->seek(pTerm.get()); docid_t docId = pIt->skipTo(0); answer.insert(docId); } } TermReaderPtr pTermReader = pIndexReader->termReader(); TermPostingIteratorPtr pDocFreqs = pTermReader->seek(pTerm.get()); CPPUNIT_ASSERT(pDocFreqs); CPPUNIT_ASSERT_EQUAL((df_t)NUM_DOCS * 2, pDocFreqs->getTermMeta().getDocFreq()); std::set<docid_t>::const_iterator it = answer.begin(); for (docid_t i = 0; i < (docid_t)(2 * NUM_DOCS); ) { docid_t docId = pDocFreqs->skipTo((docid_t)i); i = docId + 1; if (docId == INVALID_DOCID) { break; } CPPUNIT_ASSERT_EQUAL(*it, docId); ++it; } CPPUNIT_ASSERT(it == answer.end()); } { Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pIndexReader = index.acquireReader(); CPPUNIT_ASSERT(pIndexReader != NULL); TermReaderPtr pTermReader = pIndexReader->termReader(); TermPostingIteratorPtr pDocFreqs = pTermReader->seek(pTerm.get()); CPPUNIT_ASSERT(pDocFreqs); CPPUNIT_ASSERT_EQUAL((df_t)(2 * NUM_DOCS), pDocFreqs->getTermMeta().getDocFreq()); std::set<docid_t>::const_iterator it = answer.begin(); for (docid_t i = 0; i < (docid_t)(2 * NUM_DOCS); ) { docid_t docId = pDocFreqs->skipTo((docid_t)i); i = docId + 1; if (docId == INVALID_DOCID) { break; } CPPUNIT_ASSERT_EQUAL(*it, docId); ++it; } CPPUNIT_ASSERT(it == answer.end()); // for (std::set<docid_t>::const_iterator it = answer.begin(); // it != answer.end(); ++it) // { // docid_t docId = pDocFreqs->skipTo(*it); // CPPUNIT_ASSERT_EQUAL(*it, docId); // } // docid_t docId = pDocFreqs->skipTo(NUM_DOCS + 0); // CPPUNIT_ASSERT_EQUAL((docid_t)NUM_DOCS + 1, docId); // docId = pDocFreqs->skipTo(NUM_DOCS + 500); // CPPUNIT_ASSERT_EQUAL((docid_t)NUM_DOCS + 501, docId); // docId = pDocFreqs->skipTo(NUM_DOCS + 505); // CPPUNIT_ASSERT_EQUAL((docid_t)NUM_DOCS + 506, docId); // docId = pDocFreqs->skipTo(2 * NUM_DOCS - 1); // CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); } }
/// Builds an index with payloads in the given Directory and performs different /// tests to verify the payload encoding static void encodingTest(const DirectoryPtr& dir) { PayloadAnalyzerPtr analyzer = newLucene<PayloadAnalyzer>(); IndexWriterPtr writer = newLucene<IndexWriter>(dir, analyzer, true, IndexWriter::MaxFieldLengthLIMITED); // should be in sync with value in TermInfosWriter int32_t skipInterval = 16; int32_t numTerms = 5; String fieldName = L"f1"; int32_t numDocs = skipInterval + 1; // create content for the test documents with just a few terms Collection<TermPtr> terms = generateTerms(fieldName, numTerms); StringStream sb; for (Collection<TermPtr>::iterator term = terms.begin(); term != terms.end(); ++term) { sb << (*term)->text() << L" "; } String content = sb.str(); int32_t payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2; ByteArray payloadData = generateRandomData(payloadDataLength); DocumentPtr d = newLucene<Document>(); d->add(newLucene<Field>(fieldName, content, Field::STORE_NO, Field::INDEX_ANALYZED)); // add the same document multiple times to have the same payload lengths for all // occurrences within two consecutive skip intervals int32_t offset = 0; for (int32_t i = 0; i < 2 * numDocs; ++i) { analyzer->setPayloadData(fieldName, payloadData, offset, 1); offset += numTerms; writer->addDocument(d); } // make sure we create more than one segment to test merging writer->commit(); for (int32_t i = 0; i < numDocs; ++i) { analyzer->setPayloadData(fieldName, payloadData, offset, i); offset += i * numTerms; writer->addDocument(d); } writer->optimize(); // flush writer->close(); // Verify the index IndexReaderPtr reader = IndexReader::open(dir, true); ByteArray verifyPayloadData(ByteArray::newInstance(payloadDataLength)); offset = 0; Collection<TermPositionsPtr> tps = Collection<TermPositionsPtr>::newInstance(numTerms); for (int32_t i = 0; i < numTerms; ++i) { tps[i] = reader->termPositions(terms[i]); } while (tps[0]->next()) { for (int32_t i = 1; i < numTerms; ++i) { tps[i]->next(); } int32_t freq = tps[0]->freq(); for (int32_t i = 0; i < freq; ++i) { for (int32_t j = 0; j < numTerms; ++j) { tps[j]->nextPosition(); tps[j]->getPayload(verifyPayloadData, offset); offset += tps[j]->getPayloadLength(); } } } for (int32_t i = 0; i < numTerms; ++i) { tps[i]->close(); } EXPECT_TRUE(payloadData.equals(verifyPayloadData)); // test lazy skipping TermPositionsPtr tp = reader->termPositions(terms[0]); tp->next(); tp->nextPosition(); // now we don't read this payload tp->nextPosition(); EXPECT_EQ(1, tp->getPayloadLength()); ByteArray payload = tp->getPayload(ByteArray(), 0); EXPECT_EQ(payload[0], payloadData[numTerms]); tp->nextPosition(); // we don't read this payload and skip to a different document tp->skipTo(5); tp->nextPosition(); EXPECT_EQ(1, tp->getPayloadLength()); payload = tp->getPayload(ByteArray(), 0); EXPECT_EQ(payload[0], payloadData[5 * numTerms]); // Test different lengths at skip points tp->seek(terms[1]); tp->next(); tp->nextPosition(); EXPECT_EQ(1, tp->getPayloadLength()); tp->skipTo(skipInterval - 1); tp->nextPosition(); EXPECT_EQ(1, tp->getPayloadLength()); tp->skipTo(2 * skipInterval - 1); tp->nextPosition(); EXPECT_EQ(1, tp->getPayloadLength()); tp->skipTo(3 * skipInterval - 1); tp->nextPosition(); EXPECT_EQ(3 * skipInterval - 2 * numDocs - 1, tp->getPayloadLength()); // Test multiple call of getPayload() tp->getPayload(ByteArray(), 0); // it is forbidden to call getPayload() more than once without calling nextPosition() try { tp->getPayload(ByteArray(), 0); } catch (IOException& e) { EXPECT_TRUE(check_exception(LuceneException::IO)(e)); } reader->close(); // test long payload analyzer = newLucene<PayloadAnalyzer>(); writer = newLucene<IndexWriter>(dir, analyzer, true, IndexWriter::MaxFieldLengthLIMITED); String singleTerm = L"lucene"; d = newLucene<Document>(); d->add(newLucene<Field>(fieldName, singleTerm, Field::STORE_NO, Field::INDEX_ANALYZED)); // add a payload whose length is greater than the buffer size of BufferedIndexOutput payloadData = generateRandomData(2000); analyzer->setPayloadData(fieldName, payloadData, 100, 1500); writer->addDocument(d); writer->optimize(); // flush writer->close(); reader = IndexReader::open(dir, true); tp = reader->termPositions(newLucene<Term>(fieldName, singleTerm)); tp->next(); tp->nextPosition(); verifyPayloadData.resize(tp->getPayloadLength()); tp->getPayload(verifyPayloadData, 0); ByteArray portion(ByteArray::newInstance(1500)); MiscUtils::arrayCopy(payloadData.get(), 100, portion.get(), 0, 1500); EXPECT_TRUE(portion.equals(verifyPayloadData)); reader->close(); }
void close(bool doWait) { didClose = true; mainWriter->close(doWait); }
TEST_F(SimpleExplanationsOfNonMatchesTest, testTermQueryMultiSearcherExplain) { // creating two directories for indices DirectoryPtr indexStoreA = newLucene<MockRAMDirectory>(); DirectoryPtr indexStoreB = newLucene<MockRAMDirectory>(); DocumentPtr lDoc = newLucene<Document>(); lDoc->add(newLucene<Field>(L"handle", L"1 2", Field::STORE_YES, Field::INDEX_ANALYZED)); DocumentPtr lDoc2 = newLucene<Document>(); lDoc2->add(newLucene<Field>(L"handle", L"1 2", Field::STORE_YES, Field::INDEX_ANALYZED)); DocumentPtr lDoc3 = newLucene<Document>(); lDoc3->add(newLucene<Field>(L"handle", L"1 2", Field::STORE_YES, Field::INDEX_ANALYZED)); IndexWriterPtr writerA = newLucene<IndexWriter>(indexStoreA, newLucene<StandardAnalyzer>(LuceneVersion::LUCENE_CURRENT), true, IndexWriter::MaxFieldLengthLIMITED); IndexWriterPtr writerB = newLucene<IndexWriter>(indexStoreB, newLucene<StandardAnalyzer>(LuceneVersion::LUCENE_CURRENT), true, IndexWriter::MaxFieldLengthLIMITED); writerA->addDocument(lDoc); writerA->addDocument(lDoc2); writerA->optimize(); writerA->close(); writerB->addDocument(lDoc3); writerB->close(); QueryParserPtr parser = newLucene<QueryParser>(LuceneVersion::LUCENE_CURRENT, L"fulltext", newLucene<StandardAnalyzer>(LuceneVersion::LUCENE_CURRENT)); QueryPtr query = parser->parse(L"handle:1"); Collection<SearchablePtr> searchers = newCollection<SearchablePtr>( newLucene<IndexSearcher>(indexStoreB, true), newLucene<IndexSearcher>(indexStoreA, true) ); SearcherPtr mSearcher = newLucene<MultiSearcher>(searchers); Collection<ScoreDocPtr> hits = mSearcher->search(query, FilterPtr(), 1000)->scoreDocs; EXPECT_EQ(3, hits.size()); ExplanationPtr explain = mSearcher->explain(query, hits[0]->doc); String exp = explain->toString(); EXPECT_TRUE(exp.find(L"maxDocs=3") != String::npos); EXPECT_TRUE(exp.find(L"docFreq=3") != String::npos); query = parser->parse(L"handle:\"1 2\""); hits = mSearcher->search(query, FilterPtr(), 1000)->scoreDocs; EXPECT_EQ(3, hits.size()); explain = mSearcher->explain(query, hits[0]->doc); exp = explain->toString(); EXPECT_TRUE(exp.find(L"1=3") != String::npos); EXPECT_TRUE(exp.find(L"2=3") != String::npos); query = newLucene<SpanNearQuery>(newCollection<SpanQueryPtr>(newLucene<SpanTermQuery>(newLucene<Term>(L"handle", L"1")), newLucene<SpanTermQuery>(newLucene<Term>(L"handle", L"2"))), 0, true); hits = mSearcher->search(query, FilterPtr(), 1000)->scoreDocs; EXPECT_EQ(3, hits.size()); explain = mSearcher->explain(query, hits[0]->doc); exp = explain->toString(); EXPECT_TRUE(exp.find(L"1=3") != String::npos); EXPECT_TRUE(exp.find(L"2=3") != String::npos); mSearcher->close(); }