boost::any StringCache::createValue(IndexReaderPtr reader, EntryPtr key) { EntryPtr entry(key); String field(entry->field); Collection<String> retArray(Collection<String>::newInstance(reader->maxDoc())); TermDocsPtr termDocs(reader->termDocs()); TermEnumPtr termEnum(reader->terms(newLucene<Term>(field))); LuceneException finally; try { do { TermPtr term(termEnum->term()); if (!term || term->field() != field) break; String termval(term->text()); termDocs->seek(termEnum); while (termDocs->next()) retArray[termDocs->doc()] = termval; } while (termEnum->next()); } catch (LuceneException& e) { finally = e; } termDocs->close(); termEnum->close(); finally.throwException(); return retArray; }
boost::any StringIndexCache::createValue(IndexReaderPtr reader, EntryPtr key) { EntryPtr entry(key); String field(entry->field); Collection<int32_t> retArray(Collection<int32_t>::newInstance(reader->maxDoc())); Collection<String> mterms(Collection<String>::newInstance(reader->maxDoc() + 1)); TermDocsPtr termDocs(reader->termDocs()); TermEnumPtr termEnum(reader->terms(newLucene<Term>(field))); int32_t t = 0; // current term number // an entry for documents that have no terms in this field should a document with no terms be at // top or bottom? This puts them at the top - if it is changed, FieldDocSortedHitQueue needs to // change as well. mterms[t++] = L""; LuceneException finally; try { do { TermPtr term(termEnum->term()); if (!term || term->field() != field || t >= mterms.size() ) break; // store term text mterms[t] = term->text(); termDocs->seek(termEnum); while (termDocs->next()) retArray[termDocs->doc()] = t; ++t; } while (termEnum->next()); } catch (LuceneException& e) { finally = e; } termDocs->close(); termEnum->close(); finally.throwException(); if (t == 0) { // if there are no terms, make the term array have a single null entry mterms = Collection<String>::newInstance(1); } else if (t < mterms.size()) { // if there are less terms than documents, trim off the dead array space mterms.resize(t); } return newLucene<StringIndex>(retArray, mterms); }
static int32_t count(const TermPtr& t, const IndexReaderPtr& r) { int32_t count = 0; TermDocsPtr td = r->termDocs(t); while (td->next()) { td->doc(); ++count; } td->close(); return count; }
static void verifyTermDocs(DirectoryPtr dir, TermPtr term, int32_t numDocs) { IndexReaderPtr reader = IndexReader::open(dir, true); TermDocsPtr termDocs = reader->termDocs(term); int32_t count = 0; while (termDocs->next()) ++count; BOOST_CHECK_EQUAL(count, numDocs); reader->close(); }
boost::any DoubleCache::createValue(IndexReaderPtr reader, EntryPtr key) { EntryPtr entry(key); String field(entry->field); DoubleParserPtr parser(VariantUtils::get<DoubleParserPtr>(entry->custom)); if (!parser) { FieldCachePtr wrapper(_wrapper); boost::any doubles; try { doubles = wrapper->getDoubles(reader, field, FieldCache::DEFAULT_DOUBLE_PARSER()); } catch (NumberFormatException&) { doubles = wrapper->getDoubles(reader, field, FieldCache::NUMERIC_UTILS_DOUBLE_PARSER()); } return doubles; } Collection<double> retArray; TermDocsPtr termDocs(reader->termDocs()); TermEnumPtr termEnum(reader->terms(newLucene<Term>(field))); LuceneException finally; try { do { TermPtr term(termEnum->term()); if (!term || term->field() != field) break; double termval = parser->parseDouble(term->text()); if (!retArray) // late init retArray = Collection<double>::newInstance(reader->maxDoc()); termDocs->seek(termEnum); while (termDocs->next()) retArray[termDocs->doc()] = termval; } while (termEnum->next()); } catch (StopFillCacheException&) { } catch (LuceneException& e) { finally = e; } termDocs->close(); termEnum->close(); finally.throwException(); if (!retArray) // no values retArray = Collection<double>::newInstance(reader->maxDoc()); return retArray; }
boost::any ByteCache::createValue(IndexReaderPtr reader, EntryPtr key) { EntryPtr entry(key); String field(entry->field); ByteParserPtr parser(VariantUtils::get<ByteParserPtr>(entry->custom)); if (!parser) return FieldCachePtr(_wrapper)->getBytes(reader, field, FieldCache::DEFAULT_BYTE_PARSER()); Collection<uint8_t> retArray(Collection<uint8_t>::newInstance(reader->maxDoc())); TermDocsPtr termDocs(reader->termDocs()); TermEnumPtr termEnum(reader->terms(newLucene<Term>(field))); LuceneException finally; try { do { TermPtr term(termEnum->term()); if (!term || term->field() != field) break; uint8_t termval = parser->parseByte(term->text()); termDocs->seek(termEnum); while (termDocs->next()) retArray[termDocs->doc()] = termval; } while (termEnum->next()); } catch (StopFillCacheException&) { } catch (LuceneException& e) { finally = e; } termDocs->close(); termEnum->close(); finally.throwException(); return retArray; }
void checkSkipTo(int32_t indexDivisor) { DirectoryPtr dir = newLucene<RAMDirectory>(); IndexWriterPtr writer = newLucene<IndexWriter>(dir, newLucene<WhitespaceAnalyzer>(), true, IndexWriter::MaxFieldLengthLIMITED); TermPtr ta = newLucene<Term>(L"content", L"aaa"); for (int32_t i = 0; i < 10; ++i) { addDoc(writer, L"aaa aaa aaa aaa"); } TermPtr tb = newLucene<Term>(L"content", L"bbb"); for (int32_t i = 0; i < 16; ++i) { addDoc(writer, L"bbb bbb bbb bbb"); } TermPtr tc = newLucene<Term>(L"content", L"ccc"); for (int32_t i = 0; i < 50; ++i) { addDoc(writer, L"ccc ccc ccc ccc"); } // assure that we deal with a single segment writer->optimize(); writer->close(); IndexReaderPtr reader = IndexReader::open(dir, IndexDeletionPolicyPtr(), true, indexDivisor); TermDocsPtr tdocs = reader->termDocs(); // without optimization (assumption skipInterval == 16) // with next tdocs->seek(ta); EXPECT_TRUE(tdocs->next()); EXPECT_EQ(0, tdocs->doc()); EXPECT_EQ(4, tdocs->freq()); EXPECT_TRUE(tdocs->next()); EXPECT_EQ(1, tdocs->doc()); EXPECT_EQ(4, tdocs->freq()); EXPECT_TRUE(tdocs->skipTo(0)); EXPECT_EQ(2, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(4)); EXPECT_EQ(4, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(9)); EXPECT_EQ(9, tdocs->doc()); EXPECT_TRUE(!tdocs->skipTo(10)); // without next tdocs->seek(ta); EXPECT_TRUE(tdocs->skipTo(0)); EXPECT_EQ(0, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(4)); EXPECT_EQ(4, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(9)); EXPECT_EQ(9, tdocs->doc()); EXPECT_TRUE(!tdocs->skipTo(10)); // exactly skipInterval documents and therefore with optimization // with next tdocs->seek(tb); EXPECT_TRUE(tdocs->next()); EXPECT_EQ(10, tdocs->doc()); EXPECT_EQ(4, tdocs->freq()); EXPECT_TRUE(tdocs->next()); EXPECT_EQ(11, tdocs->doc()); EXPECT_EQ(4, tdocs->freq()); EXPECT_TRUE(tdocs->skipTo(5)); EXPECT_EQ(12, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(15)); EXPECT_EQ(15, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(24)); EXPECT_EQ(24, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(25)); EXPECT_EQ(25, tdocs->doc()); EXPECT_TRUE(!tdocs->skipTo(26)); // without next tdocs->seek(tb); EXPECT_TRUE(tdocs->skipTo(5)); EXPECT_EQ(10, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(15)); EXPECT_EQ(15, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(24)); EXPECT_EQ(24, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(25)); EXPECT_EQ(25, tdocs->doc()); EXPECT_TRUE(!tdocs->skipTo(26)); // much more than skipInterval documents and therefore with optimization // with next tdocs->seek(tc); EXPECT_TRUE(tdocs->next()); EXPECT_EQ(26, tdocs->doc()); EXPECT_EQ(4, tdocs->freq()); EXPECT_TRUE(tdocs->next()); EXPECT_EQ(27, tdocs->doc()); EXPECT_EQ(4, tdocs->freq()); EXPECT_TRUE(tdocs->skipTo(5)); EXPECT_EQ(28, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(40)); EXPECT_EQ(40, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(57)); EXPECT_EQ(57, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(74)); EXPECT_EQ(74, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(75)); EXPECT_EQ(75, tdocs->doc()); EXPECT_TRUE(!tdocs->skipTo(76)); // without next tdocs->seek(tc); EXPECT_TRUE(tdocs->skipTo(5)); EXPECT_EQ(26, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(40)); EXPECT_EQ(40, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(57)); EXPECT_EQ(57, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(74)); EXPECT_EQ(74, tdocs->doc()); EXPECT_TRUE(tdocs->skipTo(75)); EXPECT_EQ(75, tdocs->doc()); EXPECT_TRUE(!tdocs->skipTo(76)); tdocs->close(); reader->close(); dir->close(); }