Ejemplo n.º 1
0
 boost::any StringCache::createValue(IndexReaderPtr reader, EntryPtr key)
 {
     EntryPtr entry(key);
     String field(entry->field);
     Collection<String> retArray(Collection<String>::newInstance(reader->maxDoc()));
     TermDocsPtr termDocs(reader->termDocs());
     TermEnumPtr termEnum(reader->terms(newLucene<Term>(field)));
     LuceneException finally;
     try
     {
         do
         {
             TermPtr term(termEnum->term());
             if (!term || term->field() != field)
                 break;
             String termval(term->text());
             termDocs->seek(termEnum);
             while (termDocs->next())
                 retArray[termDocs->doc()] = termval;
         }
         while (termEnum->next());
     }
     catch (LuceneException& e)
     {
         finally = e;
     }
     termDocs->close();
     termEnum->close();
     finally.throwException();
     return retArray;
 }
Ejemplo n.º 2
0
 void SegmentInfos::write(DirectoryPtr directory)
 {
     String segmentFileName(getNextSegmentFileName());
     
     // always advance the generation on write
     if (generation == -1)
         generation = 1;
     else
         ++generation;
     
     ChecksumIndexOutputPtr segnOutput(newLucene<ChecksumIndexOutput>(directory->createOutput(segmentFileName)));
     
     bool success = false;
     LuceneException finally;
     try
     {
         segnOutput->writeInt(CURRENT_FORMAT); // write FORMAT
         segnOutput->writeLong(++version); // every write changes the index
         segnOutput->writeInt(counter); // write counter
         segnOutput->writeInt(segmentInfos.size()); // write infos
         for (Collection<SegmentInfoPtr>::iterator seginfo = segmentInfos.begin(); seginfo != segmentInfos.end(); ++seginfo)
             (*seginfo)->write(segnOutput);
         segnOutput->writeStringStringMap(userData);
         segnOutput->prepareCommit();
         success = true;
         pendingSegnOutput = segnOutput;
     }
     catch (LuceneException& e)
     {
         finally = e;
     }
     
     if (!success)
     {
         // We hit an exception above; try to close the file but suppress any exception
         try
         {
             segnOutput->close();
         }
         catch (...)
         {
             // Suppress so we keep throwing the original exception
         }
         
         try
         {
             // try not to leave a truncated segments_n file in the index
             directory->deleteFile(segmentFileName);
         }
         catch (...)
         {
             // Suppress so we keep throwing the original exception
         }
     }
     
     finally.throwException();
 }
void ConcurrentMergeScheduler::merge(const IndexWriterPtr& writer) {
    BOOST_ASSERT(!writer->holdsLock());

    this->_writer = writer;

    initMergeThreadPriority();

    dir = writer->getDirectory();

    // First, quickly run through the newly proposed merges and add any orthogonal merges (ie a merge not
    // involving segments already pending to be merged) to the queue.  If we are way behind on merging,
    // many of these newly proposed merges will likely already be registered.
    message(L"now merge");
    message(L"  index: " + writer->segString());

    // Iterate, pulling from the IndexWriter's queue of pending merges, until it's empty
    while (true) {
        OneMergePtr merge(writer->getNextMerge());
        if (!merge) {
            message(L"  no more merges pending; now return");
            return;
        }

        // We do this with the primary thread to keep deterministic assignment of segment names
        writer->mergeInit(merge);

        bool success = false;
        LuceneException finally;
        try {
            SyncLock syncLock(this);
            MergeThreadPtr merger;
            while (mergeThreadCount() >= maxThreadCount) {
                message(L"    too many merge threads running; stalling...");
                wait(1000);
            }

            message(L"  consider merge " + merge->segString(dir));

            BOOST_ASSERT(mergeThreadCount() < maxThreadCount);

            // OK to spawn a new merge thread to handle this merge
            merger = getMergeThread(writer, merge);
            mergeThreads.add(merger);
            message(L"    launch new thread");

            merger->start();
            success = true;
        } catch (LuceneException& e) {
            finally = e;
        }
        if (!success) {
            writer->mergeFinish(merge);
        }
        finally.throwException();
    }
}
Ejemplo n.º 4
0
 boost::any StringIndexCache::createValue(IndexReaderPtr reader, EntryPtr key)
 {
     EntryPtr entry(key);
     String field(entry->field);
     Collection<int32_t> retArray(Collection<int32_t>::newInstance(reader->maxDoc()));
     Collection<String> mterms(Collection<String>::newInstance(reader->maxDoc() + 1));
     TermDocsPtr termDocs(reader->termDocs());
     TermEnumPtr termEnum(reader->terms(newLucene<Term>(field)));
     int32_t t = 0; // current term number
     
     // an entry for documents that have no terms in this field should a document with no terms be at 
     // top or bottom?  This puts them at the top - if it is changed, FieldDocSortedHitQueue needs to 
     // change as well.
     mterms[t++] = L"";
     
     LuceneException finally;
     try
     {
         do
         {
             TermPtr term(termEnum->term());
             if (!term || term->field() != field || t >= mterms.size() )
                 break;
             
             // store term text
             mterms[t] = term->text();
             
             termDocs->seek(termEnum);
             while (termDocs->next())
                 retArray[termDocs->doc()] = t;
             
             ++t;
         }
         while (termEnum->next());
     }
     catch (LuceneException& e)
     {
         finally = e;
     }
     termDocs->close();
     termEnum->close();
     finally.throwException();
     
     if (t == 0)
     {
         // if there are no terms, make the term array have a single null entry
         mterms = Collection<String>::newInstance(1);
     }
     else if (t < mterms.size())
     {
         // if there are less terms than documents, trim off the dead array space
         mterms.resize(t);
     }
     
     return newLucene<StringIndex>(retArray, mterms);
 }
 void MergeThread::run()
 {
     // First time through the while loop we do the merge that we were started with
     OneMergePtr merge(this->startMerge);
     ConcurrentMergeSchedulerPtr merger(_merger);
     
     LuceneException finally;
     try
     {
         merger->message(L"  merge thread: start");
         IndexWriterPtr writer(_writer);
         
         while (true)
         {
             setRunningMerge(merge);
             merger->doMerge(merge);
             
             // Subsequent times through the loop we do any new merge that writer says is necessary
             merge = writer->getNextMerge();
             if (merge)
             {
                 writer->mergeInit(merge);
                 merger->message(L"  merge thread: do another merge " + merge->segString(merger->dir));
             }
             else
                 break;
         }
         
         merger->message(L"  merge thread: done");
     }
     catch (MergeAbortedException&)
     {
         // Ignore the exception if it was due to abort
     }
     catch (LuceneException& e)
     {
         if (!merger->suppressExceptions)
         {
             // suppressExceptions is normally only set during testing.
             merger->anyExceptions = true;
             merger->handleMergeException(e);
         }
         else
             finally = e;
     }
     
     {
         SyncLock syncLock(merger);
         merger->notifyAll();
         
         bool removed = merger->mergeThreads.remove(shared_from_this());
         BOOST_ASSERT(removed);
     }
     finally.throwException();
 }
Ejemplo n.º 6
0
 boost::any DoubleCache::createValue(IndexReaderPtr reader, EntryPtr key)
 {
     EntryPtr entry(key);
     String field(entry->field);
     DoubleParserPtr parser(VariantUtils::get<DoubleParserPtr>(entry->custom));
     if (!parser)
     {
         FieldCachePtr wrapper(_wrapper);
         boost::any doubles;
         try
         {
             doubles = wrapper->getDoubles(reader, field, FieldCache::DEFAULT_DOUBLE_PARSER());
         }
         catch (NumberFormatException&)
         {
             doubles = wrapper->getDoubles(reader, field, FieldCache::NUMERIC_UTILS_DOUBLE_PARSER());
         }
         return doubles;
     }
     Collection<double> retArray;
     TermDocsPtr termDocs(reader->termDocs());
     TermEnumPtr termEnum(reader->terms(newLucene<Term>(field)));
     LuceneException finally;
     try
     {
         do
         {
             TermPtr term(termEnum->term());
             if (!term || term->field() != field)
                 break;
             double termval = parser->parseDouble(term->text());
             if (!retArray) // late init
                 retArray = Collection<double>::newInstance(reader->maxDoc());
             termDocs->seek(termEnum);
             while (termDocs->next())
                 retArray[termDocs->doc()] = termval;
         }
         while (termEnum->next());
     }
     catch (StopFillCacheException&)
     {
     }
     catch (LuceneException& e)
     {
         finally = e;
     }
     termDocs->close();
     termEnum->close();
     finally.throwException();
     if (!retArray) // no values
         retArray = Collection<double>::newInstance(reader->maxDoc());
     return retArray;
 }
void runReadBytesAndClose(IndexInputPtr input, int32_t bufferSize)
{
    LuceneException finally;
    try
    {
        runReadBytes(input, bufferSize);
    }
    catch (LuceneException& e)
    {
        finally = e;
    }
    input->close();
    finally.throwException();
}
Ejemplo n.º 8
0
 void FieldInfos::write(DirectoryPtr d, const String& name)
 {
     IndexOutputPtr output(d->createOutput(name));
     LuceneException finally;
     try
     {
         write(output);
     }
     catch (LuceneException& e)
     {
         finally = e;
     }
     output->close();
     finally.throwException();
 }
Ejemplo n.º 9
0
QueryPtr FuzzyQuery::rewrite(const IndexReaderPtr& reader) {
    if (!termLongEnough) { // can only match if it's exact
        return newLucene<TermQuery>(term);
    }

    int32_t maxSize = BooleanQuery::getMaxClauseCount();
    ScoreTermQueuePtr stQueue(newLucene<ScoreTermQueue>(maxSize + 1));
    FilteredTermEnumPtr enumerator(getEnum(reader));
    LuceneException finally;
    try {
        ScoreTermPtr st = newLucene<ScoreTerm>();
        do {
            TermPtr t(enumerator->term());
            if (!t) {
                break;
            }
            double score = enumerator->difference();
            // ignore uncompetitive hits
            if (stQueue->size() >= maxSize && score <= stQueue->top()->score) {
                continue;
            }
            // add new entry in PQ
            st->term = t;
            st->score = score;
            stQueue->add(st);
            // possibly drop entries from queue
            st = (stQueue->size() > maxSize) ? stQueue->pop() : newLucene<ScoreTerm>();
        } while (enumerator->next());
    } catch (LuceneException& e) {
        finally = e;
    }
    enumerator->close();
    finally.throwException();

    BooleanQueryPtr query(newLucene<BooleanQuery>(true));
    int32_t size = stQueue->size();
    for (int32_t i = 0; i < size; ++i) {
        ScoreTermPtr st(stQueue->pop());
        TermQueryPtr tq(newLucene<TermQuery>(st->term)); // found a match
        tq->setBoost(getBoost() * st->score); // set the boost
        query->add(tq, BooleanClause::SHOULD); // add to query
    }

    return query;
}
Ejemplo n.º 10
0
 FieldInfos::FieldInfos(DirectoryPtr d, const String& name)
 {
     format = 0;
     byNumber = Collection<FieldInfoPtr>::newInstance();
     byName = MapStringFieldInfo::newInstance();
     IndexInputPtr input(d->openInput(name));
     LuceneException finally;
     try
     {
         try
         {
             read(input, name);
         }
         catch (IOException& e)
         {
             if (format == FORMAT_PRE)
             {
                 input->seek(0);
                 input->setModifiedUTF8StringsMode();
                 byNumber.clear();
                 byName.clear();
                 try
                 {
                     read(input, name);
                 }
                 catch (...)
                 {
                     // Ignore any new exception & throw original IOE
                     finally = e;
                 }
             }
             else
                 finally = e;
         }
     }
     catch (LuceneException& e)
     {
         finally = e;
     }
     input->close();
     finally.throwException();
 }
Ejemplo n.º 11
0
int32_t IndexReader::deleteDocuments(const TermPtr& term) {
    ensureOpen();
    TermDocsPtr docs(termDocs(term));
    if (!docs) {
        return 0;
    }
    int32_t n = 0;
    LuceneException finally;
    try {
        while (docs->next()) {
            deleteDocument(docs->doc());
            ++n;
        }
    } catch (LuceneException& e) {
        finally = e;
    }
    docs->close();
    finally.throwException();
    return n;
}
Ejemplo n.º 12
0
 boost::any ByteCache::createValue(IndexReaderPtr reader, EntryPtr key)
 {
     EntryPtr entry(key);
     String field(entry->field);
     ByteParserPtr parser(VariantUtils::get<ByteParserPtr>(entry->custom));
     if (!parser)
         return FieldCachePtr(_wrapper)->getBytes(reader, field, FieldCache::DEFAULT_BYTE_PARSER());
     Collection<uint8_t> retArray(Collection<uint8_t>::newInstance(reader->maxDoc()));
     TermDocsPtr termDocs(reader->termDocs());
     TermEnumPtr termEnum(reader->terms(newLucene<Term>(field)));
     LuceneException finally;
     try
     {
         do
         {
             TermPtr term(termEnum->term());
             if (!term || term->field() != field)
                 break;
             uint8_t termval = parser->parseByte(term->text());
             termDocs->seek(termEnum);
             while (termDocs->next())
                 retArray[termDocs->doc()] = termval;
         }
         while (termEnum->next());
     }
     catch (StopFillCacheException&)
     {
     }
     catch (LuceneException& e)
     {
         finally = e;
     }
     termDocs->close();
     termEnum->close();
     finally.throwException();
     return retArray;
 }
 void handle(const LuceneException& t) {
     FAIL() << t.getError();
     SyncLock syncLock(&failures);
     failures.add(t);
 }
Ejemplo n.º 14
0
void IndexReader::main(Collection<String> args) {
    String filename;
    bool extract = false;

    for (Collection<String>::iterator arg = args.begin(); arg != args.end(); ++arg) {
        if (*arg == L"-extract") {
            extract = true;
        } else if (filename.empty()) {
            filename = *arg;
        }
    }

    if (filename.empty()) {
        std::wcout << L"Usage: IndexReader [-extract] <cfsfile>";
        return;
    }

    DirectoryPtr dir;
    CompoundFileReaderPtr cfr;

    LuceneException finally;
    try {
        String dirname(FileUtils::extractPath(filename));
        filename = FileUtils::extractPath(filename);
        dir = FSDirectory::open(dirname);
        cfr = newLucene<CompoundFileReader>(dir, filename);

        HashSet<String> _files(cfr->listAll());
        Collection<String> files(Collection<String>::newInstance(_files.begin(), _files.end()));
        std::sort(files.begin(), files.end()); // sort the array of filename so that the output is more readable

        for (Collection<String>::iterator file = files.begin(); file != files.end(); ++file) {
            int64_t len = cfr->fileLength(*file);

            if (extract) {
                std::wcout << L"extract " << *file << L" with " << len << L" bytes to local directory...";
                IndexInputPtr ii(cfr->openInput(*file));

                boost::filesystem::ofstream f(*file, std::ios::binary | std::ios::out);

                // read and write with a small buffer, which is more effective than reading byte by byte
                ByteArray buffer(ByteArray::newInstance(1024));

                int32_t chunk = buffer.size();
                while (len > 0) {
                    int32_t bufLen = std::min(chunk, (int32_t)len);
                    ii->readBytes(buffer.get(), 0, bufLen);
                    f.write((char*)buffer.get(), bufLen);
                    len -= bufLen;
                }
                ii->close();
            } else {
                std::wcout << *file << L": " << len << " bytes\n";
            }
        }
    } catch (LuceneException& e) {
        finally = e;
    }

    if (dir) {
        dir->close();
    }
    if (cfr) {
        cfr->close();
    }

    finally.throwException();
}
Ejemplo n.º 15
0
 void SegmentInfos::finishCommit(DirectoryPtr dir)
 {
     if (!pendingSegnOutput)
         boost::throw_exception(IllegalStateException(L"prepareCommit was not called"));
     
     bool success = false;
     LuceneException finally;
     try
     {
         pendingSegnOutput->finishCommit();
         pendingSegnOutput->close();
         pendingSegnOutput.reset();
         success = true;
     }
     catch (LuceneException& e)
     {
         finally = e;
     }
     
     if (!success)
         rollbackCommit(dir);
     finally.throwException();
     
     // NOTE: if we crash here, we have left a segments_N file in the directory in a possibly corrupt state (if
     // some bytes made it to stable storage and others didn't).  But, the segments_N file includes checksum
     // at the end, which should catch this case.  So when a reader tries to read it, it will throw a 
     // CorruptIndexException, which should cause the retry logic in SegmentInfos to kick in and load the last
     // good (previous) segments_N-1 file.
     
     String fileName(IndexFileNames::fileNameFromGeneration(IndexFileNames::SEGMENTS(), L"", generation));
     
     success = false;
     try
     {
         dir->sync(fileName);
         success = true;
     }
     catch (...)
     {
     }
     
     if (!success)
         dir->deleteFile(fileName);
     
     lastGeneration = generation;
     IndexOutputPtr genOutput;
     try
     {
         genOutput = dir->createOutput(IndexFileNames::SEGMENTS_GEN());
         
         try
         {
             genOutput->writeInt(FORMAT_LOCKLESS);
             genOutput->writeLong(generation);
             genOutput->writeLong(generation);
         }
         catch (LuceneException& e)
         {
             finally = e;
         }
         
         genOutput->close();
         finally.throwException();
     }
     catch (...)
     {
     }
 }
Ejemplo n.º 16
0
    void FindSegmentsFile::doRun(IndexCommitPtr commit)
    {
        if (commit)
        {
            if (directory != commit->getDirectory())
                boost::throw_exception(IOException(L"The specified commit does not match the specified Directory"));
            runBody(commit->getSegmentsFileName());
            return;
        }
        
        String segmentFileName;
        int64_t lastGen = -1;
        int64_t gen = 0;
        int32_t genLookaheadCount = 0;
        bool retry = false;
        LuceneException exc;
        SegmentInfosPtr segmentInfos(_segmentInfos);
        
        int32_t method = 0;
        
        // Loop until we succeed in calling runBody() without hitting an IOException.  An IOException most likely
        // means a commit was in process and has finished, in the time it took us to load the now-old infos files
        // (and segments files).  It's also possible it's a true error (corrupt index).  To distinguish these,
        // on each retry we must see "forward progress" on which generation we are trying to load.  If we don't, 
        // then the original error is real and we throw it.

        // We have three methods for determining the current generation.  We try the first two in parallel, and
        // fall back to the third when necessary.
        
        while (true)
        {
            if (method == 0)
            {
                // Method 1: list the directory and use the highest segments_N file.  This method works well as long
                // as there is no stale caching on the directory contents (NOTE: NFS clients often have such stale caching)
                HashSet<String> files(directory->listAll());
                int64_t genA = segmentInfos->getCurrentSegmentGeneration(files);

                segmentInfos->message(L"directory listing genA=" + StringUtils::toString(genA));

                // Method 2: open segments.gen and read its contents.  Then we take the larger of the two gens.  This way,
                // if either approach is hitting a stale cache (NFS) we have a better chance of getting the right generation.
                int64_t genB = -1;
                for (int32_t i = 0; i < SegmentInfos::defaultGenFileRetryCount; ++i)
                {
                    IndexInputPtr genInput;
                    try
                    {
                        genInput = directory->openInput(IndexFileNames::SEGMENTS_GEN());
                    }
                    catch (FileNotFoundException& e)
                    {
                        segmentInfos->message(L"Segments.gen open: FileNotFoundException " + e.getError());
                        break;
                    }
                    catch (IOException& e)
                    {
                        segmentInfos->message(L"Segments.gen open: IOException " + e.getError());
                    }
                    
                    if (genInput)
                    {
                        LuceneException finally;
                        bool fileConsistent = false;
                        try
                        {
                            int32_t version = genInput->readInt();
                            if (version == SegmentInfos::FORMAT_LOCKLESS)
                            {
                                int64_t gen0 = genInput->readLong();
                                int64_t gen1 = genInput->readLong();
                                segmentInfos->message(L"fallback check: " + StringUtils::toString(gen0) + L"; " + StringUtils::toString(gen1));
                                if (gen0 == gen1)
                                {
                                    // the file is consistent
                                    genB = gen0;
                                    fileConsistent = true;
                                }
                            }
                        }
                        catch (IOException&)
                        {
                            // will retry
                        }
                        catch (LuceneException& e)
                        {
                            finally = e;
                        }
                        genInput->close();
                        finally.throwException();
                        if (fileConsistent)
                            break;
                    }
                    
                    LuceneThread::threadSleep(SegmentInfos::defaultGenFileRetryPauseMsec);
                }
                
                segmentInfos->message(String(IndexFileNames::SEGMENTS_GEN()) + L" check: genB=" + StringUtils::toString(genB));

                // pick the larger of the two gen's
                gen = std::max(genA, genB);
                
                // neither approach found a generation
                if (gen == -1)
                    boost::throw_exception(FileNotFoundException(L"No segments* file found in directory"));
            }
            
            // Third method (fallback if first & second methods are not reliable): since both directory cache and
            // file contents cache seem to be stale, just advance the generation.
            if (method == 1 || (method == 0 && lastGen == gen && retry))
            {
                method = 1;
                
                if (genLookaheadCount < SegmentInfos::defaultGenLookaheadCount)
                {
                    ++gen;
                    ++genLookaheadCount;
                    segmentInfos->message(L"look ahead increment gen to " + StringUtils::toString(gen));
                }
            }
            
            if (lastGen == gen)
            {
                // This means we're about to try the same segments_N last tried.  This is allowed, exactly once, because 
                // writer could have been in the process of writing segments_N last time.
                
                if (retry)
                {
                    // OK, we've tried the same segments_N file twice in a row, so this must be a real error.
                    exc.throwException();
                }
                else
                    retry = true;
            }
            else if (method == 0)
            {
                // Segment file has advanced since our last loop, so reset retry
                retry = false;
            }
            
            lastGen = gen;
            
            segmentFileName = IndexFileNames::fileNameFromGeneration(IndexFileNames::SEGMENTS(), L"", gen);
            
            try
            {
                runBody(segmentFileName);
                segmentInfos->message(L"success on " + segmentFileName);
                return;
            }
            catch (LuceneException& err)
            {
                // Save the original root cause
                if (exc.isNull())
                    exc = err;
                
                segmentInfos->message(L"primary Exception on '" + segmentFileName + L"': " + err.getError() + L"'; will retry: retry=" + StringUtils::toString(retry) + L"; gen = " + StringUtils::toString(gen));
                
                if (!retry && gen > 1)
                {
                    // This is our first time trying this segments file (because retry is false), and, there is possibly a 
                    // segments_(N-1) (because gen > 1). So, check if the segments_(N-1) exists and try it if so.
                    String prevSegmentFileName(IndexFileNames::fileNameFromGeneration(IndexFileNames::SEGMENTS(), L"", gen - 1));
                    
                    if (directory->fileExists(prevSegmentFileName))
                    {
                        segmentInfos->message(L"fallback to prior segment file '" + prevSegmentFileName + L"'");
                        
                        try
                        {
                            runBody(prevSegmentFileName);
                            if (!exc.isNull())
                                segmentInfos->message(L"success on fallback " + prevSegmentFileName);
                            return;
                        }
                        catch (LuceneException& err2)
                        {
                            segmentInfos->message(L"secondary Exception on '" + prevSegmentFileName + L"': " + err2.getError() + L"'; will retry");
                        }
                    }
                }
            }
        }
    }
Ejemplo n.º 17
0
    void SegmentInfos::read(DirectoryPtr directory, const String& segmentFileName)
    {
        bool success = false;

        // clear any previous segments
        segmentInfos.clear();
        
        ChecksumIndexInputPtr input(newLucene<ChecksumIndexInput>(directory->openInput(segmentFileName)));
        
        generation = generationFromSegmentsFileName(segmentFileName);
        lastGeneration = generation;
        LuceneException finally;
        try
        {
            int32_t format = input->readInt();
            
            if (format < 0) // file contains explicit format info
            {
                if (format < CURRENT_FORMAT)
                    boost::throw_exception(CorruptIndexException(L"Unknown format version: " + StringUtils::toString(format)));
                version = input->readLong(); // read version
                counter = input->readInt(); // read counter
            }
            else
                counter = format;
            
            for (int32_t i = input->readInt(); i > 0; --i) // read segmentInfos
                segmentInfos.add(newLucene<SegmentInfo>(directory, format, input));
            
            // in old format the version number may be at the end of the file
            if (format >= 0)
            {
                if (input->getFilePointer() >= input->length())
                    version = MiscUtils::currentTimeMillis(); // old file format without version number
                else
                    input->readLong(); // read version
            }
            
            if (format <= FORMAT_USER_DATA)
            {
                if (format <= FORMAT_DIAGNOSTICS)
                    userData = input->readStringStringMap();
                else if (input->readByte() != 0)
                {
                    if (!singletonUserData)
                        singletonUserData = MapStringString::newInstance();
                    singletonUserData[String(L"userData")] = input->readString();
                    userData = singletonUserData;
                }
                else
                    userData.clear();
            }
            else
                userData.clear();
            
            if (format <= FORMAT_CHECKSUM)
            {
                int64_t checksumNow = input->getChecksum();
                int64_t checksumThen = input->readLong();
                if (checksumNow != checksumThen)
                    boost::throw_exception(CorruptIndexException(L"Checksum mismatch in segments file"));
            }
            
            success = true;
        }
        catch (LuceneException& e)
        {
            finally = e;
        }
        
        input->close();
            
        // clear any segment infos we had loaded so we have a clean slate on retry
        if (!success)
            segmentInfos.clear();
        
        finally.throwException();
    }
    void DocInverterPerField::processFields(Collection<FieldablePtr> fields, int32_t count)
    {
        fieldState->reset(docState->doc->getBoost());
        
        int32_t maxFieldLength = docState->maxFieldLength;
        bool doInvert = consumer->start(fields, count);
        DocumentsWriterPtr docWriter(docState->_docWriter);
        DocInverterPerThreadPtr perThread(_perThread);
        
        for (int32_t i = 0; i < count; ++i)
        {
            FieldablePtr field = fields[i];
            if (field->isIndexed() && doInvert)
            {
                bool anyToken;
                
                if (fieldState->length > 0)
                    fieldState->position += docState->analyzer->getPositionIncrementGap(fieldInfo->name);
                
                if (!field->isTokenized())
                {
                    // un-tokenized field
                    String stringValue(field->stringValue());
                    int32_t valueLength = (int32_t)stringValue.length();
                    perThread->singleToken->reinit(stringValue, 0, valueLength);
                    fieldState->attributeSource = perThread->singleToken;
                    consumer->start(field);

                    bool success = false;
                    LuceneException finally;
                    try
                    {
                        consumer->add();
                        success = true;
                    }
                    catch (LuceneException& e)
                    {
                        finally = e;
                    }
                    if (!success)
                        docWriter->setAborting();
                    finally.throwException();
                    fieldState->offset += valueLength;
                    ++fieldState->length;
                    ++fieldState->position;
                    anyToken = (valueLength > 0);
                }
                else
                {
                    // tokenized field
                    TokenStreamPtr stream;
                    TokenStreamPtr streamValue(field->tokenStreamValue());
                    
                    if (streamValue)
                        stream = streamValue;
                    else
                    {
                        // the field does not have a TokenStream, so we have to obtain one from the analyzer
                        ReaderPtr reader; // find or make Reader
                        ReaderPtr readerValue(field->readerValue());
                        
                        if (readerValue)
                            reader = readerValue;
                        else
                        {
                            String stringValue(field->stringValue());
                            perThread->stringReader->init(stringValue);
                            reader = perThread->stringReader;
                        }
                        
                        // Tokenize field and add to postingTable
                        stream = docState->analyzer->reusableTokenStream(fieldInfo->name, reader);
                    }
                    
                    // reset the TokenStream to the first token
                    stream->reset();
                    
                    int32_t startLength = fieldState->length;
                    
                    LuceneException finally;
                    try
                    {
                        int32_t offsetEnd = fieldState->offset - 1;
                        
                        bool hasMoreTokens = stream->incrementToken();
                        
                        fieldState->attributeSource = stream;
                        
                        OffsetAttributePtr offsetAttribute(fieldState->attributeSource->addAttribute<OffsetAttribute>());
                        PositionIncrementAttributePtr posIncrAttribute(fieldState->attributeSource->addAttribute<PositionIncrementAttribute>());
                        
                        consumer->start(field);
                    
                        while (true)
                        {
                            // If we hit an exception in stream.next below (which is fairly common, eg if analyzer
                            // chokes on a given document), then it's non-aborting and (above) this one document
                            // will be marked as deleted, but still consume a docID
                            if (!hasMoreTokens)
                                break;
                            
                            int32_t posIncr = posIncrAttribute->getPositionIncrement();
                            fieldState->position += posIncr;
                            if (fieldState->position > 0)
                                --fieldState->position;
                            
                            if (posIncr == 0)
                                ++fieldState->numOverlap;
                            
                            bool success = false;
                            try
                            {
                                // If we hit an exception in here, we abort all buffered documents since the last
                                // flush, on the likelihood that the internal state of the consumer is now corrupt 
                                // and should not be flushed to a new segment
                                consumer->add();
                                success = true;
                            }
                            catch (LuceneException& e)
                            {
                                finally = e;
                            }
                            if (!success)
                                docWriter->setAborting();
                            finally.throwException();
                            ++fieldState->position;
                            offsetEnd = fieldState->offset + offsetAttribute->endOffset();
                            if (++fieldState->length >= maxFieldLength)
                            {
                                if (docState->infoStream)
                                    *docState->infoStream << L"maxFieldLength " << StringUtils::toString(maxFieldLength) << L" reached for field " << fieldInfo->name << L", ignoring following tokens\n";
                                break;
                            }
                            
                            hasMoreTokens = stream->incrementToken();
                        }
                        
                        // trigger streams to perform end-of-stream operations
                        stream->end();
                        
                        fieldState->offset += offsetAttribute->endOffset();
                        anyToken = (fieldState->length > startLength);
                    }
                    catch (LuceneException& e)
                    {
                        finally = e;
                    }
                    stream->close();
                    finally.throwException();
                }
                
                if (anyToken)
                    fieldState->offset += docState->analyzer->getOffsetGap(field);
                fieldState->boost *= field->getBoost();
            }
            
            // don't hang onto the field
            fields[i].reset();
        }
        
        consumer->finish();
        endConsumer->finish();
    }
Ejemplo n.º 19
0
 TermInfosReader::TermInfosReader(DirectoryPtr dir, const String& seg, FieldInfosPtr fis, int32_t readBufferSize, int32_t indexDivisor)
 {
     bool success = false;
     
     if (indexDivisor < 1 && indexDivisor != -1)
         boost::throw_exception(IllegalArgumentException(L"indexDivisor must be -1 (don't load terms index) or greater than 0: got " + StringUtils::toString(indexDivisor)));
     
     LuceneException finally;
     try
     {
         directory = dir;
         segment = seg;
         fieldInfos = fis;
         
         origEnum = newLucene<SegmentTermEnum>(directory->openInput(segment + L"." + IndexFileNames::TERMS_EXTENSION(), readBufferSize), fieldInfos, false);
         _size = origEnum->size;
         
         if (indexDivisor != -1)
         {
             // Load terms index
             totalIndexInterval = origEnum->indexInterval * indexDivisor;
             SegmentTermEnumPtr indexEnum(newLucene<SegmentTermEnum>(directory->openInput(segment + L"." + IndexFileNames::TERMS_INDEX_EXTENSION(), readBufferSize), fieldInfos, true));
             
             try
             {
                 int32_t indexSize = 1 + ((int32_t)indexEnum->size - 1) / indexDivisor; // otherwise read index
                 
                 indexTerms = Collection<TermPtr>::newInstance(indexSize);
                 indexInfos = Collection<TermInfoPtr>::newInstance(indexSize);
                 indexPointers = Collection<int64_t>::newInstance(indexSize);
                 
                 for (int32_t i = 0; indexEnum->next(); ++i)
                 {
                     indexTerms[i] = indexEnum->term();
                     indexInfos[i] = indexEnum->termInfo();
                     indexPointers[i] = indexEnum->indexPointer;
                     
                     for (int32_t j = 1; j < indexDivisor; ++j)
                     {
                         if (!indexEnum->next())
                             break;
                     }
                 }
             }
             catch (LuceneException& e)
             {
                 finally = e;
             }
             indexEnum->close();
         }
         else
         {
             // Do not load terms index
             totalIndexInterval = -1;
         }
         success = true;
     }
     catch (LuceneException& e)
     {
         finally = e;
     }
     // With lock-less commits, it's entirely possible (and fine) to hit a FileNotFound exception above. 
     // In this case, we want to explicitly close any subset of things that were opened.
     if (!success)
         close();
     finally.throwException();
 }