Exemplos de lucene_utf8towcs em C++ (Cpp)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: filter.cpp Projeto: bongo-project/bongo

static inline int
utf8towcs(wchar_t *result, const char *str, int len)
{
    int l;
    
    /* wmemset(result, L'\0', len); */

    l = lucene_utf8towcs(result, str, len);

    if (l < 0) {
        printf ("bad utf8 sent to lucene.  %d\n", l);
        l = 0;
    }
    if (l < len) {
        result[l] = 0;
    }
    result[len-1] = 0;
    return l;
}

Exemplo n.º 2

0

Exibir arquivo

Arquivo: cswordmoduleinfo.cpp Projeto: bibletime/historic-bibletime-svn

bool CSwordModuleInfo::searchIndexed(const QString& searchedText, sword::ListKey& scope) {
    char utfBuffer[BT_MAX_LUCENE_FIELD_LENGTH  + 1];
    wchar_t wcharBuffer[BT_MAX_LUCENE_FIELD_LENGTH + 1];

    // work around Swords thread insafety for Bibles and Commentaries
    boost::scoped_ptr < CSwordKey > key(CSwordKey::createInstance(this));
    sword::SWKey* s = dynamic_cast < sword::SWKey * >(key.get());
    QList<sword::VerseKey*> list;

    if (s) {
        m_module->SetKey(*s);
    }

    m_searchResult.ClearList();

    try {
        // do not use any stop words
        const TCHAR* stop_words[]  = { NULL };
        lucene::analysis::standard::StandardAnalyzer analyzer( stop_words );
        lucene::search::IndexSearcher searcher(getModuleStandardIndexLocation().toAscii().constData());
        lucene_utf8towcs(wcharBuffer, searchedText.toUtf8().constData(), BT_MAX_LUCENE_FIELD_LENGTH);
        boost::scoped_ptr<lucene::search::Query> q( lucene::queryParser::QueryParser::parse((const TCHAR*)wcharBuffer, (const TCHAR*)_T("content"), &analyzer) );

        boost::scoped_ptr<lucene::search::Hits> h( searcher.search(q.get(), lucene::search::Sort::INDEXORDER) );

        const bool useScope = (scope.Count() > 0);
//		const bool isVerseModule = (type() == CSwordModuleInfo::Bible) || (type() == CSwordModuleInfo::Commentary);

        lucene::document::Document* doc = 0;
        boost::scoped_ptr<sword::SWKey> swKey( module()->CreateKey() );


        for (int i = 0; i < h->length(); ++i) {
            doc = &h->doc(i);
            lucene_wcstoutf8(utfBuffer, (const wchar_t*)doc->get((const TCHAR*)_T("key")), BT_MAX_LUCENE_FIELD_LENGTH);

            swKey->setText(utfBuffer);

            // limit results based on scope
            //if (searchOptions & CSwordModuleSearch::useScope && scope.Count() > 0){
            if (useScope) {
                for (int j = 0; j < scope.Count(); j++) {
                    sword::VerseKey* vkey = dynamic_cast<sword::VerseKey*>(scope.getElement(j));
                    if (vkey->LowerBound().compare(*swKey) <= 0 && vkey->UpperBound().compare(*swKey) >= 0) {
                        m_searchResult.add(*swKey);
                    }
                }
            }
            else { // no scope, give me all buffers
                m_searchResult.add(*swKey);
            }
        }
    }
    catch (...) {
        qWarning("CLucene exception occurred");
        util::showWarning(0, QCoreApplication::tr("Search aborted"), QCoreApplication::tr("An internal error occurred while executing your search."));
        return false;
    }

    qDeleteAll(list);
    list.clear();

    return (m_searchResult.Count() > 0);
}

Exemplo n.º 3

0

Exibir arquivo

Arquivo: cswordmoduleinfo.cpp Projeto: bibletime/historic-bibletime-svn

void CSwordModuleInfo::buildIndex() {

    m_cancelIndexing = false;

    try {
        //Without this we don't get strongs, lemmas, etc
        backend()->setFilterOptions ( CBTConfig::getFilterOptionDefaults() );
        //make sure we reset all important filter options which influcence the plain filters.
        // turn on these options, they are needed for the EntryAttributes population
        backend()->setOption( CSwordModuleInfo::strongNumbers,  true );
        backend()->setOption( CSwordModuleInfo::morphTags,  true );
        backend()->setOption( CSwordModuleInfo::footnotes,  true );
        backend()->setOption( CSwordModuleInfo::headings,  true );
        // we don't want the following in the text, the do not carry searchable information
        backend()->setOption( CSwordModuleInfo::morphSegmentation,  false );
        backend()->setOption( CSwordModuleInfo::scriptureReferences,  false );
        backend()->setOption( CSwordModuleInfo::redLetterWords,  false );

        // do not use any stop words
        const TCHAR* stop_words[]  = { NULL };
        lucene::analysis::standard::StandardAnalyzer an( (const TCHAR**)stop_words );
        QString index = getModuleStandardIndexLocation();

        QDir dir("/");
        dir.mkpath( getGlobalBaseIndexLocation() );
        dir.mkpath( getModuleBaseIndexLocation() );
        dir.mkpath( getModuleStandardIndexLocation() );

        if (lucene::index::IndexReader::indexExists(index.toAscii().constData())) {
            if (lucene::index::IndexReader::isLocked(index.toAscii().constData()) ) {
                lucene::index::IndexReader::unlock(index.toAscii().constData());
            }
        }

        boost::scoped_ptr<lucene::index::IndexWriter> writer( new lucene::index::IndexWriter(index.toAscii().constData(), &an, true) ); //always create a new index
        writer->setMaxFieldLength(BT_MAX_LUCENE_FIELD_LENGTH);
        writer->setUseCompoundFile(true); //merge segments into a single file
        writer->setMinMergeDocs(1000);

        *m_module = sword::TOP;
        unsigned long verseLowIndex = m_module->Index();
        *m_module = sword::BOTTOM;
        unsigned long verseHighIndex = m_module->Index();

        //verseLowIndex is not 0 in all cases (i.e. NT-only modules)
        unsigned long verseIndex = verseLowIndex + 1;
        unsigned long verseSpan = verseHighIndex - verseLowIndex;

        //Index() is not implemented properly for lexicons, so we use a
        //workaround.
        if (type() == CSwordModuleInfo::Lexicon) {
            verseIndex = 0;
            verseLowIndex = 0;
            verseSpan = ((CSwordLexiconModuleInfo*)this)->entries()->size();
        }

        emit indexingProgress(0);

        sword::SWKey* key = m_module->getKey();
        //VerseKey for bibles
        sword::VerseKey* vk = dynamic_cast<sword::VerseKey*>(key);

        if (vk) {
            // we have to be sure to insert the english key into the index, otherwise we'd be in trouble if the language changes
            vk->setLocale("en_US");
            //If we have a verse based module, we want to include the pre-chapter etc. headings in the search
            vk->Headings(1);
        }

        //holds UTF-8 data and is faster than QString.
        QByteArray textBuffer;

        // we start with the first module entry, key is automatically updated
        // because key is a pointer to the modules key
        m_module->setSkipConsecutiveLinks(true);

        wchar_t wcharBuffer[BT_MAX_LUCENE_FIELD_LENGTH + 1];

        for (*m_module = sword::TOP; !(m_module->Error()) && !m_cancelIndexing; (*m_module)++) {

            // Also index Chapter 0 and Verse 0, because they might have information in the entry attributes
            // We used to just put their content into the textBuffer and continue to the next verse, but
            // with entry attributes this doesn't work any more.
            // Hits in the search dialog will show up as 1:1 (instead of 0)

            boost::scoped_ptr<lucene::document::Document> doc(new lucene::document::Document());

            //index the key
            lucene_utf8towcs(wcharBuffer, key->getText(), BT_MAX_LUCENE_FIELD_LENGTH);

            //doc->add(*lucene::document::Field::UnIndexed((const TCHAR*)_T("key"), (const TCHAR*)wcharBuffer));
            doc->add(*(new lucene::document::Field((const TCHAR*)_T("key"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_NO)));

            // index the main text
            //at this point we have to make sure we disabled the strongs and the other options
            //so the plain filters won't include the numbers somehow.
            lucene_utf8towcs(wcharBuffer, (const char*) textBuffer.append(m_module->StripText()), BT_MAX_LUCENE_FIELD_LENGTH);
            doc->add(*(new lucene::document::Field((const TCHAR*)_T("content"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED)));
            textBuffer.resize(0); //clean up

            // index attributes
            sword::AttributeList::iterator attListI;
            sword::AttributeValue::iterator attValueI;
            // Footnotes
            for (attListI = m_module->getEntryAttributes()["Footnote"].begin();
                    attListI != m_module->getEntryAttributes()["Footnote"].end();
                    attListI++) {
                lucene_utf8towcs(wcharBuffer, attListI->second["body"], BT_MAX_LUCENE_FIELD_LENGTH);
                //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("footnote"), wcharBuffer));
                doc->add(*(new lucene::document::Field((const TCHAR*)_T("footnote"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED)));
            } // for attListI

            // Headings
            for (attValueI = m_module->getEntryAttributes()["Heading"]["Preverse"].begin();
                    attValueI != m_module->getEntryAttributes()["Heading"]["Preverse"].end();
                    attValueI++) {
                lucene_utf8towcs(wcharBuffer, attValueI->second, BT_MAX_LUCENE_FIELD_LENGTH);
                //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("heading"), wcharBuffer));
                doc->add(*(new lucene::document::Field((const TCHAR*)_T("heading"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED)));
            } // for attValueI

            // Strongs/Morphs
            for (attListI = m_module->getEntryAttributes()["Word"].begin();
                    attListI != m_module->getEntryAttributes()["Word"].end();
                    attListI++) {
                // for each attribute
                if (attListI->second["LemmaClass"] == "strong") {
                    lucene_utf8towcs(wcharBuffer, attListI->second["Lemma"], BT_MAX_LUCENE_FIELD_LENGTH);
                    //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("strong"), wcharBuffer));
                    doc->add(*(new lucene::document::Field((const TCHAR*)_T("strong"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED)));
                    //qWarning("Adding strong %s", attListI->second["Lemma"].c_str());
                }
                if (attListI->second.find("Morph") != attListI->second.end()) {
                    lucene_utf8towcs(wcharBuffer, attListI->second["Morph"], BT_MAX_LUCENE_FIELD_LENGTH);
                    //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("morph"), wcharBuffer));
                    doc->add(*(new lucene::document::Field((const TCHAR*)_T("morph"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED)));
                }
            } // for attListI

            writer->addDocument(doc.get());
            //Index() is not implemented properly for lexicons, so we use a
            //workaround.
            if (type() == CSwordModuleInfo::Lexicon) {
                verseIndex++;
            }
            else {
                verseIndex = m_module->Index();
            }

            if (verseIndex % 200 == 0) {
                int indexingProgressValue;
                if (verseSpan == 0) { //prevent division by zero
                    //m_indexingProgress.setValue( QVariant(0) );
                    indexingProgressValue = 0;
                }
                else {
                    //m_indexingProgress.setValue( QVariant((int)((100*(verseIndex-verseLowIndex))/(verseHighIndex-verseLowIndex))) );
                    indexingProgressValue = (int)((100 * (verseIndex - verseLowIndex)) / (verseSpan));
                }
                //m_indexingProgress.activate();
                emit indexingProgress(indexingProgressValue);
            }
        }

        if (!m_cancelIndexing) {
            writer->optimize();
        }
        writer->close();

        if (m_cancelIndexing) {
            deleteIndex();
            m_cancelIndexing = false;
        }
        else {
            QSettings module_config(getModuleBaseIndexLocation() + QString("/bibletime-index.conf"), QSettings::IniFormat);
            if (hasVersion()) module_config.setValue("module-version", config(CSwordModuleInfo::ModuleVersion) );
            module_config.setValue("index-version", INDEX_VERSION);
            emit hasIndexChanged(true);
        }
    }
    catch (...) {
        qWarning("CLucene exception occurred while indexing");
        util::showWarning(0, QCoreApplication::tr("Indexing aborted"), QCoreApplication::tr("An internal error occurred while building the index."));
        deleteIndex();
        m_cancelIndexing = false;
    }
}