Ejemplo n.º 1
0
void SnappingUtils::prepareIndexProgress( int index )
{
  if ( index == mIndexLayerCount )
    emit indexingFinished();
  else
    emit indexingProgress( index );
}
bool Index::makeIndex(const QList< QUrl >& docs, EBook *chmFile )
{
	if ( docs.isEmpty() )
		return false;
	
	docList = docs;

	if ( chmFile->hasFeature( EBook::FEATURE_ENCODING ) )
		entityDecoder.changeEncoding( QTextCodec::codecForName( chmFile->currentEncoding().toUtf8() ) );
	
	QList< QUrl >::ConstIterator it = docList.begin();
	int steps = docList.count() / 100;
	
	if ( !steps )
		steps++;
	
	int prog = 0;
	
	for ( int i = 0; it != docList.end(); ++it, ++i )
	{
		if ( lastWindowClosed )
			return false;

		QUrl filename = *it;
		QStringList terms;
		
		if ( parseDocumentToStringlist( chmFile, filename, terms ) )
		{
			for ( QStringList::ConstIterator tit = terms.begin(); tit != terms.end(); ++tit )
				insertInDict( *tit, i );
		}
		
		if ( i%steps == 0 )
		{
			prog++;
			prog = qMin( prog, 99 );
			emit indexingProgress( prog, tr("Processing document %1") .arg( (*it).path() ) );
		}
	}
	
	emit indexingProgress( 100, tr("Processing completed") );
	return true;
}
void CSwordModuleInfo::buildIndex() {

    m_cancelIndexing = false;

    try {
        //Without this we don't get strongs, lemmas, etc
        backend()->setFilterOptions ( CBTConfig::getFilterOptionDefaults() );
        //make sure we reset all important filter options which influcence the plain filters.
        // turn on these options, they are needed for the EntryAttributes population
        backend()->setOption( CSwordModuleInfo::strongNumbers,  true );
        backend()->setOption( CSwordModuleInfo::morphTags,  true );
        backend()->setOption( CSwordModuleInfo::footnotes,  true );
        backend()->setOption( CSwordModuleInfo::headings,  true );
        // we don't want the following in the text, the do not carry searchable information
        backend()->setOption( CSwordModuleInfo::morphSegmentation,  false );
        backend()->setOption( CSwordModuleInfo::scriptureReferences,  false );
        backend()->setOption( CSwordModuleInfo::redLetterWords,  false );

        // do not use any stop words
        const TCHAR* stop_words[]  = { NULL };
        lucene::analysis::standard::StandardAnalyzer an( (const TCHAR**)stop_words );
        QString index = getModuleStandardIndexLocation();

        QDir dir("/");
        dir.mkpath( getGlobalBaseIndexLocation() );
        dir.mkpath( getModuleBaseIndexLocation() );
        dir.mkpath( getModuleStandardIndexLocation() );

        if (lucene::index::IndexReader::indexExists(index.toAscii().constData())) {
            if (lucene::index::IndexReader::isLocked(index.toAscii().constData()) ) {
                lucene::index::IndexReader::unlock(index.toAscii().constData());
            }
        }

        boost::scoped_ptr<lucene::index::IndexWriter> writer( new lucene::index::IndexWriter(index.toAscii().constData(), &an, true) ); //always create a new index
        writer->setMaxFieldLength(BT_MAX_LUCENE_FIELD_LENGTH);
        writer->setUseCompoundFile(true); //merge segments into a single file
        writer->setMinMergeDocs(1000);

        *m_module = sword::TOP;
        unsigned long verseLowIndex = m_module->Index();
        *m_module = sword::BOTTOM;
        unsigned long verseHighIndex = m_module->Index();

        //verseLowIndex is not 0 in all cases (i.e. NT-only modules)
        unsigned long verseIndex = verseLowIndex + 1;
        unsigned long verseSpan = verseHighIndex - verseLowIndex;

        //Index() is not implemented properly for lexicons, so we use a
        //workaround.
        if (type() == CSwordModuleInfo::Lexicon) {
            verseIndex = 0;
            verseLowIndex = 0;
            verseSpan = ((CSwordLexiconModuleInfo*)this)->entries()->size();
        }

        emit indexingProgress(0);

        sword::SWKey* key = m_module->getKey();
        //VerseKey for bibles
        sword::VerseKey* vk = dynamic_cast<sword::VerseKey*>(key);

        if (vk) {
            // we have to be sure to insert the english key into the index, otherwise we'd be in trouble if the language changes
            vk->setLocale("en_US");
            //If we have a verse based module, we want to include the pre-chapter etc. headings in the search
            vk->Headings(1);
        }

        //holds UTF-8 data and is faster than QString.
        QByteArray textBuffer;

        // we start with the first module entry, key is automatically updated
        // because key is a pointer to the modules key
        m_module->setSkipConsecutiveLinks(true);

        wchar_t wcharBuffer[BT_MAX_LUCENE_FIELD_LENGTH + 1];

        for (*m_module = sword::TOP; !(m_module->Error()) && !m_cancelIndexing; (*m_module)++) {

            // Also index Chapter 0 and Verse 0, because they might have information in the entry attributes
            // We used to just put their content into the textBuffer and continue to the next verse, but
            // with entry attributes this doesn't work any more.
            // Hits in the search dialog will show up as 1:1 (instead of 0)

            boost::scoped_ptr<lucene::document::Document> doc(new lucene::document::Document());

            //index the key
            lucene_utf8towcs(wcharBuffer, key->getText(), BT_MAX_LUCENE_FIELD_LENGTH);

            //doc->add(*lucene::document::Field::UnIndexed((const TCHAR*)_T("key"), (const TCHAR*)wcharBuffer));
            doc->add(*(new lucene::document::Field((const TCHAR*)_T("key"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_NO)));

            // index the main text
            //at this point we have to make sure we disabled the strongs and the other options
            //so the plain filters won't include the numbers somehow.
            lucene_utf8towcs(wcharBuffer, (const char*) textBuffer.append(m_module->StripText()), BT_MAX_LUCENE_FIELD_LENGTH);
            doc->add(*(new lucene::document::Field((const TCHAR*)_T("content"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED)));
            textBuffer.resize(0); //clean up

            // index attributes
            sword::AttributeList::iterator attListI;
            sword::AttributeValue::iterator attValueI;
            // Footnotes
            for (attListI = m_module->getEntryAttributes()["Footnote"].begin();
                    attListI != m_module->getEntryAttributes()["Footnote"].end();
                    attListI++) {
                lucene_utf8towcs(wcharBuffer, attListI->second["body"], BT_MAX_LUCENE_FIELD_LENGTH);
                //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("footnote"), wcharBuffer));
                doc->add(*(new lucene::document::Field((const TCHAR*)_T("footnote"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED)));
            } // for attListI

            // Headings
            for (attValueI = m_module->getEntryAttributes()["Heading"]["Preverse"].begin();
                    attValueI != m_module->getEntryAttributes()["Heading"]["Preverse"].end();
                    attValueI++) {
                lucene_utf8towcs(wcharBuffer, attValueI->second, BT_MAX_LUCENE_FIELD_LENGTH);
                //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("heading"), wcharBuffer));
                doc->add(*(new lucene::document::Field((const TCHAR*)_T("heading"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED)));
            } // for attValueI

            // Strongs/Morphs
            for (attListI = m_module->getEntryAttributes()["Word"].begin();
                    attListI != m_module->getEntryAttributes()["Word"].end();
                    attListI++) {
                // for each attribute
                if (attListI->second["LemmaClass"] == "strong") {
                    lucene_utf8towcs(wcharBuffer, attListI->second["Lemma"], BT_MAX_LUCENE_FIELD_LENGTH);
                    //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("strong"), wcharBuffer));
                    doc->add(*(new lucene::document::Field((const TCHAR*)_T("strong"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED)));
                    //qWarning("Adding strong %s", attListI->second["Lemma"].c_str());
                }
                if (attListI->second.find("Morph") != attListI->second.end()) {
                    lucene_utf8towcs(wcharBuffer, attListI->second["Morph"], BT_MAX_LUCENE_FIELD_LENGTH);
                    //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("morph"), wcharBuffer));
                    doc->add(*(new lucene::document::Field((const TCHAR*)_T("morph"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED)));
                }
            } // for attListI

            writer->addDocument(doc.get());
            //Index() is not implemented properly for lexicons, so we use a
            //workaround.
            if (type() == CSwordModuleInfo::Lexicon) {
                verseIndex++;
            }
            else {
                verseIndex = m_module->Index();
            }

            if (verseIndex % 200 == 0) {
                int indexingProgressValue;
                if (verseSpan == 0) { //prevent division by zero
                    //m_indexingProgress.setValue( QVariant(0) );
                    indexingProgressValue = 0;
                }
                else {
                    //m_indexingProgress.setValue( QVariant((int)((100*(verseIndex-verseLowIndex))/(verseHighIndex-verseLowIndex))) );
                    indexingProgressValue = (int)((100 * (verseIndex - verseLowIndex)) / (verseSpan));
                }
                //m_indexingProgress.activate();
                emit indexingProgress(indexingProgressValue);
            }
        }

        if (!m_cancelIndexing) {
            writer->optimize();
        }
        writer->close();

        if (m_cancelIndexing) {
            deleteIndex();
            m_cancelIndexing = false;
        }
        else {
            QSettings module_config(getModuleBaseIndexLocation() + QString("/bibletime-index.conf"), QSettings::IniFormat);
            if (hasVersion()) module_config.setValue("module-version", config(CSwordModuleInfo::ModuleVersion) );
            module_config.setValue("index-version", INDEX_VERSION);
            emit hasIndexChanged(true);
        }
    }
    catch (...) {
        qWarning("CLucene exception occurred while indexing");
        util::showWarning(0, QCoreApplication::tr("Indexing aborted"), QCoreApplication::tr("An internal error occurred while building the index."));
        deleteIndex();
        m_cancelIndexing = false;
    }
}