void SnappingUtils::prepareIndexProgress( int index ) { if ( index == mIndexLayerCount ) emit indexingFinished(); else emit indexingProgress( index ); }
bool Index::makeIndex(const QList< QUrl >& docs, EBook *chmFile ) { if ( docs.isEmpty() ) return false; docList = docs; if ( chmFile->hasFeature( EBook::FEATURE_ENCODING ) ) entityDecoder.changeEncoding( QTextCodec::codecForName( chmFile->currentEncoding().toUtf8() ) ); QList< QUrl >::ConstIterator it = docList.begin(); int steps = docList.count() / 100; if ( !steps ) steps++; int prog = 0; for ( int i = 0; it != docList.end(); ++it, ++i ) { if ( lastWindowClosed ) return false; QUrl filename = *it; QStringList terms; if ( parseDocumentToStringlist( chmFile, filename, terms ) ) { for ( QStringList::ConstIterator tit = terms.begin(); tit != terms.end(); ++tit ) insertInDict( *tit, i ); } if ( i%steps == 0 ) { prog++; prog = qMin( prog, 99 ); emit indexingProgress( prog, tr("Processing document %1") .arg( (*it).path() ) ); } } emit indexingProgress( 100, tr("Processing completed") ); return true; }
void CSwordModuleInfo::buildIndex() { m_cancelIndexing = false; try { //Without this we don't get strongs, lemmas, etc backend()->setFilterOptions ( CBTConfig::getFilterOptionDefaults() ); //make sure we reset all important filter options which influcence the plain filters. // turn on these options, they are needed for the EntryAttributes population backend()->setOption( CSwordModuleInfo::strongNumbers, true ); backend()->setOption( CSwordModuleInfo::morphTags, true ); backend()->setOption( CSwordModuleInfo::footnotes, true ); backend()->setOption( CSwordModuleInfo::headings, true ); // we don't want the following in the text, the do not carry searchable information backend()->setOption( CSwordModuleInfo::morphSegmentation, false ); backend()->setOption( CSwordModuleInfo::scriptureReferences, false ); backend()->setOption( CSwordModuleInfo::redLetterWords, false ); // do not use any stop words const TCHAR* stop_words[] = { NULL }; lucene::analysis::standard::StandardAnalyzer an( (const TCHAR**)stop_words ); QString index = getModuleStandardIndexLocation(); QDir dir("/"); dir.mkpath( getGlobalBaseIndexLocation() ); dir.mkpath( getModuleBaseIndexLocation() ); dir.mkpath( getModuleStandardIndexLocation() ); if (lucene::index::IndexReader::indexExists(index.toAscii().constData())) { if (lucene::index::IndexReader::isLocked(index.toAscii().constData()) ) { lucene::index::IndexReader::unlock(index.toAscii().constData()); } } boost::scoped_ptr<lucene::index::IndexWriter> writer( new lucene::index::IndexWriter(index.toAscii().constData(), &an, true) ); //always create a new index writer->setMaxFieldLength(BT_MAX_LUCENE_FIELD_LENGTH); writer->setUseCompoundFile(true); //merge segments into a single file writer->setMinMergeDocs(1000); *m_module = sword::TOP; unsigned long verseLowIndex = m_module->Index(); *m_module = sword::BOTTOM; unsigned long verseHighIndex = m_module->Index(); //verseLowIndex is not 0 in all cases (i.e. NT-only modules) unsigned long verseIndex = verseLowIndex + 1; unsigned long verseSpan = verseHighIndex - verseLowIndex; //Index() is not implemented properly for lexicons, so we use a //workaround. if (type() == CSwordModuleInfo::Lexicon) { verseIndex = 0; verseLowIndex = 0; verseSpan = ((CSwordLexiconModuleInfo*)this)->entries()->size(); } emit indexingProgress(0); sword::SWKey* key = m_module->getKey(); //VerseKey for bibles sword::VerseKey* vk = dynamic_cast<sword::VerseKey*>(key); if (vk) { // we have to be sure to insert the english key into the index, otherwise we'd be in trouble if the language changes vk->setLocale("en_US"); //If we have a verse based module, we want to include the pre-chapter etc. headings in the search vk->Headings(1); } //holds UTF-8 data and is faster than QString. QByteArray textBuffer; // we start with the first module entry, key is automatically updated // because key is a pointer to the modules key m_module->setSkipConsecutiveLinks(true); wchar_t wcharBuffer[BT_MAX_LUCENE_FIELD_LENGTH + 1]; for (*m_module = sword::TOP; !(m_module->Error()) && !m_cancelIndexing; (*m_module)++) { // Also index Chapter 0 and Verse 0, because they might have information in the entry attributes // We used to just put their content into the textBuffer and continue to the next verse, but // with entry attributes this doesn't work any more. // Hits in the search dialog will show up as 1:1 (instead of 0) boost::scoped_ptr<lucene::document::Document> doc(new lucene::document::Document()); //index the key lucene_utf8towcs(wcharBuffer, key->getText(), BT_MAX_LUCENE_FIELD_LENGTH); //doc->add(*lucene::document::Field::UnIndexed((const TCHAR*)_T("key"), (const TCHAR*)wcharBuffer)); doc->add(*(new lucene::document::Field((const TCHAR*)_T("key"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_NO))); // index the main text //at this point we have to make sure we disabled the strongs and the other options //so the plain filters won't include the numbers somehow. lucene_utf8towcs(wcharBuffer, (const char*) textBuffer.append(m_module->StripText()), BT_MAX_LUCENE_FIELD_LENGTH); doc->add(*(new lucene::document::Field((const TCHAR*)_T("content"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED))); textBuffer.resize(0); //clean up // index attributes sword::AttributeList::iterator attListI; sword::AttributeValue::iterator attValueI; // Footnotes for (attListI = m_module->getEntryAttributes()["Footnote"].begin(); attListI != m_module->getEntryAttributes()["Footnote"].end(); attListI++) { lucene_utf8towcs(wcharBuffer, attListI->second["body"], BT_MAX_LUCENE_FIELD_LENGTH); //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("footnote"), wcharBuffer)); doc->add(*(new lucene::document::Field((const TCHAR*)_T("footnote"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED))); } // for attListI // Headings for (attValueI = m_module->getEntryAttributes()["Heading"]["Preverse"].begin(); attValueI != m_module->getEntryAttributes()["Heading"]["Preverse"].end(); attValueI++) { lucene_utf8towcs(wcharBuffer, attValueI->second, BT_MAX_LUCENE_FIELD_LENGTH); //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("heading"), wcharBuffer)); doc->add(*(new lucene::document::Field((const TCHAR*)_T("heading"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED))); } // for attValueI // Strongs/Morphs for (attListI = m_module->getEntryAttributes()["Word"].begin(); attListI != m_module->getEntryAttributes()["Word"].end(); attListI++) { // for each attribute if (attListI->second["LemmaClass"] == "strong") { lucene_utf8towcs(wcharBuffer, attListI->second["Lemma"], BT_MAX_LUCENE_FIELD_LENGTH); //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("strong"), wcharBuffer)); doc->add(*(new lucene::document::Field((const TCHAR*)_T("strong"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED))); //qWarning("Adding strong %s", attListI->second["Lemma"].c_str()); } if (attListI->second.find("Morph") != attListI->second.end()) { lucene_utf8towcs(wcharBuffer, attListI->second["Morph"], BT_MAX_LUCENE_FIELD_LENGTH); //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("morph"), wcharBuffer)); doc->add(*(new lucene::document::Field((const TCHAR*)_T("morph"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED))); } } // for attListI writer->addDocument(doc.get()); //Index() is not implemented properly for lexicons, so we use a //workaround. if (type() == CSwordModuleInfo::Lexicon) { verseIndex++; } else { verseIndex = m_module->Index(); } if (verseIndex % 200 == 0) { int indexingProgressValue; if (verseSpan == 0) { //prevent division by zero //m_indexingProgress.setValue( QVariant(0) ); indexingProgressValue = 0; } else { //m_indexingProgress.setValue( QVariant((int)((100*(verseIndex-verseLowIndex))/(verseHighIndex-verseLowIndex))) ); indexingProgressValue = (int)((100 * (verseIndex - verseLowIndex)) / (verseSpan)); } //m_indexingProgress.activate(); emit indexingProgress(indexingProgressValue); } } if (!m_cancelIndexing) { writer->optimize(); } writer->close(); if (m_cancelIndexing) { deleteIndex(); m_cancelIndexing = false; } else { QSettings module_config(getModuleBaseIndexLocation() + QString("/bibletime-index.conf"), QSettings::IniFormat); if (hasVersion()) module_config.setValue("module-version", config(CSwordModuleInfo::ModuleVersion) ); module_config.setValue("index-version", INDEX_VERSION); emit hasIndexChanged(true); } } catch (...) { qWarning("CLucene exception occurred while indexing"); util::showWarning(0, QCoreApplication::tr("Indexing aborted"), QCoreApplication::tr("An internal error occurred while building the index.")); deleteIndex(); m_cancelIndexing = false; } }