static inline int utf8towcs(wchar_t *result, const char *str, int len) { int l; /* wmemset(result, L'\0', len); */ l = lucene_utf8towcs(result, str, len); if (l < 0) { printf ("bad utf8 sent to lucene. %d\n", l); l = 0; } if (l < len) { result[l] = 0; } result[len-1] = 0; return l; }
bool CSwordModuleInfo::searchIndexed(const QString& searchedText, sword::ListKey& scope) { char utfBuffer[BT_MAX_LUCENE_FIELD_LENGTH + 1]; wchar_t wcharBuffer[BT_MAX_LUCENE_FIELD_LENGTH + 1]; // work around Swords thread insafety for Bibles and Commentaries boost::scoped_ptr < CSwordKey > key(CSwordKey::createInstance(this)); sword::SWKey* s = dynamic_cast < sword::SWKey * >(key.get()); QList<sword::VerseKey*> list; if (s) { m_module->SetKey(*s); } m_searchResult.ClearList(); try { // do not use any stop words const TCHAR* stop_words[] = { NULL }; lucene::analysis::standard::StandardAnalyzer analyzer( stop_words ); lucene::search::IndexSearcher searcher(getModuleStandardIndexLocation().toAscii().constData()); lucene_utf8towcs(wcharBuffer, searchedText.toUtf8().constData(), BT_MAX_LUCENE_FIELD_LENGTH); boost::scoped_ptr<lucene::search::Query> q( lucene::queryParser::QueryParser::parse((const TCHAR*)wcharBuffer, (const TCHAR*)_T("content"), &analyzer) ); boost::scoped_ptr<lucene::search::Hits> h( searcher.search(q.get(), lucene::search::Sort::INDEXORDER) ); const bool useScope = (scope.Count() > 0); // const bool isVerseModule = (type() == CSwordModuleInfo::Bible) || (type() == CSwordModuleInfo::Commentary); lucene::document::Document* doc = 0; boost::scoped_ptr<sword::SWKey> swKey( module()->CreateKey() ); for (int i = 0; i < h->length(); ++i) { doc = &h->doc(i); lucene_wcstoutf8(utfBuffer, (const wchar_t*)doc->get((const TCHAR*)_T("key")), BT_MAX_LUCENE_FIELD_LENGTH); swKey->setText(utfBuffer); // limit results based on scope //if (searchOptions & CSwordModuleSearch::useScope && scope.Count() > 0){ if (useScope) { for (int j = 0; j < scope.Count(); j++) { sword::VerseKey* vkey = dynamic_cast<sword::VerseKey*>(scope.getElement(j)); if (vkey->LowerBound().compare(*swKey) <= 0 && vkey->UpperBound().compare(*swKey) >= 0) { m_searchResult.add(*swKey); } } } else { // no scope, give me all buffers m_searchResult.add(*swKey); } } } catch (...) { qWarning("CLucene exception occurred"); util::showWarning(0, QCoreApplication::tr("Search aborted"), QCoreApplication::tr("An internal error occurred while executing your search.")); return false; } qDeleteAll(list); list.clear(); return (m_searchResult.Count() > 0); }
void CSwordModuleInfo::buildIndex() { m_cancelIndexing = false; try { //Without this we don't get strongs, lemmas, etc backend()->setFilterOptions ( CBTConfig::getFilterOptionDefaults() ); //make sure we reset all important filter options which influcence the plain filters. // turn on these options, they are needed for the EntryAttributes population backend()->setOption( CSwordModuleInfo::strongNumbers, true ); backend()->setOption( CSwordModuleInfo::morphTags, true ); backend()->setOption( CSwordModuleInfo::footnotes, true ); backend()->setOption( CSwordModuleInfo::headings, true ); // we don't want the following in the text, the do not carry searchable information backend()->setOption( CSwordModuleInfo::morphSegmentation, false ); backend()->setOption( CSwordModuleInfo::scriptureReferences, false ); backend()->setOption( CSwordModuleInfo::redLetterWords, false ); // do not use any stop words const TCHAR* stop_words[] = { NULL }; lucene::analysis::standard::StandardAnalyzer an( (const TCHAR**)stop_words ); QString index = getModuleStandardIndexLocation(); QDir dir("/"); dir.mkpath( getGlobalBaseIndexLocation() ); dir.mkpath( getModuleBaseIndexLocation() ); dir.mkpath( getModuleStandardIndexLocation() ); if (lucene::index::IndexReader::indexExists(index.toAscii().constData())) { if (lucene::index::IndexReader::isLocked(index.toAscii().constData()) ) { lucene::index::IndexReader::unlock(index.toAscii().constData()); } } boost::scoped_ptr<lucene::index::IndexWriter> writer( new lucene::index::IndexWriter(index.toAscii().constData(), &an, true) ); //always create a new index writer->setMaxFieldLength(BT_MAX_LUCENE_FIELD_LENGTH); writer->setUseCompoundFile(true); //merge segments into a single file writer->setMinMergeDocs(1000); *m_module = sword::TOP; unsigned long verseLowIndex = m_module->Index(); *m_module = sword::BOTTOM; unsigned long verseHighIndex = m_module->Index(); //verseLowIndex is not 0 in all cases (i.e. NT-only modules) unsigned long verseIndex = verseLowIndex + 1; unsigned long verseSpan = verseHighIndex - verseLowIndex; //Index() is not implemented properly for lexicons, so we use a //workaround. if (type() == CSwordModuleInfo::Lexicon) { verseIndex = 0; verseLowIndex = 0; verseSpan = ((CSwordLexiconModuleInfo*)this)->entries()->size(); } emit indexingProgress(0); sword::SWKey* key = m_module->getKey(); //VerseKey for bibles sword::VerseKey* vk = dynamic_cast<sword::VerseKey*>(key); if (vk) { // we have to be sure to insert the english key into the index, otherwise we'd be in trouble if the language changes vk->setLocale("en_US"); //If we have a verse based module, we want to include the pre-chapter etc. headings in the search vk->Headings(1); } //holds UTF-8 data and is faster than QString. QByteArray textBuffer; // we start with the first module entry, key is automatically updated // because key is a pointer to the modules key m_module->setSkipConsecutiveLinks(true); wchar_t wcharBuffer[BT_MAX_LUCENE_FIELD_LENGTH + 1]; for (*m_module = sword::TOP; !(m_module->Error()) && !m_cancelIndexing; (*m_module)++) { // Also index Chapter 0 and Verse 0, because they might have information in the entry attributes // We used to just put their content into the textBuffer and continue to the next verse, but // with entry attributes this doesn't work any more. // Hits in the search dialog will show up as 1:1 (instead of 0) boost::scoped_ptr<lucene::document::Document> doc(new lucene::document::Document()); //index the key lucene_utf8towcs(wcharBuffer, key->getText(), BT_MAX_LUCENE_FIELD_LENGTH); //doc->add(*lucene::document::Field::UnIndexed((const TCHAR*)_T("key"), (const TCHAR*)wcharBuffer)); doc->add(*(new lucene::document::Field((const TCHAR*)_T("key"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_NO))); // index the main text //at this point we have to make sure we disabled the strongs and the other options //so the plain filters won't include the numbers somehow. lucene_utf8towcs(wcharBuffer, (const char*) textBuffer.append(m_module->StripText()), BT_MAX_LUCENE_FIELD_LENGTH); doc->add(*(new lucene::document::Field((const TCHAR*)_T("content"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED))); textBuffer.resize(0); //clean up // index attributes sword::AttributeList::iterator attListI; sword::AttributeValue::iterator attValueI; // Footnotes for (attListI = m_module->getEntryAttributes()["Footnote"].begin(); attListI != m_module->getEntryAttributes()["Footnote"].end(); attListI++) { lucene_utf8towcs(wcharBuffer, attListI->second["body"], BT_MAX_LUCENE_FIELD_LENGTH); //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("footnote"), wcharBuffer)); doc->add(*(new lucene::document::Field((const TCHAR*)_T("footnote"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED))); } // for attListI // Headings for (attValueI = m_module->getEntryAttributes()["Heading"]["Preverse"].begin(); attValueI != m_module->getEntryAttributes()["Heading"]["Preverse"].end(); attValueI++) { lucene_utf8towcs(wcharBuffer, attValueI->second, BT_MAX_LUCENE_FIELD_LENGTH); //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("heading"), wcharBuffer)); doc->add(*(new lucene::document::Field((const TCHAR*)_T("heading"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED))); } // for attValueI // Strongs/Morphs for (attListI = m_module->getEntryAttributes()["Word"].begin(); attListI != m_module->getEntryAttributes()["Word"].end(); attListI++) { // for each attribute if (attListI->second["LemmaClass"] == "strong") { lucene_utf8towcs(wcharBuffer, attListI->second["Lemma"], BT_MAX_LUCENE_FIELD_LENGTH); //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("strong"), wcharBuffer)); doc->add(*(new lucene::document::Field((const TCHAR*)_T("strong"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED))); //qWarning("Adding strong %s", attListI->second["Lemma"].c_str()); } if (attListI->second.find("Morph") != attListI->second.end()) { lucene_utf8towcs(wcharBuffer, attListI->second["Morph"], BT_MAX_LUCENE_FIELD_LENGTH); //doc->add(*lucene::document::Field::UnStored((const TCHAR*)_T("morph"), wcharBuffer)); doc->add(*(new lucene::document::Field((const TCHAR*)_T("morph"), (const TCHAR*)wcharBuffer, lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED))); } } // for attListI writer->addDocument(doc.get()); //Index() is not implemented properly for lexicons, so we use a //workaround. if (type() == CSwordModuleInfo::Lexicon) { verseIndex++; } else { verseIndex = m_module->Index(); } if (verseIndex % 200 == 0) { int indexingProgressValue; if (verseSpan == 0) { //prevent division by zero //m_indexingProgress.setValue( QVariant(0) ); indexingProgressValue = 0; } else { //m_indexingProgress.setValue( QVariant((int)((100*(verseIndex-verseLowIndex))/(verseHighIndex-verseLowIndex))) ); indexingProgressValue = (int)((100 * (verseIndex - verseLowIndex)) / (verseSpan)); } //m_indexingProgress.activate(); emit indexingProgress(indexingProgressValue); } } if (!m_cancelIndexing) { writer->optimize(); } writer->close(); if (m_cancelIndexing) { deleteIndex(); m_cancelIndexing = false; } else { QSettings module_config(getModuleBaseIndexLocation() + QString("/bibletime-index.conf"), QSettings::IniFormat); if (hasVersion()) module_config.setValue("module-version", config(CSwordModuleInfo::ModuleVersion) ); module_config.setValue("index-version", INDEX_VERSION); emit hasIndexChanged(true); } } catch (...) { qWarning("CLucene exception occurred while indexing"); util::showWarning(0, QCoreApplication::tr("Indexing aborted"), QCoreApplication::tr("An internal error occurred while building the index.")); deleteIndex(); m_cancelIndexing = false; } }