bool XapianDocument::removeTermStartsWith(const QByteArray &prefix) { bool modified = false; Xapian::TermIterator it = m_doc.termlist_begin(); it.skip_to(prefix.constData()); while (it != m_doc.termlist_end()) { const std::string t = *it; const QByteArray term = QByteArray::fromRawData(t.c_str(), t.size()); if (!term.startsWith(prefix)) { break; } // The term should not just be the prefix if (term.size() <= prefix.size()) { break; } // The term should not contain any more upper case letters if (isupper(term.at(prefix.size()))) { ++it; continue; } ++it; m_doc.remove_term(t); modified = true; } return modified; }
/// Suggests terms. unsigned int XapianIndex::getCloseTerms(const string &term, set<string> &suggestions) { XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false); if (pDatabase == NULL) { cerr << "Bad index " << m_databaseName << endl; return 0; } suggestions.clear(); try { Xapian::Database *pIndex = pDatabase->readLock(); if (pIndex != NULL) { Xapian::TermIterator termIter = pIndex->allterms_begin(); if (termIter != pIndex->allterms_end()) { string baseTerm(StringManip::toLowerCase(term)); unsigned int count = 0; // Get the next 10 terms termIter.skip_to(baseTerm); while ((termIter != pIndex->allterms_end()) && (count < 10)) { string suggestedTerm(*termIter); if (suggestedTerm.find(baseTerm) != 0) { // This term doesn't have the same root break; } suggestions.insert(*termIter); // Next ++count; ++termIter; } } } } catch (const Xapian::Error &error) { cerr << "Couldn't get terms: " << error.get_msg() << endl; } catch (...) { cerr << "Couldn't get terms, unknown exception occured" << endl; } pDatabase->unlock(); return suggestions.size(); }
/// Returns a document's labels. bool XapianIndex::getDocumentLabels(unsigned int docId, set<string> &labels) const { bool gotLabels = false; XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName); if (pDatabase == NULL) { cerr << "Bad index " << m_databaseName << endl; return false; } labels.clear(); try { Xapian::Database *pIndex = pDatabase->readLock(); if (pIndex != NULL) { Xapian::TermIterator termIter = pIndex->termlist_begin(docId); if (termIter != pIndex->termlist_end(docId)) { for (termIter.skip_to("XLABEL:"); termIter != pIndex->termlist_end(docId); ++termIter) { if ((*termIter).length() < 7) { break; } // Is this a label ? if (strncasecmp((*termIter).c_str(), "XLABEL:", min(7, (int)(*termIter).length())) == 0) { labels.insert((*termIter).substr(7)); } } gotLabels = true; } } } catch (const Xapian::Error &error) { cerr << "Couldn't get document's labels: " << error.get_type() << ": " << error.get_msg() << endl; } catch (...) { cerr << "Couldn't get document's labels, unknown exception occured" << endl; } pDatabase->unlock(); return gotLabels; }
QString XapianDocument::fetchTermStartsWith(const QByteArray &term) { try { Xapian::TermIterator it = m_doc.termlist_begin(); it.skip_to(term.constData()); if (it == m_doc.termlist_end()) { return QString(); } std::string str = *it; return QString::fromUtf8(str.c_str(), str.length()); } catch (const Xapian::Error &) { return QString(); } }
static void removeFirstPosting(Xapian::Document &doc, Xapian::TermIterator &termListIter, const string &term) { termListIter.skip_to(term); Xapian::PositionIterator firstPosIter = termListIter.positionlist_begin(); if (firstPosIter != termListIter.positionlist_end()) { try { doc.remove_posting(term, *firstPosIter); } catch (const Xapian::Error &error) { // This posting may have been removed already #ifdef DEBUG cout << "XapianIndex::removeFirstPosting: " << error.get_msg() << endl; #endif } } }
bool XapianEngine::queryDatabase(Xapian::Database *pIndex, Xapian::Query &query, const string &stemLanguage, unsigned int startDoc, const QueryProperties &queryProps) { Timer timer; unsigned int maxResultsCount = queryProps.getMaximumResultsCount(); bool completedQuery = false; if (pIndex == NULL) { return false; } // Start an enquire session on the database Xapian::Enquire enquire(*pIndex); timer.start(); try { AbstractGenerator abstractGen(pIndex, 50); vector<string> seedTerms; // Give the query object to the enquire session enquire.set_query(query); // How should results be sorted ? if (queryProps.getSortOrder() == QueryProperties::RELEVANCE) { // By relevance, only enquire.set_sort_by_relevance_then_value(4); #ifdef DEBUG cout << "XapianEngine::queryDatabase: sorting by relevance first" << endl; #endif } else if (queryProps.getSortOrder() == QueryProperties::DATE) { // By date, and then by relevance enquire.set_sort_by_value_then_relevance(4); #ifdef DEBUG cout << "XapianEngine::queryDatabase: sorting by date and time first" << endl; #endif } // Get the top results of the query Xapian::MSet matches = enquire.get_mset(startDoc, maxResultsCount, (2 * maxResultsCount) + 1); m_resultsCountEstimate = matches.get_matches_estimated(); if (matches.empty() == false) { #ifdef DEBUG cout << "XapianEngine::queryDatabase: found " << matches.size() << "/" << maxResultsCount << " results found from position " << startDoc << endl; cout << "XapianEngine::queryDatabase: estimated " << matches.get_matches_lower_bound() << "/" << m_resultsCountEstimate << "/" << matches.get_matches_upper_bound() << endl; #endif // Get the results for (Xapian::MSetIterator mIter = matches.begin(); mIter != matches.end(); ++mIter) { Xapian::docid docId = *mIter; Xapian::Document doc(mIter.get_document()); // What terms did this document match ? seedTerms.clear(); for (Xapian::TermIterator termIter = enquire.get_matching_terms_begin(docId); termIter != enquire.get_matching_terms_end(docId); ++termIter) { char firstChar = (*termIter)[0]; if (isupper(((int)firstChar)) == 0) { seedTerms.push_back(*termIter); #ifdef DEBUG cout << "XapianEngine::queryDatabase: matched term " << *termIter << endl; #endif } else if (firstChar == 'Z') { string stemmed((*termIter).substr(1)); string::size_type stemmedLen = stemmed.length(); // Which of this document's terms stem to this ? Xapian::TermIterator docTermIter = pIndex->termlist_begin(docId); if (docTermIter != pIndex->termlist_end(docId)) { for (docTermIter.skip_to(stemmed); docTermIter != pIndex->termlist_end(docId); ++docTermIter) { // Is this a potential unstem ? if (strncasecmp((*docTermIter).c_str(), stemmed.c_str(), stemmedLen) != 0) { // No, no point looking at the next terms break; } #ifdef DEBUG cout << "XapianEngine::queryDatabase: matched unstem " << *docTermIter << endl; #endif // FIXME: check this term stems to stemmed ! seedTerms.push_back(*docTermIter); } } } } DocumentInfo thisResult; thisResult.setExtract(abstractGen.generateAbstract(docId, seedTerms)); thisResult.setScore((float)mIter.get_percent()); #ifdef DEBUG cout << "XapianEngine::queryDatabase: found document ID " << docId << endl; #endif XapianDatabase::recordToProps(doc.get_data(), &thisResult); // XapianDatabase stored the language in English thisResult.setLanguage(Languages::toLocale(thisResult.getLanguage())); string url(thisResult.getLocation()); if (url.empty() == true) { // Hmmm this shouldn't be empty... // Use this instead, even though the document isn't cached in the index thisResult.setLocation(XapianDatabase::buildUrl(m_databaseName, docId)); } // We don't know the index ID, just the document ID thisResult.setIsIndexed(0, docId); // Add this result m_resultsList.push_back(thisResult); } } completedQuery = true; } catch (const Xapian::Error &error) { cerr << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl; } cout << "Ran query \"" << queryProps.getFreeQuery() << "\" in " << timer.stop() << " ms" << endl; try { m_expandTerms.clear(); // Expand the query ? if (m_expandDocuments.empty() == false) { Xapian::RSet expandDocs; for (set<string>::const_iterator docIter = m_expandDocuments.begin(); docIter != m_expandDocuments.end(); ++docIter) { string uniqueTerm(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(Url::canonicalizeUrl(*docIter)), true)); // Only one document may have this term Xapian::PostingIterator postingIter = pIndex->postlist_begin(uniqueTerm); if (postingIter != pIndex->postlist_end(uniqueTerm)) { expandDocs.add_document(*postingIter); } } #ifdef DEBUG cout << "XapianEngine::queryDatabase: expand from " << expandDocs.size() << " documents" << endl; #endif // Get 10 non-prefixed terms string allowedPrefixes("RS"); TermDecider expandDecider(pIndex, ((stemLanguage.empty() == true) ? NULL : &m_stemmer), FileStopper::get_stopper(Languages::toCode(stemLanguage)), allowedPrefixes, query); Xapian::ESet expandTerms = enquire.get_eset(10, expandDocs, &expandDecider); #ifdef DEBUG cout << "XapianEngine::queryDatabase: " << expandTerms.size() << " expand terms" << endl; #endif for (Xapian::ESetIterator termIter = expandTerms.begin(); termIter != expandTerms.end(); ++termIter) { string expandTerm(*termIter); char firstChar = expandTerm[0]; // Is this prefixed ? if (allowedPrefixes.find(firstChar) != string::npos) { expandTerm.erase(0, 1); } m_expandTerms.insert(expandTerm); } } } catch (const Xapian::Error &error) { cerr << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl; } // Be tolerant of errors as long as we got some results if ((completedQuery == true) || (m_resultsList.empty() == false)) { return true; } return false; }
/// Gets terms with the same root. unsigned int XapianIndex::getCloseTerms(const string &term, set<string> &suggestions) { XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName); if (pDatabase == NULL) { cerr << "Bad index " << m_databaseName << endl; return 0; } suggestions.clear(); try { Xapian::Database *pIndex = pDatabase->readLock(); if (pIndex != NULL) { Xapian::TermIterator termIter = pIndex->allterms_begin(); if (termIter != pIndex->allterms_end()) { string baseTerm(StringManip::toLowerCase(term)); unsigned int count = 0; bool isUpper = false; if (isupper((int)term[0]) != 0) { // R-prefix the term baseTerm = string("R") + term; isUpper = true; } // Get the next 10 terms for (termIter.skip_to(baseTerm); (termIter != pIndex->allterms_end()) && (count < 10); ++termIter) { string suggestedTerm(*termIter); if (suggestedTerm.find(baseTerm) != 0) { // This term doesn't have the same root if (isUpper == true) { // Try again without capital letters baseTerm = StringManip::toLowerCase(term); termIter = pIndex->allterms_begin(); if (termIter != pIndex->allterms_end()) { termIter.skip_to(baseTerm); isUpper = false; continue; } } break; } if (isUpper == true) { // Remove the R prefix suggestedTerm.erase(0, 1); } suggestions.insert(suggestedTerm); ++count; } } } } catch (const Xapian::Error &error) { cerr << "Couldn't get terms: " << error.get_type() << ": " << error.get_msg() << endl; } catch (...) { cerr << "Couldn't get terms, unknown exception occured" << endl; } pDatabase->unlock(); return suggestions.size(); }
/// Sets a document's labels. bool XapianIndex::setDocumentLabels(unsigned int docId, const set<string> &labels, bool resetLabels) { bool updatedLabels = false; XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false); if (pDatabase == NULL) { cerr << "Bad index " << m_databaseName << endl; return false; } try { Xapian::WritableDatabase *pIndex = pDatabase->writeLock(); if (pIndex != NULL) { Xapian::Document doc = pIndex->get_document(docId); // Reset existing labels ? if (resetLabels == true) { Xapian::TermIterator termIter = pIndex->termlist_begin(docId); if (termIter != pIndex->termlist_end(docId)) { for (termIter.skip_to("XLABEL:"); termIter != pIndex->termlist_end(docId); ++termIter) { // Is this a label ? if (strncasecmp((*termIter).c_str(), "XLABEL:", min(7, (int)(*termIter).length())) == 0) { doc.remove_term(*termIter); } } } } // Set new labels for (set<string>::const_iterator labelIter = labels.begin(); labelIter != labels.end(); ++labelIter) { if (labelIter->empty() == false) { doc.add_term(limitTermLength(string("XLABEL:") + *labelIter)); } } pIndex->replace_document(docId, doc); updatedLabels = true; } } catch (const Xapian::Error &error) { cerr << "Couldn't update document's labels: " << error.get_type() << ": " << error.get_msg() << endl; } catch (...) { cerr << "Couldn't update document's labels, unknown exception occured" << endl; } pDatabase->unlock(); return updatedLabels; }