bool XapianEngine::queryDatabase(Xapian::Database *pIndex, Xapian::Query &query) { bool completedQuery = false; if (pIndex == NULL) { return false; } // Start an enquire session on the database Xapian::Enquire enquire(*pIndex); try { AbstractGenerator abstractGen(pIndex, 50); vector<string> seedTerms; // Give the query object to the enquire session enquire.set_query(query); // Get the top results of the query Xapian::MSet matches = enquire.get_mset(0, m_maxResultsCount); if (matches.empty() == false) { // Get the results #ifdef DEBUG cout << "XapianEngine::queryDatabase: " << matches.get_matches_estimated() << "/" << m_maxResultsCount << " results found" << endl; #endif for (Xapian::MSetIterator mIter = matches.begin(); mIter != matches.end(); ++mIter) { Xapian::docid docId = *mIter; Xapian::Document doc(mIter.get_document()); // What terms did this document match ? seedTerms.clear(); for (Xapian::TermIterator termIter = enquire.get_matching_terms_begin(docId); termIter != enquire.get_matching_terms_end(docId); ++termIter) { seedTerms.push_back(*termIter); } DocumentInfo thisResult; thisResult.setExtract(abstractGen.generateAbstract(docId, seedTerms)); thisResult.setScore((float)mIter.get_percent()); #ifdef DEBUG cout << "XapianEngine::queryDatabase: found document ID " << docId << endl; #endif XapianDatabase::recordToProps(doc.get_data(), &thisResult); string url(thisResult.getLocation()); if (url.empty() == true) { // Hmmm this shouldn't be empty... // Use this instead, even though the document isn't cached in the index thisResult.setLocation(XapianDatabase::buildUrl(m_databaseName, docId)); } // We don't know the index ID, just the document ID thisResult.setIsIndexed(0, docId); // Add this result m_resultsList.push_back(thisResult); } } completedQuery = true; } catch (const Xapian::Error &error) { cerr << "XapianEngine::queryDatabase: " << error.get_type() << ": " << error.get_msg() << endl; } try { m_expandTerms.clear(); // Expand the query ? if (m_relevantDocuments.empty() == false) { Xapian::RSet relevantDocs; unsigned int count = 0; for (set<unsigned int>::const_iterator docIter = m_relevantDocuments.begin(); docIter != m_relevantDocuments.end(); ++docIter) { relevantDocs.add_document(*docIter); } // Get 10 non-prefixed terms Xapian::ESet expandTerms = enquire.get_eset(20, relevantDocs); for (Xapian::ESetIterator termIter = expandTerms.begin(); (termIter != expandTerms.end()) && (count < 10); ++termIter) { if (isupper((int)((*termIter)[0])) == 0) { m_expandTerms.insert(*termIter); ++count; } } } } catch (const Xapian::Error &error) { cerr << "XapianEngine::queryDatabase: " << error.get_type() << ": " << error.get_msg() << endl; } // Be tolerant of errors as long as we got some results if ((completedQuery == true) || (m_resultsList.empty() == false)) { return true; } return false; }
bool XapianEngine::queryDatabase(Xapian::Database *pIndex, Xapian::Query &query, const string &stemLanguage, unsigned int startDoc, const QueryProperties &queryProps) { Timer timer; unsigned int maxResultsCount = queryProps.getMaximumResultsCount(); bool completedQuery = false; if (pIndex == NULL) { return false; } // Start an enquire session on the database Xapian::Enquire enquire(*pIndex); timer.start(); try { AbstractGenerator abstractGen(pIndex, 50); vector<string> seedTerms; // Give the query object to the enquire session enquire.set_query(query); // How should results be sorted ? if (queryProps.getSortOrder() == QueryProperties::RELEVANCE) { // By relevance, only enquire.set_sort_by_relevance_then_value(4); #ifdef DEBUG cout << "XapianEngine::queryDatabase: sorting by relevance first" << endl; #endif } else if (queryProps.getSortOrder() == QueryProperties::DATE) { // By date, and then by relevance enquire.set_sort_by_value_then_relevance(4); #ifdef DEBUG cout << "XapianEngine::queryDatabase: sorting by date and time first" << endl; #endif } // Get the top results of the query Xapian::MSet matches = enquire.get_mset(startDoc, maxResultsCount, (2 * maxResultsCount) + 1); m_resultsCountEstimate = matches.get_matches_estimated(); if (matches.empty() == false) { #ifdef DEBUG cout << "XapianEngine::queryDatabase: found " << matches.size() << "/" << maxResultsCount << " results found from position " << startDoc << endl; cout << "XapianEngine::queryDatabase: estimated " << matches.get_matches_lower_bound() << "/" << m_resultsCountEstimate << "/" << matches.get_matches_upper_bound() << endl; #endif // Get the results for (Xapian::MSetIterator mIter = matches.begin(); mIter != matches.end(); ++mIter) { Xapian::docid docId = *mIter; Xapian::Document doc(mIter.get_document()); // What terms did this document match ? seedTerms.clear(); for (Xapian::TermIterator termIter = enquire.get_matching_terms_begin(docId); termIter != enquire.get_matching_terms_end(docId); ++termIter) { char firstChar = (*termIter)[0]; if (isupper(((int)firstChar)) == 0) { seedTerms.push_back(*termIter); #ifdef DEBUG cout << "XapianEngine::queryDatabase: matched term " << *termIter << endl; #endif } else if (firstChar == 'Z') { string stemmed((*termIter).substr(1)); string::size_type stemmedLen = stemmed.length(); // Which of this document's terms stem to this ? Xapian::TermIterator docTermIter = pIndex->termlist_begin(docId); if (docTermIter != pIndex->termlist_end(docId)) { for (docTermIter.skip_to(stemmed); docTermIter != pIndex->termlist_end(docId); ++docTermIter) { // Is this a potential unstem ? if (strncasecmp((*docTermIter).c_str(), stemmed.c_str(), stemmedLen) != 0) { // No, no point looking at the next terms break; } #ifdef DEBUG cout << "XapianEngine::queryDatabase: matched unstem " << *docTermIter << endl; #endif // FIXME: check this term stems to stemmed ! seedTerms.push_back(*docTermIter); } } } } DocumentInfo thisResult; thisResult.setExtract(abstractGen.generateAbstract(docId, seedTerms)); thisResult.setScore((float)mIter.get_percent()); #ifdef DEBUG cout << "XapianEngine::queryDatabase: found document ID " << docId << endl; #endif XapianDatabase::recordToProps(doc.get_data(), &thisResult); // XapianDatabase stored the language in English thisResult.setLanguage(Languages::toLocale(thisResult.getLanguage())); string url(thisResult.getLocation()); if (url.empty() == true) { // Hmmm this shouldn't be empty... // Use this instead, even though the document isn't cached in the index thisResult.setLocation(XapianDatabase::buildUrl(m_databaseName, docId)); } // We don't know the index ID, just the document ID thisResult.setIsIndexed(0, docId); // Add this result m_resultsList.push_back(thisResult); } } completedQuery = true; } catch (const Xapian::Error &error) { cerr << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl; } cout << "Ran query \"" << queryProps.getFreeQuery() << "\" in " << timer.stop() << " ms" << endl; try { m_expandTerms.clear(); // Expand the query ? if (m_expandDocuments.empty() == false) { Xapian::RSet expandDocs; for (set<string>::const_iterator docIter = m_expandDocuments.begin(); docIter != m_expandDocuments.end(); ++docIter) { string uniqueTerm(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(Url::canonicalizeUrl(*docIter)), true)); // Only one document may have this term Xapian::PostingIterator postingIter = pIndex->postlist_begin(uniqueTerm); if (postingIter != pIndex->postlist_end(uniqueTerm)) { expandDocs.add_document(*postingIter); } } #ifdef DEBUG cout << "XapianEngine::queryDatabase: expand from " << expandDocs.size() << " documents" << endl; #endif // Get 10 non-prefixed terms string allowedPrefixes("RS"); TermDecider expandDecider(pIndex, ((stemLanguage.empty() == true) ? NULL : &m_stemmer), FileStopper::get_stopper(Languages::toCode(stemLanguage)), allowedPrefixes, query); Xapian::ESet expandTerms = enquire.get_eset(10, expandDocs, &expandDecider); #ifdef DEBUG cout << "XapianEngine::queryDatabase: " << expandTerms.size() << " expand terms" << endl; #endif for (Xapian::ESetIterator termIter = expandTerms.begin(); termIter != expandTerms.end(); ++termIter) { string expandTerm(*termIter); char firstChar = expandTerm[0]; // Is this prefixed ? if (allowedPrefixes.find(firstChar) != string::npos) { expandTerm.erase(0, 1); } m_expandTerms.insert(expandTerm); } } } catch (const Xapian::Error &error) { cerr << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl; } // Be tolerant of errors as long as we got some results if ((completedQuery == true) || (m_resultsList.empty() == false)) { return true; } return false; }
bool XapianEngine::queryDatabase(Xapian::Query &query) { bool bStatus = false; XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, true); if (pDatabase == NULL) { return false; } // Get the latest revision... pDatabase->reopen(); Xapian::Database *pIndex = pDatabase->readLock(); if (pIndex != NULL) { try { // Start an enquire session on the database Xapian::Enquire enquire(*pIndex); // Give the query object to the enquire session enquire.set_query(query); // Get the top results of the query Xapian::MSet matches = enquire.get_mset(0, m_maxResultsCount); if (matches.empty() == false) { multimap<Xapian::weight, string> queryTerms; vector<string> seedTerms; Xapian::weight maxWeight = matches.get_max_attained(); // Sort query terms by weight for (Xapian::TermIterator termIter = query.get_terms_begin(); termIter != query.get_terms_end(); ++termIter) { string termName(*termIter); Xapian::weight termWeight = maxWeight - matches.get_termweight(termName); queryTerms.insert(pair<Xapian::weight, string>(termWeight, termName)); #ifdef DEBUG cout << "XapianEngine::queryDatabase: term " << termName << " has weight " << matches.get_termweight(termName) << "/" << maxWeight << endl; #endif } for (multimap<Xapian::weight, string>::iterator weightIter = queryTerms.begin(); weightIter != queryTerms.end(); ++weightIter) { seedTerms.push_back(weightIter->second); } // Get the results #ifdef DEBUG cout << "XapianEngine::queryDatabase: " << matches.get_matches_estimated() << "/" << m_maxResultsCount << " results found" << endl; #endif for (Xapian::MSetIterator mIter = matches.begin(); mIter != matches.end(); ++mIter) { Xapian::docid docId = *mIter; Xapian::Document doc(mIter.get_document()); string record = doc.get_data(); // Get the title string title = StringManip::extractField(record, "caption=", "\n"); #ifdef DEBUG cout << "XapianEngine::queryDatabase: found omindex title " << title << endl; #endif // Get the URL string url = StringManip::extractField(record, "url=", "\n"); if (url.empty() == true) { // Hmmm this shouldn't be empty... // Use this instead, even though the document isn't cached in the index url = XapianDatabase::buildUrl(m_databaseName, *mIter); } else { #ifdef DEBUG cout << "XapianEngine::queryDatabase: found omindex URL " << url << endl; #endif url = Url::canonicalizeUrl(url); } // Get the type string type = StringManip::extractField(record, "type=", "\n"); // ...and the language, if available string language = StringManip::extractField(record, "language=", "\n"); // Finally, get a summary string summary = StringManip::extractField(record, "sample=", "\n"); if (summary.empty() == true) { AbstractGenerator abstractGen(pIndex, 50); // Generate an abstract based on the query's terms summary = abstractGen.generateAbstract(seedTerms, docId); } // Add this result Result thisResult(url, title, summary, language, (float)mIter.get_percent()); m_resultsList.push_back(thisResult); } } m_expandTerms.clear(); // Expand the query ? if (m_relevantDocuments.empty() == false) { Xapian::RSet relevantDocs; unsigned int count = 0; for (set<unsigned int>::const_iterator docIter = m_relevantDocuments.begin(); docIter != m_relevantDocuments.end(); ++docIter) { relevantDocs.add_document(*docIter); } // Get 10 non-prefixed terms Xapian::ESet expandTerms = enquire.get_eset(20, relevantDocs); for (Xapian::ESetIterator termIter = expandTerms.begin(); (termIter != expandTerms.end()) && (count < 10); ++termIter) { if (isupper((int)((*termIter)[0])) == 0) { m_expandTerms.insert(*termIter); ++count; } } } bStatus = true; } catch (const Xapian::Error &error) { cerr << "XapianEngine::queryDatabase: " << error.get_type() << ": " << error.get_msg() << endl; } } pDatabase->unlock(); return bStatus; }