Exemplo n.º 1
0
bool XapianEngine::queryDatabase(Xapian::Database *pIndex, Xapian::Query &query)
{
	bool completedQuery = false;

	if (pIndex == NULL)
	{
		return false;
	}

	// Start an enquire session on the database
	Xapian::Enquire enquire(*pIndex);

	try
	{
		AbstractGenerator abstractGen(pIndex, 50);
		vector<string> seedTerms;

		// Give the query object to the enquire session
		enquire.set_query(query);

		// Get the top results of the query
		Xapian::MSet matches = enquire.get_mset(0, m_maxResultsCount);
		if (matches.empty() == false)
		{
			// Get the results
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: " << matches.get_matches_estimated()
				<< "/" << m_maxResultsCount << " results found" << endl;
#endif
			for (Xapian::MSetIterator mIter = matches.begin(); mIter != matches.end(); ++mIter)
			{
				Xapian::docid docId = *mIter;
				Xapian::Document doc(mIter.get_document());

				// What terms did this document match ?
				seedTerms.clear();
				for (Xapian::TermIterator termIter = enquire.get_matching_terms_begin(docId);
					termIter != enquire.get_matching_terms_end(docId); ++termIter)
				{
					seedTerms.push_back(*termIter);
				}

				DocumentInfo thisResult;
				thisResult.setExtract(abstractGen.generateAbstract(docId, seedTerms));
				thisResult.setScore((float)mIter.get_percent());

#ifdef DEBUG
				cout << "XapianEngine::queryDatabase: found document ID " << docId << endl;
#endif
				XapianDatabase::recordToProps(doc.get_data(), &thisResult);

				string url(thisResult.getLocation());
				if (url.empty() == true)
				{
					// Hmmm this shouldn't be empty...
					// Use this instead, even though the document isn't cached in the index
					thisResult.setLocation(XapianDatabase::buildUrl(m_databaseName, docId));
				}

				// We don't know the index ID, just the document ID
				thisResult.setIsIndexed(0, docId);

				// Add this result
				m_resultsList.push_back(thisResult);
			}
		}

		completedQuery = true;
	}
	catch (const Xapian::Error &error)
	{
		cerr << "XapianEngine::queryDatabase: " << error.get_type() << ": " << error.get_msg() << endl;
	}

	try
	{
		m_expandTerms.clear();

		// Expand the query ?
		if (m_relevantDocuments.empty() == false)
		{
			Xapian::RSet relevantDocs;
			unsigned int count = 0;

			for (set<unsigned int>::const_iterator docIter = m_relevantDocuments.begin();
				docIter != m_relevantDocuments.end(); ++docIter)
			{
				relevantDocs.add_document(*docIter);
			}

			// Get 10 non-prefixed terms
			Xapian::ESet expandTerms = enquire.get_eset(20, relevantDocs);
			for (Xapian::ESetIterator termIter = expandTerms.begin();
				(termIter != expandTerms.end()) && (count < 10); ++termIter)
			{
				if (isupper((int)((*termIter)[0])) == 0)
				{
					m_expandTerms.insert(*termIter);
					++count;
				}
			}
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "XapianEngine::queryDatabase: " << error.get_type() << ": " << error.get_msg() << endl;
	}

	// Be tolerant of errors as long as we got some results
	if ((completedQuery == true) ||
		(m_resultsList.empty() == false))
	{
		return true;
	}

	return false;
}
Exemplo n.º 2
0
bool XapianEngine::queryDatabase(Xapian::Database *pIndex, Xapian::Query &query,
	const string &stemLanguage, unsigned int startDoc, const QueryProperties &queryProps)
{
	Timer timer;
	unsigned int maxResultsCount = queryProps.getMaximumResultsCount();
	bool completedQuery = false;

	if (pIndex == NULL)
	{
		return false;
	}

	// Start an enquire session on the database
	Xapian::Enquire enquire(*pIndex);

	timer.start();
	try
	{
		AbstractGenerator abstractGen(pIndex, 50);
		vector<string> seedTerms;

		// Give the query object to the enquire session
		enquire.set_query(query);
		// How should results be sorted ?
		if (queryProps.getSortOrder() == QueryProperties::RELEVANCE)
		{
			// By relevance, only
			enquire.set_sort_by_relevance_then_value(4);
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: sorting by relevance first" << endl;
#endif
		}
		else if (queryProps.getSortOrder() == QueryProperties::DATE)
		{
			// By date, and then by relevance
			enquire.set_sort_by_value_then_relevance(4);
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: sorting by date and time first" << endl;
#endif
		}

		// Get the top results of the query
		Xapian::MSet matches = enquire.get_mset(startDoc, maxResultsCount, (2 * maxResultsCount) + 1);
		m_resultsCountEstimate = matches.get_matches_estimated();
		if (matches.empty() == false)
		{
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: found " << matches.size() << "/" << maxResultsCount
				<< " results found from position " << startDoc << endl;
			cout << "XapianEngine::queryDatabase: estimated " << matches.get_matches_lower_bound()
				<< "/" << m_resultsCountEstimate << "/" << matches.get_matches_upper_bound() << endl;
#endif

			// Get the results
			for (Xapian::MSetIterator mIter = matches.begin(); mIter != matches.end(); ++mIter)
			{
				Xapian::docid docId = *mIter;
				Xapian::Document doc(mIter.get_document());

				// What terms did this document match ?
				seedTerms.clear();
				for (Xapian::TermIterator termIter = enquire.get_matching_terms_begin(docId);
					termIter != enquire.get_matching_terms_end(docId); ++termIter)
				{
					char firstChar = (*termIter)[0];

					if (isupper(((int)firstChar)) == 0)
					{
						seedTerms.push_back(*termIter);
#ifdef DEBUG
						cout << "XapianEngine::queryDatabase: matched term " << *termIter << endl;
#endif
					}
					else if (firstChar == 'Z')
					{
						string stemmed((*termIter).substr(1));
						string::size_type stemmedLen = stemmed.length();

						// Which of this document's terms stem to this ?
						Xapian::TermIterator docTermIter = pIndex->termlist_begin(docId);
						if (docTermIter != pIndex->termlist_end(docId))
						{
							for (docTermIter.skip_to(stemmed);
								docTermIter != pIndex->termlist_end(docId); ++docTermIter)
							{
								// Is this a potential unstem ?
								if (strncasecmp((*docTermIter).c_str(), stemmed.c_str(), stemmedLen) != 0)
								{
									// No, no point looking at the next terms
									break;
								}
#ifdef DEBUG
								cout << "XapianEngine::queryDatabase: matched unstem " << *docTermIter << endl;
#endif

								// FIXME: check this term stems to stemmed !
								seedTerms.push_back(*docTermIter); 
							}
						}
					}
				}

				DocumentInfo thisResult;
				thisResult.setExtract(abstractGen.generateAbstract(docId, seedTerms));
				thisResult.setScore((float)mIter.get_percent());

#ifdef DEBUG
				cout << "XapianEngine::queryDatabase: found document ID " << docId << endl;
#endif
				XapianDatabase::recordToProps(doc.get_data(), &thisResult);
				// XapianDatabase stored the language in English
				thisResult.setLanguage(Languages::toLocale(thisResult.getLanguage()));

				string url(thisResult.getLocation());
				if (url.empty() == true)
				{
					// Hmmm this shouldn't be empty...
					// Use this instead, even though the document isn't cached in the index
					thisResult.setLocation(XapianDatabase::buildUrl(m_databaseName, docId));
				}

				// We don't know the index ID, just the document ID
				thisResult.setIsIndexed(0, docId);

				// Add this result
				m_resultsList.push_back(thisResult);
			}
		}

		completedQuery = true;
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl;
	}
	cout << "Ran query \"" << queryProps.getFreeQuery() << "\" in " << timer.stop() << " ms" << endl;

	try
	{
		m_expandTerms.clear();

		// Expand the query ?
		if (m_expandDocuments.empty() == false)
		{
			Xapian::RSet expandDocs;

			for (set<string>::const_iterator docIter = m_expandDocuments.begin();
				docIter != m_expandDocuments.end(); ++docIter)
			{
				string uniqueTerm(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(Url::canonicalizeUrl(*docIter)), true));

				// Only one document may have this term
				Xapian::PostingIterator postingIter = pIndex->postlist_begin(uniqueTerm);
				if (postingIter != pIndex->postlist_end(uniqueTerm))
				{
					expandDocs.add_document(*postingIter);
				}
			}
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: expand from " << expandDocs.size() << " documents" << endl;
#endif

			// Get 10 non-prefixed terms
			string allowedPrefixes("RS");
			TermDecider expandDecider(pIndex, ((stemLanguage.empty() == true) ? NULL : &m_stemmer),
				FileStopper::get_stopper(Languages::toCode(stemLanguage)),
				allowedPrefixes, query);
			Xapian::ESet expandTerms = enquire.get_eset(10, expandDocs, &expandDecider);
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: " << expandTerms.size() << " expand terms" << endl;
#endif
			for (Xapian::ESetIterator termIter = expandTerms.begin();
				termIter != expandTerms.end(); ++termIter)
			{
				string expandTerm(*termIter);
				char firstChar = expandTerm[0];

				// Is this prefixed ?
				if (allowedPrefixes.find(firstChar) != string::npos)
				{
					expandTerm.erase(0, 1);
				}

				m_expandTerms.insert(expandTerm);
			}
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl;
	}

	// Be tolerant of errors as long as we got some results
	if ((completedQuery == true) ||
		(m_resultsList.empty() == false))
	{
		return true;
	}

	return false;
}
Exemplo n.º 3
0
bool XapianEngine::queryDatabase(Xapian::Query &query)
{
	bool bStatus = false;

	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, true);
	if (pDatabase == NULL)
	{
		return false;
	}

	// Get the latest revision...
	pDatabase->reopen();
	Xapian::Database *pIndex = pDatabase->readLock();
	if (pIndex != NULL)
	{
		try
		{
			// Start an enquire session on the database
			Xapian::Enquire enquire(*pIndex);

			// Give the query object to the enquire session
			enquire.set_query(query);

			// Get the top results of the query
			Xapian::MSet matches = enquire.get_mset(0, m_maxResultsCount);
			if (matches.empty() == false)
			{
				multimap<Xapian::weight, string> queryTerms;
				vector<string> seedTerms;
				Xapian::weight maxWeight = matches.get_max_attained();

				// Sort query terms by weight
				for (Xapian::TermIterator termIter = query.get_terms_begin();
					termIter != query.get_terms_end(); ++termIter)
				{
					string termName(*termIter);
					Xapian::weight termWeight = maxWeight - matches.get_termweight(termName);

					queryTerms.insert(pair<Xapian::weight, string>(termWeight, termName));
#ifdef DEBUG
					cout << "XapianEngine::queryDatabase: term " << termName
						<< " has weight " << matches.get_termweight(termName) << "/" << maxWeight << endl;
#endif
				}

				for (multimap<Xapian::weight, string>::iterator weightIter = queryTerms.begin();
					weightIter != queryTerms.end(); ++weightIter)
				{
					seedTerms.push_back(weightIter->second);
				}

				// Get the results
#ifdef DEBUG
				cout << "XapianEngine::queryDatabase: " << matches.get_matches_estimated() << "/" << m_maxResultsCount << " results found" << endl;
#endif
				for (Xapian::MSetIterator mIter = matches.begin(); mIter != matches.end(); ++mIter)
				{
					Xapian::docid docId = *mIter;
					Xapian::Document doc(mIter.get_document());
					string record = doc.get_data();

					// Get the title
					string title = StringManip::extractField(record, "caption=", "\n");
#ifdef DEBUG
					cout << "XapianEngine::queryDatabase: found omindex title " << title << endl;
#endif
					// Get the URL
					string url = StringManip::extractField(record, "url=", "\n");
					if (url.empty() == true)
					{
						// Hmmm this shouldn't be empty...
						// Use this instead, even though the document isn't cached in the index
						url = XapianDatabase::buildUrl(m_databaseName, *mIter);
					}
					else
					{
#ifdef DEBUG
						cout << "XapianEngine::queryDatabase: found omindex URL " << url << endl;
#endif
						url = Url::canonicalizeUrl(url);
					}

					// Get the type
					string type = StringManip::extractField(record, "type=", "\n");
					// ...and the language, if available
					string language = StringManip::extractField(record, "language=", "\n");

					// Finally, get a summary
					string summary = StringManip::extractField(record, "sample=", "\n");
					if (summary.empty() == true)
					{
						AbstractGenerator abstractGen(pIndex, 50);

						// Generate an abstract based on the query's terms
						summary = abstractGen.generateAbstract(seedTerms, docId);
					}

					// Add this result
					Result thisResult(url, title, summary, language, (float)mIter.get_percent());
					m_resultsList.push_back(thisResult);
				}
			}

			m_expandTerms.clear();

			// Expand the query ?
			if (m_relevantDocuments.empty() == false)
			{
				Xapian::RSet relevantDocs;
				unsigned int count = 0;

				for (set<unsigned int>::const_iterator docIter = m_relevantDocuments.begin();
					docIter != m_relevantDocuments.end(); ++docIter)
				{
					relevantDocs.add_document(*docIter);
				}

				// Get 10 non-prefixed terms
				Xapian::ESet expandTerms = enquire.get_eset(20, relevantDocs);
				for (Xapian::ESetIterator termIter = expandTerms.begin();
					(termIter != expandTerms.end()) && (count < 10); ++termIter)
				{
					if (isupper((int)((*termIter)[0])) == 0)
					{
						m_expandTerms.insert(*termIter);
						++count;
					}
				}
			}

			bStatus = true;
		}
		catch (const Xapian::Error &error)
		{
			cerr << "XapianEngine::queryDatabase: " << error.get_type() << ": " << error.get_msg() << endl;
		}
	}
	pDatabase->unlock();

	return bStatus;
}