Beispiel #1
0
bool XapianDocument::removeTermStartsWith(const QByteArray &prefix)
{
    bool modified = false;

    Xapian::TermIterator it = m_doc.termlist_begin();
    it.skip_to(prefix.constData());
    while (it != m_doc.termlist_end()) {
        const std::string t = *it;
        const QByteArray term = QByteArray::fromRawData(t.c_str(), t.size());
        if (!term.startsWith(prefix)) {
            break;
        }

        // The term should not just be the prefix
        if (term.size() <= prefix.size()) {
            break;
        }

        // The term should not contain any more upper case letters
        if (isupper(term.at(prefix.size()))) {
            ++it;
            continue;
        }

        ++it;
        m_doc.remove_term(t);
        modified = true;
    }

    return modified;
}
/// Suggests terms.
unsigned int XapianIndex::getCloseTerms(const string &term, set<string> &suggestions)
{
	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
	if (pDatabase == NULL)
	{
		cerr << "Bad index " << m_databaseName << endl;
		return 0;
	}

	suggestions.clear();
	try
	{
		Xapian::Database *pIndex = pDatabase->readLock();
		if (pIndex != NULL)
		{
			Xapian::TermIterator termIter = pIndex->allterms_begin();

			if (termIter != pIndex->allterms_end())
			{
				string baseTerm(StringManip::toLowerCase(term));
				unsigned int count = 0;

				// Get the next 10 terms
				termIter.skip_to(baseTerm);
				while ((termIter != pIndex->allterms_end()) &&
					(count < 10))
				{
					string suggestedTerm(*termIter);

					if (suggestedTerm.find(baseTerm) != 0)
					{
						// This term doesn't have the same root
						break;
					}
					suggestions.insert(*termIter);

					// Next
					++count;
					++termIter;
				}
			}
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't get terms: " << error.get_msg() << endl;
	}
	catch (...)
	{
		cerr << "Couldn't get terms, unknown exception occured" << endl;
	}
	pDatabase->unlock();

	return suggestions.size();
}
/// Returns a document's labels.
bool XapianIndex::getDocumentLabels(unsigned int docId, set<string> &labels) const
{
	bool gotLabels = false;

	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
	if (pDatabase == NULL)
	{
		cerr << "Bad index " << m_databaseName << endl;
		return false;
	}

	labels.clear();
	try
	{
		Xapian::Database *pIndex = pDatabase->readLock();
		if (pIndex != NULL)
		{
			Xapian::TermIterator termIter = pIndex->termlist_begin(docId);
			if (termIter != pIndex->termlist_end(docId))
			{
				for (termIter.skip_to("XLABEL:");
					termIter != pIndex->termlist_end(docId); ++termIter)
				{
					if ((*termIter).length() < 7)
					{
						break;
					}

					// Is this a label ?
					if (strncasecmp((*termIter).c_str(), "XLABEL:", min(7, (int)(*termIter).length())) == 0)
					{
						labels.insert((*termIter).substr(7));
					}
				}
				gotLabels = true;
			}
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't get document's labels: " << error.get_type() << ": " << error.get_msg() << endl;
	}
	catch (...)
	{
		cerr << "Couldn't get document's labels, unknown exception occured" << endl;
	}
	pDatabase->unlock();

	return gotLabels;
}
Beispiel #4
0
QString XapianDocument::fetchTermStartsWith(const QByteArray &term)
{
    try {
        Xapian::TermIterator it = m_doc.termlist_begin();
        it.skip_to(term.constData());

        if (it == m_doc.termlist_end()) {
            return QString();
        }
        std::string str = *it;
        return QString::fromUtf8(str.c_str(), str.length());
    } catch (const Xapian::Error &) {
        return QString();
    }
}
static void removeFirstPosting(Xapian::Document &doc,
                               Xapian::TermIterator &termListIter, const string &term)
{
    termListIter.skip_to(term);

    Xapian::PositionIterator firstPosIter = termListIter.positionlist_begin();
    if (firstPosIter != termListIter.positionlist_end())
    {
        try
        {
            doc.remove_posting(term, *firstPosIter);
        }
        catch (const Xapian::Error &error)
        {
            // This posting may have been removed already
#ifdef DEBUG
            cout << "XapianIndex::removeFirstPosting: " << error.get_msg() << endl;
#endif
        }
    }
}
bool XapianEngine::queryDatabase(Xapian::Database *pIndex, Xapian::Query &query,
	const string &stemLanguage, unsigned int startDoc, const QueryProperties &queryProps)
{
	Timer timer;
	unsigned int maxResultsCount = queryProps.getMaximumResultsCount();
	bool completedQuery = false;

	if (pIndex == NULL)
	{
		return false;
	}

	// Start an enquire session on the database
	Xapian::Enquire enquire(*pIndex);

	timer.start();
	try
	{
		AbstractGenerator abstractGen(pIndex, 50);
		vector<string> seedTerms;

		// Give the query object to the enquire session
		enquire.set_query(query);
		// How should results be sorted ?
		if (queryProps.getSortOrder() == QueryProperties::RELEVANCE)
		{
			// By relevance, only
			enquire.set_sort_by_relevance_then_value(4);
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: sorting by relevance first" << endl;
#endif
		}
		else if (queryProps.getSortOrder() == QueryProperties::DATE)
		{
			// By date, and then by relevance
			enquire.set_sort_by_value_then_relevance(4);
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: sorting by date and time first" << endl;
#endif
		}

		// Get the top results of the query
		Xapian::MSet matches = enquire.get_mset(startDoc, maxResultsCount, (2 * maxResultsCount) + 1);
		m_resultsCountEstimate = matches.get_matches_estimated();
		if (matches.empty() == false)
		{
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: found " << matches.size() << "/" << maxResultsCount
				<< " results found from position " << startDoc << endl;
			cout << "XapianEngine::queryDatabase: estimated " << matches.get_matches_lower_bound()
				<< "/" << m_resultsCountEstimate << "/" << matches.get_matches_upper_bound() << endl;
#endif

			// Get the results
			for (Xapian::MSetIterator mIter = matches.begin(); mIter != matches.end(); ++mIter)
			{
				Xapian::docid docId = *mIter;
				Xapian::Document doc(mIter.get_document());

				// What terms did this document match ?
				seedTerms.clear();
				for (Xapian::TermIterator termIter = enquire.get_matching_terms_begin(docId);
					termIter != enquire.get_matching_terms_end(docId); ++termIter)
				{
					char firstChar = (*termIter)[0];

					if (isupper(((int)firstChar)) == 0)
					{
						seedTerms.push_back(*termIter);
#ifdef DEBUG
						cout << "XapianEngine::queryDatabase: matched term " << *termIter << endl;
#endif
					}
					else if (firstChar == 'Z')
					{
						string stemmed((*termIter).substr(1));
						string::size_type stemmedLen = stemmed.length();

						// Which of this document's terms stem to this ?
						Xapian::TermIterator docTermIter = pIndex->termlist_begin(docId);
						if (docTermIter != pIndex->termlist_end(docId))
						{
							for (docTermIter.skip_to(stemmed);
								docTermIter != pIndex->termlist_end(docId); ++docTermIter)
							{
								// Is this a potential unstem ?
								if (strncasecmp((*docTermIter).c_str(), stemmed.c_str(), stemmedLen) != 0)
								{
									// No, no point looking at the next terms
									break;
								}
#ifdef DEBUG
								cout << "XapianEngine::queryDatabase: matched unstem " << *docTermIter << endl;
#endif

								// FIXME: check this term stems to stemmed !
								seedTerms.push_back(*docTermIter); 
							}
						}
					}
				}

				DocumentInfo thisResult;
				thisResult.setExtract(abstractGen.generateAbstract(docId, seedTerms));
				thisResult.setScore((float)mIter.get_percent());

#ifdef DEBUG
				cout << "XapianEngine::queryDatabase: found document ID " << docId << endl;
#endif
				XapianDatabase::recordToProps(doc.get_data(), &thisResult);
				// XapianDatabase stored the language in English
				thisResult.setLanguage(Languages::toLocale(thisResult.getLanguage()));

				string url(thisResult.getLocation());
				if (url.empty() == true)
				{
					// Hmmm this shouldn't be empty...
					// Use this instead, even though the document isn't cached in the index
					thisResult.setLocation(XapianDatabase::buildUrl(m_databaseName, docId));
				}

				// We don't know the index ID, just the document ID
				thisResult.setIsIndexed(0, docId);

				// Add this result
				m_resultsList.push_back(thisResult);
			}
		}

		completedQuery = true;
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl;
	}
	cout << "Ran query \"" << queryProps.getFreeQuery() << "\" in " << timer.stop() << " ms" << endl;

	try
	{
		m_expandTerms.clear();

		// Expand the query ?
		if (m_expandDocuments.empty() == false)
		{
			Xapian::RSet expandDocs;

			for (set<string>::const_iterator docIter = m_expandDocuments.begin();
				docIter != m_expandDocuments.end(); ++docIter)
			{
				string uniqueTerm(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(Url::canonicalizeUrl(*docIter)), true));

				// Only one document may have this term
				Xapian::PostingIterator postingIter = pIndex->postlist_begin(uniqueTerm);
				if (postingIter != pIndex->postlist_end(uniqueTerm))
				{
					expandDocs.add_document(*postingIter);
				}
			}
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: expand from " << expandDocs.size() << " documents" << endl;
#endif

			// Get 10 non-prefixed terms
			string allowedPrefixes("RS");
			TermDecider expandDecider(pIndex, ((stemLanguage.empty() == true) ? NULL : &m_stemmer),
				FileStopper::get_stopper(Languages::toCode(stemLanguage)),
				allowedPrefixes, query);
			Xapian::ESet expandTerms = enquire.get_eset(10, expandDocs, &expandDecider);
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: " << expandTerms.size() << " expand terms" << endl;
#endif
			for (Xapian::ESetIterator termIter = expandTerms.begin();
				termIter != expandTerms.end(); ++termIter)
			{
				string expandTerm(*termIter);
				char firstChar = expandTerm[0];

				// Is this prefixed ?
				if (allowedPrefixes.find(firstChar) != string::npos)
				{
					expandTerm.erase(0, 1);
				}

				m_expandTerms.insert(expandTerm);
			}
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl;
	}

	// Be tolerant of errors as long as we got some results
	if ((completedQuery == true) ||
		(m_resultsList.empty() == false))
	{
		return true;
	}

	return false;
}
/// Gets terms with the same root.
unsigned int XapianIndex::getCloseTerms(const string &term, set<string> &suggestions)
{
	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
	if (pDatabase == NULL)
	{
		cerr << "Bad index " << m_databaseName << endl;
		return 0;
	}

	suggestions.clear();
	try
	{
		Xapian::Database *pIndex = pDatabase->readLock();
		if (pIndex != NULL)
		{
			Xapian::TermIterator termIter = pIndex->allterms_begin();

			if (termIter != pIndex->allterms_end())
			{
				string baseTerm(StringManip::toLowerCase(term));
				unsigned int count = 0;
				bool isUpper = false;

				if (isupper((int)term[0]) != 0)
				{
					// R-prefix the term
					baseTerm = string("R") + term;
					isUpper = true;
				}

				// Get the next 10 terms
				for (termIter.skip_to(baseTerm);
					(termIter != pIndex->allterms_end()) && (count < 10); ++termIter)
				{
					string suggestedTerm(*termIter);

					if (suggestedTerm.find(baseTerm) != 0)
					{
						// This term doesn't have the same root
						if (isUpper == true)
						{
							// Try again without capital letters
							baseTerm = StringManip::toLowerCase(term);
							termIter = pIndex->allterms_begin();
							if (termIter != pIndex->allterms_end())
							{
								termIter.skip_to(baseTerm);
								isUpper = false;
								continue;
							}
						}

						break;
					}

					if (isUpper == true)
					{
						// Remove the R prefix
						suggestedTerm.erase(0, 1);
					}

					suggestions.insert(suggestedTerm);
					++count;
				}
			}
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't get terms: " << error.get_type() << ": " << error.get_msg() << endl;
	}
	catch (...)
	{
		cerr << "Couldn't get terms, unknown exception occured" << endl;
	}
	pDatabase->unlock();

	return suggestions.size();
}
/// Sets a document's labels.
bool XapianIndex::setDocumentLabels(unsigned int docId, const set<string> &labels,
	bool resetLabels)
{
	bool updatedLabels = false;

	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
	if (pDatabase == NULL)
	{
		cerr << "Bad index " << m_databaseName << endl;
		return false;
	}

	try
	{
		Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
		if (pIndex != NULL)
		{
			Xapian::Document doc = pIndex->get_document(docId);

			// Reset existing labels ?
			if (resetLabels == true)
			{
				Xapian::TermIterator termIter = pIndex->termlist_begin(docId);
				if (termIter != pIndex->termlist_end(docId))
				{
					for (termIter.skip_to("XLABEL:");
						termIter != pIndex->termlist_end(docId); ++termIter)
					{
						// Is this a label ?
						if (strncasecmp((*termIter).c_str(), "XLABEL:", min(7, (int)(*termIter).length())) == 0)
						{
							doc.remove_term(*termIter);
						}
					}
				}
			}

			// Set new labels
			for (set<string>::const_iterator labelIter = labels.begin(); labelIter != labels.end();
				++labelIter)
			{
				if (labelIter->empty() == false)
				{
					doc.add_term(limitTermLength(string("XLABEL:") + *labelIter));
				}
			}

			pIndex->replace_document(docId, doc);
			updatedLabels = true;
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't update document's labels: " << error.get_type() << ": " << error.get_msg() << endl;
	}
	catch (...)
	{
		cerr << "Couldn't update document's labels, unknown exception occured" << endl;
	}
	pDatabase->unlock();

	return updatedLabels;
}