Пример #1
0
Xapian::Query XapianEngine::parseQuery(Xapian::Database *pIndex, const QueryProperties &queryProps,
	const string &stemLanguage, DefaultOperator defaultOperator,
	const string &limitQuery, string &correctedFreeQuery, bool minimal)
{
	Xapian::QueryParser parser;
	CJKVTokenizer tokenizer;
	string freeQuery(queryProps.getFreeQuery());
	unsigned int tokensCount = 1;
	bool diacriticSensitive = queryProps.getDiacriticSensitive();

	// Modifying the query is necessary if it's CJKV or diacritics are off
	if ((tokenizer.has_cjkv(freeQuery) == true) ||
		(diacriticSensitive == false))
	{
		QueryModifier handler(freeQuery,
			diacriticSensitive,
			tokenizer.get_ngram_size());

		tokenizer.tokenize(freeQuery, handler, true);

		tokensCount = handler.get_tokens_count();

		// We can disable stemming and spelling correction for pure CJKV queries
		string cjkvQuery(handler.get_modified_query(minimal));
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: CJKV query is " << cjkvQuery << endl;
#endif

		// Do as if the user had given this as input
		freeQuery = cjkvQuery;
	}
	else
	{
		string::size_type spacePos = freeQuery.find(' ');
		while (spacePos != string::npos)
		{
			++tokensCount;

			if (spacePos + 1 >= freeQuery.length())
			{
				break;
			}

			// Next
			spacePos = freeQuery.find(' ', spacePos + 1);
		}
	}
#ifdef DEBUG
	cout << "XapianEngine::parseQuery: " << tokensCount << " tokens" << endl;
#endif

	if (pIndex != NULL)
	{
		// The database is required for wildcards and spelling
		parser.set_database(*pIndex);
	}

	// Set things up
	if ((minimal == false) &&
		(stemLanguage.empty() == false))
	{
		parser.set_stemmer(m_stemmer);
		parser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);

		// Don't bother loading the stopwords list if there's only one token
		if (tokensCount > 1)
		{
			FileStopper *pStopper = FileStopper::get_stopper(Languages::toCode(stemLanguage));
			if ((pStopper != NULL) &&
				(pStopper->get_stopwords_count() > 0))
			{
				parser.set_stopper(pStopper);
			}
		}
	}
	else
	{
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: no stemming" << endl;
#endif
		parser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
	}
	// What's the default operator ?
	if (defaultOperator == DEFAULT_OP_AND)
	{
		parser.set_default_op(Xapian::Query::OP_AND);
	}
	else
	{
		parser.set_default_op(Xapian::Query::OP_OR);
	}
#if XAPIAN_NUM_VERSION >= 1000004
	// Search across text body and title
	parser.add_prefix("", "");
	parser.add_prefix("", "S");
#endif
	// X prefixes should always include a colon
	parser.add_boolean_prefix("site", "H");
	parser.add_boolean_prefix("file", "P");
	parser.add_boolean_prefix("ext", "XEXT:");
	parser.add_prefix("title", "S");
	parser.add_boolean_prefix("url", "U");
	parser.add_boolean_prefix("dir", "XDIR:");
	parser.add_boolean_prefix("inurl", "XFILE:");
	parser.add_prefix("path", "XPATH:");
	parser.add_boolean_prefix("lang", "L");
	parser.add_boolean_prefix("type", "T");
	parser.add_boolean_prefix("class", "XCLASS:");
	parser.add_boolean_prefix("label", "XLABEL:");
	parser.add_boolean_prefix("tokens", "XTOK:");

	// Any limit on what documents should be searched ?
	if (limitQuery.empty() == false)
	{
		string limitedQuery(limitQuery);

		limitedQuery += " AND ( ";
		limitedQuery += freeQuery;
		limitedQuery += " )";
		freeQuery = limitedQuery;
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: " << freeQuery << endl;
#endif
	}

	// Date range
	Xapian::DateValueRangeProcessor dateProcessor(0);
	parser.add_valuerangeprocessor(&dateProcessor);

	// Size with a "b" suffix, ie 1024..10240b
#if XAPIAN_NUM_VERSION >= 1001000
	Xapian::NumberValueRangeProcessor sizeProcessor(2, "b", false);
	parser.add_valuerangeprocessor(&sizeProcessor);
#elif XAPIAN_NUM_VERSION >= 1000002
	// Xapian 1.02 is the bare minimum
	Xapian::v102::NumberValueRangeProcessor sizeProcessor(2, "b", false);
	parser.add_valuerangeprocessor(&sizeProcessor);
#endif

	// Time range
	TimeValueRangeProcessor timeProcessor(3);
	parser.add_valuerangeprocessor(&timeProcessor);

	// What type of query is this ?
	QueryProperties::QueryType type = queryProps.getType();
	if (type != QueryProperties::XAPIAN_QP)
	{
		map<string, string> fieldMapping;

		// Bare minimum mapping between Xesam fields and our prefixes 
		fieldMapping["dc:title"] = "S";

		XapianQueryBuilder builder(parser, fieldMapping);
		XesamParser *pParser = NULL;
		bool parsedQuery = false;

		// Get a Xesam parser
		if (type == QueryProperties::XESAM_QL)
		{
			pParser = new XesamQLParser();
		}
#ifdef HAVE_BOOST_SPIRIT_CORE_HPP
		else if (type == QueryProperties::XESAM_UL)
		{
			pParser = new XesamULParser();
		}
#endif

		if (pParser != NULL)
		{
			parsedQuery = pParser->parse(freeQuery, builder);

			delete pParser;
		}

		if (parsedQuery == true)
		{
			return builder.get_query();
		}

		return Xapian::Query();
	}

	// Do some pre-processing : look for filters with quoted values
	string::size_type escapedFilterEnd = 0;
	string::size_type escapedFilterStart = freeQuery.find(":\"");
	while ((escapedFilterStart != string::npos) &&
		(escapedFilterStart < freeQuery.length() - 2))
	{
		escapedFilterEnd = freeQuery.find("\"", escapedFilterStart + 2);
		if (escapedFilterEnd == string::npos)
		{
			break;
		}

		string filterValue = freeQuery.substr(escapedFilterStart + 2, escapedFilterEnd - escapedFilterStart - 2);
		if (filterValue.empty() == false)
		{
			string escapedValue(Url::escapeUrl(filterValue));
			bool escapeValue = false, hashValue = false;

			// The value should be escaped and length-limited as done at indexing time
			checkFilter(freeQuery, escapedFilterStart, escapeValue, hashValue);

			if (escapeValue == false)
			{
				// No escaping
				escapedValue = filterValue;
			}
			if (hashValue == true)
			{
				// Partially hash if necessary
				escapedValue = XapianDatabase::limitTermLength(escapedValue, true);
			}
			else
			{
				escapedValue = XapianDatabase::limitTermLength(escapedValue);
			}

			freeQuery.replace(escapedFilterStart + 1, escapedFilterEnd - escapedFilterStart,
				escapedValue);
			escapedFilterEnd = escapedFilterEnd + escapedValue.length() - filterValue.length();
		}
		else
		{
			// No value !
			freeQuery.replace(escapedFilterStart, escapedFilterEnd - escapedFilterStart + 1, ":");
			escapedFilterEnd -= 2;
		}
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: replaced filter: " << freeQuery << endl;
#endif

		// Next
		escapedFilterStart = freeQuery.find(":\"", escapedFilterEnd);
	}

	// Parse the query string with all necessary options
	unsigned int flags = Xapian::QueryParser::FLAG_BOOLEAN|Xapian::QueryParser::FLAG_PHRASE|
		Xapian::QueryParser::FLAG_LOVEHATE|Xapian::QueryParser::FLAG_PURE_NOT;
	if (minimal == false)
	{
		flags |= Xapian::QueryParser::FLAG_WILDCARD;
#if ENABLE_XAPIAN_SPELLING_CORRECTION>0
		flags |= Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
#endif
	}
	Xapian::Query parsedQuery = parser.parse_query(freeQuery, flags);
#ifdef DEBUG
	cout << "XapianEngine::parseQuery: " << parsedQuery.get_description() << endl;
#endif

	if (minimal == false)
	{
#if ENABLE_XAPIAN_SPELLING_CORRECTION>0
		// Any correction ?
		correctedFreeQuery = parser.get_corrected_query_string();
#ifdef DEBUG
		if (correctedFreeQuery.empty() == false)
		{
			cout << "XapianEngine::parseQuery: corrected spelling to: " << correctedFreeQuery << endl;
		}
#endif
#endif
	}

	return parsedQuery;
}
Пример #2
0
Xapian::Query XapianEngine::parseQuery(Xapian::Database *pIndex, const QueryProperties &queryProps,
	const string &stemLanguage, DefaultOperator defaultOperator,
	const string &limitQuery, string &correctedFreeQuery, bool minimal)
{
	Xapian::QueryParser parser;
	Xapian::Stem stemmer;
	CJKVTokenizer tokenizer;
	string freeQuery(StringManip::replaceSubString(queryProps.getFreeQuery(), "\n", " "));
	unsigned int minDay, minMonth, minYear = 0;
	unsigned int maxDay, maxMonth, maxYear = 0;

	if (tokenizer.has_cjkv_only(freeQuery) == true)
	{
		vector<string> tokens;
		string cjkvQuery;

		tokenizer.tokenize(freeQuery, tokens);

		// Get the terms
		for (vector<string>::const_iterator tokenIter = tokens.begin();
			tokenIter != tokens.end(); ++tokenIter)
		{
			cjkvQuery += *tokenIter;
			cjkvQuery += " ";
		}
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: CJKV query is " << cjkvQuery << endl;
#endif

		// Do as if the user had given this as input
		freeQuery = cjkvQuery;
		// We can disable stemming and spelling correction
		minimal = true;
	}

	if (pIndex != NULL)
	{
		// The database is required for wildcards and spelling
		parser.set_database(*pIndex);
	}

	// Set things up
	if ((minimal == false) &&
		(stemLanguage.empty() == false))
	{
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: " << stemLanguage << " stemming" << endl;
#endif
		try
		{
			stemmer = Xapian::Stem(StringManip::toLowerCase(stemLanguage));
		}
		catch (const Xapian::Error &error)
		{
			cerr << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl;
		}
		parser.set_stemmer(stemmer);
		parser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
	}
	else
	{
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: no stemming" << endl;
#endif
		parser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
	}
	// What's the default operator ?
	if (defaultOperator == DEFAULT_OP_AND)
	{
		parser.set_default_op(Xapian::Query::OP_AND);
	}
	else
	{
		parser.set_default_op(Xapian::Query::OP_OR);
	}
	// X prefixes should always include a colon
	parser.add_boolean_prefix("site", "H");
	parser.add_boolean_prefix("file", "P");
	parser.add_boolean_prefix("ext", "XEXT:");
	parser.add_prefix("title", "S");
	parser.add_boolean_prefix("url", "U");
	parser.add_boolean_prefix("dir", "XDIR:");
	parser.add_boolean_prefix("lang", "L");
	parser.add_boolean_prefix("type", "T");
	parser.add_boolean_prefix("class", "XCLASS:");
	parser.add_boolean_prefix("label", "XLABEL:");
	parser.add_boolean_prefix("tokens", "XTOK:");

	// Any limit on what documents should be searched ?
	if (limitQuery.empty() == false)
	{
		string limitedQuery(limitQuery);

		limitedQuery += " AND ( ";
		limitedQuery += freeQuery;
		limitedQuery += " )";
		freeQuery = limitedQuery;
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: " << freeQuery << endl;
#endif
	}

	// Date range
	Xapian::DateValueRangeProcessor dateProcessor(0);
	parser.add_valuerangeprocessor(&dateProcessor);

	// Size with a "b" suffix, ie 1024..10240b
#if XAPIAN_NUM_VERSION >= 1001000
	Xapian::NumberValueRangeProcessor sizeProcessor(2, "b", false);
	parser.add_valuerangeprocessor(&sizeProcessor);
#elif XAPIAN_NUM_VERSION >= 1000002
	// Xapian 1.02 is the bare minimum
	Xapian::v102::NumberValueRangeProcessor sizeProcessor(2, "b", false);
	parser.add_valuerangeprocessor(&sizeProcessor);
#endif

	// Time range
	TimeValueRangeProcessor timeProcessor(3);
	parser.add_valuerangeprocessor(&timeProcessor);

	// What type of query is this ?
	QueryProperties::QueryType type = queryProps.getType();
	if (type != QueryProperties::XAPIAN_QP)
	{
		map<string, string> fieldMapping;

		// Bare minimum mapping between Xesam fields and our prefixes 
		fieldMapping["dc:title"] = "S";

		XapianQueryBuilder builder(parser, fieldMapping);
		XesamParser *pParser = NULL;

		// Get a Xesam parser
		if (type == QueryProperties::XESAM_QL)
		{
			pParser = new XesamQLParser();
		}
#ifdef HAVE_BOOST_SPIRIT_CORE_HPP
		else if (type == QueryProperties::XESAM_UL)
		{
			pParser = new XesamULParser();
		}
#endif

		if (pParser != NULL)
		{
			bool parsedQuery = pParser->parse(freeQuery, builder);

			delete pParser;

			if (parsedQuery == true)
			{
				return builder.get_query();
			}
		}

		return Xapian::Query();
	}

	// Do some pre-processing : look for filters with quoted values
	string::size_type escapedFilterEnd = 0;
	string::size_type escapedFilterStart = freeQuery.find(":\"");
	while ((escapedFilterStart != string::npos) &&
		(escapedFilterStart < freeQuery.length() - 2))
	{
		escapedFilterEnd = freeQuery.find("\"", escapedFilterStart + 2);
		if (escapedFilterEnd == string::npos)
		{
			break;
		}

		string filterValue = freeQuery.substr(escapedFilterStart + 2, escapedFilterEnd - escapedFilterStart - 2);
		if (filterValue.empty() == false)
		{
			string escapedValue(Url::escapeUrl(filterValue));
			bool escapeValue = false, hashValue = false;

			// The value should be escaped and length-limited as done at indexing time
			checkFilter(freeQuery, escapedFilterStart, escapeValue, hashValue);

			if (escapeValue == false)
			{
				// No escaping
				escapedValue = filterValue;
			}
			if (hashValue == true)
			{
				// Partially hash if necessary
				escapedValue = XapianDatabase::limitTermLength(escapedValue, true);
			}
			else
			{
				escapedValue = XapianDatabase::limitTermLength(escapedValue);
			}

			freeQuery.replace(escapedFilterStart + 1, escapedFilterEnd - escapedFilterStart,
				escapedValue);
			escapedFilterEnd = escapedFilterEnd + escapedValue.length() - filterValue.length();
		}
		else
		{
			// No value !
			freeQuery.replace(escapedFilterStart, escapedFilterEnd - escapedFilterStart + 1, ":");
			escapedFilterEnd -= 2;
		}
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: replaced filter: " << freeQuery << endl;
#endif

		// Next
		escapedFilterStart = freeQuery.find(":\"", escapedFilterEnd);
	}

	// Parse the query string with all necessary options
	unsigned int flags = Xapian::QueryParser::FLAG_BOOLEAN|Xapian::QueryParser::FLAG_PHRASE|
		Xapian::QueryParser::FLAG_LOVEHATE|Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE|
		Xapian::QueryParser::FLAG_PURE_NOT;
	if (minimal == false)
	{
		flags |= Xapian::QueryParser::FLAG_WILDCARD;
#if ENABLE_XAPIAN_SPELLING_CORRECTION>0
		flags |= Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
#endif
	}
	Xapian::Query parsedQuery = parser.parse_query(freeQuery, flags);
#ifdef DEBUG
	cout << "XapianEngine::parseQuery: " << parsedQuery.get_description() << endl;
#endif

	if (minimal == false)
	{

#if ENABLE_XAPIAN_SPELLING_CORRECTION>0
		// Any correction ?
		correctedFreeQuery = parser.get_corrected_query_string();
#ifdef DEBUG
		if (correctedFreeQuery.empty() == false)
		{
			cout << "XapianEngine::parseQuery: corrected spelling to: " << correctedFreeQuery << endl;
		}
#endif
#endif
	}

	return parsedQuery;
}