Beispiel #1
0
void DataSystem::query(const std::string q)
{
	try {
		// Start an enquire session.
		Xapian::Enquire enquire(db);

		Xapian::Query query = qp.parse_query(q);
		printf("Parsed query is: %s", query.get_description().c_str());

		// Find the top 10 results for the query.
		enquire.set_query(query);
		Xapian::MSet matches = enquire.get_mset(0, 10);

		// Display the results.
		printf("%d results found.\n", matches.get_matches_estimated());
		printf("Matches 1-$d:\n", matches.size());

		for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i)
	{
			printf("%d: %d\% docid= [%s]\n\n",
				   i.get_rank() + 1,
				   i.get_percent(),
				   i.get_document().get_data().c_str());
		}
	} catch (const Xapian::Error &e)
	{
		printf("DataSystem, query error: %s", e.get_description());
	}
}
Xapian::Query XapianEngine::parseQuery(Xapian::Database *pIndex, const QueryProperties &queryProps,
	const string &stemLanguage, DefaultOperator defaultOperator,
	const string &limitQuery, string &correctedFreeQuery, bool minimal)
{
	Xapian::QueryParser parser;
	CJKVTokenizer tokenizer;
	string freeQuery(queryProps.getFreeQuery());
	unsigned int tokensCount = 1;
	bool diacriticSensitive = queryProps.getDiacriticSensitive();

	// Modifying the query is necessary if it's CJKV or diacritics are off
	if ((tokenizer.has_cjkv(freeQuery) == true) ||
		(diacriticSensitive == false))
	{
		QueryModifier handler(freeQuery,
			diacriticSensitive,
			tokenizer.get_ngram_size());

		tokenizer.tokenize(freeQuery, handler, true);

		tokensCount = handler.get_tokens_count();

		// We can disable stemming and spelling correction for pure CJKV queries
		string cjkvQuery(handler.get_modified_query(minimal));
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: CJKV query is " << cjkvQuery << endl;
#endif

		// Do as if the user had given this as input
		freeQuery = cjkvQuery;
	}
	else
	{
		string::size_type spacePos = freeQuery.find(' ');
		while (spacePos != string::npos)
		{
			++tokensCount;

			if (spacePos + 1 >= freeQuery.length())
			{
				break;
			}

			// Next
			spacePos = freeQuery.find(' ', spacePos + 1);
		}
	}
#ifdef DEBUG
	cout << "XapianEngine::parseQuery: " << tokensCount << " tokens" << endl;
#endif

	if (pIndex != NULL)
	{
		// The database is required for wildcards and spelling
		parser.set_database(*pIndex);
	}

	// Set things up
	if ((minimal == false) &&
		(stemLanguage.empty() == false))
	{
		parser.set_stemmer(m_stemmer);
		parser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);

		// Don't bother loading the stopwords list if there's only one token
		if (tokensCount > 1)
		{
			FileStopper *pStopper = FileStopper::get_stopper(Languages::toCode(stemLanguage));
			if ((pStopper != NULL) &&
				(pStopper->get_stopwords_count() > 0))
			{
				parser.set_stopper(pStopper);
			}
		}
	}
	else
	{
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: no stemming" << endl;
#endif
		parser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
	}
	// What's the default operator ?
	if (defaultOperator == DEFAULT_OP_AND)
	{
		parser.set_default_op(Xapian::Query::OP_AND);
	}
	else
	{
		parser.set_default_op(Xapian::Query::OP_OR);
	}
#if XAPIAN_NUM_VERSION >= 1000004
	// Search across text body and title
	parser.add_prefix("", "");
	parser.add_prefix("", "S");
#endif
	// X prefixes should always include a colon
	parser.add_boolean_prefix("site", "H");
	parser.add_boolean_prefix("file", "P");
	parser.add_boolean_prefix("ext", "XEXT:");
	parser.add_prefix("title", "S");
	parser.add_boolean_prefix("url", "U");
	parser.add_boolean_prefix("dir", "XDIR:");
	parser.add_boolean_prefix("inurl", "XFILE:");
	parser.add_prefix("path", "XPATH:");
	parser.add_boolean_prefix("lang", "L");
	parser.add_boolean_prefix("type", "T");
	parser.add_boolean_prefix("class", "XCLASS:");
	parser.add_boolean_prefix("label", "XLABEL:");
	parser.add_boolean_prefix("tokens", "XTOK:");

	// Any limit on what documents should be searched ?
	if (limitQuery.empty() == false)
	{
		string limitedQuery(limitQuery);

		limitedQuery += " AND ( ";
		limitedQuery += freeQuery;
		limitedQuery += " )";
		freeQuery = limitedQuery;
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: " << freeQuery << endl;
#endif
	}

	// Date range
	Xapian::DateValueRangeProcessor dateProcessor(0);
	parser.add_valuerangeprocessor(&dateProcessor);

	// Size with a "b" suffix, ie 1024..10240b
#if XAPIAN_NUM_VERSION >= 1001000
	Xapian::NumberValueRangeProcessor sizeProcessor(2, "b", false);
	parser.add_valuerangeprocessor(&sizeProcessor);
#elif XAPIAN_NUM_VERSION >= 1000002
	// Xapian 1.02 is the bare minimum
	Xapian::v102::NumberValueRangeProcessor sizeProcessor(2, "b", false);
	parser.add_valuerangeprocessor(&sizeProcessor);
#endif

	// Time range
	TimeValueRangeProcessor timeProcessor(3);
	parser.add_valuerangeprocessor(&timeProcessor);

	// What type of query is this ?
	QueryProperties::QueryType type = queryProps.getType();
	if (type != QueryProperties::XAPIAN_QP)
	{
		map<string, string> fieldMapping;

		// Bare minimum mapping between Xesam fields and our prefixes 
		fieldMapping["dc:title"] = "S";

		XapianQueryBuilder builder(parser, fieldMapping);
		XesamParser *pParser = NULL;
		bool parsedQuery = false;

		// Get a Xesam parser
		if (type == QueryProperties::XESAM_QL)
		{
			pParser = new XesamQLParser();
		}
#ifdef HAVE_BOOST_SPIRIT_CORE_HPP
		else if (type == QueryProperties::XESAM_UL)
		{
			pParser = new XesamULParser();
		}
#endif

		if (pParser != NULL)
		{
			parsedQuery = pParser->parse(freeQuery, builder);

			delete pParser;
		}

		if (parsedQuery == true)
		{
			return builder.get_query();
		}

		return Xapian::Query();
	}

	// Do some pre-processing : look for filters with quoted values
	string::size_type escapedFilterEnd = 0;
	string::size_type escapedFilterStart = freeQuery.find(":\"");
	while ((escapedFilterStart != string::npos) &&
		(escapedFilterStart < freeQuery.length() - 2))
	{
		escapedFilterEnd = freeQuery.find("\"", escapedFilterStart + 2);
		if (escapedFilterEnd == string::npos)
		{
			break;
		}

		string filterValue = freeQuery.substr(escapedFilterStart + 2, escapedFilterEnd - escapedFilterStart - 2);
		if (filterValue.empty() == false)
		{
			string escapedValue(Url::escapeUrl(filterValue));
			bool escapeValue = false, hashValue = false;

			// The value should be escaped and length-limited as done at indexing time
			checkFilter(freeQuery, escapedFilterStart, escapeValue, hashValue);

			if (escapeValue == false)
			{
				// No escaping
				escapedValue = filterValue;
			}
			if (hashValue == true)
			{
				// Partially hash if necessary
				escapedValue = XapianDatabase::limitTermLength(escapedValue, true);
			}
			else
			{
				escapedValue = XapianDatabase::limitTermLength(escapedValue);
			}

			freeQuery.replace(escapedFilterStart + 1, escapedFilterEnd - escapedFilterStart,
				escapedValue);
			escapedFilterEnd = escapedFilterEnd + escapedValue.length() - filterValue.length();
		}
		else
		{
			// No value !
			freeQuery.replace(escapedFilterStart, escapedFilterEnd - escapedFilterStart + 1, ":");
			escapedFilterEnd -= 2;
		}
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: replaced filter: " << freeQuery << endl;
#endif

		// Next
		escapedFilterStart = freeQuery.find(":\"", escapedFilterEnd);
	}

	// Parse the query string with all necessary options
	unsigned int flags = Xapian::QueryParser::FLAG_BOOLEAN|Xapian::QueryParser::FLAG_PHRASE|
		Xapian::QueryParser::FLAG_LOVEHATE|Xapian::QueryParser::FLAG_PURE_NOT;
	if (minimal == false)
	{
		flags |= Xapian::QueryParser::FLAG_WILDCARD;
#if ENABLE_XAPIAN_SPELLING_CORRECTION>0
		flags |= Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
#endif
	}
	Xapian::Query parsedQuery = parser.parse_query(freeQuery, flags);
#ifdef DEBUG
	cout << "XapianEngine::parseQuery: " << parsedQuery.get_description() << endl;
#endif

	if (minimal == false)
	{
#if ENABLE_XAPIAN_SPELLING_CORRECTION>0
		// Any correction ?
		correctedFreeQuery = parser.get_corrected_query_string();
#ifdef DEBUG
		if (correctedFreeQuery.empty() == false)
		{
			cout << "XapianEngine::parseQuery: corrected spelling to: " << correctedFreeQuery << endl;
		}
#endif
#endif
	}

	return parsedQuery;
}
/// Runs a query; true if success.
bool XapianEngine::runQuery(QueryProperties& queryProps)
{
	// Clear the results list
	m_resultsList.clear();

	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, true);
	if (pDatabase == NULL)
	{
		return false;
	}

	// Get the latest revision...
	pDatabase->reopen();
	Xapian::Database *pIndex = pDatabase->readLock();
	try
	{
		string stemLanguage;
		unsigned int searchStep = 1;
		bool followOperators = true;

		// Searches are run in this order :
		// 1. follow operators and don't stem terms
		// 2. if no results, follow operators and stem terms
		// 3. if no results, don't follow operators and don't stem terms
		// 4. if no results, don't follow operators and stem terms
		// Steps 2 and 4 depend on a language being defined for the query
		Xapian::Query fullQuery = parseQuery(pIndex, queryProps, "", followOperators);
		while (fullQuery.empty() == false)
		{
#ifdef DEBUG
			cout << "XapianEngine::runQuery: " << fullQuery.get_description() << endl;
#endif
			// Query the database
			if (queryDatabase(pIndex, fullQuery) == false)
			{
				break;
			}

			if (m_resultsList.empty() == true)
			{
				// The search did succeed but didn't return anything
				// Try the next step
				switch (++searchStep)
				{
					case 2:
						followOperators = true;
						stemLanguage = queryProps.getLanguage();
						if (stemLanguage.empty() == false)
						{
							break;
						}
						++searchStep;
					case 3:
						followOperators = false;
						stemLanguage.clear();
						break;
					case 4:
						followOperators = false;
						stemLanguage = queryProps.getLanguage();
						if (stemLanguage.empty() == false)
						{
							break;
						}
						++searchStep;
					default:
						pDatabase->unlock();
						return true;
				}

#ifdef DEBUG
				cout << "XapianEngine::runQuery: trying step " << searchStep << endl;
#endif
				fullQuery = parseQuery(pIndex, queryProps,
					Languages::toEnglish(stemLanguage), followOperators);
				continue;
			}

			pDatabase->unlock();
			return true;
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "XapianEngine::runQuery: " << error.get_type() << ": " << error.get_msg() << endl;
	}
	pDatabase->unlock();

	return false;
}
Beispiel #4
0
int main(int argc, char **argv)
{
    if(argc < 2) {
        usage(argv);
        return 1;
    }

    try {
        char *action = argv[1];
        char *db_path = argv[2];

        if(!strcmp(action, "index")) {
            Xapian::WritableDatabase db(db_path, Xapian::DB_CREATE_OR_OPEN);

            Xapian::TermGenerator indexer;
            Xapian::Stem stemmer("english");
            indexer.set_stemmer(stemmer);

            std::string doc_txt;
            while(true) {
                if(std::cin.eof()) break;

                std::string line;
                getline(std::cin, line);
                doc_txt += line;
            }

            if(!doc_txt.empty()) {
                Xapian::Document doc;
                doc.set_data(doc_txt);

                indexer.set_document(doc);
                indexer.index_text(doc_txt);

                db.add_document(doc);

                std::cout << "Indexed: " << indexer.get_description() << std::endl;
            }

            db.commit();
        } else if(!strcmp(action, "search")) {
            if(argc < 4) {
                std::cerr << "You must supply a query string" << std::endl;
                return 1;
            }

            Xapian::Database db(db_path);
            Xapian::Enquire enquire(db);

            std::string query_str = argv[3];
            argv+= 4;
            while(*argv) {
                query_str += ' ';
                query_str += *argv++;
            }

            Xapian::QueryParser qp;
            Xapian::Stem stemmer("english");
            qp.set_stemmer(stemmer);
            qp.set_database(db);
            qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);

            Xapian::Query query = qp.parse_query(query_str);
            std::cout << "Parsed query is: " << query.get_description() <<
                         std::endl;

            enquire.set_query(query);
            Xapian::MSet matches = enquire.get_mset(0, 10);

            std::cout << matches.get_matches_estimated() << " results found.\n";
            std::cout << "Matches 1-" << matches.size() << ":\n" << std::endl;

            for (Xapian::MSetIterator i = matches.begin();
                    i != matches.end(); ++i) {
                std::cout << i.get_rank() + 1 << ": " << i.get_percent() <<
                        "% docid=" << *i << " [" <<
                        i.get_document().get_data()<< "]" << std::endl <<
                        std::endl;
            }
        } else {
            std::cerr << "Invalid action " << action << std::endl;
            usage(argv);
            return 1;
        }

    } catch (const Xapian::Error &error) {
        std::cout << "Exception: " << error.get_msg() << std::endl;
    }
}
Xapian::Query XapianEngine::parseQuery(Xapian::Database *pIndex, const QueryProperties &queryProps,
	const string &stemLanguage, DefaultOperator defaultOperator,
	const string &limitQuery, string &correctedFreeQuery, bool minimal)
{
	Xapian::QueryParser parser;
	Xapian::Stem stemmer;
	CJKVTokenizer tokenizer;
	string freeQuery(StringManip::replaceSubString(queryProps.getFreeQuery(), "\n", " "));
	unsigned int minDay, minMonth, minYear = 0;
	unsigned int maxDay, maxMonth, maxYear = 0;

	if (tokenizer.has_cjkv_only(freeQuery) == true)
	{
		vector<string> tokens;
		string cjkvQuery;

		tokenizer.tokenize(freeQuery, tokens);

		// Get the terms
		for (vector<string>::const_iterator tokenIter = tokens.begin();
			tokenIter != tokens.end(); ++tokenIter)
		{
			cjkvQuery += *tokenIter;
			cjkvQuery += " ";
		}
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: CJKV query is " << cjkvQuery << endl;
#endif

		// Do as if the user had given this as input
		freeQuery = cjkvQuery;
		// We can disable stemming and spelling correction
		minimal = true;
	}

	if (pIndex != NULL)
	{
		// The database is required for wildcards and spelling
		parser.set_database(*pIndex);
	}

	// Set things up
	if ((minimal == false) &&
		(stemLanguage.empty() == false))
	{
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: " << stemLanguage << " stemming" << endl;
#endif
		try
		{
			stemmer = Xapian::Stem(StringManip::toLowerCase(stemLanguage));
		}
		catch (const Xapian::Error &error)
		{
			cerr << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl;
		}
		parser.set_stemmer(stemmer);
		parser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
	}
	else
	{
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: no stemming" << endl;
#endif
		parser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
	}
	// What's the default operator ?
	if (defaultOperator == DEFAULT_OP_AND)
	{
		parser.set_default_op(Xapian::Query::OP_AND);
	}
	else
	{
		parser.set_default_op(Xapian::Query::OP_OR);
	}
	// X prefixes should always include a colon
	parser.add_boolean_prefix("site", "H");
	parser.add_boolean_prefix("file", "P");
	parser.add_boolean_prefix("ext", "XEXT:");
	parser.add_prefix("title", "S");
	parser.add_boolean_prefix("url", "U");
	parser.add_boolean_prefix("dir", "XDIR:");
	parser.add_boolean_prefix("lang", "L");
	parser.add_boolean_prefix("type", "T");
	parser.add_boolean_prefix("class", "XCLASS:");
	parser.add_boolean_prefix("label", "XLABEL:");
	parser.add_boolean_prefix("tokens", "XTOK:");

	// Any limit on what documents should be searched ?
	if (limitQuery.empty() == false)
	{
		string limitedQuery(limitQuery);

		limitedQuery += " AND ( ";
		limitedQuery += freeQuery;
		limitedQuery += " )";
		freeQuery = limitedQuery;
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: " << freeQuery << endl;
#endif
	}

	// Date range
	Xapian::DateValueRangeProcessor dateProcessor(0);
	parser.add_valuerangeprocessor(&dateProcessor);

	// Size with a "b" suffix, ie 1024..10240b
#if XAPIAN_NUM_VERSION >= 1001000
	Xapian::NumberValueRangeProcessor sizeProcessor(2, "b", false);
	parser.add_valuerangeprocessor(&sizeProcessor);
#elif XAPIAN_NUM_VERSION >= 1000002
	// Xapian 1.02 is the bare minimum
	Xapian::v102::NumberValueRangeProcessor sizeProcessor(2, "b", false);
	parser.add_valuerangeprocessor(&sizeProcessor);
#endif

	// Time range
	TimeValueRangeProcessor timeProcessor(3);
	parser.add_valuerangeprocessor(&timeProcessor);

	// What type of query is this ?
	QueryProperties::QueryType type = queryProps.getType();
	if (type != QueryProperties::XAPIAN_QP)
	{
		map<string, string> fieldMapping;

		// Bare minimum mapping between Xesam fields and our prefixes 
		fieldMapping["dc:title"] = "S";

		XapianQueryBuilder builder(parser, fieldMapping);
		XesamParser *pParser = NULL;

		// Get a Xesam parser
		if (type == QueryProperties::XESAM_QL)
		{
			pParser = new XesamQLParser();
		}
#ifdef HAVE_BOOST_SPIRIT_CORE_HPP
		else if (type == QueryProperties::XESAM_UL)
		{
			pParser = new XesamULParser();
		}
#endif

		if (pParser != NULL)
		{
			bool parsedQuery = pParser->parse(freeQuery, builder);

			delete pParser;

			if (parsedQuery == true)
			{
				return builder.get_query();
			}
		}

		return Xapian::Query();
	}

	// Do some pre-processing : look for filters with quoted values
	string::size_type escapedFilterEnd = 0;
	string::size_type escapedFilterStart = freeQuery.find(":\"");
	while ((escapedFilterStart != string::npos) &&
		(escapedFilterStart < freeQuery.length() - 2))
	{
		escapedFilterEnd = freeQuery.find("\"", escapedFilterStart + 2);
		if (escapedFilterEnd == string::npos)
		{
			break;
		}

		string filterValue = freeQuery.substr(escapedFilterStart + 2, escapedFilterEnd - escapedFilterStart - 2);
		if (filterValue.empty() == false)
		{
			string escapedValue(Url::escapeUrl(filterValue));
			bool escapeValue = false, hashValue = false;

			// The value should be escaped and length-limited as done at indexing time
			checkFilter(freeQuery, escapedFilterStart, escapeValue, hashValue);

			if (escapeValue == false)
			{
				// No escaping
				escapedValue = filterValue;
			}
			if (hashValue == true)
			{
				// Partially hash if necessary
				escapedValue = XapianDatabase::limitTermLength(escapedValue, true);
			}
			else
			{
				escapedValue = XapianDatabase::limitTermLength(escapedValue);
			}

			freeQuery.replace(escapedFilterStart + 1, escapedFilterEnd - escapedFilterStart,
				escapedValue);
			escapedFilterEnd = escapedFilterEnd + escapedValue.length() - filterValue.length();
		}
		else
		{
			// No value !
			freeQuery.replace(escapedFilterStart, escapedFilterEnd - escapedFilterStart + 1, ":");
			escapedFilterEnd -= 2;
		}
#ifdef DEBUG
		cout << "XapianEngine::parseQuery: replaced filter: " << freeQuery << endl;
#endif

		// Next
		escapedFilterStart = freeQuery.find(":\"", escapedFilterEnd);
	}

	// Parse the query string with all necessary options
	unsigned int flags = Xapian::QueryParser::FLAG_BOOLEAN|Xapian::QueryParser::FLAG_PHRASE|
		Xapian::QueryParser::FLAG_LOVEHATE|Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE|
		Xapian::QueryParser::FLAG_PURE_NOT;
	if (minimal == false)
	{
		flags |= Xapian::QueryParser::FLAG_WILDCARD;
#if ENABLE_XAPIAN_SPELLING_CORRECTION>0
		flags |= Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
#endif
	}
	Xapian::Query parsedQuery = parser.parse_query(freeQuery, flags);
#ifdef DEBUG
	cout << "XapianEngine::parseQuery: " << parsedQuery.get_description() << endl;
#endif

	if (minimal == false)
	{

#if ENABLE_XAPIAN_SPELLING_CORRECTION>0
		// Any correction ?
		correctedFreeQuery = parser.get_corrected_query_string();
#ifdef DEBUG
		if (correctedFreeQuery.empty() == false)
		{
			cout << "XapianEngine::parseQuery: corrected spelling to: " << correctedFreeQuery << endl;
		}
#endif
#endif
	}

	return parsedQuery;
}