void DataSystem::query(const std::string q) { try { // Start an enquire session. Xapian::Enquire enquire(db); Xapian::Query query = qp.parse_query(q); printf("Parsed query is: %s", query.get_description().c_str()); // Find the top 10 results for the query. enquire.set_query(query); Xapian::MSet matches = enquire.get_mset(0, 10); // Display the results. printf("%d results found.\n", matches.get_matches_estimated()); printf("Matches 1-$d:\n", matches.size()); for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i) { printf("%d: %d\% docid= [%s]\n\n", i.get_rank() + 1, i.get_percent(), i.get_document().get_data().c_str()); } } catch (const Xapian::Error &e) { printf("DataSystem, query error: %s", e.get_description()); } }
Xapian::Query XapianEngine::parseQuery(Xapian::Database *pIndex, const QueryProperties &queryProps, const string &stemLanguage, DefaultOperator defaultOperator, const string &limitQuery, string &correctedFreeQuery, bool minimal) { Xapian::QueryParser parser; CJKVTokenizer tokenizer; string freeQuery(queryProps.getFreeQuery()); unsigned int tokensCount = 1; bool diacriticSensitive = queryProps.getDiacriticSensitive(); // Modifying the query is necessary if it's CJKV or diacritics are off if ((tokenizer.has_cjkv(freeQuery) == true) || (diacriticSensitive == false)) { QueryModifier handler(freeQuery, diacriticSensitive, tokenizer.get_ngram_size()); tokenizer.tokenize(freeQuery, handler, true); tokensCount = handler.get_tokens_count(); // We can disable stemming and spelling correction for pure CJKV queries string cjkvQuery(handler.get_modified_query(minimal)); #ifdef DEBUG cout << "XapianEngine::parseQuery: CJKV query is " << cjkvQuery << endl; #endif // Do as if the user had given this as input freeQuery = cjkvQuery; } else { string::size_type spacePos = freeQuery.find(' '); while (spacePos != string::npos) { ++tokensCount; if (spacePos + 1 >= freeQuery.length()) { break; } // Next spacePos = freeQuery.find(' ', spacePos + 1); } } #ifdef DEBUG cout << "XapianEngine::parseQuery: " << tokensCount << " tokens" << endl; #endif if (pIndex != NULL) { // The database is required for wildcards and spelling parser.set_database(*pIndex); } // Set things up if ((minimal == false) && (stemLanguage.empty() == false)) { parser.set_stemmer(m_stemmer); parser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME); // Don't bother loading the stopwords list if there's only one token if (tokensCount > 1) { FileStopper *pStopper = FileStopper::get_stopper(Languages::toCode(stemLanguage)); if ((pStopper != NULL) && (pStopper->get_stopwords_count() > 0)) { parser.set_stopper(pStopper); } } } else { #ifdef DEBUG cout << "XapianEngine::parseQuery: no stemming" << endl; #endif parser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE); } // What's the default operator ? if (defaultOperator == DEFAULT_OP_AND) { parser.set_default_op(Xapian::Query::OP_AND); } else { parser.set_default_op(Xapian::Query::OP_OR); } #if XAPIAN_NUM_VERSION >= 1000004 // Search across text body and title parser.add_prefix("", ""); parser.add_prefix("", "S"); #endif // X prefixes should always include a colon parser.add_boolean_prefix("site", "H"); parser.add_boolean_prefix("file", "P"); parser.add_boolean_prefix("ext", "XEXT:"); parser.add_prefix("title", "S"); parser.add_boolean_prefix("url", "U"); parser.add_boolean_prefix("dir", "XDIR:"); parser.add_boolean_prefix("inurl", "XFILE:"); parser.add_prefix("path", "XPATH:"); parser.add_boolean_prefix("lang", "L"); parser.add_boolean_prefix("type", "T"); parser.add_boolean_prefix("class", "XCLASS:"); parser.add_boolean_prefix("label", "XLABEL:"); parser.add_boolean_prefix("tokens", "XTOK:"); // Any limit on what documents should be searched ? if (limitQuery.empty() == false) { string limitedQuery(limitQuery); limitedQuery += " AND ( "; limitedQuery += freeQuery; limitedQuery += " )"; freeQuery = limitedQuery; #ifdef DEBUG cout << "XapianEngine::parseQuery: " << freeQuery << endl; #endif } // Date range Xapian::DateValueRangeProcessor dateProcessor(0); parser.add_valuerangeprocessor(&dateProcessor); // Size with a "b" suffix, ie 1024..10240b #if XAPIAN_NUM_VERSION >= 1001000 Xapian::NumberValueRangeProcessor sizeProcessor(2, "b", false); parser.add_valuerangeprocessor(&sizeProcessor); #elif XAPIAN_NUM_VERSION >= 1000002 // Xapian 1.02 is the bare minimum Xapian::v102::NumberValueRangeProcessor sizeProcessor(2, "b", false); parser.add_valuerangeprocessor(&sizeProcessor); #endif // Time range TimeValueRangeProcessor timeProcessor(3); parser.add_valuerangeprocessor(&timeProcessor); // What type of query is this ? QueryProperties::QueryType type = queryProps.getType(); if (type != QueryProperties::XAPIAN_QP) { map<string, string> fieldMapping; // Bare minimum mapping between Xesam fields and our prefixes fieldMapping["dc:title"] = "S"; XapianQueryBuilder builder(parser, fieldMapping); XesamParser *pParser = NULL; bool parsedQuery = false; // Get a Xesam parser if (type == QueryProperties::XESAM_QL) { pParser = new XesamQLParser(); } #ifdef HAVE_BOOST_SPIRIT_CORE_HPP else if (type == QueryProperties::XESAM_UL) { pParser = new XesamULParser(); } #endif if (pParser != NULL) { parsedQuery = pParser->parse(freeQuery, builder); delete pParser; } if (parsedQuery == true) { return builder.get_query(); } return Xapian::Query(); } // Do some pre-processing : look for filters with quoted values string::size_type escapedFilterEnd = 0; string::size_type escapedFilterStart = freeQuery.find(":\""); while ((escapedFilterStart != string::npos) && (escapedFilterStart < freeQuery.length() - 2)) { escapedFilterEnd = freeQuery.find("\"", escapedFilterStart + 2); if (escapedFilterEnd == string::npos) { break; } string filterValue = freeQuery.substr(escapedFilterStart + 2, escapedFilterEnd - escapedFilterStart - 2); if (filterValue.empty() == false) { string escapedValue(Url::escapeUrl(filterValue)); bool escapeValue = false, hashValue = false; // The value should be escaped and length-limited as done at indexing time checkFilter(freeQuery, escapedFilterStart, escapeValue, hashValue); if (escapeValue == false) { // No escaping escapedValue = filterValue; } if (hashValue == true) { // Partially hash if necessary escapedValue = XapianDatabase::limitTermLength(escapedValue, true); } else { escapedValue = XapianDatabase::limitTermLength(escapedValue); } freeQuery.replace(escapedFilterStart + 1, escapedFilterEnd - escapedFilterStart, escapedValue); escapedFilterEnd = escapedFilterEnd + escapedValue.length() - filterValue.length(); } else { // No value ! freeQuery.replace(escapedFilterStart, escapedFilterEnd - escapedFilterStart + 1, ":"); escapedFilterEnd -= 2; } #ifdef DEBUG cout << "XapianEngine::parseQuery: replaced filter: " << freeQuery << endl; #endif // Next escapedFilterStart = freeQuery.find(":\"", escapedFilterEnd); } // Parse the query string with all necessary options unsigned int flags = Xapian::QueryParser::FLAG_BOOLEAN|Xapian::QueryParser::FLAG_PHRASE| Xapian::QueryParser::FLAG_LOVEHATE|Xapian::QueryParser::FLAG_PURE_NOT; if (minimal == false) { flags |= Xapian::QueryParser::FLAG_WILDCARD; #if ENABLE_XAPIAN_SPELLING_CORRECTION>0 flags |= Xapian::QueryParser::FLAG_SPELLING_CORRECTION; #endif } Xapian::Query parsedQuery = parser.parse_query(freeQuery, flags); #ifdef DEBUG cout << "XapianEngine::parseQuery: " << parsedQuery.get_description() << endl; #endif if (minimal == false) { #if ENABLE_XAPIAN_SPELLING_CORRECTION>0 // Any correction ? correctedFreeQuery = parser.get_corrected_query_string(); #ifdef DEBUG if (correctedFreeQuery.empty() == false) { cout << "XapianEngine::parseQuery: corrected spelling to: " << correctedFreeQuery << endl; } #endif #endif } return parsedQuery; }
/// Runs a query; true if success. bool XapianEngine::runQuery(QueryProperties& queryProps) { // Clear the results list m_resultsList.clear(); XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, true); if (pDatabase == NULL) { return false; } // Get the latest revision... pDatabase->reopen(); Xapian::Database *pIndex = pDatabase->readLock(); try { string stemLanguage; unsigned int searchStep = 1; bool followOperators = true; // Searches are run in this order : // 1. follow operators and don't stem terms // 2. if no results, follow operators and stem terms // 3. if no results, don't follow operators and don't stem terms // 4. if no results, don't follow operators and stem terms // Steps 2 and 4 depend on a language being defined for the query Xapian::Query fullQuery = parseQuery(pIndex, queryProps, "", followOperators); while (fullQuery.empty() == false) { #ifdef DEBUG cout << "XapianEngine::runQuery: " << fullQuery.get_description() << endl; #endif // Query the database if (queryDatabase(pIndex, fullQuery) == false) { break; } if (m_resultsList.empty() == true) { // The search did succeed but didn't return anything // Try the next step switch (++searchStep) { case 2: followOperators = true; stemLanguage = queryProps.getLanguage(); if (stemLanguage.empty() == false) { break; } ++searchStep; case 3: followOperators = false; stemLanguage.clear(); break; case 4: followOperators = false; stemLanguage = queryProps.getLanguage(); if (stemLanguage.empty() == false) { break; } ++searchStep; default: pDatabase->unlock(); return true; } #ifdef DEBUG cout << "XapianEngine::runQuery: trying step " << searchStep << endl; #endif fullQuery = parseQuery(pIndex, queryProps, Languages::toEnglish(stemLanguage), followOperators); continue; } pDatabase->unlock(); return true; } } catch (const Xapian::Error &error) { cerr << "XapianEngine::runQuery: " << error.get_type() << ": " << error.get_msg() << endl; } pDatabase->unlock(); return false; }
int main(int argc, char **argv) { if(argc < 2) { usage(argv); return 1; } try { char *action = argv[1]; char *db_path = argv[2]; if(!strcmp(action, "index")) { Xapian::WritableDatabase db(db_path, Xapian::DB_CREATE_OR_OPEN); Xapian::TermGenerator indexer; Xapian::Stem stemmer("english"); indexer.set_stemmer(stemmer); std::string doc_txt; while(true) { if(std::cin.eof()) break; std::string line; getline(std::cin, line); doc_txt += line; } if(!doc_txt.empty()) { Xapian::Document doc; doc.set_data(doc_txt); indexer.set_document(doc); indexer.index_text(doc_txt); db.add_document(doc); std::cout << "Indexed: " << indexer.get_description() << std::endl; } db.commit(); } else if(!strcmp(action, "search")) { if(argc < 4) { std::cerr << "You must supply a query string" << std::endl; return 1; } Xapian::Database db(db_path); Xapian::Enquire enquire(db); std::string query_str = argv[3]; argv+= 4; while(*argv) { query_str += ' '; query_str += *argv++; } Xapian::QueryParser qp; Xapian::Stem stemmer("english"); qp.set_stemmer(stemmer); qp.set_database(db); qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME); Xapian::Query query = qp.parse_query(query_str); std::cout << "Parsed query is: " << query.get_description() << std::endl; enquire.set_query(query); Xapian::MSet matches = enquire.get_mset(0, 10); std::cout << matches.get_matches_estimated() << " results found.\n"; std::cout << "Matches 1-" << matches.size() << ":\n" << std::endl; for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i) { std::cout << i.get_rank() + 1 << ": " << i.get_percent() << "% docid=" << *i << " [" << i.get_document().get_data()<< "]" << std::endl << std::endl; } } else { std::cerr << "Invalid action " << action << std::endl; usage(argv); return 1; } } catch (const Xapian::Error &error) { std::cout << "Exception: " << error.get_msg() << std::endl; } }
Xapian::Query XapianEngine::parseQuery(Xapian::Database *pIndex, const QueryProperties &queryProps, const string &stemLanguage, DefaultOperator defaultOperator, const string &limitQuery, string &correctedFreeQuery, bool minimal) { Xapian::QueryParser parser; Xapian::Stem stemmer; CJKVTokenizer tokenizer; string freeQuery(StringManip::replaceSubString(queryProps.getFreeQuery(), "\n", " ")); unsigned int minDay, minMonth, minYear = 0; unsigned int maxDay, maxMonth, maxYear = 0; if (tokenizer.has_cjkv_only(freeQuery) == true) { vector<string> tokens; string cjkvQuery; tokenizer.tokenize(freeQuery, tokens); // Get the terms for (vector<string>::const_iterator tokenIter = tokens.begin(); tokenIter != tokens.end(); ++tokenIter) { cjkvQuery += *tokenIter; cjkvQuery += " "; } #ifdef DEBUG cout << "XapianEngine::parseQuery: CJKV query is " << cjkvQuery << endl; #endif // Do as if the user had given this as input freeQuery = cjkvQuery; // We can disable stemming and spelling correction minimal = true; } if (pIndex != NULL) { // The database is required for wildcards and spelling parser.set_database(*pIndex); } // Set things up if ((minimal == false) && (stemLanguage.empty() == false)) { #ifdef DEBUG cout << "XapianEngine::parseQuery: " << stemLanguage << " stemming" << endl; #endif try { stemmer = Xapian::Stem(StringManip::toLowerCase(stemLanguage)); } catch (const Xapian::Error &error) { cerr << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl; } parser.set_stemmer(stemmer); parser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME); } else { #ifdef DEBUG cout << "XapianEngine::parseQuery: no stemming" << endl; #endif parser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE); } // What's the default operator ? if (defaultOperator == DEFAULT_OP_AND) { parser.set_default_op(Xapian::Query::OP_AND); } else { parser.set_default_op(Xapian::Query::OP_OR); } // X prefixes should always include a colon parser.add_boolean_prefix("site", "H"); parser.add_boolean_prefix("file", "P"); parser.add_boolean_prefix("ext", "XEXT:"); parser.add_prefix("title", "S"); parser.add_boolean_prefix("url", "U"); parser.add_boolean_prefix("dir", "XDIR:"); parser.add_boolean_prefix("lang", "L"); parser.add_boolean_prefix("type", "T"); parser.add_boolean_prefix("class", "XCLASS:"); parser.add_boolean_prefix("label", "XLABEL:"); parser.add_boolean_prefix("tokens", "XTOK:"); // Any limit on what documents should be searched ? if (limitQuery.empty() == false) { string limitedQuery(limitQuery); limitedQuery += " AND ( "; limitedQuery += freeQuery; limitedQuery += " )"; freeQuery = limitedQuery; #ifdef DEBUG cout << "XapianEngine::parseQuery: " << freeQuery << endl; #endif } // Date range Xapian::DateValueRangeProcessor dateProcessor(0); parser.add_valuerangeprocessor(&dateProcessor); // Size with a "b" suffix, ie 1024..10240b #if XAPIAN_NUM_VERSION >= 1001000 Xapian::NumberValueRangeProcessor sizeProcessor(2, "b", false); parser.add_valuerangeprocessor(&sizeProcessor); #elif XAPIAN_NUM_VERSION >= 1000002 // Xapian 1.02 is the bare minimum Xapian::v102::NumberValueRangeProcessor sizeProcessor(2, "b", false); parser.add_valuerangeprocessor(&sizeProcessor); #endif // Time range TimeValueRangeProcessor timeProcessor(3); parser.add_valuerangeprocessor(&timeProcessor); // What type of query is this ? QueryProperties::QueryType type = queryProps.getType(); if (type != QueryProperties::XAPIAN_QP) { map<string, string> fieldMapping; // Bare minimum mapping between Xesam fields and our prefixes fieldMapping["dc:title"] = "S"; XapianQueryBuilder builder(parser, fieldMapping); XesamParser *pParser = NULL; // Get a Xesam parser if (type == QueryProperties::XESAM_QL) { pParser = new XesamQLParser(); } #ifdef HAVE_BOOST_SPIRIT_CORE_HPP else if (type == QueryProperties::XESAM_UL) { pParser = new XesamULParser(); } #endif if (pParser != NULL) { bool parsedQuery = pParser->parse(freeQuery, builder); delete pParser; if (parsedQuery == true) { return builder.get_query(); } } return Xapian::Query(); } // Do some pre-processing : look for filters with quoted values string::size_type escapedFilterEnd = 0; string::size_type escapedFilterStart = freeQuery.find(":\""); while ((escapedFilterStart != string::npos) && (escapedFilterStart < freeQuery.length() - 2)) { escapedFilterEnd = freeQuery.find("\"", escapedFilterStart + 2); if (escapedFilterEnd == string::npos) { break; } string filterValue = freeQuery.substr(escapedFilterStart + 2, escapedFilterEnd - escapedFilterStart - 2); if (filterValue.empty() == false) { string escapedValue(Url::escapeUrl(filterValue)); bool escapeValue = false, hashValue = false; // The value should be escaped and length-limited as done at indexing time checkFilter(freeQuery, escapedFilterStart, escapeValue, hashValue); if (escapeValue == false) { // No escaping escapedValue = filterValue; } if (hashValue == true) { // Partially hash if necessary escapedValue = XapianDatabase::limitTermLength(escapedValue, true); } else { escapedValue = XapianDatabase::limitTermLength(escapedValue); } freeQuery.replace(escapedFilterStart + 1, escapedFilterEnd - escapedFilterStart, escapedValue); escapedFilterEnd = escapedFilterEnd + escapedValue.length() - filterValue.length(); } else { // No value ! freeQuery.replace(escapedFilterStart, escapedFilterEnd - escapedFilterStart + 1, ":"); escapedFilterEnd -= 2; } #ifdef DEBUG cout << "XapianEngine::parseQuery: replaced filter: " << freeQuery << endl; #endif // Next escapedFilterStart = freeQuery.find(":\"", escapedFilterEnd); } // Parse the query string with all necessary options unsigned int flags = Xapian::QueryParser::FLAG_BOOLEAN|Xapian::QueryParser::FLAG_PHRASE| Xapian::QueryParser::FLAG_LOVEHATE|Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE| Xapian::QueryParser::FLAG_PURE_NOT; if (minimal == false) { flags |= Xapian::QueryParser::FLAG_WILDCARD; #if ENABLE_XAPIAN_SPELLING_CORRECTION>0 flags |= Xapian::QueryParser::FLAG_SPELLING_CORRECTION; #endif } Xapian::Query parsedQuery = parser.parse_query(freeQuery, flags); #ifdef DEBUG cout << "XapianEngine::parseQuery: " << parsedQuery.get_description() << endl; #endif if (minimal == false) { #if ENABLE_XAPIAN_SPELLING_CORRECTION>0 // Any correction ? correctedFreeQuery = parser.get_corrected_query_string(); #ifdef DEBUG if (correctedFreeQuery.empty() == false) { cout << "XapianEngine::parseQuery: corrected spelling to: " << correctedFreeQuery << endl; } #endif #endif } return parsedQuery; }