int main(int argc, char **argv) { // Simplest possible options parsing: we just require three or more // parameters. if(argc < 4) { cout << "usage: " << argv[0] << " <path to database> <document data> <document terms>" << endl; exit(1); } // Catch any Xapian::Error exceptions thrown try { // Make the database Xapian::WritableDatabase database(argv[1], Xapian::DB_CREATE_OR_OPEN); // Make the document Xapian::Document newdocument; // Put the data in the document newdocument.set_data(string(argv[2])); // Put the terms into the document for (int i = 3; i < argc; ++i) { newdocument.add_posting(argv[i], i - 2); } // Add the document to the database database.add_document(newdocument); } catch(const Xapian::Error &error) { cout << "Exception: " << error.get_msg() << endl; } }
void XapianIndex::addTermsToDocument(Tokenizer &tokens, Xapian::Document &doc, const string &prefix, Xapian::termcount &termPos, StemmingMode mode) const { Xapian::Stem *pStemmer = NULL; string term; // Do we know what language to use for stemming ? if (m_stemLanguage.empty() == false) { pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage)); } // Get the terms while (tokens.nextToken(term) == true) { if (term.empty() == true) { continue; } // Does it start with a capital letter ? if (isupper((int)term[0]) != 0) { // R-prefix the raw term doc.add_posting(string("R") + term, termPos); } // Lower case the term term = StringManip::toLowerCase(term); // Stem the term ? if ((mode == STORE_UNSTEM) || (pStemmer == NULL)) { doc.add_posting(limitTermLength(prefix + term), termPos++); } else if (mode == STORE_STEM) { string stemmedTerm = pStemmer->stem_word(term); doc.add_posting(limitTermLength(prefix + stemmedTerm), termPos++); } else if (mode == STORE_BOTH) { string stemmedTerm = pStemmer->stem_word(term); // Add both doc.add_posting(limitTermLength(prefix + term), termPos); // ...at the same position doc.add_posting(limitTermLength(prefix + stemmedTerm), termPos++); } } #ifdef DEBUG cout << "XapianIndex::addTermsToDocument: added " << termPos << " terms" << endl; #endif if (pStemmer != NULL) { delete pStemmer; } }
/// Returns a document's properties. bool XapianIndex::getDocumentInfo(unsigned int docId, DocumentInfo &docInfo) const { bool foundDocument = false; if (docId == 0) { return false; } XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false); if (pDatabase == NULL) { cerr << "Bad index " << m_databaseName << endl; return false; } try { Xapian::Database *pIndex = pDatabase->readLock(); if (pIndex != NULL) { Xapian::Document doc = pIndex->get_document(docId); // Get the current document data string record = doc.get_data(); if (record.empty() == false) { string language = Languages::toLocale(StringManip::extractField(record, "language=", "")); docInfo = DocumentInfo(StringManip::extractField(record, "caption=", "\n"), StringManip::extractField(record, "url=", "\n"), StringManip::extractField(record, "type=", "\n"), language); docInfo.setTimestamp(StringManip::extractField(record, "timestamp=", "\n")); #ifdef DEBUG cout << "XapianIndex::getDocumentInfo: language is " << docInfo.getLanguage() << endl; #endif foundDocument = true; } } } catch (const Xapian::Error &error) { cerr << "Couldn't get document properties: " << error.get_msg() << endl; } catch (...) { cerr << "Couldn't get document properties, unknown exception occured" << endl; } pDatabase->unlock(); return foundDocument; }
void XapianIndex::setDocumentData(const DocumentInfo &info, Xapian::Document &doc, const string &language) const { time_t timeT = TimeConverter::fromTimestamp(info.getTimestamp()); // Add this value to allow sorting by date doc.add_value(0, StringManip::integerToBinaryString((uint32_t)timeT)); DocumentInfo docCopy(info); docCopy.setLanguage(language); doc.set_data(XapianDatabase::propsToRecord(&docCopy)); }
/// Renames a label. bool XapianIndex::renameLabel(const string &name, const string &newName) { bool renamedLabel = false; XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false); if (pDatabase == NULL) { cerr << "Bad index " << m_databaseName << endl; return false; } try { Xapian::WritableDatabase *pIndex = pDatabase->writeLock(); if (pIndex != NULL) { string term("XLABEL:"); // Get documents that have this label term += name; for (Xapian::PostingIterator postingIter = pIndex->postlist_begin(term); postingIter != pIndex->postlist_end(term); ++postingIter) { Xapian::docid docId = *postingIter; // Get the document Xapian::Document doc = pIndex->get_document(docId); // Remove the term doc.remove_term(term); // ...add the new one doc.add_term(limitTermLength(string("XLABEL:") + newName)); // ...and update the document pIndex->replace_document(docId, doc); } renamedLabel = true; } } catch (const Xapian::Error &error) { cerr << "Couldn't delete label: " << error.get_type() << ": " << error.get_msg() << endl; } catch (...) { cerr << "Couldn't delete label, unknown exception occured" << endl; } pDatabase->unlock(); return renamedLabel; }
void XapianIndex::setDocumentData(const DocumentInfo &info, Xapian::Document &doc, const string &language) const { string title(info.getTitle()); string timestamp(info.getTimestamp()); char timeStr[64]; time_t timeT = TimeConverter::fromTimestamp(timestamp); // Set the document data omindex-style string record = "url="; record += info.getLocation(); // The sample will be generated at query time record += "\nsample="; record += "\ncaption="; if (badField(title) == true) { // Modify the title if necessary string::size_type pos = title.find("="); while (pos != string::npos) { title[pos] = ' '; pos = title.find("=", pos + 1); } #ifdef DEBUG cout << "XapianIndex::setDocumentData: modified title" << endl; #endif } record += title; record += "\ntype="; record += info.getType(); // Append a timestamp, in a format compatible with Omega record += "\nmodtime="; snprintf(timeStr, 64, "%ld", timeT); record += timeStr; // ...and the language record += "\nlanguage="; record += StringManip::toLowerCase(language); #ifdef DEBUG cout << "XapianIndex::setDocumentData: document data is " << record << endl; #endif doc.set_data(record); // Add this value to allow sorting by date doc.add_value(0, StringManip::integerToBinaryString((uint32_t)timeT)); }
void HistoryLogger::saveMessage(const Message* message) { if (message->flags() & MESSAGE_FLAG_ALARM) return; Xapian::Document doc; quint32 flags = message->flags(); std::string plainText(message->plainText().toUtf8()); std::string confUser(message->getConfUser().constData()); std::string data; if (flags & MESSAGE_FLAG_RTF) data = message->rtfText().constData(); else data = plainText; std::cout << "HistoryLogger::saveMessage data = " << data << std::endl; doc.set_data(data); Xapian::TermGenerator termGen; termGen.set_stemmer(Xapian::Stem("ru")); termGen.set_document(doc); termGen.index_text(plainText); doc.add_value(0, message->dateTime().toString("yyyyMMdd").toStdString()); doc.add_value(1, message->dateTime().toString("hhmmss").toStdString()); doc.add_value(2, QString::number(flags, 16).toStdString()); doc.add_value(3, message->type() == Message::Outgoing? "o" : "i"); doc.add_value(4, confUser); database->add_document(doc); database->flush(); }
void XapianIndex::setDocumentData(Xapian::Document &doc, const DocumentInfo &info, const string &language) const { string title(info.getTitle()); string timestamp(info.getTimestamp()); char timeStr[64]; // Set the document data omindex-style string record = "url="; record += info.getLocation(); // The sample will be generated at query time record += "\nsample="; record += "\ncaption="; if (badField(title) == true) { // Modify the title if necessary string::size_type pos = title.find("="); while (pos != string::npos) { title[pos] = ' '; pos = title.find("=", pos + 1); } #ifdef DEBUG cout << "XapianIndex::setDocumentData: modified title" << endl; #endif } record += title; record += "\ntype="; record += info.getType(); // Append a timestamp record += "\ntimestamp="; record += timestamp; // ...and the language record += "\nlanguage="; record += language; #ifdef DEBUG cout << "XapianIndex::setDocumentData: document data is " << record << endl; #endif doc.set_data(record); // Add this value to allow sorting by date snprintf(timeStr, 64, "%d", TimeConverter::fromTimestamp(timestamp)); doc.add_value(0, timeStr); }
int main(int argc, char **argv) { // Simplest possible options parsing: we just require two or more // parameters. if (argc < 3) { cout << "usage: " << argv[0] << " <path to database> <search terms>" << endl; exit(1); } // Catch any Xapian::Error exceptions thrown try { // Make the database Xapian::Database db(argv[1]); // Start an enquire session Xapian::Enquire enquire(db); // Set percent and/or weight cutoffs enquire.set_cutoff(90,0.2); // Set weighting schema BM25Weight bm1(1.0,0.0,1.0,0.5,0.3); enquire.set_weighting_scheme(bm1); // Build the query object Xapian::Query query(Xapian::Query::OP_AND, argv + 2, argv + argc); cout << "Performing query" << query.get_description() << "'" << endl; // Set Stopper string stop[8]={"的","了","呵","吧","就","你","我","他"}; SimpleStopper *ss=new SimpleStopper; for(int i=0;i<8;i++){ ss->add(stop[i]); } QueryParser qparser; qparser.set_stopper(ss); qparser.set_database(db); // Give the query object to the enquire session enquire.set_query(query); // Get the top 10 results of the query Xapian::MSet matches = enquire.get_mset(0, 10); //最多返回10个文档 // Display the results cout << matches.size() << " results found" << endl; for (Xapian::MSetIterator i = matches.begin();i != matches.end(); ++i) { Xapian::Document doc = i.get_document(); cout << "Document ID " << *i << "\nPercent " <<i.get_percent() << "%\n" << doc.get_data() << "\n" << endl; } db.close(); } catch(const Xapian::Error &error) { cout << "Exception: " << error.get_msg() << endl; } }
void XapianIndex::removeFirstPostingsFromDocument(Tokenizer &tokens, Xapian::Document &doc, const string &prefix, const string &language, StemmingMode mode) const { Xapian::TermIterator termListIter = doc.termlist_begin(); Xapian::Stem *pStemmer = NULL; string term; // Do we know what language to use for stemming ? if (language.empty() == false) { pStemmer = new Xapian::Stem(StringManip::toLowerCase(language)); } // Get the terms and remove the first posting for each while (tokens.nextToken(term) == true) { if (term.empty() == true) { continue; } // Does it start with a capital letter ? if (isupper((int)term[0]) != 0) { // R-prefix the raw term removeFirstPosting(doc, termListIter, string("R") + term); } // Lower case the term term = StringManip::toLowerCase(term); // Stem the term ? if ((mode == STORE_UNSTEM) || (pStemmer == NULL)) { removeFirstPosting(doc, termListIter, limitTermLength(prefix + term)); } else if (mode == STORE_STEM) { removeFirstPosting(doc, termListIter, limitTermLength(prefix + pStemmer->stem_word(term))); } else if (mode == STORE_BOTH) { string stemmedTerm = pStemmer->stem_word(term); removeFirstPosting(doc, termListIter, limitTermLength(prefix + term)); if (stemmedTerm != term) { removeFirstPosting(doc, termListIter, limitTermLength(prefix + stemmedTerm)); } } } if (pStemmer != NULL) { delete pStemmer; } }
QImage ThumbnailProvider::requestImage(const QString &id, QSize *size, const QSize &requestedSize) { QImage image; if (m_thumb32->findImage(id, &image)) { return image; } else { QString filePath; if (id.at(0) == QLatin1Char('Q')) { Xapian::Document doc = m_xapianDB->findDocument(id); if (doc.get_docid() == 0) { return image; } else { filePath = QString::fromStdString(doc.get_value(Database::FilePath)); } } else { filePath = id; } // Load thumbnail // KExiv2Iface::KExiv2Previews preview(filePath); KExiv2Iface::KExiv2 preview(filePath); image = preview.getExifThumbnail(true); if (image.isNull()) { // image = preview.image(); // } else { // Store thumbnail // TODO smooth or fast? image = QImage(filePath).scaled(160, 120, Qt::KeepAspectRatio); // preview. kWarning() << "Could not find preview image for" << filePath << image.isNull(); } // Store the thumbnail into the cache file if (m_thumb32->insertImage(id, image)) { kWarning() << "Added preview for" << image.byteCount() << filePath << id; } else { kWarning() << "FAILED to add preview for" << filePath << id; } } return image; }
QString EmailSearchStore::text(int queryId) { Xapian::Document doc = docForQuery(queryId); QMutexLocker lock(&m_mutex); std::string data; try { data = doc.get_data(); } catch (const Xapian::Error &) { // Nothing to do, move along } QString subject = QString::fromUtf8(data.c_str(), data.length()); if (subject.isEmpty()) { return QStringLiteral("No Subject"); } return subject; }
bool XapianIndex::prepareDocument(const DocumentInfo &info, Xapian::Document &doc, Xapian::termcount &termPos) const { string title(info.getTitle()); string location(info.getLocation()); Url urlObj(location); // Add a magic term :-) doc.add_term(MAGIC_TERM); // Index the title with and without prefix S if (title.empty() == false) { Document titleDoc; titleDoc.setData(title.c_str(), title.length()); Tokenizer titleTokens(&titleDoc); addTermsToDocument(titleTokens, doc, "S", termPos, STORE_UNSTEM); titleTokens.rewind(); addTermsToDocument(titleTokens, doc, "", termPos, m_stemMode); } // Index the full URL with prefix U doc.add_term(limitTermLength(string("U") + location, true)); // ...the host name and included domains with prefix H string hostName(StringManip::toLowerCase(urlObj.getHost())); if (hostName.empty() == false) { doc.add_term(limitTermLength(string("H") + hostName, true)); string::size_type dotPos = hostName.find('.'); while (dotPos != string::npos) { doc.add_term(limitTermLength(string("H") + hostName.substr(dotPos + 1), true)); // Next dotPos = hostName.find('.', dotPos + 1); } } // ...and the file name with prefix P string fileName(urlObj.getFile()); if (fileName.empty() == false) { doc.add_term(limitTermLength(string("P") + StringManip::toLowerCase(fileName), true)); } // Finally, add the language code with prefix L doc.add_term(string("L") + Languages::toCode(m_stemLanguage)); setDocumentData(doc, info, m_stemLanguage); return true; }
static void removeFirstPosting(Xapian::Document &doc, Xapian::TermIterator &termListIter, const string &term) { termListIter.skip_to(term); Xapian::PositionIterator firstPosIter = termListIter.positionlist_begin(); if (firstPosIter != termListIter.positionlist_end()) { try { doc.remove_posting(term, *firstPosIter); } catch (const Xapian::Error &error) { // This posting may have been removed already #ifdef DEBUG cout << "XapianIndex::removeFirstPosting: " << error.get_msg() << endl; #endif } } }
Indexer::Indexer(const string &datapath, const string &dbpath) { // Hardcode field offsets for simplicity. const size_t FIELD_ID_NUMBER = 0; const size_t FIELD_TITLE = 2; const size_t FIELD_DESCRIPTION = 8; // Create or open the database we're going to be writing to. Xapian::WritableDatabase db(dbpath, Xapian::DB_CREATE_OR_OPEN); // Set up a TermGenerator that we'll use in indexing. Xapian::TermGenerator termgenerator; termgenerator.set_stemmer(Xapian::Stem("en")); ifstream csv(datapath.c_str()); vector<string> fields; csv_parse_line(csv, fields); // Check the CSV header line matches our hard-code offsets. if (fields.at(FIELD_ID_NUMBER) != "id_NUMBER" || fields.at(FIELD_TITLE) != "TITLE" || fields.at(FIELD_DESCRIPTION) != "DESCRIPTION") { // The CSV format doesn't match what we expect. cerr << "CSV format has changed!" << endl; exit(1); } while (csv_parse_line(csv, fields)) { // 'fields' is a vector mapping from field number to value. // We look up fields with the 'at' method so we get an exception // if that field isn't set. // // We're just going to use DESCRIPTION, TITLE and id_NUMBER. const string & description = fields.at(FIELD_DESCRIPTION); const string & title = fields.at(FIELD_TITLE); const string & identifier = fields.at(FIELD_ID_NUMBER); // We make a document and tell the term generator to use this. Xapian::Document doc; termgenerator.set_document(doc); // Index each field with a suitable prefix. termgenerator.index_text(title, 1, "S"); termgenerator.index_text(description, 1, "XD"); // Index fields without prefixes for general search. termgenerator.index_text(title); termgenerator.increase_termpos(); termgenerator.index_text(description); // Store all the fields for display purposes. doc.set_data(identifier + "\n" + title + "\n" + description); // We use the identifier to ensure each object ends up in the // database only once no matter how many times we run the // indexer. string idterm = "Q" + identifier; doc.add_boolean_term(idterm); db.replace_document(idterm, doc); } }
void QueryHandler(const QueryMessage &message, const Theron::Address from) { search::QueryInfo qi=*(message.query); std::string resKey(message.resKey); delete message.query; std::string segString; char *output=new char[qi.query.length()*9]; char *input=new char[qi.query.length()*3]; memset(output,0,qi.query.length()*9); memset(input,0,qi.query.length()*3); try { UErrorCode error = U_ZERO_ERROR; ucnv_convert("GBK","UTF-8",input, qi.query.length()*3, qi.query.c_str(), qi.query.length(), &error ); bool ret = result->ParagraphProcessing(input, output); if (ret) { int oLen=strlen(output); char *utf8out=new char[oLen*3]; memset(utf8out,0,oLen*3); ucnv_convert("UTF-8","GBK",utf8out, oLen*3, output, oLen, &error ); segString=std::string(utf8out); delete [] utf8out; } } catch (...) { } delete [] output; delete [] input; std::list<std::string> segList; if(segString.length()>0) { std::vector<std::string> resv; boost::algorithm::split( resv, segString, boost::algorithm::is_any_of(" ") ); for(std::vector<std::string>::iterator it=resv.begin();it!=resv.end();++it) { std::vector<std::string> tmpv; boost::algorithm::split( tmpv, *it, boost::algorithm::is_any_of("/") ); if(tmpv.size()>1&&tmpv[1]!="w") segList.push_back(std::string("K")+tmpv[0]); } } search::DocList *dList=new search::DocList(); if(segList.size()>0) { Xapian::Query query(Xapian::Query::OP_AND,segList.begin(), segList.end()); while(1) { try { db.reopen(); Xapian::Enquire enquire(db); enquire.set_query(query); Xapian::MSet matches = enquire.get_mset(0, 100); for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i) { Xapian::Document doc = i.get_document(); search::IndexInfo info; info.uid=doc.get_value(1); info.attMap.insert(std::make_pair(std::string("title"),doc.get_value(2))); info.content=doc.get_data(); dList->docList.push_back(info); } std::cout<<"doc size:"<<dList->docList.size()<<std::endl; break; }catch(Xapian::DatabaseModifiedError exception) { std::cout<<"try agian"<<std::endl; }catch(...) { break; } } } Send(QueryResponceMessage(dList,resKey.c_str()), from); }
/// Indexes the given data. bool XapianIndex::indexDocument(Tokenizer &tokens, const std::set<std::string> &labels, unsigned int &docId) { unsigned int dataLength = 0; bool indexed = false; XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false); if (pDatabase == NULL) { cerr << "Bad index " << m_databaseName << endl; return false; } try { // Get the document const Document *pDocument = tokens.getDocument(); if (pDocument == NULL) { #ifdef DEBUG cout << "XapianIndex::indexDocument: no document" << endl; #endif return false; } // Cache the document's properties DocumentInfo docInfo(pDocument->getTitle(), pDocument->getLocation(), pDocument->getType(), pDocument->getLanguage()); docInfo.setTimestamp(pDocument->getTimestamp()); docInfo.setLocation(Url::canonicalizeUrl(docInfo.getLocation())); const char *pData = pDocument->getData(dataLength); if (pData != NULL) { m_stemLanguage = scanDocument(pData, dataLength, docInfo); } Xapian::Document doc; Xapian::termcount termPos = 0; #ifdef DEBUG cout << "XapianIndex::indexDocument: adding terms" << endl; #endif // Add the tokenizer's terms to the Xapian document addPostingsToDocument(tokens, doc, "", termPos, m_stemMode); // Add labels for (set<string>::const_iterator labelIter = labels.begin(); labelIter != labels.end(); ++labelIter) { doc.add_term(limitTermLength(string("XLABEL:") + *labelIter)); } if (addCommonTerms(docInfo, doc, termPos) == true) { setDocumentData(docInfo, doc, m_stemLanguage); Xapian::WritableDatabase *pIndex = pDatabase->writeLock(); if (pIndex != NULL) { // Add this document to the Xapian index docId = pIndex->add_document(doc); indexed = true; } } } catch (const Xapian::Error &error) { cerr << "Couldn't index document: " << error.get_type() << ": " << error.get_msg() << endl; } catch (...) { cerr << "Couldn't index document, unknown exception occured" << endl; } pDatabase->unlock(); return indexed; }
void XapianIndex::addPostingsToDocument(Tokenizer &tokens, Xapian::Document &doc, const string &prefix, Xapian::termcount &termPos, StemmingMode mode) const { Xapian::Stem *pStemmer = NULL; string upperCasePrefix("R"); string term; // Do we know what language to use for stemming ? if (m_stemLanguage.empty() == false) { pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage)); } // Terms starting with a capital letter are R-prefixed, unless a prefix is already defined if (prefix.empty() == false) { upperCasePrefix = prefix; } // Get the terms while (tokens.nextToken(term) == true) { if (term.empty() == true) { continue; } // Does it start with a capital letter ? if (isupper((int)term[0]) != 0) { doc.add_posting(upperCasePrefix + XapianDatabase::limitTermLength(term), termPos); } // Lower case the term term = StringManip::toLowerCase(term); // Stem the term ? if ((mode == STORE_UNSTEM) || (pStemmer == NULL)) { doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos); } else if (mode == STORE_STEM) { #if XAPIAN_MAJOR_VERSION==0 string stemmedTerm(pStemmer->stem_word(term)); #else string stemmedTerm((*pStemmer)(term)); #endif doc.add_posting(prefix + XapianDatabase::limitTermLength(stemmedTerm), termPos); } else if (mode == STORE_BOTH) { #if XAPIAN_MAJOR_VERSION==0 string stemmedTerm(pStemmer->stem_word(term)); #else string stemmedTerm((*pStemmer)(term)); #endif // Add both at the same position doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos); if (stemmedTerm != term) { // No point adding the same term twice doc.add_posting(prefix + XapianDatabase::limitTermLength(stemmedTerm), termPos); } } ++termPos; } #ifdef DEBUG cout << "XapianIndex::addPostingsToDocument: added " << termPos << " terms" << endl; #endif if (pStemmer != NULL) { delete pStemmer; } }
void XapianIndex::addCommonTerms(const DocumentInfo &info, Xapian::Document &doc, Xapian::termcount &termPos) const { string title(info.getTitle()); string location(info.getLocation()); Url urlObj(location); // Add a magic term :-) doc.add_term(MAGIC_TERM); // Index the title with and without prefix S if (title.empty() == false) { Document titleDoc; titleDoc.setData(title.c_str(), title.length()); Tokenizer titleTokens(&titleDoc); addPostingsToDocument(titleTokens, doc, "S", termPos, STORE_UNSTEM); titleTokens.rewind(); addPostingsToDocument(titleTokens, doc, "", termPos, m_stemMode); } // Index the full URL with prefix U doc.add_term(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(location), true)); // ...the base file with XFILE: string::size_type qmPos = location.find("?"); if ((urlObj.isLocal() == true) && (qmPos != string::npos)) { doc.add_term(string("XFILE:") + XapianDatabase::limitTermLength(Url::escapeUrl(location.substr(0, qmPos)), true)); } // ...the host name and included domains with prefix H string hostName(StringManip::toLowerCase(urlObj.getHost())); if (hostName.empty() == false) { doc.add_term(string("H") + XapianDatabase::limitTermLength(hostName, true)); string::size_type dotPos = hostName.find('.'); while (dotPos != string::npos) { doc.add_term(string("H") + XapianDatabase::limitTermLength(hostName.substr(dotPos + 1), true)); // Next dotPos = hostName.find('.', dotPos + 1); } } // ...the location (as is) and all directories with prefix XDIR: string tree(urlObj.getLocation()); if (tree.empty() == false) { doc.add_term(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree), true)); if (tree[0] == '/') { doc.add_term("XDIR:/"); } string::size_type slashPos = tree.find('/', 1); while (slashPos != string::npos) { doc.add_term(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree.substr(0, slashPos)), true)); // Next slashPos = tree.find('/', slashPos + 1); } } // ...and the file name with prefix P string fileName(urlObj.getFile()); if (fileName.empty() == false) { string extension; doc.add_term(string("P") + XapianDatabase::limitTermLength(Url::escapeUrl(fileName), true)); // Does it have an extension ? string::size_type extPos = fileName.rfind('.'); if ((extPos != string::npos) && (extPos + 1 < fileName.length())) { extension = StringManip::toLowerCase(fileName.substr(extPos + 1)); } doc.add_term(string("XEXT:") + XapianDatabase::limitTermLength(extension)); } // Add the date terms D, M and Y time_t timeT = TimeConverter::fromTimestamp(info.getTimestamp()); struct tm *tm = localtime(&timeT); string yyyymmdd = TimeConverter::toYYYYMMDDString(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday); if (yyyymmdd.length() == 8) { doc.add_term(string("D") + yyyymmdd); doc.add_term(string("M") + yyyymmdd.substr(0, 6)); doc.add_term(string("Y") + yyyymmdd.substr(0, 4)); } // Finally, add the language code with prefix L doc.add_term(string("L") + Languages::toCode(m_stemLanguage)); // ...and the MIME type with prefix T doc.add_term(string("T") + info.getType()); }
/// Sets a document's labels. bool XapianIndex::setDocumentLabels(unsigned int docId, const set<string> &labels, bool resetLabels) { bool updatedLabels = false; XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false); if (pDatabase == NULL) { cerr << "Bad index " << m_databaseName << endl; return false; } try { Xapian::WritableDatabase *pIndex = pDatabase->writeLock(); if (pIndex != NULL) { Xapian::Document doc = pIndex->get_document(docId); // Reset existing labels ? if (resetLabels == true) { Xapian::TermIterator termIter = pIndex->termlist_begin(docId); if (termIter != pIndex->termlist_end(docId)) { for (termIter.skip_to("XLABEL:"); termIter != pIndex->termlist_end(docId); ++termIter) { // Is this a label ? if (strncasecmp((*termIter).c_str(), "XLABEL:", min(7, (int)(*termIter).length())) == 0) { doc.remove_term(*termIter); } } } } // Set new labels for (set<string>::const_iterator labelIter = labels.begin(); labelIter != labels.end(); ++labelIter) { if (labelIter->empty() == false) { doc.add_term(limitTermLength(string("XLABEL:") + *labelIter)); } } pIndex->replace_document(docId, doc); updatedLabels = true; } } catch (const Xapian::Error &error) { cerr << "Couldn't update document's labels: " << error.get_type() << ": " << error.get_msg() << endl; } catch (...) { cerr << "Couldn't update document's labels, unknown exception occured" << endl; } pDatabase->unlock(); return updatedLabels; }
/// Updates the given document; true if success. bool XapianIndex::updateDocument(unsigned int docId, Tokenizer &tokens) { unsigned int dataLength = 0; bool updated = false; const Document *pDocument = tokens.getDocument(); if (pDocument == NULL) { return false; } XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false); if (pDatabase == NULL) { cerr << "Bad index " << m_databaseName << endl; return false; } const char *pData = pDocument->getData(dataLength); if (pData == NULL) { return false; } // Cache the document's properties DocumentInfo docInfo(pDocument->getTitle(), pDocument->getLocation(), pDocument->getType(), pDocument->getLanguage()); docInfo.setTimestamp(pDocument->getTimestamp()); docInfo.setLocation(Url::canonicalizeUrl(docInfo.getLocation())); // Don't scan the document if a language is specified m_stemLanguage = Languages::toEnglish(pDocument->getLanguage()); if (m_stemLanguage.empty() == true) { m_stemLanguage = scanDocument(pData, dataLength, docInfo); } try { set<string> labels; Xapian::Document doc; Xapian::termcount termPos = 0; // Add the tokenizer's terms to the document addPostingsToDocument(tokens, doc, "", termPos, m_stemMode); // Get the document's labels if (getDocumentLabels(docId, labels) == true) { // Add labels for (set<string>::const_iterator labelIter = labels.begin(); labelIter != labels.end(); ++labelIter) { doc.add_term(limitTermLength(string("XLABEL:") + *labelIter)); } } if (addCommonTerms(docInfo, doc, termPos) == true) { setDocumentData(docInfo, doc, m_stemLanguage); Xapian::WritableDatabase *pIndex = pDatabase->writeLock(); if (pIndex != NULL) { // Update the document in the database pIndex->replace_document(docId, doc); updated = true; } } } catch (const Xapian::Error &error) { cerr << "Couldn't update document: " << error.get_type() << ": " << error.get_msg() << endl; } catch (...) { cerr << "Couldn't update document, unknown exception occured" << endl; } pDatabase->unlock(); return updated; }
void query(const std::vector<std::string>& arr_queries, const std::vector<std::string>& arr_selection = {}) { Xapian::Database databases_ir; try { Xapian::Database database_ir_object_values(path_database+"object_values/"); databases_ir.add_database(database_ir_object_values); } catch (const Xapian::Error &e) { // Database not ready } try { Xapian::Database database_ir_object_descriptions(path_database+"object_descriptions/"); databases_ir.add_database(database_ir_object_descriptions); } catch (const Xapian::Error &e) { // Database not ready } try { Xapian::Database database_ir_object_sub_descriptions(path_database+"object_sub_descriptions/"); databases_ir.add_database(database_ir_object_sub_descriptions); } catch (const Xapian::Error &e) { // Database not ready } // Filter on Type IDs Xapian::Query query_ir_identifiers; if (!arr_selection.empty()) { std::vector<Xapian::Query> arr_query_identifiers; for (const auto& str_identifier_field : arr_selection) { arr_query_identifiers.push_back(Xapian::Query("T"+str_identifier_field)); } query_ir_identifiers = Xapian::Query(Xapian::Query::OP_OR, arr_query_identifiers.begin(), arr_query_identifiers.end()); } Xapian::QueryParser queryparser; queryparser.set_database(databases_ir); // Needed to enable specific query flags queryparser.set_stemmer(Xapian::Stem("en")); queryparser.set_stemming_strategy(queryparser.STEM_SOME); queryparser.add_boolean_prefix("identifier", "T"); //queryparser.add_prefix("value", "SV"); unsigned int count_queries = 0; for (const auto& str_query : arr_queries) { const auto query_id = count_queries; count_queries++; Xapian::Query query_ir = queryparser.parse_query(str_query, Xapian::QueryParser::FLAG_DEFAULT | Xapian::QueryParser::FLAG_WILDCARD); if (!arr_selection.empty()) { // Update main query query_ir = Xapian::Query(Xapian::Query::OP_FILTER, query_ir, query_ir_identifiers); } // Run query Xapian::Enquire enquire(databases_ir); enquire.set_query(query_ir); Xapian::MSet arr_msets = enquire.get_mset(num_offset, num_limit); for (Xapian::MSetIterator iterate_arr_mset = arr_msets.begin(); iterate_arr_mset != arr_msets.end(); iterate_arr_mset++) { //Xapian::docid did = *iterate_arr_mset; const int unsigned& nr_rank = iterate_arr_mset.get_rank(); const int unsigned& nr_weight = iterate_arr_mset.get_weight(); const Xapian::Document doc = iterate_arr_mset.get_document(); const std::string& str_identifier = doc.get_value(0); if (map_query_results.find(str_identifier) == map_query_results.end()) { std::vector<unsigned int> arr_matches; arr_matches.push_back(query_id); const std::string& str_value = (include_value ? doc.get_data() : ""); map_query_results[str_identifier] = std::make_tuple(nr_rank, nr_weight, arr_matches, str_value); } else { type_arr_query_result& arr_query_result = map_query_results[str_identifier]; std::get<0>(arr_query_result) += nr_rank; std::get<1>(arr_query_result) += nr_weight; std::get<2>(arr_query_result).push_back(query_id); } } } }
void XapianIndex::removeFirstPostingsFromDocument(Tokenizer &tokens, Xapian::Document &doc, const string &prefix, const string &language, StemmingMode mode) const { Xapian::TermIterator termListIter = doc.termlist_begin(); Xapian::Stem *pStemmer = NULL; string upperCasePrefix("R"); string term; // Do we know what language to use for stemming ? if (language.empty() == false) { pStemmer = new Xapian::Stem(StringManip::toLowerCase(language)); } // Terms starting with a capital letter are R-prefixed, unless a prefix is already defined if (prefix.empty() == false) { upperCasePrefix = prefix; } // Get the terms and remove the first posting for each while (tokens.nextToken(term) == true) { if (term.empty() == true) { continue; } // Does it start with a capital letter ? if (isupper((int)term[0]) != 0) { removeFirstPosting(doc, termListIter, upperCasePrefix + term); } // Lower case the term term = StringManip::toLowerCase(term); // Stem the term ? if ((mode == STORE_UNSTEM) || (pStemmer == NULL)) { removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term)); } else if (mode == STORE_STEM) { #if XAPIAN_MAJOR_VERSION==0 string stemmedTerm(pStemmer->stem_word(term)); #else string stemmedTerm((*pStemmer)(term)); #endif removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(stemmedTerm)); } else if (mode == STORE_BOTH) { #if XAPIAN_MAJOR_VERSION==0 string stemmedTerm(pStemmer->stem_word(term)); #else string stemmedTerm((*pStemmer)(term)); #endif removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term)); if (stemmedTerm != term) { removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(stemmedTerm)); } } } if (pStemmer != NULL) { delete pStemmer; } }
void XapianIndex::addPostingsToDocument(Tokenizer &tokens, Xapian::Document &doc, const string &prefix, Xapian::termcount &termPos, StemmingMode mode) const { Xapian::Stem *pStemmer = NULL; string stemPrefix("Z"); string term; // Do we know what language to use for stemming ? if (m_stemLanguage.empty() == false) { try { pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage)); } catch (const Xapian::Error &error) { cerr << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl; } } // Stems are Z-prefixed, unless a prefix is already defined if (prefix.empty() == false) { stemPrefix = prefix; } // Get the terms while (tokens.nextToken(term) == true) { bool addStem = false; if (term.empty() == true) { continue; } // Lower case the term term = StringManip::toLowerCase(term); // Stem the term ? if ((mode == STORE_UNSTEM) || (pStemmer == NULL)) { doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos); } else if (mode == STORE_STEM) { addStem = true; } else if (mode == STORE_BOTH) { // Add both doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos); addStem = true; } // Don't stem if the term starts with a digit if ((addStem == true) && (isdigit((int)term[0]) == 0)) { #if XAPIAN_MAJOR_VERSION==0 string stemmedTerm(pStemmer->stem_word(term)); #else string stemmedTerm((*pStemmer)(term)); #endif doc.add_term(stemPrefix + XapianDatabase::limitTermLength(stemmedTerm)); } ++termPos; } #ifdef DEBUG cout << "XapianIndex::addPostingsToDocument: added " << termPos << " terms" << endl; #endif if (pStemmer != NULL) { delete pStemmer; } }
void XapianIndex::removeFirstPostingsFromDocument(Tokenizer &tokens, Xapian::Document &doc, const string &prefix, const string &language, StemmingMode mode) const { Xapian::TermIterator termListIter = doc.termlist_begin(); Xapian::Stem *pStemmer = NULL; string stemPrefix("Z"); string term; // Do we know what language to use for stemming ? if (language.empty() == false) { try { pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage)); } catch (const Xapian::Error &error) { cerr << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl; } } // Stems are Z-prefixed, unless a prefix is already defined if (prefix.empty() == false) { stemPrefix = prefix; } // Get the terms and remove the first posting for each while (tokens.nextToken(term) == true) { bool removeStem = false; if (term.empty() == true) { continue; } // Lower case the term term = StringManip::toLowerCase(term); // Stem the term ? if ((mode == STORE_UNSTEM) || (pStemmer == NULL)) { removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term)); } else if (mode == STORE_STEM) { removeStem = true; } else if (mode == STORE_BOTH) { // Remove both removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term)); removeStem = true; } // Since stems don't have positional information, we can't simply remove them // since any may appear more than once in the original document // We can only remove those that have some prefix set // Don't stem if the term starts with a digit if ((removeStem == true) && (prefix.empty() == false) && (isdigit((int)term[0]) == 0)) { #if XAPIAN_MAJOR_VERSION==0 string stemmedTerm(pStemmer->stem_word(term)); #else string stemmedTerm((*pStemmer)(term)); #endif doc.remove_term(stemPrefix + XapianDatabase::limitTermLength(stemmedTerm)); } } if (pStemmer != NULL) { delete pStemmer; } }
int main(int argc, char **argv) { if(argc < 2) { usage(argv); return 1; } try { char *action = argv[1]; char *db_path = argv[2]; if(!strcmp(action, "index")) { Xapian::WritableDatabase db(db_path, Xapian::DB_CREATE_OR_OPEN); Xapian::TermGenerator indexer; Xapian::Stem stemmer("english"); indexer.set_stemmer(stemmer); std::string doc_txt; while(true) { if(std::cin.eof()) break; std::string line; getline(std::cin, line); doc_txt += line; } if(!doc_txt.empty()) { Xapian::Document doc; doc.set_data(doc_txt); indexer.set_document(doc); indexer.index_text(doc_txt); db.add_document(doc); std::cout << "Indexed: " << indexer.get_description() << std::endl; } db.commit(); } else if(!strcmp(action, "search")) { if(argc < 4) { std::cerr << "You must supply a query string" << std::endl; return 1; } Xapian::Database db(db_path); Xapian::Enquire enquire(db); std::string query_str = argv[3]; argv+= 4; while(*argv) { query_str += ' '; query_str += *argv++; } Xapian::QueryParser qp; Xapian::Stem stemmer("english"); qp.set_stemmer(stemmer); qp.set_database(db); qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME); Xapian::Query query = qp.parse_query(query_str); std::cout << "Parsed query is: " << query.get_description() << std::endl; enquire.set_query(query); Xapian::MSet matches = enquire.get_mset(0, 10); std::cout << matches.get_matches_estimated() << " results found.\n"; std::cout << "Matches 1-" << matches.size() << ":\n" << std::endl; for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i) { std::cout << i.get_rank() + 1 << ": " << i.get_percent() << "% docid=" << *i << " [" << i.get_document().get_data()<< "]" << std::endl << std::endl; } } else { std::cerr << "Invalid action " << action << std::endl; usage(argv); return 1; } } catch (const Xapian::Error &error) { std::cout << "Exception: " << error.get_msg() << std::endl; } }
void XapianIndex::removeCommonTerms(Xapian::Document &doc) { DocumentInfo docInfo; set<string> commonTerms; string record(doc.get_data()); // First, remove the magic term commonTerms.insert(MAGIC_TERM); if (record.empty() == true) { // Nothing else we can do return; } string language(StringManip::extractField(record, "language=", "\n")); string timestamp(StringManip::extractField(record, "timestamp=", "\n")); docInfo = DocumentInfo(StringManip::extractField(record, "caption=", "\n"), StringManip::extractField(record, "url=", "\n"), StringManip::extractField(record, "type=", "\n"), Languages::toLocale(language)); // We used to use timestamp prior to 0.60 if (timestamp.empty() == true) { string modTime(StringManip::extractField(record, "modtime=", "\n")); if (modTime.empty() == false) { time_t timeT = (time_t )atol(modTime.c_str()); timestamp = TimeConverter::toTimestamp(timeT); } } docInfo.setTimestamp(timestamp); string bytesSize(StringManip::extractField(record, "size=", "")); if (bytesSize.empty() == false) { docInfo.setSize((off_t )atol(bytesSize.c_str())); } Url urlObj(docInfo.getLocation()); // FIXME: remove terms extracted from the title if they don't have more than one posting string title(docInfo.getTitle()); if (title.empty() == false) { Document titleDoc; titleDoc.setData(title.c_str(), title.length()); Tokenizer titleTokens(&titleDoc); removeFirstPostingsFromDocument(titleTokens, doc, "S", language, STORE_UNSTEM); titleTokens.rewind(); removeFirstPostingsFromDocument(titleTokens, doc, "", language, m_stemMode); } // Location string location(docInfo.getLocation()); commonTerms.insert(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(location), true)); // Base file string::size_type qmPos = location.find("?"); if ((urlObj.isLocal() == true) && (qmPos != string::npos)) { commonTerms.insert(string("XFILE:") + XapianDatabase::limitTermLength(Url::escapeUrl(location.substr(0, qmPos)), true)); } // Host name string hostName(StringManip::toLowerCase(urlObj.getHost())); if (hostName.empty() == false) { commonTerms.insert(string("H") + XapianDatabase::limitTermLength(hostName, true)); string::size_type dotPos = hostName.find('.'); while (dotPos != string::npos) { commonTerms.insert(string("H") + XapianDatabase::limitTermLength(hostName.substr(dotPos + 1), true)); // Next dotPos = hostName.find('.', dotPos + 1); } } // ...location string tree(urlObj.getLocation()); if (tree.empty() == false) { commonTerms.insert(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree), true)); if (tree[0] == '/') { commonTerms.insert("XDIR:/"); } string::size_type slashPos = tree.find('/', 1); while (slashPos != string::npos) { commonTerms.insert(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree.substr(0, slashPos)), true)); // Next slashPos = tree.find('/', slashPos + 1); } } // ...and file name string fileName(urlObj.getFile()); if (fileName.empty() == false) { string extension; commonTerms.insert(string("P") + XapianDatabase::limitTermLength(Url::escapeUrl(fileName), true)); // Does it have an extension ? string::size_type extPos = fileName.rfind('.'); if ((extPos != string::npos) && (extPos + 1 < fileName.length())) { extension = StringManip::toLowerCase(fileName.substr(extPos + 1)); } commonTerms.insert(string("XEXT:") + XapianDatabase::limitTermLength(extension)); } // Date terms time_t timeT = TimeConverter::fromTimestamp(docInfo.getTimestamp()); struct tm *tm = localtime(&timeT); string yyyymmdd = TimeConverter::toYYYYMMDDString(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday); if (yyyymmdd.length() == 8) { commonTerms.insert(string("D") + yyyymmdd); commonTerms.insert(string("M") + yyyymmdd.substr(0, 6)); commonTerms.insert(string("Y") + yyyymmdd.substr(0, 4)); } // Language code commonTerms.insert(string("L") + Languages::toCode(language)); // MIME type commonTerms.insert(string("T") + docInfo.getType()); for (set<string>::const_iterator termIter = commonTerms.begin(); termIter != commonTerms.end(); ++termIter) { try { doc.remove_term(*termIter); } catch (const Xapian::Error &error) { #ifdef DEBUG cout << "XapianIndex::removeCommonTerms: " << error.get_msg() << endl; #endif } } }
/// Returns a document's properties. bool XapianIndex::getDocumentInfo(unsigned int docId, DocumentInfo &docInfo) const { bool foundDocument = false; if (docId == 0) { return false; } XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName); if (pDatabase == NULL) { cerr << "Bad index " << m_databaseName << endl; return false; } try { Xapian::Database *pIndex = pDatabase->readLock(); if (pIndex != NULL) { Xapian::Document doc = pIndex->get_document(docId); // Get the current document data string record = doc.get_data(); if (record.empty() == false) { string language(Languages::toLocale(StringManip::extractField(record, "language=", ""))); // We used to use timestamp prior to 0.60 string timestamp(StringManip::extractField(record, "timestamp=", "\n")); docInfo = DocumentInfo(StringManip::extractField(record, "caption=", "\n"), StringManip::extractField(record, "url=", "\n"), StringManip::extractField(record, "type=", "\n"), language); if (timestamp.empty() == true) { // This is the format used by Omega string modTime(StringManip::extractField(record, "modtime=", "\n")); if (modTime.empty() == false) { time_t timeT = (time_t )atol(modTime.c_str()); timestamp = TimeConverter::toTimestamp(timeT); } } docInfo.setTimestamp(timestamp); foundDocument = true; } } } catch (const Xapian::Error &error) { cerr << "Couldn't get document properties: " << error.get_type() << ": " << error.get_msg() << endl; } catch (...) { cerr << "Couldn't get document properties, unknown exception occured" << endl; } pDatabase->unlock(); return foundDocument; }
void XapianIndex::removeCommonTerms(Xapian::Document &doc) { DocumentInfo docInfo; string record(doc.get_data()); // First, remove the magic term doc.remove_term(MAGIC_TERM); if (record.empty() == true) { // Nothing else we can do return; } string language(StringManip::extractField(record, "language=", "")); string timestamp(StringManip::extractField(record, "timestamp=", "\n")); docInfo = DocumentInfo(StringManip::extractField(record, "caption=", "\n"), StringManip::extractField(record, "url=", "\n"), StringManip::extractField(record, "type=", "\n"), Languages::toLocale(language)); // We used to use timestamp prior to 0.60 if (timestamp.empty() == true) { string modTime(StringManip::extractField(record, "modtime=", "\n")); if (modTime.empty() == false) { time_t timeT = (time_t )atol(modTime.c_str()); timestamp = TimeConverter::toTimestamp(timeT); } } docInfo.setTimestamp(timestamp); Url urlObj(docInfo.getLocation()); // FIXME: remove terms extracted from the title if they don't have more than one posting string title(docInfo.getTitle()); if (title.empty() == false) { Document titleDoc; titleDoc.setData(title.c_str(), title.length()); Tokenizer titleTokens(&titleDoc); removeFirstPostingsFromDocument(titleTokens, doc, "S", language, STORE_UNSTEM); titleTokens.rewind(); removeFirstPostingsFromDocument(titleTokens, doc, "", language, m_stemMode); } // Title doc.remove_term(limitTermLength(string("U") + docInfo.getLocation(), true)); // Host name string hostName(StringManip::toLowerCase(urlObj.getHost())); if (hostName.empty() == false) { doc.remove_term(limitTermLength(string("H") + hostName, true)); string::size_type dotPos = hostName.find('.'); while (dotPos != string::npos) { doc.remove_term(limitTermLength(string("H") + hostName.substr(dotPos + 1), true)); // Next dotPos = hostName.find('.', dotPos + 1); } } // ...location string tree(urlObj.getLocation()); if (tree.empty() == false) { doc.remove_term(limitTermLength(string("XDIR:") + tree, true)); string::size_type slashPos = tree.find('/', 1); while (slashPos != string::npos) { doc.remove_term(limitTermLength(string("XDIR:") + tree.substr(0, slashPos), true)); // Next slashPos = tree.find('/', slashPos + 1); } } // ...and file name string fileName(urlObj.getFile()); if (fileName.empty() == false) { doc.remove_term(limitTermLength(string("P") + StringManip::toLowerCase(fileName), true)); } // Language code doc.remove_term(string("L") + Languages::toCode(language)); // MIME type doc.remove_term(string("T") + docInfo.getType()); }
/** Main routine */ int main(int argc,char **argv) { // process inputs that were passed to us via QUERY_STRING std::cout << "Content-Type:application/javascript;charset=utf-8\r\n\n"; std::string callback; try { // get input parameters const char *queryEnv = getenv("QUERY_STRING"); std::string queryString; if (queryEnv) { queryString = queryEnv; } else if (argc>=2) { queryString = argv[1]; } else { std::cout << "No input!\n"; exit(1); } // parse query string std::vector<std::string> parts = split(queryString,'&'); std::string searchFor,callback; int num=1,page=0; for (std::vector<std::string>::const_iterator it=parts.begin();it!=parts.end();++it) { std::vector<std::string> kv = split(*it,'='); if (kv.size()==2) { std::string val = uriDecode(kv[1]); if (kv[0]=="q") searchFor = val; else if (kv[0]=="n") num = fromString<int>(val); else if (kv[0]=="p") page = fromString<int>(val); else if (kv[0]=="cb") callback = val; } } std::string indexDir = "doxysearch.db"; if (queryString=="test") // user test { bool dbOk = dirExists(indexDir); if (dbOk) { std::cout << "Test successful."; } else { std::cout << "Test failed: cannot find search index " << indexDir; } exit(0); } // create query Xapian::Database db(indexDir); Xapian::Enquire enquire(db); Xapian::Query query; std::vector<std::string> words = split(searchFor,' '); for (std::vector<std::string>::const_iterator it=words.begin();it!=words.end();++it) { query = Xapian::Query(Xapian::Query::OP_OR,query,Xapian::Query(*it)); } enquire.set_query(query); // get results Xapian::MSet matches = enquire.get_mset(page*num,num); unsigned int hits = matches.get_matches_estimated(); unsigned int offset = page*num; unsigned int pages = num>0 ? (hits+num-1)/num : 0; if (offset>hits) offset=hits; if (offset+num>hits) num=hits-offset; // write results as JSONP std::cout << callback.c_str() << "("; std::cout << "{" << std::endl << " \"hits\":" << hits << "," << std::endl << " \"first\":" << offset << "," << std::endl << " \"count\":" << num << "," << std::endl << " \"page\":" << page << "," << std::endl << " \"pages\":" << pages << "," << std::endl << " \"query\": \"" << escapeString(searchFor) << "\"," << std::endl << " \"items\":[" << std::endl; // foreach search result unsigned int o = offset; for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i,++o) { std::vector<Fragment> hl; Xapian::Document doc = i.get_document(); highlighter(doc.get_value(FIELD_DOC),words,hl); std::cout << " {\"type\": \"" << doc.get_value(FIELD_TYPE) << "\"," << std::endl << " \"name\": \"" << doc.get_value(FIELD_NAME) << doc.get_value(FIELD_ARGS) << "\"," << std::endl << " \"tag\": \"" << doc.get_value(FIELD_TAG) << "\"," << std::endl << " \"url\": \"" << doc.get_value(FIELD_URL) << "\"," << std::endl; std::cout << " \"fragments\":[" << std::endl; int c=0; bool first=true; for (std::vector<Fragment>::const_iterator it = hl.begin();it!=hl.end() && c<3;++it,++c) { if (!first) std::cout << "," << std::endl; std::cout << " \"" << escapeString((*it).text) << "\""; first=false; } if (!first) std::cout << std::endl; std::cout << " ]" << std::endl; std::cout << " }"; if (o<offset+num-1) std::cout << ","; std::cout << std::endl; } std::cout << " ]" << std::endl << "})" << std::endl; } catch (const Xapian::Error &e) // Xapian exception { showError(callback,e.get_description()); } catch (...) // Any other exception { showError(callback,"Unknown Exception!"); exit(1); } return 0; }