/// Renames a label. bool XapianIndex::renameLabel(const string &name, const string &newName) { bool renamedLabel = false; XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false); if (pDatabase == NULL) { cerr << "Bad index " << m_databaseName << endl; return false; } try { Xapian::WritableDatabase *pIndex = pDatabase->writeLock(); if (pIndex != NULL) { string term("XLABEL:"); // Get documents that have this label term += name; for (Xapian::PostingIterator postingIter = pIndex->postlist_begin(term); postingIter != pIndex->postlist_end(term); ++postingIter) { Xapian::docid docId = *postingIter; // Get the document Xapian::Document doc = pIndex->get_document(docId); // Remove the term doc.remove_term(term); // ...add the new one doc.add_term(limitTermLength(string("XLABEL:") + newName)); // ...and update the document pIndex->replace_document(docId, doc); } renamedLabel = true; } } catch (const Xapian::Error &error) { cerr << "Couldn't delete label: " << error.get_type() << ": " << error.get_msg() << endl; } catch (...) { cerr << "Couldn't delete label, unknown exception occured" << endl; } pDatabase->unlock(); return renamedLabel; }
void XapianIndex::removeCommonTerms(Xapian::Document &doc) { DocumentInfo docInfo; set<string> commonTerms; string record(doc.get_data()); // First, remove the magic term commonTerms.insert(MAGIC_TERM); if (record.empty() == true) { // Nothing else we can do return; } string language(StringManip::extractField(record, "language=", "\n")); string timestamp(StringManip::extractField(record, "timestamp=", "\n")); docInfo = DocumentInfo(StringManip::extractField(record, "caption=", "\n"), StringManip::extractField(record, "url=", "\n"), StringManip::extractField(record, "type=", "\n"), Languages::toLocale(language)); // We used to use timestamp prior to 0.60 if (timestamp.empty() == true) { string modTime(StringManip::extractField(record, "modtime=", "\n")); if (modTime.empty() == false) { time_t timeT = (time_t )atol(modTime.c_str()); timestamp = TimeConverter::toTimestamp(timeT); } } docInfo.setTimestamp(timestamp); string bytesSize(StringManip::extractField(record, "size=", "")); if (bytesSize.empty() == false) { docInfo.setSize((off_t )atol(bytesSize.c_str())); } Url urlObj(docInfo.getLocation()); // FIXME: remove terms extracted from the title if they don't have more than one posting string title(docInfo.getTitle()); if (title.empty() == false) { Document titleDoc; titleDoc.setData(title.c_str(), title.length()); Tokenizer titleTokens(&titleDoc); removeFirstPostingsFromDocument(titleTokens, doc, "S", language, STORE_UNSTEM); titleTokens.rewind(); removeFirstPostingsFromDocument(titleTokens, doc, "", language, m_stemMode); } // Location string location(docInfo.getLocation()); commonTerms.insert(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(location), true)); // Base file string::size_type qmPos = location.find("?"); if ((urlObj.isLocal() == true) && (qmPos != string::npos)) { commonTerms.insert(string("XFILE:") + XapianDatabase::limitTermLength(Url::escapeUrl(location.substr(0, qmPos)), true)); } // Host name string hostName(StringManip::toLowerCase(urlObj.getHost())); if (hostName.empty() == false) { commonTerms.insert(string("H") + XapianDatabase::limitTermLength(hostName, true)); string::size_type dotPos = hostName.find('.'); while (dotPos != string::npos) { commonTerms.insert(string("H") + XapianDatabase::limitTermLength(hostName.substr(dotPos + 1), true)); // Next dotPos = hostName.find('.', dotPos + 1); } } // ...location string tree(urlObj.getLocation()); if (tree.empty() == false) { commonTerms.insert(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree), true)); if (tree[0] == '/') { commonTerms.insert("XDIR:/"); } string::size_type slashPos = tree.find('/', 1); while (slashPos != string::npos) { commonTerms.insert(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree.substr(0, slashPos)), true)); // Next slashPos = tree.find('/', slashPos + 1); } } // ...and file name string fileName(urlObj.getFile()); if (fileName.empty() == false) { string extension; commonTerms.insert(string("P") + XapianDatabase::limitTermLength(Url::escapeUrl(fileName), true)); // Does it have an extension ? string::size_type extPos = fileName.rfind('.'); if ((extPos != string::npos) && (extPos + 1 < fileName.length())) { extension = StringManip::toLowerCase(fileName.substr(extPos + 1)); } commonTerms.insert(string("XEXT:") + XapianDatabase::limitTermLength(extension)); } // Date terms time_t timeT = TimeConverter::fromTimestamp(docInfo.getTimestamp()); struct tm *tm = localtime(&timeT); string yyyymmdd = TimeConverter::toYYYYMMDDString(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday); if (yyyymmdd.length() == 8) { commonTerms.insert(string("D") + yyyymmdd); commonTerms.insert(string("M") + yyyymmdd.substr(0, 6)); commonTerms.insert(string("Y") + yyyymmdd.substr(0, 4)); } // Language code commonTerms.insert(string("L") + Languages::toCode(language)); // MIME type commonTerms.insert(string("T") + docInfo.getType()); for (set<string>::const_iterator termIter = commonTerms.begin(); termIter != commonTerms.end(); ++termIter) { try { doc.remove_term(*termIter); } catch (const Xapian::Error &error) { #ifdef DEBUG cout << "XapianIndex::removeCommonTerms: " << error.get_msg() << endl; #endif } } }
void XapianIndex::removeFirstPostingsFromDocument(Tokenizer &tokens, Xapian::Document &doc, const string &prefix, const string &language, StemmingMode mode) const { Xapian::TermIterator termListIter = doc.termlist_begin(); Xapian::Stem *pStemmer = NULL; string stemPrefix("Z"); string term; // Do we know what language to use for stemming ? if (language.empty() == false) { try { pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage)); } catch (const Xapian::Error &error) { cerr << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl; } } // Stems are Z-prefixed, unless a prefix is already defined if (prefix.empty() == false) { stemPrefix = prefix; } // Get the terms and remove the first posting for each while (tokens.nextToken(term) == true) { bool removeStem = false; if (term.empty() == true) { continue; } // Lower case the term term = StringManip::toLowerCase(term); // Stem the term ? if ((mode == STORE_UNSTEM) || (pStemmer == NULL)) { removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term)); } else if (mode == STORE_STEM) { removeStem = true; } else if (mode == STORE_BOTH) { // Remove both removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term)); removeStem = true; } // Since stems don't have positional information, we can't simply remove them // since any may appear more than once in the original document // We can only remove those that have some prefix set // Don't stem if the term starts with a digit if ((removeStem == true) && (prefix.empty() == false) && (isdigit((int)term[0]) == 0)) { #if XAPIAN_MAJOR_VERSION==0 string stemmedTerm(pStemmer->stem_word(term)); #else string stemmedTerm((*pStemmer)(term)); #endif doc.remove_term(stemPrefix + XapianDatabase::limitTermLength(stemmedTerm)); } } if (pStemmer != NULL) { delete pStemmer; } }
void XapianIndex::removeCommonTerms(Xapian::Document &doc) { DocumentInfo docInfo; string record(doc.get_data()); // First, remove the magic term doc.remove_term(MAGIC_TERM); if (record.empty() == true) { // Nothing else we can do return; } string language(StringManip::extractField(record, "language=", "")); string timestamp(StringManip::extractField(record, "timestamp=", "\n")); docInfo = DocumentInfo(StringManip::extractField(record, "caption=", "\n"), StringManip::extractField(record, "url=", "\n"), StringManip::extractField(record, "type=", "\n"), Languages::toLocale(language)); // We used to use timestamp prior to 0.60 if (timestamp.empty() == true) { string modTime(StringManip::extractField(record, "modtime=", "\n")); if (modTime.empty() == false) { time_t timeT = (time_t )atol(modTime.c_str()); timestamp = TimeConverter::toTimestamp(timeT); } } docInfo.setTimestamp(timestamp); Url urlObj(docInfo.getLocation()); // FIXME: remove terms extracted from the title if they don't have more than one posting string title(docInfo.getTitle()); if (title.empty() == false) { Document titleDoc; titleDoc.setData(title.c_str(), title.length()); Tokenizer titleTokens(&titleDoc); removeFirstPostingsFromDocument(titleTokens, doc, "S", language, STORE_UNSTEM); titleTokens.rewind(); removeFirstPostingsFromDocument(titleTokens, doc, "", language, m_stemMode); } // Title doc.remove_term(limitTermLength(string("U") + docInfo.getLocation(), true)); // Host name string hostName(StringManip::toLowerCase(urlObj.getHost())); if (hostName.empty() == false) { doc.remove_term(limitTermLength(string("H") + hostName, true)); string::size_type dotPos = hostName.find('.'); while (dotPos != string::npos) { doc.remove_term(limitTermLength(string("H") + hostName.substr(dotPos + 1), true)); // Next dotPos = hostName.find('.', dotPos + 1); } } // ...location string tree(urlObj.getLocation()); if (tree.empty() == false) { doc.remove_term(limitTermLength(string("XDIR:") + tree, true)); string::size_type slashPos = tree.find('/', 1); while (slashPos != string::npos) { doc.remove_term(limitTermLength(string("XDIR:") + tree.substr(0, slashPos), true)); // Next slashPos = tree.find('/', slashPos + 1); } } // ...and file name string fileName(urlObj.getFile()); if (fileName.empty() == false) { doc.remove_term(limitTermLength(string("P") + StringManip::toLowerCase(fileName), true)); } // Language code doc.remove_term(string("L") + Languages::toCode(language)); // MIME type doc.remove_term(string("T") + docInfo.getType()); }
/// Sets a document's labels. bool XapianIndex::setDocumentLabels(unsigned int docId, const set<string> &labels, bool resetLabels) { bool updatedLabels = false; XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false); if (pDatabase == NULL) { cerr << "Bad index " << m_databaseName << endl; return false; } try { Xapian::WritableDatabase *pIndex = pDatabase->writeLock(); if (pIndex != NULL) { Xapian::Document doc = pIndex->get_document(docId); // Reset existing labels ? if (resetLabels == true) { Xapian::TermIterator termIter = pIndex->termlist_begin(docId); if (termIter != pIndex->termlist_end(docId)) { for (termIter.skip_to("XLABEL:"); termIter != pIndex->termlist_end(docId); ++termIter) { // Is this a label ? if (strncasecmp((*termIter).c_str(), "XLABEL:", min(7, (int)(*termIter).length())) == 0) { doc.remove_term(*termIter); } } } } // Set new labels for (set<string>::const_iterator labelIter = labels.begin(); labelIter != labels.end(); ++labelIter) { if (labelIter->empty() == false) { doc.add_term(limitTermLength(string("XLABEL:") + *labelIter)); } } pIndex->replace_document(docId, doc); updatedLabels = true; } } catch (const Xapian::Error &error) { cerr << "Couldn't update document's labels: " << error.get_type() << ": " << error.get_msg() << endl; } catch (...) { cerr << "Couldn't update document's labels, unknown exception occured" << endl; } pDatabase->unlock(); return updatedLabels; }