void XapianIndex::addTermsToDocument(Tokenizer &tokens, Xapian::Document &doc, const string &prefix, Xapian::termcount &termPos, StemmingMode mode) const { Xapian::Stem *pStemmer = NULL; string term; // Do we know what language to use for stemming ? if (m_stemLanguage.empty() == false) { pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage)); } // Get the terms while (tokens.nextToken(term) == true) { if (term.empty() == true) { continue; } // Does it start with a capital letter ? if (isupper((int)term[0]) != 0) { // R-prefix the raw term doc.add_posting(string("R") + term, termPos); } // Lower case the term term = StringManip::toLowerCase(term); // Stem the term ? if ((mode == STORE_UNSTEM) || (pStemmer == NULL)) { doc.add_posting(limitTermLength(prefix + term), termPos++); } else if (mode == STORE_STEM) { string stemmedTerm = pStemmer->stem_word(term); doc.add_posting(limitTermLength(prefix + stemmedTerm), termPos++); } else if (mode == STORE_BOTH) { string stemmedTerm = pStemmer->stem_word(term); // Add both doc.add_posting(limitTermLength(prefix + term), termPos); // ...at the same position doc.add_posting(limitTermLength(prefix + stemmedTerm), termPos++); } } #ifdef DEBUG cout << "XapianIndex::addTermsToDocument: added " << termPos << " terms" << endl; #endif if (pStemmer != NULL) { delete pStemmer; } }
int main(int argc, char **argv) { // Simplest possible options parsing: we just require three or more // parameters. if(argc < 4) { cout << "usage: " << argv[0] << " <path to database> <document data> <document terms>" << endl; exit(1); } // Catch any Xapian::Error exceptions thrown try { // Make the database Xapian::WritableDatabase database(argv[1], Xapian::DB_CREATE_OR_OPEN); // Make the document Xapian::Document newdocument; // Put the data in the document newdocument.set_data(string(argv[2])); // Put the terms into the document for (int i = 3; i < argc; ++i) { newdocument.add_posting(argv[i], i - 2); } // Add the document to the database database.add_document(newdocument); } catch(const Xapian::Error &error) { cout << "Exception: " << error.get_msg() << endl; } }
void XapianIndex::addPostingsToDocument(Tokenizer &tokens, Xapian::Document &doc, const string &prefix, Xapian::termcount &termPos, StemmingMode mode) const { Xapian::Stem *pStemmer = NULL; string stemPrefix("Z"); string term; // Do we know what language to use for stemming ? if (m_stemLanguage.empty() == false) { try { pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage)); } catch (const Xapian::Error &error) { cerr << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl; } } // Stems are Z-prefixed, unless a prefix is already defined if (prefix.empty() == false) { stemPrefix = prefix; } // Get the terms while (tokens.nextToken(term) == true) { bool addStem = false; if (term.empty() == true) { continue; } // Lower case the term term = StringManip::toLowerCase(term); // Stem the term ? if ((mode == STORE_UNSTEM) || (pStemmer == NULL)) { doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos); } else if (mode == STORE_STEM) { addStem = true; } else if (mode == STORE_BOTH) { // Add both doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos); addStem = true; } // Don't stem if the term starts with a digit if ((addStem == true) && (isdigit((int)term[0]) == 0)) { #if XAPIAN_MAJOR_VERSION==0 string stemmedTerm(pStemmer->stem_word(term)); #else string stemmedTerm((*pStemmer)(term)); #endif doc.add_term(stemPrefix + XapianDatabase::limitTermLength(stemmedTerm)); } ++termPos; } #ifdef DEBUG cout << "XapianIndex::addPostingsToDocument: added " << termPos << " terms" << endl; #endif if (pStemmer != NULL) { delete pStemmer; } }
void XapianIndex::addPostingsToDocument(Tokenizer &tokens, Xapian::Document &doc, const string &prefix, Xapian::termcount &termPos, StemmingMode mode) const { Xapian::Stem *pStemmer = NULL; string upperCasePrefix("R"); string term; // Do we know what language to use for stemming ? if (m_stemLanguage.empty() == false) { pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage)); } // Terms starting with a capital letter are R-prefixed, unless a prefix is already defined if (prefix.empty() == false) { upperCasePrefix = prefix; } // Get the terms while (tokens.nextToken(term) == true) { if (term.empty() == true) { continue; } // Does it start with a capital letter ? if (isupper((int)term[0]) != 0) { doc.add_posting(upperCasePrefix + XapianDatabase::limitTermLength(term), termPos); } // Lower case the term term = StringManip::toLowerCase(term); // Stem the term ? if ((mode == STORE_UNSTEM) || (pStemmer == NULL)) { doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos); } else if (mode == STORE_STEM) { #if XAPIAN_MAJOR_VERSION==0 string stemmedTerm(pStemmer->stem_word(term)); #else string stemmedTerm((*pStemmer)(term)); #endif doc.add_posting(prefix + XapianDatabase::limitTermLength(stemmedTerm), termPos); } else if (mode == STORE_BOTH) { #if XAPIAN_MAJOR_VERSION==0 string stemmedTerm(pStemmer->stem_word(term)); #else string stemmedTerm((*pStemmer)(term)); #endif // Add both at the same position doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos); if (stemmedTerm != term) { // No point adding the same term twice doc.add_posting(prefix + XapianDatabase::limitTermLength(stemmedTerm), termPos); } } ++termPos; } #ifdef DEBUG cout << "XapianIndex::addPostingsToDocument: added " << termPos << " terms" << endl; #endif if (pStemmer != NULL) { delete pStemmer; } }