Пример #1
0
void XapianIndex::addTermsToDocument(Tokenizer &tokens, Xapian::Document &doc,
	const string &prefix, Xapian::termcount &termPos, StemmingMode mode) const
{
	Xapian::Stem *pStemmer = NULL;
	string term;

	// Do we know what language to use for stemming ?
	if (m_stemLanguage.empty() == false)
	{
		pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage));
	}

	// Get the terms
	while (tokens.nextToken(term) == true)
	{
		if (term.empty() == true)
		{
			continue;
		}
		// Does it start with a capital letter ?
		if (isupper((int)term[0]) != 0)
		{
			// R-prefix the raw term
			doc.add_posting(string("R") + term, termPos);
		}
		// Lower case the term
		term = StringManip::toLowerCase(term);

		// Stem the term ?
		if ((mode == STORE_UNSTEM) ||
			(pStemmer == NULL))
		{
			doc.add_posting(limitTermLength(prefix + term), termPos++);
		}
		else if (mode == STORE_STEM)
		{
			string stemmedTerm = pStemmer->stem_word(term);

			doc.add_posting(limitTermLength(prefix + stemmedTerm), termPos++);
		}
		else if (mode == STORE_BOTH)
		{
			string stemmedTerm = pStemmer->stem_word(term);

			// Add both
			doc.add_posting(limitTermLength(prefix + term), termPos);
			// ...at the same position
			doc.add_posting(limitTermLength(prefix + stemmedTerm), termPos++);
		}
	}
#ifdef DEBUG
	cout << "XapianIndex::addTermsToDocument: added " << termPos << " terms" << endl;
#endif

	if (pStemmer != NULL)
	{
		delete pStemmer;
	}
}
Пример #2
0
int main(int argc, char **argv)
{
    // Simplest possible options parsing: we just require three or more
    // parameters.
    if(argc < 4) {
        cout << "usage: " << argv[0] <<
	    " <path to database> <document data> <document terms>" << endl;
        exit(1);
    }

    // Catch any Xapian::Error exceptions thrown
    try {
        // Make the database
	Xapian::WritableDatabase database(argv[1], Xapian::DB_CREATE_OR_OPEN);

        // Make the document
	Xapian::Document newdocument;

        // Put the data in the document
        newdocument.set_data(string(argv[2]));

        // Put the terms into the document
        for (int i = 3; i < argc; ++i) {
            newdocument.add_posting(argv[i], i - 2);
        }

        // Add the document to the database
        database.add_document(newdocument);
    } catch(const Xapian::Error &error) {
        cout << "Exception: "  << error.get_msg() << endl;
    }
}
Пример #3
0
void XapianIndex::addPostingsToDocument(Tokenizer &tokens, Xapian::Document &doc,
                                        const string &prefix, Xapian::termcount &termPos, StemmingMode mode) const
{
    Xapian::Stem *pStemmer = NULL;
    string stemPrefix("Z");
    string term;

    // Do we know what language to use for stemming ?
    if (m_stemLanguage.empty() == false)
    {
        try
        {
            pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage));
        }
        catch (const Xapian::Error &error)
        {
            cerr << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl;
        }
    }

    // Stems are Z-prefixed, unless a prefix is already defined
    if (prefix.empty() == false)
    {
        stemPrefix = prefix;
    }

    // Get the terms
    while (tokens.nextToken(term) == true)
    {
        bool addStem = false;

        if (term.empty() == true)
        {
            continue;
        }
        // Lower case the term
        term = StringManip::toLowerCase(term);

        // Stem the term ?
        if ((mode == STORE_UNSTEM) ||
                (pStemmer == NULL))
        {
            doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos);
        }
        else if (mode == STORE_STEM)
        {
            addStem = true;
        }
        else if (mode == STORE_BOTH)
        {
            // Add both
            doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos);
            addStem = true;
        }

        // Don't stem if the term starts with a digit
        if ((addStem == true) &&
                (isdigit((int)term[0]) == 0))
        {
#if XAPIAN_MAJOR_VERSION==0
            string stemmedTerm(pStemmer->stem_word(term));
#else
            string stemmedTerm((*pStemmer)(term));
#endif

            doc.add_term(stemPrefix + XapianDatabase::limitTermLength(stemmedTerm));
        }

        ++termPos;
    }
#ifdef DEBUG
    cout << "XapianIndex::addPostingsToDocument: added " << termPos << " terms" << endl;
#endif

    if (pStemmer != NULL)
    {
        delete pStemmer;
    }
}
Пример #4
0
void XapianIndex::addPostingsToDocument(Tokenizer &tokens, Xapian::Document &doc,
	const string &prefix, Xapian::termcount &termPos, StemmingMode mode) const
{
	Xapian::Stem *pStemmer = NULL;
	string upperCasePrefix("R");
	string term;

	// Do we know what language to use for stemming ?
	if (m_stemLanguage.empty() == false)
	{
		pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage));
	}

	// Terms starting with a capital letter are R-prefixed, unless a prefix is already defined
	if (prefix.empty() == false)
	{
		upperCasePrefix = prefix;
	}

	// Get the terms
	while (tokens.nextToken(term) == true)
	{
		if (term.empty() == true)
		{
			continue;
		}
		// Does it start with a capital letter ?
		if (isupper((int)term[0]) != 0)
		{
			doc.add_posting(upperCasePrefix + XapianDatabase::limitTermLength(term), termPos);
		}
		// Lower case the term
		term = StringManip::toLowerCase(term);

		// Stem the term ?
		if ((mode == STORE_UNSTEM) ||
			(pStemmer == NULL))
		{
			doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos);
		}
		else if (mode == STORE_STEM)
		{
#if XAPIAN_MAJOR_VERSION==0
			string stemmedTerm(pStemmer->stem_word(term));
#else
			string stemmedTerm((*pStemmer)(term));
#endif

			doc.add_posting(prefix + XapianDatabase::limitTermLength(stemmedTerm), termPos);
		}
		else if (mode == STORE_BOTH)
		{
#if XAPIAN_MAJOR_VERSION==0
			string stemmedTerm(pStemmer->stem_word(term));
#else
			string stemmedTerm((*pStemmer)(term));
#endif

			// Add both at the same position
			doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos);
			if (stemmedTerm != term)
			{
				// No point adding the same term twice
				doc.add_posting(prefix + XapianDatabase::limitTermLength(stemmedTerm), termPos);
			}
		}

		++termPos;
	}
#ifdef DEBUG
	cout << "XapianIndex::addPostingsToDocument: added " << termPos << " terms" << endl;
#endif

	if (pStemmer != NULL)
	{
		delete pStemmer;
	}
}