Ejemplo n.º 1
0
/// Renames a label.
bool XapianIndex::renameLabel(const string &name, const string &newName)
{
	bool renamedLabel = false;

	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
	if (pDatabase == NULL)
	{
		cerr << "Bad index " << m_databaseName << endl;
		return false;
	}

	try
	{
		Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
		if (pIndex != NULL)
		{
			string term("XLABEL:");

			// Get documents that have this label
			term += name;
			for (Xapian::PostingIterator postingIter = pIndex->postlist_begin(term);
				postingIter != pIndex->postlist_end(term); ++postingIter)
			{
				Xapian::docid docId = *postingIter;

				// Get the document
				Xapian::Document doc = pIndex->get_document(docId);
				// Remove the term
				doc.remove_term(term);
				// ...add the new one
				doc.add_term(limitTermLength(string("XLABEL:") + newName));
				// ...and update the document
				pIndex->replace_document(docId, doc);
			}

			renamedLabel = true;
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't delete label: " << error.get_type() << ": " << error.get_msg() << endl;
	}
	catch (...)
	{
		cerr << "Couldn't delete label, unknown exception occured" << endl;
	}
	pDatabase->unlock();

	return renamedLabel;
}
Ejemplo n.º 2
0
void XapianIndex::removeCommonTerms(Xapian::Document &doc)
{
    DocumentInfo docInfo;
    set<string> commonTerms;
    string record(doc.get_data());

    // First, remove the magic term
    commonTerms.insert(MAGIC_TERM);

    if (record.empty() == true)
    {
        // Nothing else we can do
        return;
    }

    string language(StringManip::extractField(record, "language=", "\n"));
    string timestamp(StringManip::extractField(record, "timestamp=", "\n"));

    docInfo = DocumentInfo(StringManip::extractField(record, "caption=", "\n"),
                           StringManip::extractField(record, "url=", "\n"),
                           StringManip::extractField(record, "type=", "\n"),
                           Languages::toLocale(language));
    // We used to use timestamp prior to 0.60
    if (timestamp.empty() == true)
    {
        string modTime(StringManip::extractField(record, "modtime=", "\n"));
        if (modTime.empty() == false)
        {
            time_t timeT = (time_t )atol(modTime.c_str());
            timestamp = TimeConverter::toTimestamp(timeT);
        }
    }
    docInfo.setTimestamp(timestamp);
    string bytesSize(StringManip::extractField(record, "size=", ""));
    if (bytesSize.empty() == false)
    {
        docInfo.setSize((off_t )atol(bytesSize.c_str()));
    }
    Url urlObj(docInfo.getLocation());

    // FIXME: remove terms extracted from the title if they don't have more than one posting
    string title(docInfo.getTitle());
    if (title.empty() == false)
    {
        Document titleDoc;
        titleDoc.setData(title.c_str(), title.length());
        Tokenizer titleTokens(&titleDoc);
        removeFirstPostingsFromDocument(titleTokens, doc, "S", language, STORE_UNSTEM);
        titleTokens.rewind();
        removeFirstPostingsFromDocument(titleTokens, doc, "", language, m_stemMode);
    }

    // Location
    string location(docInfo.getLocation());
    commonTerms.insert(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(location), true));
    // Base file
    string::size_type qmPos = location.find("?");
    if ((urlObj.isLocal() == true) &&
            (qmPos != string::npos))
    {
        commonTerms.insert(string("XFILE:") + XapianDatabase::limitTermLength(Url::escapeUrl(location.substr(0, qmPos)), true));
    }
    // Host name
    string hostName(StringManip::toLowerCase(urlObj.getHost()));
    if (hostName.empty() == false)
    {
        commonTerms.insert(string("H") + XapianDatabase::limitTermLength(hostName, true));
        string::size_type dotPos = hostName.find('.');
        while (dotPos != string::npos)
        {
            commonTerms.insert(string("H") + XapianDatabase::limitTermLength(hostName.substr(dotPos + 1), true));

            // Next
            dotPos = hostName.find('.', dotPos + 1);
        }
    }
    // ...location
    string tree(urlObj.getLocation());
    if (tree.empty() == false)
    {
        commonTerms.insert(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree), true));
        if (tree[0] == '/')
        {
            commonTerms.insert("XDIR:/");
        }
        string::size_type slashPos = tree.find('/', 1);
        while (slashPos != string::npos)
        {
            commonTerms.insert(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree.substr(0, slashPos)), true));

            // Next
            slashPos = tree.find('/', slashPos + 1);
        }
    }
    // ...and file name
    string fileName(urlObj.getFile());
    if (fileName.empty() == false)
    {
        string extension;

        commonTerms.insert(string("P") + XapianDatabase::limitTermLength(Url::escapeUrl(fileName), true));

        // Does it have an extension ?
        string::size_type extPos = fileName.rfind('.');
        if ((extPos != string::npos) &&
                (extPos + 1 < fileName.length()))
        {
            extension = StringManip::toLowerCase(fileName.substr(extPos + 1));
        }
        commonTerms.insert(string("XEXT:") + XapianDatabase::limitTermLength(extension));
    }
    // Date terms
    time_t timeT = TimeConverter::fromTimestamp(docInfo.getTimestamp());
    struct tm *tm = localtime(&timeT);
    string yyyymmdd = TimeConverter::toYYYYMMDDString(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday);
    if (yyyymmdd.length() == 8)
    {
        commonTerms.insert(string("D") + yyyymmdd);
        commonTerms.insert(string("M") + yyyymmdd.substr(0, 6));
        commonTerms.insert(string("Y") + yyyymmdd.substr(0, 4));
    }
    // Language code
    commonTerms.insert(string("L") + Languages::toCode(language));
    // MIME type
    commonTerms.insert(string("T") + docInfo.getType());

    for (set<string>::const_iterator termIter = commonTerms.begin(); termIter != commonTerms.end(); ++termIter)
    {
        try
        {
            doc.remove_term(*termIter);
        }
        catch (const Xapian::Error &error)
        {
#ifdef DEBUG
            cout << "XapianIndex::removeCommonTerms: " << error.get_msg() << endl;
#endif
        }
    }
}
Ejemplo n.º 3
0
void XapianIndex::removeFirstPostingsFromDocument(Tokenizer &tokens, Xapian::Document &doc,
        const string &prefix, const string &language, StemmingMode mode) const
{
    Xapian::TermIterator termListIter = doc.termlist_begin();
    Xapian::Stem *pStemmer = NULL;
    string stemPrefix("Z");
    string term;

    // Do we know what language to use for stemming ?
    if (language.empty() == false)
    {
        try
        {
            pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage));
        }
        catch (const Xapian::Error &error)
        {
            cerr << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl;
        }
    }

    // Stems are Z-prefixed, unless a prefix is already defined
    if (prefix.empty() == false)
    {
        stemPrefix = prefix;
    }

    // Get the terms and remove the first posting for each
    while (tokens.nextToken(term) == true)
    {
        bool removeStem = false;

        if (term.empty() == true)
        {
            continue;
        }
        // Lower case the term
        term = StringManip::toLowerCase(term);

        // Stem the term ?
        if ((mode == STORE_UNSTEM) ||
                (pStemmer == NULL))
        {
            removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term));
        }
        else if (mode == STORE_STEM)
        {
            removeStem = true;
        }
        else if (mode == STORE_BOTH)
        {
            // Remove both
            removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term));
            removeStem = true;
        }

        // Since stems don't have positional information, we can't simply remove them
        // since any may appear more than once in the original document
        // We can only remove those that have some prefix set
        // Don't stem if the term starts with a digit
        if ((removeStem == true) &&
                (prefix.empty() == false) &&
                (isdigit((int)term[0]) == 0))
        {
#if XAPIAN_MAJOR_VERSION==0
            string stemmedTerm(pStemmer->stem_word(term));
#else
            string stemmedTerm((*pStemmer)(term));
#endif

            doc.remove_term(stemPrefix + XapianDatabase::limitTermLength(stemmedTerm));
        }
    }

    if (pStemmer != NULL)
    {
        delete pStemmer;
    }
}
Ejemplo n.º 4
0
void XapianIndex::removeCommonTerms(Xapian::Document &doc)
{
	DocumentInfo docInfo;
	string record(doc.get_data());

	// First, remove the magic term
	doc.remove_term(MAGIC_TERM);

	if (record.empty() == true)
        {
		// Nothing else we can do
		return;
	}

	string language(StringManip::extractField(record, "language=", ""));
	string timestamp(StringManip::extractField(record, "timestamp=", "\n"));

	docInfo = DocumentInfo(StringManip::extractField(record, "caption=", "\n"),
		StringManip::extractField(record, "url=", "\n"),
		StringManip::extractField(record, "type=", "\n"),
		Languages::toLocale(language));
	// We used to use timestamp prior to 0.60
	if (timestamp.empty() == true)
	{
		string modTime(StringManip::extractField(record, "modtime=", "\n"));
		if (modTime.empty() == false)
		{
			time_t timeT = (time_t )atol(modTime.c_str());
			timestamp = TimeConverter::toTimestamp(timeT);
		}
	}
	docInfo.setTimestamp(timestamp);
	Url urlObj(docInfo.getLocation());

	// FIXME: remove terms extracted from the title if they don't have more than one posting
	string title(docInfo.getTitle());
	if (title.empty() == false)
	{
		Document titleDoc;
		titleDoc.setData(title.c_str(), title.length());
		Tokenizer titleTokens(&titleDoc);
		removeFirstPostingsFromDocument(titleTokens, doc, "S", language, STORE_UNSTEM);
		titleTokens.rewind();
		removeFirstPostingsFromDocument(titleTokens, doc, "", language, m_stemMode);
	}

	// Title
	doc.remove_term(limitTermLength(string("U") + docInfo.getLocation(), true));
	// Host name
	string hostName(StringManip::toLowerCase(urlObj.getHost()));
	if (hostName.empty() == false)
	{
		doc.remove_term(limitTermLength(string("H") + hostName, true));
		string::size_type dotPos = hostName.find('.');
		while (dotPos != string::npos)
		{
			doc.remove_term(limitTermLength(string("H") + hostName.substr(dotPos + 1), true));

			// Next
			dotPos = hostName.find('.', dotPos + 1);
		}
	}
	// ...location
	string tree(urlObj.getLocation());
	if (tree.empty() == false)
	{
		doc.remove_term(limitTermLength(string("XDIR:") + tree, true));
		string::size_type slashPos = tree.find('/', 1);
		while (slashPos != string::npos)
		{
			doc.remove_term(limitTermLength(string("XDIR:") + tree.substr(0, slashPos), true));

			// Next
			slashPos = tree.find('/', slashPos + 1);
		}
	}
	// ...and file name
	string fileName(urlObj.getFile());
	if (fileName.empty() == false)
	{
		doc.remove_term(limitTermLength(string("P") + StringManip::toLowerCase(fileName), true));
	}
	// Language code
	doc.remove_term(string("L") + Languages::toCode(language));
	// MIME type
	doc.remove_term(string("T") + docInfo.getType());
}
Ejemplo n.º 5
0
/// Sets a document's labels.
bool XapianIndex::setDocumentLabels(unsigned int docId, const set<string> &labels,
	bool resetLabels)
{
	bool updatedLabels = false;

	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
	if (pDatabase == NULL)
	{
		cerr << "Bad index " << m_databaseName << endl;
		return false;
	}

	try
	{
		Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
		if (pIndex != NULL)
		{
			Xapian::Document doc = pIndex->get_document(docId);

			// Reset existing labels ?
			if (resetLabels == true)
			{
				Xapian::TermIterator termIter = pIndex->termlist_begin(docId);
				if (termIter != pIndex->termlist_end(docId))
				{
					for (termIter.skip_to("XLABEL:");
						termIter != pIndex->termlist_end(docId); ++termIter)
					{
						// Is this a label ?
						if (strncasecmp((*termIter).c_str(), "XLABEL:", min(7, (int)(*termIter).length())) == 0)
						{
							doc.remove_term(*termIter);
						}
					}
				}
			}

			// Set new labels
			for (set<string>::const_iterator labelIter = labels.begin(); labelIter != labels.end();
				++labelIter)
			{
				if (labelIter->empty() == false)
				{
					doc.add_term(limitTermLength(string("XLABEL:") + *labelIter));
				}
			}

			pIndex->replace_document(docId, doc);
			updatedLabels = true;
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't update document's labels: " << error.get_type() << ": " << error.get_msg() << endl;
	}
	catch (...)
	{
		cerr << "Couldn't update document's labels, unknown exception occured" << endl;
	}
	pDatabase->unlock();

	return updatedLabels;
}