Ejemplo n.º 1
0
int main(int argc, char **argv)
{
    // Simplest possible options parsing: we just require three or more
    // parameters.
    if(argc < 4) {
        cout << "usage: " << argv[0] <<
	    " <path to database> <document data> <document terms>" << endl;
        exit(1);
    }

    // Catch any Xapian::Error exceptions thrown
    try {
        // Make the database
	Xapian::WritableDatabase database(argv[1], Xapian::DB_CREATE_OR_OPEN);

        // Make the document
	Xapian::Document newdocument;

        // Put the data in the document
        newdocument.set_data(string(argv[2]));

        // Put the terms into the document
        for (int i = 3; i < argc; ++i) {
            newdocument.add_posting(argv[i], i - 2);
        }

        // Add the document to the database
        database.add_document(newdocument);
    } catch(const Xapian::Error &error) {
        cout << "Exception: "  << error.get_msg() << endl;
    }
}
Ejemplo n.º 2
0
void XapianIndex::addTermsToDocument(Tokenizer &tokens, Xapian::Document &doc,
	const string &prefix, Xapian::termcount &termPos, StemmingMode mode) const
{
	Xapian::Stem *pStemmer = NULL;
	string term;

	// Do we know what language to use for stemming ?
	if (m_stemLanguage.empty() == false)
	{
		pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage));
	}

	// Get the terms
	while (tokens.nextToken(term) == true)
	{
		if (term.empty() == true)
		{
			continue;
		}
		// Does it start with a capital letter ?
		if (isupper((int)term[0]) != 0)
		{
			// R-prefix the raw term
			doc.add_posting(string("R") + term, termPos);
		}
		// Lower case the term
		term = StringManip::toLowerCase(term);

		// Stem the term ?
		if ((mode == STORE_UNSTEM) ||
			(pStemmer == NULL))
		{
			doc.add_posting(limitTermLength(prefix + term), termPos++);
		}
		else if (mode == STORE_STEM)
		{
			string stemmedTerm = pStemmer->stem_word(term);

			doc.add_posting(limitTermLength(prefix + stemmedTerm), termPos++);
		}
		else if (mode == STORE_BOTH)
		{
			string stemmedTerm = pStemmer->stem_word(term);

			// Add both
			doc.add_posting(limitTermLength(prefix + term), termPos);
			// ...at the same position
			doc.add_posting(limitTermLength(prefix + stemmedTerm), termPos++);
		}
	}
#ifdef DEBUG
	cout << "XapianIndex::addTermsToDocument: added " << termPos << " terms" << endl;
#endif

	if (pStemmer != NULL)
	{
		delete pStemmer;
	}
}
Ejemplo n.º 3
0
/// Returns a document's properties.
bool XapianIndex::getDocumentInfo(unsigned int docId, DocumentInfo &docInfo) const
{
	bool foundDocument = false;

	if (docId == 0)
	{
		return false;
	}

	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
	if (pDatabase == NULL)
	{
		cerr << "Bad index " << m_databaseName << endl;
		return false;
	}

	try
	{
		Xapian::Database *pIndex = pDatabase->readLock();
		if (pIndex != NULL)
		{
			Xapian::Document doc = pIndex->get_document(docId);

			// Get the current document data
			string record = doc.get_data();
			if (record.empty() == false)
			{
				string language = Languages::toLocale(StringManip::extractField(record, "language=", ""));

				docInfo = DocumentInfo(StringManip::extractField(record, "caption=", "\n"),
					StringManip::extractField(record, "url=", "\n"),
					StringManip::extractField(record, "type=", "\n"),
					language);
				docInfo.setTimestamp(StringManip::extractField(record, "timestamp=", "\n"));
#ifdef DEBUG
				cout << "XapianIndex::getDocumentInfo: language is "
					<< docInfo.getLanguage() << endl;
#endif
				foundDocument = true;
			}
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't get document properties: " << error.get_msg() << endl;
	}
	catch (...)
	{
		cerr << "Couldn't get document properties, unknown exception occured" << endl;
	}
	pDatabase->unlock();

	return foundDocument;
}
Ejemplo n.º 4
0
void XapianIndex::setDocumentData(const DocumentInfo &info, Xapian::Document &doc,
                                  const string &language) const
{
    time_t timeT = TimeConverter::fromTimestamp(info.getTimestamp());

    // Add this value to allow sorting by date
    doc.add_value(0, StringManip::integerToBinaryString((uint32_t)timeT));

    DocumentInfo docCopy(info);
    docCopy.setLanguage(language);
    doc.set_data(XapianDatabase::propsToRecord(&docCopy));
}
Ejemplo n.º 5
0
/// Renames a label.
bool XapianIndex::renameLabel(const string &name, const string &newName)
{
	bool renamedLabel = false;

	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
	if (pDatabase == NULL)
	{
		cerr << "Bad index " << m_databaseName << endl;
		return false;
	}

	try
	{
		Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
		if (pIndex != NULL)
		{
			string term("XLABEL:");

			// Get documents that have this label
			term += name;
			for (Xapian::PostingIterator postingIter = pIndex->postlist_begin(term);
				postingIter != pIndex->postlist_end(term); ++postingIter)
			{
				Xapian::docid docId = *postingIter;

				// Get the document
				Xapian::Document doc = pIndex->get_document(docId);
				// Remove the term
				doc.remove_term(term);
				// ...add the new one
				doc.add_term(limitTermLength(string("XLABEL:") + newName));
				// ...and update the document
				pIndex->replace_document(docId, doc);
			}

			renamedLabel = true;
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't delete label: " << error.get_type() << ": " << error.get_msg() << endl;
	}
	catch (...)
	{
		cerr << "Couldn't delete label, unknown exception occured" << endl;
	}
	pDatabase->unlock();

	return renamedLabel;
}
Ejemplo n.º 6
0
void XapianIndex::setDocumentData(const DocumentInfo &info, Xapian::Document &doc,
	const string &language) const
{
	string title(info.getTitle());
	string timestamp(info.getTimestamp());
	char timeStr[64];
	time_t timeT = TimeConverter::fromTimestamp(timestamp);

	// Set the document data omindex-style
	string record = "url=";
	record += info.getLocation();
	// The sample will be generated at query time
	record += "\nsample=";
	record += "\ncaption=";
	if (badField(title) == true)
	{
		// Modify the title if necessary
		string::size_type pos = title.find("=");
		while (pos != string::npos)
		{
			title[pos] = ' ';
			pos = title.find("=", pos + 1);
		}
#ifdef DEBUG
		cout << "XapianIndex::setDocumentData: modified title" << endl;
#endif
	}
	record += title;
	record += "\ntype=";
	record += info.getType();
	// Append a timestamp, in a format compatible with Omega
	record += "\nmodtime=";
	snprintf(timeStr, 64, "%ld", timeT);
	record += timeStr;
	// ...and the language
	record += "\nlanguage=";
	record += StringManip::toLowerCase(language);
#ifdef DEBUG
	cout << "XapianIndex::setDocumentData: document data is " << record << endl;
#endif
	doc.set_data(record);

	// Add this value to allow sorting by date
	doc.add_value(0, StringManip::integerToBinaryString((uint32_t)timeT));
}
Ejemplo n.º 7
0
void HistoryLogger::saveMessage(const Message* message)
{
	if (message->flags() & MESSAGE_FLAG_ALARM)
		return;
	
	Xapian::Document doc;
	
	quint32 flags = message->flags();
	std::string plainText(message->plainText().toUtf8());
	std::string confUser(message->getConfUser().constData());

	std::string data;
	if (flags & MESSAGE_FLAG_RTF)
		data = message->rtfText().constData();
	else
		data = plainText;

	std::cout << "HistoryLogger::saveMessage data = " << data << std::endl;
	doc.set_data(data);

	Xapian::TermGenerator termGen;
	termGen.set_stemmer(Xapian::Stem("ru"));
	termGen.set_document(doc);
	termGen.index_text(plainText);

	doc.add_value(0, message->dateTime().toString("yyyyMMdd").toStdString());
	doc.add_value(1, message->dateTime().toString("hhmmss").toStdString());
	doc.add_value(2, QString::number(flags, 16).toStdString());
	doc.add_value(3, message->type() == Message::Outgoing? "o" : "i");
	doc.add_value(4, confUser);

	database->add_document(doc);
	database->flush();
}
Ejemplo n.º 8
0
void XapianIndex::setDocumentData(Xapian::Document &doc, const DocumentInfo &info,
	const string &language) const
{
	string title(info.getTitle());
	string timestamp(info.getTimestamp());
	char timeStr[64];

	// Set the document data omindex-style
	string record = "url=";
	record += info.getLocation();
	// The sample will be generated at query time
	record += "\nsample=";
	record += "\ncaption=";
	if (badField(title) == true)
	{
		// Modify the title if necessary
		string::size_type pos = title.find("=");
		while (pos != string::npos)
		{
			title[pos] = ' ';
			pos = title.find("=", pos + 1);
		}
#ifdef DEBUG
		cout << "XapianIndex::setDocumentData: modified title" << endl;
#endif
	}
	record += title;
	record += "\ntype=";
	record += info.getType();
	// Append a timestamp
	record += "\ntimestamp=";
	record += timestamp;
	// ...and the language
	record += "\nlanguage=";
	record += language;
#ifdef DEBUG
	cout << "XapianIndex::setDocumentData: document data is " << record << endl;
#endif
	doc.set_data(record);

	// Add this value to allow sorting by date
	snprintf(timeStr, 64, "%d", TimeConverter::fromTimestamp(timestamp));
	doc.add_value(0, timeStr);
}
Ejemplo n.º 9
0
int main(int argc, char **argv)
{
    // Simplest possible options parsing: we just require two or more
    // parameters.
    if (argc < 3) {
        cout << "usage: " << argv[0] << " <path to database> <search terms>" << endl;
        exit(1);
    }
 
    // Catch any Xapian::Error exceptions thrown
    try {
        // Make the database
	Xapian::Database db(argv[1]);
 
        // Start an enquire session
	Xapian::Enquire enquire(db);
         
        // Set percent and/or weight cutoffs
        enquire.set_cutoff(90,0.2);
         
        // Set weighting schema
        BM25Weight bm1(1.0,0.0,1.0,0.5,0.3);
        enquire.set_weighting_scheme(bm1);
 
        // Build the query object
	Xapian::Query query(Xapian::Query::OP_AND, argv + 2, argv + argc);
        cout << "Performing query" << query.get_description() << "'" << endl;
	
        // Set Stopper
        string stop[8]={"的","了","呵","吧","就","你","我","他"};
        SimpleStopper *ss=new SimpleStopper;
        for(int i=0;i<8;i++){
            ss->add(stop[i]);
        }
        QueryParser qparser;
        qparser.set_stopper(ss);
        qparser.set_database(db);
 
        // Give the query object to the enquire session
        enquire.set_query(query);
 
        // Get the top 10 results of the query
	Xapian::MSet matches = enquire.get_mset(0, 10);                     //最多返回10个文档
 
        // Display the results
        cout << matches.size() << " results found" << endl;
 
        for (Xapian::MSetIterator i = matches.begin();i != matches.end(); ++i) {
	    Xapian::Document doc = i.get_document();
            cout << "Document ID " << *i << "\nPercent " <<i.get_percent() << "%\n" << doc.get_data() << "\n" << endl;
        }
        db.close();
    } catch(const Xapian::Error &error) {
        cout << "Exception: "  << error.get_msg() << endl;
    }
}
Ejemplo n.º 10
0
void XapianIndex::removeFirstPostingsFromDocument(Tokenizer &tokens, Xapian::Document &doc,
	const string &prefix, const string &language, StemmingMode mode) const
{
	Xapian::TermIterator termListIter = doc.termlist_begin();
	Xapian::Stem *pStemmer = NULL;
	string term;

	// Do we know what language to use for stemming ?
	if (language.empty() == false)
	{
		pStemmer = new Xapian::Stem(StringManip::toLowerCase(language));
	}

	// Get the terms and remove the first posting for each
	while (tokens.nextToken(term) == true)
	{
		if (term.empty() == true)
		{
			continue;
		}
		// Does it start with a capital letter ?
		if (isupper((int)term[0]) != 0)
		{
			// R-prefix the raw term
			removeFirstPosting(doc, termListIter, string("R") + term);
		}
		// Lower case the term
		term = StringManip::toLowerCase(term);

		// Stem the term ?
		if ((mode == STORE_UNSTEM) ||
			(pStemmer == NULL))
		{
			removeFirstPosting(doc, termListIter, limitTermLength(prefix + term));
		}
		else if (mode == STORE_STEM)
		{
			removeFirstPosting(doc, termListIter, limitTermLength(prefix + pStemmer->stem_word(term)));
		}
		else if (mode == STORE_BOTH)
		{
			string stemmedTerm = pStemmer->stem_word(term);

			removeFirstPosting(doc, termListIter, limitTermLength(prefix + term));
			if (stemmedTerm != term)
			{
				removeFirstPosting(doc, termListIter, limitTermLength(prefix + stemmedTerm));
			}
		}
	}

	if (pStemmer != NULL)
	{
		delete pStemmer;
	}
}
Ejemplo n.º 11
0
QImage ThumbnailProvider::requestImage(const QString &id, QSize *size, const QSize &requestedSize)
{
    QImage image;

    if (m_thumb32->findImage(id, &image)) {
        return image;
    } else {
        QString filePath;
        if (id.at(0) == QLatin1Char('Q')) {
            Xapian::Document doc = m_xapianDB->findDocument(id);
            if (doc.get_docid() == 0) {
                return image;
            } else {
                filePath = QString::fromStdString(doc.get_value(Database::FilePath));
            }
        } else {
            filePath = id;
        }

        // Load thumbnail
//        KExiv2Iface::KExiv2Previews preview(filePath);
        KExiv2Iface::KExiv2	preview(filePath);
        image = preview.getExifThumbnail(true);
        if (image.isNull()) {
//            image = preview.image();
//        } else {
            // Store thumbnail
            // TODO smooth or fast?
            image = QImage(filePath).scaled(160, 120, Qt::KeepAspectRatio);
//            preview.
            kWarning() << "Could not find preview image for" << filePath << image.isNull();
        }

        // Store the thumbnail into the cache file
        if (m_thumb32->insertImage(id, image)) {
            kWarning() << "Added preview for" << image.byteCount() << filePath << id;
        } else {
            kWarning() << "FAILED to add preview for" << filePath << id;
        }
    }

    return image;
}
Ejemplo n.º 12
0
QString EmailSearchStore::text(int queryId)
{
    Xapian::Document doc = docForQuery(queryId);

    QMutexLocker lock(&m_mutex);
    std::string data;
    try {
        data = doc.get_data();
    } catch (const Xapian::Error &) {
        // Nothing to do, move along
    }

    QString subject = QString::fromUtf8(data.c_str(), data.length());
    if (subject.isEmpty()) {
        return QStringLiteral("No Subject");
    }

    return subject;
}
Ejemplo n.º 13
0
bool XapianIndex::prepareDocument(const DocumentInfo &info, Xapian::Document &doc,
	Xapian::termcount &termPos) const
{
	string title(info.getTitle());
	string location(info.getLocation());
	Url urlObj(location);

	// Add a magic term :-)
	doc.add_term(MAGIC_TERM);

	// Index the title with and without prefix S
	if (title.empty() == false)
	{
		Document titleDoc;
		titleDoc.setData(title.c_str(), title.length());
		Tokenizer titleTokens(&titleDoc);
		addTermsToDocument(titleTokens, doc, "S", termPos, STORE_UNSTEM);
		titleTokens.rewind();
		addTermsToDocument(titleTokens, doc, "", termPos, m_stemMode);
	}

	// Index the full URL with prefix U
	doc.add_term(limitTermLength(string("U") + location, true));
	// ...the host name and included domains with prefix H
	string hostName(StringManip::toLowerCase(urlObj.getHost()));
	if (hostName.empty() == false)
	{
		doc.add_term(limitTermLength(string("H") + hostName, true));
		string::size_type dotPos = hostName.find('.');
		while (dotPos != string::npos)
		{
			doc.add_term(limitTermLength(string("H") + hostName.substr(dotPos + 1), true));

			// Next
			dotPos = hostName.find('.', dotPos + 1);
		}
	}
	// ...and the file name with prefix P
	string fileName(urlObj.getFile());
	if (fileName.empty() == false)
	{
		doc.add_term(limitTermLength(string("P") + StringManip::toLowerCase(fileName), true));
	}
	// Finally, add the language code with prefix L
	doc.add_term(string("L") + Languages::toCode(m_stemLanguage));

	setDocumentData(doc, info, m_stemLanguage);

	return true;
}
Ejemplo n.º 14
0
static void removeFirstPosting(Xapian::Document &doc,
                               Xapian::TermIterator &termListIter, const string &term)
{
    termListIter.skip_to(term);

    Xapian::PositionIterator firstPosIter = termListIter.positionlist_begin();
    if (firstPosIter != termListIter.positionlist_end())
    {
        try
        {
            doc.remove_posting(term, *firstPosIter);
        }
        catch (const Xapian::Error &error)
        {
            // This posting may have been removed already
#ifdef DEBUG
            cout << "XapianIndex::removeFirstPosting: " << error.get_msg() << endl;
#endif
        }
    }
}
Indexer::Indexer(const string &datapath, const string &dbpath)
{
    // Hardcode field offsets for simplicity.
    const size_t FIELD_ID_NUMBER = 0;
    const size_t FIELD_TITLE = 2;
    const size_t FIELD_DESCRIPTION = 8;

    // Create or open the database we're going to be writing to.
    Xapian::WritableDatabase db(dbpath, Xapian::DB_CREATE_OR_OPEN);

    // Set up a TermGenerator that we'll use in indexing.
    Xapian::TermGenerator termgenerator;
    termgenerator.set_stemmer(Xapian::Stem("en"));

    ifstream csv(datapath.c_str());
    vector<string> fields;
    csv_parse_line(csv, fields);

    // Check the CSV header line matches our hard-code offsets.
    if (fields.at(FIELD_ID_NUMBER) != "id_NUMBER" ||
    fields.at(FIELD_TITLE) != "TITLE" ||
    fields.at(FIELD_DESCRIPTION) != "DESCRIPTION") {
    // The CSV format doesn't match what we expect.
    cerr << "CSV format has changed!" << endl;
    exit(1);
    }

    while (csv_parse_line(csv, fields)) {
    // 'fields' is a vector mapping from field number to value.
    // We look up fields with the 'at' method so we get an exception
    // if that field isn't set.
    //
    // We're just going to use DESCRIPTION, TITLE and id_NUMBER.
    const string & description = fields.at(FIELD_DESCRIPTION);
    const string & title = fields.at(FIELD_TITLE);
    const string & identifier = fields.at(FIELD_ID_NUMBER);

    // We make a document and tell the term generator to use this.
    Xapian::Document doc;
    termgenerator.set_document(doc);

    // Index each field with a suitable prefix.
    termgenerator.index_text(title, 1, "S");
    termgenerator.index_text(description, 1, "XD");

    // Index fields without prefixes for general search.
    termgenerator.index_text(title);
    termgenerator.increase_termpos();
    termgenerator.index_text(description);

    // Store all the fields for display purposes.
    doc.set_data(identifier + "\n" + title + "\n" + description);

    // We use the identifier to ensure each object ends up in the
    // database only once no matter how many times we run the
    // indexer.
    string idterm = "Q" + identifier;
    doc.add_boolean_term(idterm);
    db.replace_document(idterm, doc);
    }
}
Ejemplo n.º 16
0
    void QueryHandler(const QueryMessage &message, const Theron::Address from)
        {
            search::QueryInfo qi=*(message.query);
            std::string resKey(message.resKey);
            delete message.query;
            std::string segString;
            char *output=new char[qi.query.length()*9];
            char *input=new char[qi.query.length()*3];
            memset(output,0,qi.query.length()*9);
            memset(input,0,qi.query.length()*3);
            try 
            {
                UErrorCode  error = U_ZERO_ERROR;
                ucnv_convert("GBK","UTF-8",input,  qi.query.length()*3, qi.query.c_str(), qi.query.length(), &error );
                
                
                bool ret = result->ParagraphProcessing(input, output);
                if (ret)
                {
                    int oLen=strlen(output);
                    char *utf8out=new char[oLen*3];
                    memset(utf8out,0,oLen*3);
                    ucnv_convert("UTF-8","GBK",utf8out,  oLen*3, output, oLen, &error );
                    
                    segString=std::string(utf8out);
                    delete [] utf8out;
                }
            }
            catch (...) {
            }
            delete [] output;
            delete [] input;
            std::list<std::string> segList;
            if(segString.length()>0)
            {
                std::vector<std::string> resv;
                boost::algorithm::split( resv, segString, boost::algorithm::is_any_of(" ") );
                for(std::vector<std::string>::iterator it=resv.begin();it!=resv.end();++it)
                {
                    std::vector<std::string> tmpv;
                    boost::algorithm::split( tmpv, *it, boost::algorithm::is_any_of("/") );
                    if(tmpv.size()>1&&tmpv[1]!="w")
                        segList.push_back(std::string("K")+tmpv[0]);
                }
            }
            search::DocList *dList=new search::DocList();
            if(segList.size()>0)
            {
                Xapian::Query query(Xapian::Query::OP_AND,segList.begin(), segList.end());
                
                while(1)
                {
                    try
                    {
                        db.reopen();
                        Xapian::Enquire  enquire(db);
                        enquire.set_query(query);
                        Xapian::MSet matches = enquire.get_mset(0, 100);
                        for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i) {
                            Xapian::Document doc = i.get_document();
                            search::IndexInfo info;
                            info.uid=doc.get_value(1);
                            info.attMap.insert(std::make_pair(std::string("title"),doc.get_value(2)));
                            info.content=doc.get_data();
                            dList->docList.push_back(info);
                        }
                        std::cout<<"doc size:"<<dList->docList.size()<<std::endl;
                        break;
                    }catch(Xapian::DatabaseModifiedError exception)
                    {
                        std::cout<<"try agian"<<std::endl;
                    }catch(...)
                    {
                        break;
                    }
                    
                }
                

            }
            Send(QueryResponceMessage(dList,resKey.c_str()), from);
            
        }
Ejemplo n.º 17
0
/// Indexes the given data.
bool XapianIndex::indexDocument(Tokenizer &tokens, const std::set<std::string> &labels,
	unsigned int &docId)
{
	unsigned int dataLength = 0;
	bool indexed = false;

	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
	if (pDatabase == NULL)
	{
		cerr << "Bad index " << m_databaseName << endl;
		return false;
	}

	try
	{
		// Get the document
		const Document *pDocument = tokens.getDocument();
		if (pDocument == NULL)
		{
#ifdef DEBUG
			cout << "XapianIndex::indexDocument: no document" << endl;
#endif
			return false;
		}

		// Cache the document's properties
		DocumentInfo docInfo(pDocument->getTitle(), pDocument->getLocation(),
			pDocument->getType(), pDocument->getLanguage());
		docInfo.setTimestamp(pDocument->getTimestamp());
		docInfo.setLocation(Url::canonicalizeUrl(docInfo.getLocation()));

		const char *pData = pDocument->getData(dataLength);
		if (pData != NULL)
		{
			m_stemLanguage = scanDocument(pData, dataLength, docInfo);
		}

		Xapian::Document doc;
		Xapian::termcount termPos = 0;

#ifdef DEBUG
		cout << "XapianIndex::indexDocument: adding terms" << endl;
#endif
		// Add the tokenizer's terms to the Xapian document
		addPostingsToDocument(tokens, doc, "", termPos, m_stemMode);
		// Add labels
		for (set<string>::const_iterator labelIter = labels.begin(); labelIter != labels.end();
			++labelIter)
		{
			doc.add_term(limitTermLength(string("XLABEL:") + *labelIter));
		}
		if (addCommonTerms(docInfo, doc, termPos) == true)
		{
			setDocumentData(docInfo, doc, m_stemLanguage);

			Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
			if (pIndex != NULL)
			{
				// Add this document to the Xapian index
				docId = pIndex->add_document(doc);
				indexed = true;
			}
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't index document: " << error.get_type() << ": " << error.get_msg() << endl;
	}
	catch (...)
	{
		cerr << "Couldn't index document, unknown exception occured" << endl;
	}
	pDatabase->unlock();

	return indexed;
}
Ejemplo n.º 18
0
void XapianIndex::addPostingsToDocument(Tokenizer &tokens, Xapian::Document &doc,
	const string &prefix, Xapian::termcount &termPos, StemmingMode mode) const
{
	Xapian::Stem *pStemmer = NULL;
	string upperCasePrefix("R");
	string term;

	// Do we know what language to use for stemming ?
	if (m_stemLanguage.empty() == false)
	{
		pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage));
	}

	// Terms starting with a capital letter are R-prefixed, unless a prefix is already defined
	if (prefix.empty() == false)
	{
		upperCasePrefix = prefix;
	}

	// Get the terms
	while (tokens.nextToken(term) == true)
	{
		if (term.empty() == true)
		{
			continue;
		}
		// Does it start with a capital letter ?
		if (isupper((int)term[0]) != 0)
		{
			doc.add_posting(upperCasePrefix + XapianDatabase::limitTermLength(term), termPos);
		}
		// Lower case the term
		term = StringManip::toLowerCase(term);

		// Stem the term ?
		if ((mode == STORE_UNSTEM) ||
			(pStemmer == NULL))
		{
			doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos);
		}
		else if (mode == STORE_STEM)
		{
#if XAPIAN_MAJOR_VERSION==0
			string stemmedTerm(pStemmer->stem_word(term));
#else
			string stemmedTerm((*pStemmer)(term));
#endif

			doc.add_posting(prefix + XapianDatabase::limitTermLength(stemmedTerm), termPos);
		}
		else if (mode == STORE_BOTH)
		{
#if XAPIAN_MAJOR_VERSION==0
			string stemmedTerm(pStemmer->stem_word(term));
#else
			string stemmedTerm((*pStemmer)(term));
#endif

			// Add both at the same position
			doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos);
			if (stemmedTerm != term)
			{
				// No point adding the same term twice
				doc.add_posting(prefix + XapianDatabase::limitTermLength(stemmedTerm), termPos);
			}
		}

		++termPos;
	}
#ifdef DEBUG
	cout << "XapianIndex::addPostingsToDocument: added " << termPos << " terms" << endl;
#endif

	if (pStemmer != NULL)
	{
		delete pStemmer;
	}
}
Ejemplo n.º 19
0
void XapianIndex::addCommonTerms(const DocumentInfo &info, Xapian::Document &doc,
                                 Xapian::termcount &termPos) const
{
    string title(info.getTitle());
    string location(info.getLocation());
    Url urlObj(location);

    // Add a magic term :-)
    doc.add_term(MAGIC_TERM);

    // Index the title with and without prefix S
    if (title.empty() == false)
    {
        Document titleDoc;
        titleDoc.setData(title.c_str(), title.length());
        Tokenizer titleTokens(&titleDoc);
        addPostingsToDocument(titleTokens, doc, "S", termPos, STORE_UNSTEM);
        titleTokens.rewind();
        addPostingsToDocument(titleTokens, doc, "", termPos, m_stemMode);
    }

    // Index the full URL with prefix U
    doc.add_term(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(location), true));
    // ...the base file with XFILE:
    string::size_type qmPos = location.find("?");
    if ((urlObj.isLocal() == true) &&
            (qmPos != string::npos))
    {
        doc.add_term(string("XFILE:") + XapianDatabase::limitTermLength(Url::escapeUrl(location.substr(0, qmPos)), true));
    }
    // ...the host name and included domains with prefix H
    string hostName(StringManip::toLowerCase(urlObj.getHost()));
    if (hostName.empty() == false)
    {
        doc.add_term(string("H") + XapianDatabase::limitTermLength(hostName, true));
        string::size_type dotPos = hostName.find('.');
        while (dotPos != string::npos)
        {
            doc.add_term(string("H") + XapianDatabase::limitTermLength(hostName.substr(dotPos + 1), true));

            // Next
            dotPos = hostName.find('.', dotPos + 1);
        }
    }
    // ...the location (as is) and all directories with prefix XDIR:
    string tree(urlObj.getLocation());
    if (tree.empty() == false)
    {
        doc.add_term(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree), true));
        if (tree[0] == '/')
        {
            doc.add_term("XDIR:/");
        }
        string::size_type slashPos = tree.find('/', 1);
        while (slashPos != string::npos)
        {
            doc.add_term(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree.substr(0, slashPos)), true));

            // Next
            slashPos = tree.find('/', slashPos + 1);
        }
    }
    // ...and the file name with prefix P
    string fileName(urlObj.getFile());
    if (fileName.empty() == false)
    {
        string extension;

        doc.add_term(string("P") + XapianDatabase::limitTermLength(Url::escapeUrl(fileName), true));

        // Does it have an extension ?
        string::size_type extPos = fileName.rfind('.');
        if ((extPos != string::npos) &&
                (extPos + 1 < fileName.length()))
        {
            extension = StringManip::toLowerCase(fileName.substr(extPos + 1));
        }
        doc.add_term(string("XEXT:") + XapianDatabase::limitTermLength(extension));
    }
    // Add the date terms D, M and Y
    time_t timeT = TimeConverter::fromTimestamp(info.getTimestamp());
    struct tm *tm = localtime(&timeT);
    string yyyymmdd = TimeConverter::toYYYYMMDDString(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday);
    if (yyyymmdd.length() == 8)
    {
        doc.add_term(string("D") + yyyymmdd);
        doc.add_term(string("M") + yyyymmdd.substr(0, 6));
        doc.add_term(string("Y") + yyyymmdd.substr(0, 4));
    }
    // Finally, add the language code with prefix L
    doc.add_term(string("L") + Languages::toCode(m_stemLanguage));
    // ...and the MIME type with prefix T
    doc.add_term(string("T") + info.getType());
}
Ejemplo n.º 20
0
/// Sets a document's labels.
bool XapianIndex::setDocumentLabels(unsigned int docId, const set<string> &labels,
	bool resetLabels)
{
	bool updatedLabels = false;

	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
	if (pDatabase == NULL)
	{
		cerr << "Bad index " << m_databaseName << endl;
		return false;
	}

	try
	{
		Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
		if (pIndex != NULL)
		{
			Xapian::Document doc = pIndex->get_document(docId);

			// Reset existing labels ?
			if (resetLabels == true)
			{
				Xapian::TermIterator termIter = pIndex->termlist_begin(docId);
				if (termIter != pIndex->termlist_end(docId))
				{
					for (termIter.skip_to("XLABEL:");
						termIter != pIndex->termlist_end(docId); ++termIter)
					{
						// Is this a label ?
						if (strncasecmp((*termIter).c_str(), "XLABEL:", min(7, (int)(*termIter).length())) == 0)
						{
							doc.remove_term(*termIter);
						}
					}
				}
			}

			// Set new labels
			for (set<string>::const_iterator labelIter = labels.begin(); labelIter != labels.end();
				++labelIter)
			{
				if (labelIter->empty() == false)
				{
					doc.add_term(limitTermLength(string("XLABEL:") + *labelIter));
				}
			}

			pIndex->replace_document(docId, doc);
			updatedLabels = true;
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't update document's labels: " << error.get_type() << ": " << error.get_msg() << endl;
	}
	catch (...)
	{
		cerr << "Couldn't update document's labels, unknown exception occured" << endl;
	}
	pDatabase->unlock();

	return updatedLabels;
}
Ejemplo n.º 21
0
/// Updates the given document; true if success.
bool XapianIndex::updateDocument(unsigned int docId, Tokenizer &tokens)
{
	unsigned int dataLength = 0;
	bool updated = false;

	const Document *pDocument = tokens.getDocument();
	if (pDocument == NULL)
	{
		return false;
	}

	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
	if (pDatabase == NULL)
	{
		cerr << "Bad index " << m_databaseName << endl;
		return false;
	}

	const char *pData = pDocument->getData(dataLength);
	if (pData == NULL)
	{
		return false;
	}

	// Cache the document's properties
	DocumentInfo docInfo(pDocument->getTitle(), pDocument->getLocation(),
		pDocument->getType(), pDocument->getLanguage());
	docInfo.setTimestamp(pDocument->getTimestamp());
	docInfo.setLocation(Url::canonicalizeUrl(docInfo.getLocation()));

	// Don't scan the document if a language is specified
	m_stemLanguage = Languages::toEnglish(pDocument->getLanguage());
	if (m_stemLanguage.empty() == true)
	{
		m_stemLanguage = scanDocument(pData, dataLength, docInfo);
	}

	try
	{
		set<string> labels;
		Xapian::Document doc;
		Xapian::termcount termPos = 0;

		// Add the tokenizer's terms to the document
		addPostingsToDocument(tokens, doc, "", termPos, m_stemMode);
		// Get the document's labels
		if (getDocumentLabels(docId, labels) == true)
		{
			// Add labels
			for (set<string>::const_iterator labelIter = labels.begin(); labelIter != labels.end();
				++labelIter)
			{
				doc.add_term(limitTermLength(string("XLABEL:") + *labelIter));
			}
		}
		if (addCommonTerms(docInfo, doc, termPos) == true)
		{
			setDocumentData(docInfo, doc, m_stemLanguage);

			Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
			if (pIndex != NULL)
			{
				// Update the document in the database
				pIndex->replace_document(docId, doc);
				updated = true;
			}
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't update document: " << error.get_type() << ": " << error.get_msg() << endl;
	}
	catch (...)
	{
		cerr << "Couldn't update document, unknown exception occured" << endl;
	}
	pDatabase->unlock();

	return updated;
}
			void query(const std::vector<std::string>& arr_queries, const std::vector<std::string>& arr_selection = {}) {
				
				Xapian::Database databases_ir;
				try {
					Xapian::Database database_ir_object_values(path_database+"object_values/");
					databases_ir.add_database(database_ir_object_values);
				} catch (const Xapian::Error &e) {
					// Database not ready
				}
				try {
					Xapian::Database database_ir_object_descriptions(path_database+"object_descriptions/");
					databases_ir.add_database(database_ir_object_descriptions);
				} catch (const Xapian::Error &e) {
					// Database not ready
				}
				try {
					Xapian::Database database_ir_object_sub_descriptions(path_database+"object_sub_descriptions/");
					databases_ir.add_database(database_ir_object_sub_descriptions);
				} catch (const Xapian::Error &e) {
					// Database not ready
				}
				
				// Filter on Type IDs
				
				Xapian::Query query_ir_identifiers;
				
				if (!arr_selection.empty()) {
					
					std::vector<Xapian::Query> arr_query_identifiers;
					
					for (const auto& str_identifier_field : arr_selection) {

						arr_query_identifiers.push_back(Xapian::Query("T"+str_identifier_field));
					}
					
					query_ir_identifiers = Xapian::Query(Xapian::Query::OP_OR, arr_query_identifiers.begin(), arr_query_identifiers.end());
				}

				Xapian::QueryParser queryparser;
				queryparser.set_database(databases_ir); // Needed to enable specific query flags
				queryparser.set_stemmer(Xapian::Stem("en"));
				queryparser.set_stemming_strategy(queryparser.STEM_SOME);
				queryparser.add_boolean_prefix("identifier", "T");
				//queryparser.add_prefix("value", "SV");
				
				unsigned int count_queries = 0;
				
				for (const auto& str_query : arr_queries) {
					
					const auto query_id = count_queries;
					count_queries++;
						
					Xapian::Query query_ir = queryparser.parse_query(str_query, Xapian::QueryParser::FLAG_DEFAULT | Xapian::QueryParser::FLAG_WILDCARD);
					
					if (!arr_selection.empty()) {
						
						// Update main query
						
						query_ir = Xapian::Query(Xapian::Query::OP_FILTER, query_ir, query_ir_identifiers);
					}
					
					// Run query
					
					Xapian::Enquire enquire(databases_ir);
					enquire.set_query(query_ir);
					
					Xapian::MSet arr_msets = enquire.get_mset(num_offset, num_limit);

					for (Xapian::MSetIterator iterate_arr_mset = arr_msets.begin(); iterate_arr_mset != arr_msets.end(); iterate_arr_mset++) {
											
						//Xapian::docid did = *iterate_arr_mset;
						const int unsigned& nr_rank = iterate_arr_mset.get_rank();
						const int unsigned& nr_weight = iterate_arr_mset.get_weight();
						
						const Xapian::Document doc = iterate_arr_mset.get_document();
						const std::string& str_identifier = doc.get_value(0);
						
						if (map_query_results.find(str_identifier) == map_query_results.end()) {
							
							std::vector<unsigned int> arr_matches;
							
							arr_matches.push_back(query_id);

							const std::string& str_value = (include_value ? doc.get_data() : "");
							
							map_query_results[str_identifier] = std::make_tuple(nr_rank, nr_weight, arr_matches, str_value);
						} else {
							
							type_arr_query_result& arr_query_result = map_query_results[str_identifier];
							
							std::get<0>(arr_query_result) += nr_rank;
							std::get<1>(arr_query_result) += nr_weight;
							
							std::get<2>(arr_query_result).push_back(query_id);
						}
					}
				}
			}
Ejemplo n.º 23
0
void XapianIndex::removeFirstPostingsFromDocument(Tokenizer &tokens, Xapian::Document &doc,
	const string &prefix, const string &language, StemmingMode mode) const
{
	Xapian::TermIterator termListIter = doc.termlist_begin();
	Xapian::Stem *pStemmer = NULL;
	string upperCasePrefix("R");
	string term;

	// Do we know what language to use for stemming ?
	if (language.empty() == false)
	{
		pStemmer = new Xapian::Stem(StringManip::toLowerCase(language));
	}

	// Terms starting with a capital letter are R-prefixed, unless a prefix is already defined
	if (prefix.empty() == false)
	{
		upperCasePrefix = prefix;
	}

	// Get the terms and remove the first posting for each
	while (tokens.nextToken(term) == true)
	{
		if (term.empty() == true)
		{
			continue;
		}
		// Does it start with a capital letter ?
		if (isupper((int)term[0]) != 0)
		{
			removeFirstPosting(doc, termListIter, upperCasePrefix + term);
		}
		// Lower case the term
		term = StringManip::toLowerCase(term);

		// Stem the term ?
		if ((mode == STORE_UNSTEM) ||
			(pStemmer == NULL))
		{
			removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term));
		}
		else if (mode == STORE_STEM)
		{
#if XAPIAN_MAJOR_VERSION==0
			string stemmedTerm(pStemmer->stem_word(term));
#else
			string stemmedTerm((*pStemmer)(term));
#endif

			removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(stemmedTerm));
		}
		else if (mode == STORE_BOTH)
		{
#if XAPIAN_MAJOR_VERSION==0
			string stemmedTerm(pStemmer->stem_word(term));
#else
			string stemmedTerm((*pStemmer)(term));
#endif

			removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term));
			if (stemmedTerm != term)
			{
				removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(stemmedTerm));
			}
		}
	}

	if (pStemmer != NULL)
	{
		delete pStemmer;
	}
}
Ejemplo n.º 24
0
void XapianIndex::addPostingsToDocument(Tokenizer &tokens, Xapian::Document &doc,
                                        const string &prefix, Xapian::termcount &termPos, StemmingMode mode) const
{
    Xapian::Stem *pStemmer = NULL;
    string stemPrefix("Z");
    string term;

    // Do we know what language to use for stemming ?
    if (m_stemLanguage.empty() == false)
    {
        try
        {
            pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage));
        }
        catch (const Xapian::Error &error)
        {
            cerr << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl;
        }
    }

    // Stems are Z-prefixed, unless a prefix is already defined
    if (prefix.empty() == false)
    {
        stemPrefix = prefix;
    }

    // Get the terms
    while (tokens.nextToken(term) == true)
    {
        bool addStem = false;

        if (term.empty() == true)
        {
            continue;
        }
        // Lower case the term
        term = StringManip::toLowerCase(term);

        // Stem the term ?
        if ((mode == STORE_UNSTEM) ||
                (pStemmer == NULL))
        {
            doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos);
        }
        else if (mode == STORE_STEM)
        {
            addStem = true;
        }
        else if (mode == STORE_BOTH)
        {
            // Add both
            doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos);
            addStem = true;
        }

        // Don't stem if the term starts with a digit
        if ((addStem == true) &&
                (isdigit((int)term[0]) == 0))
        {
#if XAPIAN_MAJOR_VERSION==0
            string stemmedTerm(pStemmer->stem_word(term));
#else
            string stemmedTerm((*pStemmer)(term));
#endif

            doc.add_term(stemPrefix + XapianDatabase::limitTermLength(stemmedTerm));
        }

        ++termPos;
    }
#ifdef DEBUG
    cout << "XapianIndex::addPostingsToDocument: added " << termPos << " terms" << endl;
#endif

    if (pStemmer != NULL)
    {
        delete pStemmer;
    }
}
Ejemplo n.º 25
0
void XapianIndex::removeFirstPostingsFromDocument(Tokenizer &tokens, Xapian::Document &doc,
        const string &prefix, const string &language, StemmingMode mode) const
{
    Xapian::TermIterator termListIter = doc.termlist_begin();
    Xapian::Stem *pStemmer = NULL;
    string stemPrefix("Z");
    string term;

    // Do we know what language to use for stemming ?
    if (language.empty() == false)
    {
        try
        {
            pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage));
        }
        catch (const Xapian::Error &error)
        {
            cerr << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl;
        }
    }

    // Stems are Z-prefixed, unless a prefix is already defined
    if (prefix.empty() == false)
    {
        stemPrefix = prefix;
    }

    // Get the terms and remove the first posting for each
    while (tokens.nextToken(term) == true)
    {
        bool removeStem = false;

        if (term.empty() == true)
        {
            continue;
        }
        // Lower case the term
        term = StringManip::toLowerCase(term);

        // Stem the term ?
        if ((mode == STORE_UNSTEM) ||
                (pStemmer == NULL))
        {
            removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term));
        }
        else if (mode == STORE_STEM)
        {
            removeStem = true;
        }
        else if (mode == STORE_BOTH)
        {
            // Remove both
            removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term));
            removeStem = true;
        }

        // Since stems don't have positional information, we can't simply remove them
        // since any may appear more than once in the original document
        // We can only remove those that have some prefix set
        // Don't stem if the term starts with a digit
        if ((removeStem == true) &&
                (prefix.empty() == false) &&
                (isdigit((int)term[0]) == 0))
        {
#if XAPIAN_MAJOR_VERSION==0
            string stemmedTerm(pStemmer->stem_word(term));
#else
            string stemmedTerm((*pStemmer)(term));
#endif

            doc.remove_term(stemPrefix + XapianDatabase::limitTermLength(stemmedTerm));
        }
    }

    if (pStemmer != NULL)
    {
        delete pStemmer;
    }
}
Ejemplo n.º 26
0
int main(int argc, char **argv)
{
    if(argc < 2) {
        usage(argv);
        return 1;
    }

    try {
        char *action = argv[1];
        char *db_path = argv[2];

        if(!strcmp(action, "index")) {
            Xapian::WritableDatabase db(db_path, Xapian::DB_CREATE_OR_OPEN);

            Xapian::TermGenerator indexer;
            Xapian::Stem stemmer("english");
            indexer.set_stemmer(stemmer);

            std::string doc_txt;
            while(true) {
                if(std::cin.eof()) break;

                std::string line;
                getline(std::cin, line);
                doc_txt += line;
            }

            if(!doc_txt.empty()) {
                Xapian::Document doc;
                doc.set_data(doc_txt);

                indexer.set_document(doc);
                indexer.index_text(doc_txt);

                db.add_document(doc);

                std::cout << "Indexed: " << indexer.get_description() << std::endl;
            }

            db.commit();
        } else if(!strcmp(action, "search")) {
            if(argc < 4) {
                std::cerr << "You must supply a query string" << std::endl;
                return 1;
            }

            Xapian::Database db(db_path);
            Xapian::Enquire enquire(db);

            std::string query_str = argv[3];
            argv+= 4;
            while(*argv) {
                query_str += ' ';
                query_str += *argv++;
            }

            Xapian::QueryParser qp;
            Xapian::Stem stemmer("english");
            qp.set_stemmer(stemmer);
            qp.set_database(db);
            qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);

            Xapian::Query query = qp.parse_query(query_str);
            std::cout << "Parsed query is: " << query.get_description() <<
                         std::endl;

            enquire.set_query(query);
            Xapian::MSet matches = enquire.get_mset(0, 10);

            std::cout << matches.get_matches_estimated() << " results found.\n";
            std::cout << "Matches 1-" << matches.size() << ":\n" << std::endl;

            for (Xapian::MSetIterator i = matches.begin();
                    i != matches.end(); ++i) {
                std::cout << i.get_rank() + 1 << ": " << i.get_percent() <<
                        "% docid=" << *i << " [" <<
                        i.get_document().get_data()<< "]" << std::endl <<
                        std::endl;
            }
        } else {
            std::cerr << "Invalid action " << action << std::endl;
            usage(argv);
            return 1;
        }

    } catch (const Xapian::Error &error) {
        std::cout << "Exception: " << error.get_msg() << std::endl;
    }
}
Ejemplo n.º 27
0
void XapianIndex::removeCommonTerms(Xapian::Document &doc)
{
    DocumentInfo docInfo;
    set<string> commonTerms;
    string record(doc.get_data());

    // First, remove the magic term
    commonTerms.insert(MAGIC_TERM);

    if (record.empty() == true)
    {
        // Nothing else we can do
        return;
    }

    string language(StringManip::extractField(record, "language=", "\n"));
    string timestamp(StringManip::extractField(record, "timestamp=", "\n"));

    docInfo = DocumentInfo(StringManip::extractField(record, "caption=", "\n"),
                           StringManip::extractField(record, "url=", "\n"),
                           StringManip::extractField(record, "type=", "\n"),
                           Languages::toLocale(language));
    // We used to use timestamp prior to 0.60
    if (timestamp.empty() == true)
    {
        string modTime(StringManip::extractField(record, "modtime=", "\n"));
        if (modTime.empty() == false)
        {
            time_t timeT = (time_t )atol(modTime.c_str());
            timestamp = TimeConverter::toTimestamp(timeT);
        }
    }
    docInfo.setTimestamp(timestamp);
    string bytesSize(StringManip::extractField(record, "size=", ""));
    if (bytesSize.empty() == false)
    {
        docInfo.setSize((off_t )atol(bytesSize.c_str()));
    }
    Url urlObj(docInfo.getLocation());

    // FIXME: remove terms extracted from the title if they don't have more than one posting
    string title(docInfo.getTitle());
    if (title.empty() == false)
    {
        Document titleDoc;
        titleDoc.setData(title.c_str(), title.length());
        Tokenizer titleTokens(&titleDoc);
        removeFirstPostingsFromDocument(titleTokens, doc, "S", language, STORE_UNSTEM);
        titleTokens.rewind();
        removeFirstPostingsFromDocument(titleTokens, doc, "", language, m_stemMode);
    }

    // Location
    string location(docInfo.getLocation());
    commonTerms.insert(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(location), true));
    // Base file
    string::size_type qmPos = location.find("?");
    if ((urlObj.isLocal() == true) &&
            (qmPos != string::npos))
    {
        commonTerms.insert(string("XFILE:") + XapianDatabase::limitTermLength(Url::escapeUrl(location.substr(0, qmPos)), true));
    }
    // Host name
    string hostName(StringManip::toLowerCase(urlObj.getHost()));
    if (hostName.empty() == false)
    {
        commonTerms.insert(string("H") + XapianDatabase::limitTermLength(hostName, true));
        string::size_type dotPos = hostName.find('.');
        while (dotPos != string::npos)
        {
            commonTerms.insert(string("H") + XapianDatabase::limitTermLength(hostName.substr(dotPos + 1), true));

            // Next
            dotPos = hostName.find('.', dotPos + 1);
        }
    }
    // ...location
    string tree(urlObj.getLocation());
    if (tree.empty() == false)
    {
        commonTerms.insert(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree), true));
        if (tree[0] == '/')
        {
            commonTerms.insert("XDIR:/");
        }
        string::size_type slashPos = tree.find('/', 1);
        while (slashPos != string::npos)
        {
            commonTerms.insert(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree.substr(0, slashPos)), true));

            // Next
            slashPos = tree.find('/', slashPos + 1);
        }
    }
    // ...and file name
    string fileName(urlObj.getFile());
    if (fileName.empty() == false)
    {
        string extension;

        commonTerms.insert(string("P") + XapianDatabase::limitTermLength(Url::escapeUrl(fileName), true));

        // Does it have an extension ?
        string::size_type extPos = fileName.rfind('.');
        if ((extPos != string::npos) &&
                (extPos + 1 < fileName.length()))
        {
            extension = StringManip::toLowerCase(fileName.substr(extPos + 1));
        }
        commonTerms.insert(string("XEXT:") + XapianDatabase::limitTermLength(extension));
    }
    // Date terms
    time_t timeT = TimeConverter::fromTimestamp(docInfo.getTimestamp());
    struct tm *tm = localtime(&timeT);
    string yyyymmdd = TimeConverter::toYYYYMMDDString(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday);
    if (yyyymmdd.length() == 8)
    {
        commonTerms.insert(string("D") + yyyymmdd);
        commonTerms.insert(string("M") + yyyymmdd.substr(0, 6));
        commonTerms.insert(string("Y") + yyyymmdd.substr(0, 4));
    }
    // Language code
    commonTerms.insert(string("L") + Languages::toCode(language));
    // MIME type
    commonTerms.insert(string("T") + docInfo.getType());

    for (set<string>::const_iterator termIter = commonTerms.begin(); termIter != commonTerms.end(); ++termIter)
    {
        try
        {
            doc.remove_term(*termIter);
        }
        catch (const Xapian::Error &error)
        {
#ifdef DEBUG
            cout << "XapianIndex::removeCommonTerms: " << error.get_msg() << endl;
#endif
        }
    }
}
Ejemplo n.º 28
0
/// Returns a document's properties.
bool XapianIndex::getDocumentInfo(unsigned int docId, DocumentInfo &docInfo) const
{
	bool foundDocument = false;

	if (docId == 0)
	{
		return false;
	}

	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
	if (pDatabase == NULL)
	{
		cerr << "Bad index " << m_databaseName << endl;
		return false;
	}

	try
	{
		Xapian::Database *pIndex = pDatabase->readLock();
		if (pIndex != NULL)
		{
			Xapian::Document doc = pIndex->get_document(docId);

			// Get the current document data
			string record = doc.get_data();
			if (record.empty() == false)
			{
				string language(Languages::toLocale(StringManip::extractField(record, "language=", "")));
				// We used to use timestamp prior to 0.60
				string timestamp(StringManip::extractField(record, "timestamp=", "\n"));

				docInfo = DocumentInfo(StringManip::extractField(record, "caption=", "\n"),
					StringManip::extractField(record, "url=", "\n"),
					StringManip::extractField(record, "type=", "\n"),
					language);
				if (timestamp.empty() == true)
				{
					// This is the format used by Omega
					string modTime(StringManip::extractField(record, "modtime=", "\n"));
					if (modTime.empty() == false)
					{
						time_t timeT = (time_t )atol(modTime.c_str());
						timestamp = TimeConverter::toTimestamp(timeT);
					}
				}
				docInfo.setTimestamp(timestamp);
				foundDocument = true;
			}
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't get document properties: " << error.get_type() << ": " << error.get_msg() << endl;
	}
	catch (...)
	{
		cerr << "Couldn't get document properties, unknown exception occured" << endl;
	}
	pDatabase->unlock();

	return foundDocument;
}
Ejemplo n.º 29
0
void XapianIndex::removeCommonTerms(Xapian::Document &doc)
{
	DocumentInfo docInfo;
	string record(doc.get_data());

	// First, remove the magic term
	doc.remove_term(MAGIC_TERM);

	if (record.empty() == true)
        {
		// Nothing else we can do
		return;
	}

	string language(StringManip::extractField(record, "language=", ""));
	string timestamp(StringManip::extractField(record, "timestamp=", "\n"));

	docInfo = DocumentInfo(StringManip::extractField(record, "caption=", "\n"),
		StringManip::extractField(record, "url=", "\n"),
		StringManip::extractField(record, "type=", "\n"),
		Languages::toLocale(language));
	// We used to use timestamp prior to 0.60
	if (timestamp.empty() == true)
	{
		string modTime(StringManip::extractField(record, "modtime=", "\n"));
		if (modTime.empty() == false)
		{
			time_t timeT = (time_t )atol(modTime.c_str());
			timestamp = TimeConverter::toTimestamp(timeT);
		}
	}
	docInfo.setTimestamp(timestamp);
	Url urlObj(docInfo.getLocation());

	// FIXME: remove terms extracted from the title if they don't have more than one posting
	string title(docInfo.getTitle());
	if (title.empty() == false)
	{
		Document titleDoc;
		titleDoc.setData(title.c_str(), title.length());
		Tokenizer titleTokens(&titleDoc);
		removeFirstPostingsFromDocument(titleTokens, doc, "S", language, STORE_UNSTEM);
		titleTokens.rewind();
		removeFirstPostingsFromDocument(titleTokens, doc, "", language, m_stemMode);
	}

	// Title
	doc.remove_term(limitTermLength(string("U") + docInfo.getLocation(), true));
	// Host name
	string hostName(StringManip::toLowerCase(urlObj.getHost()));
	if (hostName.empty() == false)
	{
		doc.remove_term(limitTermLength(string("H") + hostName, true));
		string::size_type dotPos = hostName.find('.');
		while (dotPos != string::npos)
		{
			doc.remove_term(limitTermLength(string("H") + hostName.substr(dotPos + 1), true));

			// Next
			dotPos = hostName.find('.', dotPos + 1);
		}
	}
	// ...location
	string tree(urlObj.getLocation());
	if (tree.empty() == false)
	{
		doc.remove_term(limitTermLength(string("XDIR:") + tree, true));
		string::size_type slashPos = tree.find('/', 1);
		while (slashPos != string::npos)
		{
			doc.remove_term(limitTermLength(string("XDIR:") + tree.substr(0, slashPos), true));

			// Next
			slashPos = tree.find('/', slashPos + 1);
		}
	}
	// ...and file name
	string fileName(urlObj.getFile());
	if (fileName.empty() == false)
	{
		doc.remove_term(limitTermLength(string("P") + StringManip::toLowerCase(fileName), true));
	}
	// Language code
	doc.remove_term(string("L") + Languages::toCode(language));
	// MIME type
	doc.remove_term(string("T") + docInfo.getType());
}
Ejemplo n.º 30
0
/** Main routine */
int main(int argc,char **argv)
{
  // process inputs that were passed to us via QUERY_STRING
  std::cout << "Content-Type:application/javascript;charset=utf-8\r\n\n";
  std::string callback;
  try
  {
    // get input parameters
    const char *queryEnv = getenv("QUERY_STRING");
    std::string queryString;
    if (queryEnv)
    {
      queryString = queryEnv;
    }
    else if (argc>=2)
    {
      queryString = argv[1];
    }
    else
    {
      std::cout << "No input!\n";
      exit(1);
    }

    // parse query string
    std::vector<std::string> parts = split(queryString,'&');
    std::string searchFor,callback;
    int num=1,page=0;
    for (std::vector<std::string>::const_iterator it=parts.begin();it!=parts.end();++it)
    {
      std::vector<std::string> kv = split(*it,'=');
      if (kv.size()==2)
      {
        std::string val = uriDecode(kv[1]);
        if      (kv[0]=="q")  searchFor = val; 
        else if (kv[0]=="n")  num       = fromString<int>(val);
        else if (kv[0]=="p")  page      = fromString<int>(val);
        else if (kv[0]=="cb") callback  = val;
      }
    }

    std::string indexDir = "doxysearch.db";

    if (queryString=="test") // user test
    {
      bool dbOk = dirExists(indexDir);
      if (dbOk)
      {
        std::cout << "Test successful.";
      }
      else
      {
        std::cout << "Test failed: cannot find search index " << indexDir;
      }
      exit(0);
    }

    // create query
    Xapian::Database db(indexDir);
    Xapian::Enquire enquire(db);
    Xapian::Query query;
    std::vector<std::string> words = split(searchFor,' ');
    for (std::vector<std::string>::const_iterator it=words.begin();it!=words.end();++it)
    {
      query = Xapian::Query(Xapian::Query::OP_OR,query,Xapian::Query(*it));
    }
    enquire.set_query(query);

    // get results
    Xapian::MSet matches = enquire.get_mset(page*num,num);
    unsigned int hits    = matches.get_matches_estimated();
    unsigned int offset  = page*num;
    unsigned int pages   = num>0 ? (hits+num-1)/num : 0;
    if (offset>hits)     offset=hits;
    if (offset+num>hits) num=hits-offset;

    // write results as JSONP
    std::cout << callback.c_str() << "(";
    std::cout << "{" << std::endl 
              << "  \"hits\":"   << hits   << "," << std::endl
              << "  \"first\":"  << offset << "," << std::endl
              << "  \"count\":"  << num    << "," << std::endl
              << "  \"page\":"   << page   << "," << std::endl
              << "  \"pages\":"  << pages  << "," << std::endl
              << "  \"query\": \""  << escapeString(searchFor)  << "\"," << std::endl
              << "  \"items\":[" << std::endl;
    // foreach search result
    unsigned int o = offset;
    for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i,++o) 
    {
      std::vector<Fragment> hl;
      Xapian::Document doc = i.get_document();
      highlighter(doc.get_value(FIELD_DOC),words,hl);
      std::cout << "  {\"type\": \"" << doc.get_value(FIELD_TYPE) << "\"," << std::endl
                << "   \"name\": \"" << doc.get_value(FIELD_NAME) << doc.get_value(FIELD_ARGS) << "\"," << std::endl
                << "   \"tag\": \""  << doc.get_value(FIELD_TAG) << "\"," << std::endl
                << "   \"url\": \""  << doc.get_value(FIELD_URL) << "\"," << std::endl;
      std::cout << "   \"fragments\":[" << std::endl;
      int c=0;
      bool first=true;
      for (std::vector<Fragment>::const_iterator it = hl.begin();it!=hl.end() && c<3;++it,++c)
      {
        if (!first) std::cout << "," << std::endl;
        std::cout << "     \"" << escapeString((*it).text) << "\"";
        first=false;
      }
      if (!first) std::cout << std::endl;
      std::cout << "   ]" << std::endl;
      std::cout << "  }";
      if (o<offset+num-1) std::cout << ",";
      std::cout << std::endl;
    }
    std::cout << " ]" << std::endl << "})" << std::endl;
  } 
  catch (const Xapian::Error &e) // Xapian exception
  {
    showError(callback,e.get_description());
  } 
  catch (...) // Any other exception
  {
    showError(callback,"Unknown Exception!");
    exit(1);
  }
  return 0;
}