Exemplo n.º 1
0
bool OnDiskHandler::fileMoved(const string &fileName, const string &previousFileName)
{
	bool handledEvent = false;

#ifdef DEBUG
	cout << "OnDiskHandler::fileMoved: " << fileName << endl;
#endif
	pthread_mutex_lock(&m_mutex);
	unsigned int oldDocId = m_index.hasDocument(string("file://") + previousFileName);
	if (oldDocId > 0)
	{
		DocumentInfo docInfo;

		if (m_index.getDocumentInfo(oldDocId, docInfo) == true)
		{
			// Change the location
			docInfo.setLocation(string("file://") + fileName);

			handledEvent = replaceFile(oldDocId, docInfo);
		}
	}
	pthread_mutex_unlock(&m_mutex);

	return handledEvent;
}
Exemplo n.º 2
0
//
// Updates a document's properties.
//
void IndexTree::updateDocumentInfo(unsigned int docId, const DocumentInfo &docInfo)
{
	if (docId == 0)
	{
		return;
	}

	// Go through the list of indexed documents
	TreeModel::Children children = m_refStore->children();
	for (TreeModel::Children::iterator iter = children.begin(); iter != children.end(); ++iter)
	{
		TreeModel::Row row = *iter;

		if (docId == row[m_indexColumns.m_id])
		{
			row[m_indexColumns.m_text] = to_utf8(docInfo.getTitle());
			row[m_indexColumns.m_type] = to_utf8(docInfo.getType());
			row[m_indexColumns.m_language] = to_utf8(docInfo.getLanguage());
			row[m_indexColumns.m_timestamp] = to_utf8(docInfo.getTimestamp());
#ifdef DEBUG
			cout << "IndexTree::updateDocumentInfo: language now " << docInfo.getLanguage() << endl;
#endif
			break;
		}
	}
}
Exemplo n.º 3
0
bool ThreadsManager::index_document(const DocumentInfo &docInfo)
{
	string location(docInfo.getLocation());

	if (location.empty() == true)
	{
		// Nothing to do
		return false;
	}

	// If the document is a mail message, we can't index it again
	Url urlObj(location);
	if (urlObj.getProtocol() == "mailbox")
	{
		return false;
	}

	// Is the document being indexed/updated ?
	if (write_lock_lists() == true)
	{
		bool beingProcessed = true;

		if (m_beingIndexed.find(location) == m_beingIndexed.end())
		{
			m_beingIndexed.insert(location);
			beingProcessed = false;
		}

		unlock_lists();

		if (beingProcessed == true)
		{
			// FIXME: we may have to set labels on this document
			return false;
		}
	}

	// Is it an update ?
	IndexInterface *pIndex = PinotSettings::getInstance().getIndex(m_defaultIndexLocation);
	if (pIndex == NULL)
	{
		return false;
	}

	unsigned int docId = pIndex->hasDocument(docInfo.getLocation());
	if (docId > 0)
	{
		// Yes, it is
		start_thread(new IndexingThread(docInfo, docId, m_defaultIndexLocation));
	}
	else
	{
		// This is a new document
		start_thread(new IndexingThread(docInfo, docId, m_defaultIndexLocation));
	}
	delete pIndex;

	return true;
}
Exemplo n.º 4
0
/// Returns a document's properties.
bool XapianIndex::getDocumentInfo(unsigned int docId, DocumentInfo &docInfo) const
{
	bool foundDocument = false;

	if (docId == 0)
	{
		return false;
	}

	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
	if (pDatabase == NULL)
	{
		cerr << "Bad index " << m_databaseName << endl;
		return false;
	}

	try
	{
		Xapian::Database *pIndex = pDatabase->readLock();
		if (pIndex != NULL)
		{
			Xapian::Document doc = pIndex->get_document(docId);

			// Get the current document data
			string record = doc.get_data();
			if (record.empty() == false)
			{
				string language = Languages::toLocale(StringManip::extractField(record, "language=", ""));

				docInfo = DocumentInfo(StringManip::extractField(record, "caption=", "\n"),
					StringManip::extractField(record, "url=", "\n"),
					StringManip::extractField(record, "type=", "\n"),
					language);
				docInfo.setTimestamp(StringManip::extractField(record, "timestamp=", "\n"));
#ifdef DEBUG
				cout << "XapianIndex::getDocumentInfo: language is "
					<< docInfo.getLanguage() << endl;
#endif
				foundDocument = true;
			}
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't get document properties: " << error.get_msg() << endl;
	}
	catch (...)
	{
		cerr << "Couldn't get document properties, unknown exception occured" << endl;
	}
	pDatabase->unlock();

	return foundDocument;
}
Exemplo n.º 5
0
bool MetaDataBackup::setAttribute(const DocumentInfo &docInfo,
	const string &name, const string &value, bool noXAttr)
{
	string url(docInfo.getLocation());
	string urlWithIPath(docInfo.getLocation(true));
#ifdef HAVE_ATTR_XATTR_H
	Url urlObj(url);

	// If the file is local and isn't a nested document, use an extended attribute
	if ((noXAttr == false) &&
		(urlObj.isLocal() == true) &&
		(docInfo.getInternalPath().empty() == true))
	{
		string fileName(url.substr(urlObj.getProtocol().length() + 3));
		string attrName("pinot." + name);

		// Set an attribute, and add an entry in the table
		if (setxattr(fileName.c_str(), attrName.c_str(),
			value.c_str(), (size_t)value.length(), 0) != 0)
		{
#ifdef DEBUG
			cout << "MetaDataBackup::setAttribute: setxattr failed with " << strerror(errno) << endl;
#endif
		}
	}
#endif
	bool update = false, success = false;

	// Is there already such an item for this URL ?
	SQLResults *results = executeStatement("SELECT Url FROM MetaDataBackup \
		WHERE Url='%q' AND Name='%q';",
		Url::escapeUrl(urlWithIPath).c_str(), name.c_str());
	if (results != NULL)
	{
		SQLRow *row = results->nextRow();
		if (row != NULL)
		{
			// Yes, there is
			update = true;

			delete row;
		}

		delete results;
	}

	if (update == false)
	{
		results = executeStatement("INSERT INTO MetaDataBackup \
			VALUES('%q', '%q', '%q');",
			Url::escapeUrl(urlWithIPath).c_str(), name.c_str(), value.c_str());
	}
Exemplo n.º 6
0
void SimpleIndex::insert(ifstream *in, string filename)
{
  // Get next doc id
  int docid=doccount++;
  // Save filename info
  doc_info.set(docid,"filename",filename);
  // Get document language ...
  ifstream tempstream;
  tempstream.open(filename);
  set<string> *langs=lang_rec.recognize(&tempstream);
  // Only set language if result is unambigous
  string lang;
  if (langs->size()==1)
    lang=*(langs->begin());
  else
    lang="";
  tempstream.close();
  doc_info.set(docid,"language",lang);
#ifdef DEBUG
  cerr << "Got " << lang << endl;
#endif 
  // Store in the list of all doc ids
  doc_ids.add(docid);
  string word;
  // Read the whole file word by word to the end
  while(!in->eof())
  {
    *in >> word;
    string nword=u.normalize(word);
    // Add to index
    if (lemma_forward[lang][nword]!="")
      {
	//#ifdef DEBUG
	cout << "Got lemma " << lemma_forward[lang][nword] << " for word " << nword << endl;
	//#endif
	for (auto it=lemma_backward[lang][lemma_forward[lang][nword]].begin(); it!=lemma_backward[lang][lemma_forward[lang][nword]].end(); ++it)
	  {
	    inverted_index[*it].first++;
	    inverted_index[*it].second.add(docid);
	  }
      }
    else
      {
#ifdef DEBUG
	cout << "Got no lemma for word " << nword << endl;
#endif
	inverted_index[nword].first++;
	inverted_index[nword].second.add(docid);
      }
  }
}
Exemplo n.º 7
0
bool XapianIndex::prepareDocument(const DocumentInfo &info, Xapian::Document &doc,
	Xapian::termcount &termPos) const
{
	string title(info.getTitle());
	string location(info.getLocation());
	Url urlObj(location);

	// Add a magic term :-)
	doc.add_term(MAGIC_TERM);

	// Index the title with and without prefix S
	if (title.empty() == false)
	{
		Document titleDoc;
		titleDoc.setData(title.c_str(), title.length());
		Tokenizer titleTokens(&titleDoc);
		addTermsToDocument(titleTokens, doc, "S", termPos, STORE_UNSTEM);
		titleTokens.rewind();
		addTermsToDocument(titleTokens, doc, "", termPos, m_stemMode);
	}

	// Index the full URL with prefix U
	doc.add_term(limitTermLength(string("U") + location, true));
	// ...the host name and included domains with prefix H
	string hostName(StringManip::toLowerCase(urlObj.getHost()));
	if (hostName.empty() == false)
	{
		doc.add_term(limitTermLength(string("H") + hostName, true));
		string::size_type dotPos = hostName.find('.');
		while (dotPos != string::npos)
		{
			doc.add_term(limitTermLength(string("H") + hostName.substr(dotPos + 1), true));

			// Next
			dotPos = hostName.find('.', dotPos + 1);
		}
	}
	// ...and the file name with prefix P
	string fileName(urlObj.getFile());
	if (fileName.empty() == false)
	{
		doc.add_term(limitTermLength(string("P") + StringManip::toLowerCase(fileName), true));
	}
	// Finally, add the language code with prefix L
	doc.add_term(string("L") + Languages::toCode(m_stemLanguage));

	setDocumentData(doc, info, m_stemLanguage);

	return true;
}
/// Updates a document's properties.
bool DBusXapianIndex::updateDocumentInfo(unsigned int docId, const DocumentInfo &docInfo)
{
	bool updated = false;

	DBusGConnection *pBus = getBusConnection();
	if (pBus == NULL)
	{
		return false;
	}

	DBusGProxy *pBusProxy = getBusProxy(pBus);
	if (pBusProxy == NULL)
	{
		cerr << "DBusXapianIndex::updateDocumentInfo: couldn't get bus proxy" << endl;
		return false;
	}

	GError *pError = NULL;
	const char *pTitle = docInfo.getTitle().c_str();
	const char *pLocation = docInfo.getLocation().c_str();
	const char *pType = docInfo.getType().c_str();
	string language(Languages::toEnglish(docInfo.getLanguage()));
	const char *pLanguage = language.c_str();

	if (dbus_g_proxy_call(pBusProxy, "SetDocumentInfo", &pError,
		G_TYPE_UINT, docId,
		G_TYPE_STRING, pTitle,
		G_TYPE_STRING, pLocation,
		G_TYPE_STRING, pType,
		G_TYPE_STRING, pLanguage,
		G_TYPE_INVALID,
		G_TYPE_UINT, &docId,
		G_TYPE_INVALID) == TRUE)
	{
		updated = true;
	}
	else
	{
		if (pError != NULL)
		{
			cerr << "DBusXapianIndex::updateDocumentInfo: " << pError->message << endl;
			g_error_free(pError);
		}
	}

	g_object_unref(pBusProxy);
	// FIXME: don't we have to call dbus_g_connection_unref(pBus); ?

	return updated;
}
Exemplo n.º 9
0
void DirectoryScannerThread::foundFile(const DocumentInfo &docInfo)
{
	if ((docInfo.getLocation().empty() == true) ||
		(m_done == true))
	{
		return;
	}

	stringstream labelStream;

	// This identifies the source
	labelStream << "X-SOURCE" << m_sourceId;
#ifdef DEBUG
	cout << "DirectoryScannerThread::foundFile: source label for " << docInfo.getLocation() << " is " << labelStream.str() << endl;
#endif
	m_signalFileFound(docInfo, labelStream.str(), false);
}
Exemplo n.º 10
0
void XapianIndex::setDocumentData(const DocumentInfo &info, Xapian::Document &doc,
	const string &language) const
{
	string title(info.getTitle());
	string timestamp(info.getTimestamp());
	char timeStr[64];
	time_t timeT = TimeConverter::fromTimestamp(timestamp);

	// Set the document data omindex-style
	string record = "url=";
	record += info.getLocation();
	// The sample will be generated at query time
	record += "\nsample=";
	record += "\ncaption=";
	if (badField(title) == true)
	{
		// Modify the title if necessary
		string::size_type pos = title.find("=");
		while (pos != string::npos)
		{
			title[pos] = ' ';
			pos = title.find("=", pos + 1);
		}
#ifdef DEBUG
		cout << "XapianIndex::setDocumentData: modified title" << endl;
#endif
	}
	record += title;
	record += "\ntype=";
	record += info.getType();
	// Append a timestamp, in a format compatible with Omega
	record += "\nmodtime=";
	snprintf(timeStr, 64, "%ld", timeT);
	record += timeStr;
	// ...and the language
	record += "\nlanguage=";
	record += StringManip::toLowerCase(language);
#ifdef DEBUG
	cout << "XapianIndex::setDocumentData: document data is " << record << endl;
#endif
	doc.set_data(record);

	// Add this value to allow sorting by date
	doc.add_value(0, StringManip::integerToBinaryString((uint32_t)timeT));
}
Exemplo n.º 11
0
void SimpleIndex::restore_index(ifstream *infile)
{
  while(!infile->eof())
    {
      string line;
      getline(*infile,line);
      stringstream linestream(line);
      string scount,sdoclist,sword;
      getline(linestream,scount,'\t');
      getline(linestream,sdoclist,'\t');
      getline(linestream,sword);
      inverted_index[sword].first+=atoi(scount.c_str());
      stringstream docstream(sdoclist);
      while(!docstream.eof())
	{
	  string sdocinfo;
	  string sdocnum, sdocfilename, sdoccount, sdoclang;
	  getline(docstream,sdocinfo,'|');
	  stringstream docinfostream(sdocinfo);
	  getline(docinfostream,sdocnum,':');
	  getline(docinfostream,sdocfilename,':');
	  getline(docinfostream,sdoccount,':');
	  getline(docinfostream,sdoclang,':');
	  if (sdocnum!=""&& sdoccount!="")
	    {
	      int docnum=atoi(sdocnum.c_str());
	      doc_ids.add(docnum);
	      inverted_index[sword].second.add(docnum,atoi(sdoccount.c_str()));
	      if (sdocfilename!="")
		{
		  doc_info.set(docnum,"filename",sdocfilename);
		}
	      else
		{
		  doc_info.set(docnum,"filename",string("Lost in translation"));
		}
	      if (sdoclang!="")
		{
		  doc_info.set(docnum,"language",sdoclang);
		}
	    }
	}
    }
}
Exemplo n.º 12
0
void XapianIndex::setDocumentData(Xapian::Document &doc, const DocumentInfo &info,
	const string &language) const
{
	string title(info.getTitle());
	string timestamp(info.getTimestamp());
	char timeStr[64];

	// Set the document data omindex-style
	string record = "url=";
	record += info.getLocation();
	// The sample will be generated at query time
	record += "\nsample=";
	record += "\ncaption=";
	if (badField(title) == true)
	{
		// Modify the title if necessary
		string::size_type pos = title.find("=");
		while (pos != string::npos)
		{
			title[pos] = ' ';
			pos = title.find("=", pos + 1);
		}
#ifdef DEBUG
		cout << "XapianIndex::setDocumentData: modified title" << endl;
#endif
	}
	record += title;
	record += "\ntype=";
	record += info.getType();
	// Append a timestamp
	record += "\ntimestamp=";
	record += timestamp;
	// ...and the language
	record += "\nlanguage=";
	record += language;
#ifdef DEBUG
	cout << "XapianIndex::setDocumentData: document data is " << record << endl;
#endif
	doc.set_data(record);

	// Add this value to allow sorting by date
	snprintf(timeStr, 64, "%d", TimeConverter::fromTimestamp(timestamp));
	doc.add_value(0, timeStr);
}
Exemplo n.º 13
0
/// Updates a document's properties.
bool XapianIndex::updateDocumentInfo(unsigned int docId, const DocumentInfo &docInfo)
{
	bool updated = false;

	if (docId == 0)
	{
		return false;
	}

	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
	if (pDatabase == NULL)
	{
		cerr << "Bad index " << m_databaseName << endl;
		return false;
	}

	try
	{
		Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
		if (pIndex != NULL)
		{
			Xapian::Document doc = pIndex->get_document(docId);

#ifdef DEBUG
			cout << "XapianIndex::updateDocumentInfo: language is " << docInfo.getLanguage() << endl;
#endif
			// Update the document data with the current language
			setDocumentData(doc, docInfo, docInfo.getLanguage());
			pIndex->replace_document(docId, doc);
			updated = true;
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't update document properties: " << error.get_msg() << endl;
	}
	catch (...)
	{
		cerr << "Couldn't update document properties, unknown exception occured" << endl;
	}
	pDatabase->unlock();

	return updated;
}
Exemplo n.º 14
0
bool OnDiskHandler::replaceFile(unsigned int docId, DocumentInfo &docInfo)
{
	FilterWrapper wrapFilter(&m_index);

	// Unindex the destination file
	wrapFilter.unindexDocument(docInfo.getLocation());

	// Update the document info
	return m_index.updateDocumentInfo(docId, docInfo);
}
Exemplo n.º 15
0
bool DocumentInfo::operator<(const DocumentInfo& other) const
{
	string thisUrl(getField("url"));
	string otherUrl(other.getField("url"));

	if (thisUrl < otherUrl)
	{
		return true;
	}
	else if (thisUrl == otherUrl)
	{
		if (getField("ipath") < other.getField("ipath"))
		{
			return true;
		}
	}

	return false;
}
Exemplo n.º 16
0
bool OnDiskHandler::directoryMoved(const string &dirName,
	const string &previousDirName)
{
	set<unsigned int> docIdList;
	bool handledEvent = false;

#ifdef DEBUG
	cout << "OnDiskHandler::directoryMoved: " << dirName << endl;
#endif
	pthread_mutex_lock(&m_mutex);
	if (m_index.listDocumentsInDirectory(previousDirName, docIdList) == true)
	{
		for (set<unsigned int>::const_iterator iter = docIdList.begin();
			iter != docIdList.end(); ++iter)
		{
			DocumentInfo docInfo;

			if (m_index.getDocumentInfo(*iter, docInfo) == true)
			{
				string newLocation(docInfo.getLocation());

				string::size_type pos = newLocation.find(previousDirName);
				if (pos != string::npos)
				{
					newLocation.replace(pos, previousDirName.length(), dirName);

					// Change the location
					docInfo.setLocation(newLocation);

					replaceFile(*iter, docInfo);
				}
			}
		}

		handledEvent = true;
	}
#ifdef DEBUG
	else cout << "OnDiskHandler::directoryMoved: no documents in " << previousDirName << endl;
#endif
	pthread_mutex_unlock(&m_mutex);

	return handledEvent;
}
Exemplo n.º 17
0
ustring ThreadsManager::index_document(const DocumentInfo &docInfo)
{
	string location(docInfo.getLocation());

	if (location.empty() == true)
	{
		// Nothing to do
		return "";
	}

	// If the document is a mail message, we can't index it again
	Url urlObj(location);
	if (urlObj.getProtocol() == "mailbox")
	{
		return _("Can't index mail here");
	}

	// Is the document being indexed/updated ?
	if (write_lock_lists() == true)
	{
		bool beingProcessed = true;

		if (m_beingIndexed.find(location) == m_beingIndexed.end())
		{
			m_beingIndexed.insert(location);
			beingProcessed = false;
		}

		unlock_lists();

		if (beingProcessed == true)
		{
			// FIXME: we may have to set labels on this document
			ustring status(location);
			status += " ";
			status += _("is already being indexed");
			return status;
		}
	}

	// Is the document blacklisted ?
	if (PinotSettings::getInstance().isBlackListed(location) == true)
	{
		ustring status(location);
		status += " ";
		status += _("is blacklisted");
		return status;
	}

	start_thread(new IndexingThread(docInfo, m_defaultIndexLocation));

	return "";
}
Exemplo n.º 18
0
void XapianIndex::setDocumentData(const DocumentInfo &info, Xapian::Document &doc,
                                  const string &language) const
{
    time_t timeT = TimeConverter::fromTimestamp(info.getTimestamp());

    // Add this value to allow sorting by date
    doc.add_value(0, StringManip::integerToBinaryString((uint32_t)timeT));

    DocumentInfo docCopy(info);
    docCopy.setLanguage(language);
    doc.set_data(XapianDatabase::propsToRecord(&docCopy));
}
Exemplo n.º 19
0
DocumentInfo DocumentInfoModel::getDocumentInfo(QString type) {
    DocumentInfo info;
    query = new QSqlQuery(Database::getInstance().db);
    query->prepare("SELECT documents.type, documents.name, numbering, after_text, family "
                   "FROM documents, wh_numbering WHERE documents.type = wh_numbering.type AND warehouse =  ? AND documents.type = ?");
    query->addBindValue(ApplicationManager::getInstance()->getWarehouse()->getId());
    query->addBindValue(type);
    query->exec();

    if(this->isQueryError(query))
        throw new SQLException("DocumentInfoModel::getDocumentInfo", query);

    if(query->size() > 0) {
        query->first();

        info.setType(query->value(0).toString());
        info.setName(query->value(1).toString());
        info.setNumbering(query->value(2).toString());
        info.setAfterText(query->value(3).toString());
        info.setFamily(this->getFamily(query->value(4).toString()));
    }

    delete query;

    return info;
}
Exemplo n.º 20
0
QVector<DocumentInfo> DocumentInfoModel::getDocumentsInfo() {
    QVector<DocumentInfo> docsInfo;
    query = new QSqlQuery(Database::getInstance().db);
    query->prepare("SELECT documents.type, name, numbering, after_text, family "
                   "FROM documents, wh_numbering WHERE documents.type = wh_numbering.type AND warehouse =  ?");
    query->addBindValue(ApplicationManager::getInstance()->getWarehouse()->getId());
    query->exec();

    if(this->isQueryError(query))
        throw new SQLException("DocumentInfoModel::getDocumentsInfo", query);

    DocumentInfo info;
    while(query->next())
    {
        info.setType(query->value(0).toString());
        info.setName(query->value(1).toString());
        info.setNumbering(query->value(2).toString());
        info.setAfterText(query->value(3).toString());
        info.setFamily(this->getFamily(query->value(4).toString()));
        docsInfo.push_back(info);
    }

    delete query;

    return docsInfo;
}
Exemplo n.º 21
0
void DirectoryScannerThread::foundFile(const DocumentInfo &docInfo)
{
    char labelStr[64];

    if ((docInfo.getLocation().empty() == true) ||
            (m_done == true))
    {
        return;
    }

    // This identifies the source
    snprintf(labelStr, 64, "X-SOURCE%u", m_sourceId);
    m_signalFileFound(docInfo, labelStr, false);
}
Exemplo n.º 22
0
/// Updates a document's properties.
bool XapianIndex::updateDocumentInfo(unsigned int docId, const DocumentInfo &docInfo)
{
	bool updated = false;

	if (docId == 0)
	{
		return false;
	}

	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
	if (pDatabase == NULL)
	{
		cerr << "Bad index " << m_databaseName << endl;
		return false;
	}

	try
	{
		Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
		if (pIndex != NULL)
		{
			Xapian::Document doc = pIndex->get_document(docId);
			Xapian::termcount termPos = 0;

			// Update the document data with the current language
			removeCommonTerms(doc);
			m_stemLanguage = Languages::toEnglish(docInfo.getLanguage());
			addCommonTerms(docInfo, doc, termPos);
			setDocumentData(docInfo, doc, m_stemLanguage);

			pIndex->replace_document(docId, doc);
			updated = true;
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't update document properties: " << error.get_type() << ": " << error.get_msg() << endl;
	}
	catch (...)
	{
		cerr << "Couldn't update document properties, unknown exception occured" << endl;
	}
	pDatabase->unlock();

	return updated;
}
Exemplo n.º 23
0
IndexingThread::IndexingThread(const DocumentInfo &docInfo, const string &labelName,
	unsigned int docId) :
	DownloadingThread(docInfo.getLocation(), false),
	m_docInfo(docInfo),
	m_labelName(labelName),
	m_docId(docId)
{
	m_indexLocation = PinotSettings::getInstance().m_indexLocation;
	if (m_docId > 0)
	{
		// Ignore robots directives on updates
		m_ignoreRobotsDirectives = true;
		m_update = true;
	}
	else
	{
		m_ignoreRobotsDirectives = PinotSettings::getInstance().m_ignoreRobotsDirectives;
		// This is not an update
		m_update = false;
	}
}
Exemplo n.º 24
0
string XapianIndex::scanDocument(const char *pData, unsigned int dataLength,
                                 DocumentInfo &info)
{
    vector<string> candidates;
    string language;

    // Try to determine the document's language
    LanguageDetector lang;
    lang.guessLanguage(pData, max(dataLength, (unsigned int)2048), candidates);

    // See which of these languages is suitable for stemming
    for (vector<string>::iterator langIter = candidates.begin(); langIter != candidates.end(); ++langIter)
    {
        if (*langIter == "unknown")
        {
            continue;
        }

        try
        {
            Xapian::Stem stemmer(*langIter);
        }
        catch (const Xapian::Error &error)
        {
            cerr << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl;
            continue;
        }

        language = *langIter;
        break;
    }
#ifdef DEBUG
    cout << "XapianIndex::scanDocument: language " << language << endl;
#endif

    // Update the document's properties
    info.setLanguage(language);

    return language;
}
Exemplo n.º 25
0
void DaemonState::on_message_filefound(DocumentInfo docInfo, bool isDirectory)
{
	if (isDirectory == false)
	{
		queue_index(docInfo);
	}
	else
	{
		PinotSettings::IndexableLocation newLocation;

		newLocation.m_monitor = true;
		newLocation.m_name = docInfo.getLocation().substr(7);
		newLocation.m_isSource = false;
#ifdef DEBUG
		cout << "DaemonState::on_message_filefound: new directory " << newLocation.m_name << endl;
#endif

		// Queue this directory for crawling
		m_crawlQueue.push(newLocation);
		start_crawling();
	}
}
Exemplo n.º 26
0
void DaemonState::on_message_filefound(const DocumentInfo &docInfo, const string &sourceLabel, bool isDirectory)
{
	if (isDirectory == false)
	{
		DocumentInfo docCopy(docInfo);
		set<string> labels;

		// Insert a label that identifies the source
		labels.insert(sourceLabel);
		docCopy.setLabels(labels);

		queue_index(docCopy);
	}
	else
	{
		string location(docInfo.getLocation());

		crawlLocation(location.substr(7), false, true);
#ifdef DEBUG
		cout << "DaemonState::on_message_filefound: new directory " << location.substr(7) << endl;
#endif
	}
}
Exemplo n.º 27
0
void DBusServletThread::doWork(void)
{
	XapianIndex index(PinotSettings::getInstance().m_daemonIndexLocation);
	DBusError error;
	const char *pSender = dbus_message_get_sender(m_pRequest);
	bool processedMessage = true, flushIndex = false;

	if ((m_pServer == NULL) ||
		(m_pConnection == NULL) ||
		(m_pRequest == NULL))
	{
		return;
	}

	dbus_error_init(&error);

#ifdef DEBUG
	if (pSender != NULL)
	{
		cout << "DBusServletThread::doWork: called by " << pSender << endl;
	}
	else
	{
		cout << "DBusServletThread::doWork: called by unknown sender" << endl;
	}
#endif

	if (dbus_message_is_method_call(m_pRequest, "de.berlios.Pinot", "DeleteLabel") == TRUE)
	{
		char *pLabel = NULL;

		if (dbus_message_get_args(m_pRequest, &error,
			DBUS_TYPE_STRING, &pLabel,
			DBUS_TYPE_INVALID) == TRUE)
		{
#ifdef DEBUG
			cout << "DBusServletThread::doWork: received DeleteLabel " << pLabel << endl;
#endif
			// Delete the label
			flushIndex = index.deleteLabel(pLabel);

			// Prepare the reply
			m_pReply = newDBusReply(m_pRequest);
			if (m_pReply != NULL)
			{
				dbus_message_append_args(m_pReply,
					DBUS_TYPE_STRING, &pLabel,
					DBUS_TYPE_INVALID);
			}
		}
	}
	else if (dbus_message_is_method_call(m_pRequest, "de.berlios.Pinot", "GetDocumentInfo") == TRUE)
	{
		unsigned int docId = 0;

		if (dbus_message_get_args(m_pRequest, &error,
			DBUS_TYPE_UINT32, &docId,
			DBUS_TYPE_INVALID) == TRUE)
		{
			DocumentInfo docInfo;

#ifdef DEBUG
			cout << "DBusServletThread::doWork: received GetDocumentInfo " << docId << endl;
#endif
			if (index.getDocumentInfo(docId, docInfo) == true)
			{
				// Prepare the reply
				m_pReply = newDBusReply(m_pRequest);
				if (m_pReply != NULL)
				{
					string language(Languages::toEnglish(docInfo.getLanguage()));
					const char *pTitle = docInfo.getTitle().c_str();
					const char *pLocation = docInfo.getLocation().c_str();
					const char *pType = docInfo.getType().c_str();
					const char *pLanguage = language.c_str();

					dbus_message_append_args(m_pReply,
						DBUS_TYPE_STRING, &pTitle,
						DBUS_TYPE_STRING, &pLocation,
						DBUS_TYPE_STRING, &pType,
						DBUS_TYPE_STRING, &pLanguage,
						DBUS_TYPE_INVALID);
				}
			}
			else
			{
				m_pReply = dbus_message_new_error(m_pRequest,
					"de.berlios.Pinot.GetDocumentInfo",
					"Unknown document");
			}
		}
	}
	else if (dbus_message_is_method_call(m_pRequest, "de.berlios.Pinot", "GetDocumentLabels") == TRUE)
	{
		unsigned int docId = 0;

		if (dbus_message_get_args(m_pRequest, &error,
			DBUS_TYPE_UINT32, &docId,
			DBUS_TYPE_INVALID) == TRUE)
		{
			set<string> labels;

#ifdef DEBUG
			cout << "DBusServletThread::doWork: received GetDocumentLabels " << docId << endl;
#endif
			if (index.getDocumentLabels(docId, labels) == true)
			{
				m_pArray = g_ptr_array_new();

				for (set<string>::const_iterator labelIter = labels.begin();
					labelIter != labels.end(); ++labelIter)
				{
					string labelName(*labelIter);

					g_ptr_array_add(m_pArray, const_cast<char*>(labelName.c_str()));
#ifdef DEBUG
					cout << "DBusServletThread::doWork: adding label " << m_pArray->len << " " << labelName << endl;
#endif
				}

				// Prepare the reply
				m_pReply = newDBusReply(m_pRequest);
				if (m_pReply != NULL)
				{
					dbus_message_append_args(m_pReply,
						DBUS_TYPE_ARRAY, DBUS_TYPE_STRING, &m_pArray->pdata, m_pArray->len,
						DBUS_TYPE_INVALID);
				}
			}
			else
			{
				m_pReply = dbus_message_new_error(m_pRequest,
					"de.berlios.Pinot.GetDocumentLabels",
					" failed");
			}
		}
	}
	else if (dbus_message_is_method_call(m_pRequest, "de.berlios.Pinot", "GetStatistics") == TRUE)
	{
		CrawlHistory history(PinotSettings::getInstance().m_historyDatabase);
		unsigned int crawledFilesCount = history.getItemsCount(CrawlHistory::CRAWLED);
		unsigned int docsCount = index.getDocumentsCount();

#ifdef DEBUG
		cout << "DBusServletThread::doWork: received GetStatistics" << endl;
#endif
		// Prepare the reply
		m_pReply = newDBusReply(m_pRequest);
		if (m_pReply != NULL)
		{
			dbus_message_append_args(m_pReply,
				DBUS_TYPE_UINT32, &crawledFilesCount,
				DBUS_TYPE_UINT32, &docsCount,
				DBUS_TYPE_INVALID);
		}
	}
	else if (dbus_message_is_method_call(m_pRequest, "de.berlios.Pinot", "RenameLabel") == TRUE)
	{
		char *pOldLabel = NULL;
		char *pNewLabel = NULL;

		if (dbus_message_get_args(m_pRequest, &error,
			DBUS_TYPE_STRING, &pOldLabel,
			DBUS_TYPE_STRING, &pNewLabel,
			DBUS_TYPE_INVALID) == TRUE)
		{
#ifdef DEBUG
			cout << "DBusServletThread::doWork: received RenameLabel " << pOldLabel << ", " << pNewLabel << endl;
#endif
			// Rename the label
			flushIndex = index.renameLabel(pOldLabel, pNewLabel);

			// Prepare the reply
			m_pReply = newDBusReply(m_pRequest);
			if (m_pReply != NULL)
			{
				dbus_message_append_args(m_pReply,
					DBUS_TYPE_STRING, &pNewLabel,
					DBUS_TYPE_INVALID);
			}
		}
	}
	else if (dbus_message_is_method_call(m_pRequest, "de.berlios.Pinot", "SetDocumentInfo") == TRUE)
	{
		char *pTitle = NULL;
		char *pLocation = NULL;
		char *pType = NULL;
		char *pLanguage = NULL;
		unsigned int docId = 0;

		if (dbus_message_get_args(m_pRequest, &error,
			DBUS_TYPE_UINT32, &docId,
			DBUS_TYPE_STRING, &pTitle,
			DBUS_TYPE_STRING, &pLocation,
			DBUS_TYPE_STRING, &pType,
			DBUS_TYPE_STRING, &pLanguage,
			DBUS_TYPE_INVALID) == TRUE)
		{
			DocumentInfo docInfo(pTitle, pLocation, pType,
				((pLanguage != NULL) ? Languages::toLocale(pLanguage) : ""));

#ifdef DEBUG
			cout << "DBusServletThread::doWork: received SetDocumentInfo " << docId << ", " << pTitle
				<< ", " << pLocation << ", " << pType << ", " << pLanguage << endl;
#endif

			// Update the document info
			flushIndex = index.updateDocumentInfo(docId, docInfo);

			// Prepare the reply
			m_pReply = newDBusReply(m_pRequest);
			if (m_pReply != NULL)
			{
				dbus_message_append_args(m_pReply,
					DBUS_TYPE_UINT32, &docId,
					DBUS_TYPE_INVALID);
			}
		}
	}
	else if (dbus_message_is_method_call(m_pRequest, "de.berlios.Pinot", "SetDocumentLabels") == TRUE)
	{
		char **ppLabels = NULL;
		dbus_uint32_t labelsCount = 0;
		unsigned int docId = 0;
		gboolean resetLabels = TRUE;

		if (dbus_message_get_args(m_pRequest, &error,
			DBUS_TYPE_UINT32, &docId,
			DBUS_TYPE_ARRAY, DBUS_TYPE_STRING, &ppLabels, &labelsCount,
			DBUS_TYPE_BOOLEAN, &resetLabels,
			DBUS_TYPE_INVALID) == TRUE)
		{
			set<string> labels;

			for (dbus_uint32_t labelIndex = 0; labelIndex < labelsCount; ++labelIndex)
			{
				if (ppLabels[labelIndex] == NULL)
				{
					break;
				}
				labels.insert(ppLabels[labelIndex]);
			}
#ifdef DEBUG
			cout << "DBusServletThread::doWork: received SetDocumentLabels on ID " << docId
				<< ", " << labelsCount << " labels" << ", " << resetLabels << endl;
#endif
			// Set labels
			flushIndex = index.setDocumentLabels(docId, labels, ((resetLabels == TRUE) ? true : false));

			// Free container types
			g_strfreev(ppLabels);

			// Prepare the reply
			m_pReply = newDBusReply(m_pRequest);
			if (m_pReply != NULL)
			{
				dbus_message_append_args(m_pReply,
					DBUS_TYPE_UINT32, &docId,
					DBUS_TYPE_INVALID);
			}
		}
	}
	else if (dbus_message_is_method_call(m_pRequest, "de.berlios.Pinot", "SetDocumentsLabels") == TRUE)
	{
		char **ppDocIds = NULL;
		char **ppLabels = NULL;
		dbus_uint32_t idsCount = 0;
		dbus_uint32_t labelsCount = 0;
		gboolean resetLabels = TRUE;

		if (dbus_message_get_args(m_pRequest, &error,
			DBUS_TYPE_ARRAY, DBUS_TYPE_STRING, &ppDocIds, &idsCount,
			DBUS_TYPE_ARRAY, DBUS_TYPE_STRING, &ppLabels, &labelsCount,
			DBUS_TYPE_BOOLEAN, &resetLabels,
			DBUS_TYPE_INVALID) == TRUE)
		{
			set<unsigned int> docIds;
			set<string> labels;

			for (dbus_uint32_t idIndex = 0; idIndex < idsCount; ++idIndex)
			{
				if (ppDocIds[idIndex] == NULL)
				{
					break;
				}
				docIds.insert((unsigned int)atoi(ppDocIds[idIndex]));
			}
			for (dbus_uint32_t labelIndex = 0; labelIndex < labelsCount; ++labelIndex)
			{
				if (ppLabels[labelIndex] == NULL)
				{
					break;
				}
				labels.insert(ppLabels[labelIndex]);
			}
#ifdef DEBUG
			cout << "DBusServletThread::doWork: received SetDocumentLabels on " << docIds.size()
				<< " IDs, " << labelsCount << " labels" << ", " << resetLabels << endl;
#endif
			// Set labels
			flushIndex = index.setDocumentsLabels(docIds, labels, ((resetLabels == TRUE) ? true : false));

			// Free container types
			g_strfreev(ppDocIds);
			g_strfreev(ppLabels);

			// Prepare the reply
			m_pReply = newDBusReply(m_pRequest);
			if (m_pReply != NULL)
			{
				dbus_message_append_args(m_pReply,
					DBUS_TYPE_BOOLEAN, &flushIndex,
					DBUS_TYPE_INVALID);
			}
		}
	}
	else if (dbus_message_is_method_call(m_pRequest, "de.berlios.Pinot", "SimpleQuery") == TRUE)
	{
		char *pSearchText = NULL;
		dbus_uint32_t maxHits = 0;

		if (dbus_message_get_args(m_pRequest, &error,
			DBUS_TYPE_STRING, &pSearchText,
			DBUS_TYPE_UINT32, &maxHits,
			DBUS_TYPE_INVALID) == TRUE)
		{
			XapianEngine engine(PinotSettings::getInstance().m_daemonIndexLocation);
			bool replyWithError = true;

#ifdef DEBUG
			cout << "DBusServletThread::doWork: received SimpleQuery " << pSearchText << ", " << maxHits << endl;
#endif
			if (pSearchText != NULL)
			{
				QueryProperties queryProps("DBUS", pSearchText);

				// Run the query
				engine.setMaxResultsCount(maxHits);
				if (engine.runQuery(queryProps) == true)
				{
					const vector<Result> &resultsList = engine.getResults();
					vector<string> docIds;
					m_pArray = g_ptr_array_new();

					for (vector<Result>::const_iterator resultIter = resultsList.begin();
						resultIter != resultsList.end(); ++resultIter)
					{
						// We only need the document ID
						unsigned int docId = index.hasDocument(resultIter->getLocation());
						if (docId > 0)
						{
							char docIdStr[64];
							snprintf(docIdStr, 64, "%u", docId);
							docIds.push_back(docIdStr);
						}
					}

					for (vector<string>::const_iterator docIter = docIds.begin();
						docIter != docIds.end(); ++docIter)
					{
#ifdef DEBUG
						cout << "DBusServletThread::doWork: adding result " << m_pArray->len << " " << *docIter << endl;
#endif
						g_ptr_array_add(m_pArray, const_cast<char*>(docIter->c_str()));
					}

					// Prepare the reply
					m_pReply = newDBusReply(m_pRequest);
					if (m_pReply != NULL)
					{
						dbus_message_append_args(m_pReply,
							DBUS_TYPE_ARRAY, DBUS_TYPE_STRING, &m_pArray->pdata, m_pArray->len,
							DBUS_TYPE_INVALID);

						replyWithError = false;
					}
				}
			}

			if (replyWithError == true)
			{
				m_pReply = dbus_message_new_error(m_pRequest,
					"de.berlios.Pinot.SimpleQuery",
					"Query failed");
			}
		}
	}
	else if (dbus_message_is_method_call(m_pRequest, "de.berlios.Pinot", "Stop") == TRUE)
	{
		if (dbus_message_get_args(m_pRequest, &error,
			DBUS_TYPE_INVALID) == TRUE)
		{
			int exitStatus = EXIT_SUCCESS;

#ifdef DEBUG
			cout << "DBusServletThread::doWork: received Stop" << endl;
#endif
			// Prepare the reply
			m_pReply = newDBusReply(m_pRequest);
			if (m_pReply != NULL)
			{
				dbus_message_append_args(m_pReply,
					DBUS_TYPE_INT32, &exitStatus,
					DBUS_TYPE_INVALID);
			}

			m_mustQuit = true;
		}
	}
	else if (dbus_message_is_method_call(m_pRequest, "de.berlios.Pinot", "UpdateDocument") == TRUE)
	{
		unsigned int docId = 0;

		if (dbus_message_get_args(m_pRequest, &error,
			DBUS_TYPE_UINT32, &docId,
			DBUS_TYPE_INVALID) == TRUE)
		{
			DocumentInfo docInfo;

#ifdef DEBUG
			cout << "DBusServletThread::doWork: received UpdateDocument " << docId << endl;
#endif
			if (index.getDocumentInfo(docId, docInfo) == true)
			{
				// Update document
				m_pServer->queue_index(docInfo);
			}

			// Prepare the reply
			m_pReply = newDBusReply(m_pRequest);
			if (m_pReply != NULL)
			{
				dbus_message_append_args(m_pReply,
					DBUS_TYPE_UINT32, &docId,
					DBUS_TYPE_INVALID);
			}
		}
	}
	else
	{
#ifdef DEBUG
		cout << "DBusServletThread::doWork: foreign message for/from " << dbus_message_get_interface(m_pRequest)
			<< " " << dbus_message_get_member(m_pRequest) << endl;
#endif
		processedMessage = false;
	}

	// Did an error occur ?
	if (error.message != NULL)
	{
#ifdef DEBUG
		cout << "DBusServletThread::doWork: error occured: " << error.message << endl;
#endif
		// Use the error message as reply
		m_pReply = dbus_message_new_error(m_pRequest, error.name, error.message);
	}

	dbus_error_free(&error);

	if (flushIndex == true)
	{
		// Flush now for the sake of the client application
		index.flush();
	}

	// Send a reply ?
	if ((m_pConnection != NULL) &&
		(m_pReply != NULL))
	{
		dbus_connection_send(m_pConnection, m_pReply, NULL);
		dbus_connection_flush(m_pConnection);
#ifdef DEBUG
		cout << "DBusServletThread::doWork: sent reply" << endl;
#endif
		dbus_message_unref(m_pReply);
	}
}
Exemplo n.º 28
0
bool DirectoryScannerThread::scanEntry(const string &entryName, CrawlHistory &history)
{
	CrawlHistory::CrawlStatus status = CrawlHistory::UNKNOWN;
	time_t itemDate;
	struct stat fileStat;
	int statSuccess = 0;
	bool scanSuccess = true;

	if (entryName.empty() == true)
	{
#ifdef DEBUG
		cout << "DirectoryScannerThread::scanEntry: no name" << endl;
#endif
		return false;
	}

	// Skip . .. and dotfiles
	Url urlObj("file://" + entryName);
	if (urlObj.getFile()[0] == '.')
	{
#ifdef DEBUG
		cout << "DirectoryScannerThread::scanEntry: skipped dotfile " << urlObj.getFile() << endl;
#endif
		return false;
	}

	if (m_followSymLinks == false)
	{
		statSuccess = lstat(entryName.c_str(), &fileStat);
	}
	else
	{
		// Stat the files pointed to by symlinks
		statSuccess = stat(entryName.c_str(), &fileStat);
	}

	// Is this item in the database already ?
	bool itemExists = history.hasItem("file://" + entryName, status, itemDate);

	if (statSuccess == -1)
	{
#ifdef DEBUG
		cout << "DirectoryScannerThread::scanEntry: stat failed with error " << errno << " " << strerror(errno) << endl;
#endif
		scanSuccess = false;
	}
	// Is it a file or a directory ?
	else if (S_ISLNK(fileStat.st_mode))
	{
		// This won't happen when m_followSymLinks is true
#ifdef DEBUG
		cout << "DirectoryScannerThread::scanEntry: skipped symlink" << endl;
#endif
		return false;
	}
	else if (S_ISREG(fileStat.st_mode))
	{
		DocumentInfo docInfo;
		bool reportFile = false;

		docInfo.setLocation("file://" + entryName);

		// Is this file blacklisted ?
		// We have to check early so that if necessary the file's status stays at CRAWLING 
		// and it is removed from the index at the end of this crawl
		if (PinotSettings::getInstance().isBlackListed(entryName) == false)
		{
			if (itemExists == false)
			{
				// Record it
				history.insertItem(docInfo.getLocation(), CrawlHistory::CRAWLED, m_sourceId, fileStat.st_mtime);
#ifdef DEBUG
				cout << "DirectoryScannerThread::scanEntry: reporting new file " << entryName << endl;
#endif
				reportFile = true;
			}
			else
			{
				// Update the record
				history.updateItem(docInfo.getLocation(), CrawlHistory::CRAWLED, fileStat.st_mtime);

				// Was it last crawled after it was modified ?
				if (itemDate < fileStat.st_mtime)
				{
#ifdef DEBUG
					cout << "DirectoryScannerThread::scanEntry: reporting modified file " << entryName << endl;
#endif
					// No, crawl and index it again
					reportFile = true;
				}
			}
		}

		if (reportFile == true)
		{
			Url urlObj(docInfo.getLocation());

			docInfo.setTitle(urlObj.getFile());
			docInfo.setTimestamp(TimeConverter::toTimestamp(fileStat.st_mtime));
			docInfo.setSize(fileStat.st_size);

			foundFile(docInfo);
		}
	}
	else if (S_ISDIR(fileStat.st_mode))
	{
		// Can we scan this directory ?
		if (((m_maxLevel == 0) ||
			(m_currentLevel < m_maxLevel)) &&
			(PinotSettings::getInstance().isBlackListed(entryName) == false))
		{
			++m_currentLevel;

			// Open the directory
			DIR *pDir = opendir(entryName.c_str());
			if (pDir != NULL)
			{
#ifdef DEBUG
				cout << "DirectoryScannerThread::scanEntry: entering " << entryName << endl;
#endif
				if (m_pMonitor != NULL)
				{
					// Monitor first so that we don't miss events
					m_pMonitor->addLocation(entryName, true);
				}

				// Iterate through this directory's entries
				struct dirent *pDirEntry = readdir(pDir);
				while ((m_done == false) &&
					(pDirEntry != NULL))
				{
					char *pEntryName = pDirEntry->d_name;

					// Skip . .. and dotfiles
					if ((pEntryName != NULL) &&
						(pEntryName[0] != '.'))
					{
						string subEntryName(entryName);

						if (entryName[entryName.length() - 1] != '/')
						{
							subEntryName += "/";
						}
						subEntryName += pEntryName;

						// Scan this entry
						if (scanEntry(subEntryName, history) == false)
						{
#ifdef DEBUG
							cout << "DirectoryScannerThread::scanEntry: failed to open "
								<< subEntryName << endl;
#endif
						}
					}

					// Next entry
					pDirEntry = readdir(pDir);
				}
#ifdef DEBUG
				cout << "DirectoryScannerThread::scanEntry: done with " << entryName << endl;
#endif

				// Close the directory
				closedir(pDir);
				--m_currentLevel;
			}
			else
			{
#ifdef DEBUG
				cout << "DirectoryScannerThread::scanEntry: opendir failed with error " << errno << " " << strerror(errno) << endl;
#endif
				scanSuccess = false;
			}
		}
	}
	else
	{
#ifdef DEBUG
		cout << "DirectoryScannerThread::scanEntry: unknown entry type" << endl;
#endif
		scanSuccess = false;
	}

	if (scanSuccess == false)
	{
		time_t timeNow = time(NULL);

		// Record this error
		if (itemExists == false)
		{
			history.insertItem("file://" + entryName, CrawlHistory::ERROR, m_sourceId, timeNow);
		}
		else
		{
			history.updateItem("file://" + entryName, CrawlHistory::ERROR, timeNow);
		}
	}

	return scanSuccess;
}
Exemplo n.º 29
0
void IndexBrowserThread::doWork(void)
{
	set<unsigned int> docIDList;
	set<string> docLabels;
	unsigned int numDocs = 0;

	const map<string, string> &indexesMap = PinotSettings::getInstance().getIndexes();
	map<string, string>::const_iterator mapIter = indexesMap.find(m_indexName);
	if (mapIter == indexesMap.end())
	{
		m_status = _("Index");
		m_status += " ";
		m_status += m_indexName;
		m_status += " ";
		m_status += _("doesn't exist");
		return;
	}

	// Get the index at that location
	IndexInterface *pIndex = PinotSettings::getInstance().getIndex(mapIter->second);
	if ((pIndex == NULL) ||
		(pIndex->isGood() == false))
	{
		m_status = _("Index error on");
		m_status += " ";
		m_status += mapIter->second;
		if (pIndex != NULL)
		{
			delete pIndex;
		}
		return;
	}

	m_indexDocsCount = pIndex->getDocumentsCount(m_labelName);
	if (m_indexDocsCount == 0)
	{
#ifdef DEBUG
		cout << "IndexBrowserThread::doWork: no documents" << endl;
#endif
		return;
	}

#ifdef DEBUG
	cout << "IndexBrowserThread::doWork: " << m_maxDocsCount << " off " << m_indexDocsCount
		<< " documents to browse, starting at " << m_startDoc << endl;
#endif
	if (m_labelName.empty() == true)
	{
		pIndex->listDocuments(docIDList, m_maxDocsCount, m_startDoc);
	}
	else
	{
		pIndex->listDocumentsWithLabel(m_labelName, docIDList, m_maxDocsCount, m_startDoc);
	}

	m_documentsList.reserve(m_maxDocsCount);
	for (set<unsigned int>::iterator iter = docIDList.begin(); iter != docIDList.end(); ++iter)
	{
		if (m_done == true)
		{
			break;
		}

		// Get the document ID
		unsigned int docId = (*iter);
		// ...and the document URL
		string url = XapianDatabase::buildUrl(mapIter->second, docId);

		DocumentInfo docInfo;
		if (pIndex->getDocumentInfo(docId, docInfo) == true)
		{
			string type = docInfo.getType();
			if (type.empty() == true)
			{
				type = "text/html";
			}

			IndexedDocument indexedDoc(docInfo.getTitle(), url, docInfo.getLocation(),
				type, docInfo.getLanguage());
			indexedDoc.setTimestamp(docInfo.getTimestamp());
			indexedDoc.setSize(docInfo.getSize());

			// Insert that document
			m_documentsList.push_back(indexedDoc);
			++numDocs;
		}
#ifdef DEBUG
		else cout << "IndexBrowserThread::doWork: couldn't retrieve document " << docId << endl;
#endif
	}
	delete pIndex;
}
Exemplo n.º 30
0
bool XapianEngine::queryDatabase(Xapian::Database *pIndex, Xapian::Query &query,
	const string &stemLanguage, unsigned int startDoc, const QueryProperties &queryProps)
{
	Timer timer;
	unsigned int maxResultsCount = queryProps.getMaximumResultsCount();
	bool completedQuery = false;

	if (pIndex == NULL)
	{
		return false;
	}

	// Start an enquire session on the database
	Xapian::Enquire enquire(*pIndex);

	timer.start();
	try
	{
		AbstractGenerator abstractGen(pIndex, 50);
		vector<string> seedTerms;

		// Give the query object to the enquire session
		enquire.set_query(query);
		// How should results be sorted ?
		if (queryProps.getSortOrder() == QueryProperties::RELEVANCE)
		{
			// By relevance, only
			enquire.set_sort_by_relevance_then_value(4);
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: sorting by relevance first" << endl;
#endif
		}
		else if (queryProps.getSortOrder() == QueryProperties::DATE)
		{
			// By date, and then by relevance
			enquire.set_sort_by_value_then_relevance(4);
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: sorting by date and time first" << endl;
#endif
		}

		// Get the top results of the query
		Xapian::MSet matches = enquire.get_mset(startDoc, maxResultsCount, (2 * maxResultsCount) + 1);
		m_resultsCountEstimate = matches.get_matches_estimated();
		if (matches.empty() == false)
		{
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: found " << matches.size() << "/" << maxResultsCount
				<< " results found from position " << startDoc << endl;
			cout << "XapianEngine::queryDatabase: estimated " << matches.get_matches_lower_bound()
				<< "/" << m_resultsCountEstimate << "/" << matches.get_matches_upper_bound() << endl;
#endif

			// Get the results
			for (Xapian::MSetIterator mIter = matches.begin(); mIter != matches.end(); ++mIter)
			{
				Xapian::docid docId = *mIter;
				Xapian::Document doc(mIter.get_document());

				// What terms did this document match ?
				seedTerms.clear();
				for (Xapian::TermIterator termIter = enquire.get_matching_terms_begin(docId);
					termIter != enquire.get_matching_terms_end(docId); ++termIter)
				{
					char firstChar = (*termIter)[0];

					if (isupper(((int)firstChar)) == 0)
					{
						seedTerms.push_back(*termIter);
#ifdef DEBUG
						cout << "XapianEngine::queryDatabase: matched term " << *termIter << endl;
#endif
					}
					else if (firstChar == 'Z')
					{
						string stemmed((*termIter).substr(1));
						string::size_type stemmedLen = stemmed.length();

						// Which of this document's terms stem to this ?
						Xapian::TermIterator docTermIter = pIndex->termlist_begin(docId);
						if (docTermIter != pIndex->termlist_end(docId))
						{
							for (docTermIter.skip_to(stemmed);
								docTermIter != pIndex->termlist_end(docId); ++docTermIter)
							{
								// Is this a potential unstem ?
								if (strncasecmp((*docTermIter).c_str(), stemmed.c_str(), stemmedLen) != 0)
								{
									// No, no point looking at the next terms
									break;
								}
#ifdef DEBUG
								cout << "XapianEngine::queryDatabase: matched unstem " << *docTermIter << endl;
#endif

								// FIXME: check this term stems to stemmed !
								seedTerms.push_back(*docTermIter); 
							}
						}
					}
				}

				DocumentInfo thisResult;
				thisResult.setExtract(abstractGen.generateAbstract(docId, seedTerms));
				thisResult.setScore((float)mIter.get_percent());

#ifdef DEBUG
				cout << "XapianEngine::queryDatabase: found document ID " << docId << endl;
#endif
				XapianDatabase::recordToProps(doc.get_data(), &thisResult);
				// XapianDatabase stored the language in English
				thisResult.setLanguage(Languages::toLocale(thisResult.getLanguage()));

				string url(thisResult.getLocation());
				if (url.empty() == true)
				{
					// Hmmm this shouldn't be empty...
					// Use this instead, even though the document isn't cached in the index
					thisResult.setLocation(XapianDatabase::buildUrl(m_databaseName, docId));
				}

				// We don't know the index ID, just the document ID
				thisResult.setIsIndexed(0, docId);

				// Add this result
				m_resultsList.push_back(thisResult);
			}
		}

		completedQuery = true;
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl;
	}
	cout << "Ran query \"" << queryProps.getFreeQuery() << "\" in " << timer.stop() << " ms" << endl;

	try
	{
		m_expandTerms.clear();

		// Expand the query ?
		if (m_expandDocuments.empty() == false)
		{
			Xapian::RSet expandDocs;

			for (set<string>::const_iterator docIter = m_expandDocuments.begin();
				docIter != m_expandDocuments.end(); ++docIter)
			{
				string uniqueTerm(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(Url::canonicalizeUrl(*docIter)), true));

				// Only one document may have this term
				Xapian::PostingIterator postingIter = pIndex->postlist_begin(uniqueTerm);
				if (postingIter != pIndex->postlist_end(uniqueTerm))
				{
					expandDocs.add_document(*postingIter);
				}
			}
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: expand from " << expandDocs.size() << " documents" << endl;
#endif

			// Get 10 non-prefixed terms
			string allowedPrefixes("RS");
			TermDecider expandDecider(pIndex, ((stemLanguage.empty() == true) ? NULL : &m_stemmer),
				FileStopper::get_stopper(Languages::toCode(stemLanguage)),
				allowedPrefixes, query);
			Xapian::ESet expandTerms = enquire.get_eset(10, expandDocs, &expandDecider);
#ifdef DEBUG
			cout << "XapianEngine::queryDatabase: " << expandTerms.size() << " expand terms" << endl;
#endif
			for (Xapian::ESetIterator termIter = expandTerms.begin();
				termIter != expandTerms.end(); ++termIter)
			{
				string expandTerm(*termIter);
				char firstChar = expandTerm[0];

				// Is this prefixed ?
				if (allowedPrefixes.find(firstChar) != string::npos)
				{
					expandTerm.erase(0, 1);
				}

				m_expandTerms.insert(expandTerm);
			}
		}
	}
	catch (const Xapian::Error &error)
	{
		cerr << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl;
	}

	// Be tolerant of errors as long as we got some results
	if ((completedQuery == true) ||
		(m_resultsList.empty() == false))
	{
		return true;
	}

	return false;
}