예제 #1
0
static void fetchPage(const string &url, const string &file)
{
#ifdef DEBUG
	cout << "fetchPage: attempting to save " << url << " to " << file << endl;
#endif

	// Any type of downloader will do...
	Url thisUrl(url);
	DownloaderInterface *myDownloader = DownloaderFactory::getDownloader(thisUrl.getProtocol(), "");
	if (myDownloader == NULL)
	{
		cerr << "fetchPage: couldn't obtain downloader instance (" << thisUrl.getProtocol() << ")" << endl;
		return;
	}

	DocumentInfo docInfo("Page", url, "", "");
	Document *urlDoc = myDownloader->retrieveUrl(docInfo);
	if (urlDoc != NULL)
	{
		unsigned int urlContentLen;
		ofstream outputFile;
		outputFile.open(file.c_str(), ofstream::out | ofstream::trunc);
		outputFile << urlDoc->getData(urlContentLen);
		outputFile.close();

		delete urlDoc;
	}
	else
	{
		cerr << "fetchPage: couldn't get " << url << " !" << endl;
	}

	delete myDownloader;
}
예제 #2
0
Document *WebEngine::downloadPage(const DocumentInfo &docInfo)
{
	m_charset.clear();

	// Any type of downloader will do...
	DownloaderInterface *pDownloader = DownloaderFactory::getDownloader("http");
	if (pDownloader == NULL)
	{
		return NULL;
	}

	Document *pDoc = pDownloader->retrieveUrl(docInfo);
	if (pDoc != NULL)
	{
		string contentType(pDoc->getType());

		// Found a charset ?
		m_charset = getCharset(contentType);
		if (m_charset.empty() == true)
		{
			HtmlTokenizer tokens(pDoc, true);

			// Content-Type might be specified as a META tag 
			contentType = tokens.getMetaTag("Content-Type");
			m_charset = getCharset(contentType);
			if (m_charset.empty() == false)
			{
				// Reset the document's type
				pDoc->setType(contentType);
			}
		}
	}
	delete pDownloader;

	return pDoc;
}
예제 #3
0
void QueryingThread::doWork(void)
{
	PinotSettings &settings = PinotSettings::getInstance();

	// Get the SearchEngine
	SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine(m_engineName, m_engineOption);
	if (pEngine == NULL)
	{
		m_status = _("Couldn't create search engine");
		m_status += " ";
		m_status += m_engineDisplayableName;
		return;
	}

	// Set the maximum number of results
	pEngine->setMaxResultsCount(m_queryProps.getMaximumResultsCount());

	// Set up the proxy
	DownloaderInterface *pDownloader = pEngine->getDownloader();
	if ((pDownloader != NULL) &&
		(settings.m_proxyEnabled == true) &&
		(settings.m_proxyAddress.empty() == false))
	{
		char portStr[64];

		pDownloader->setSetting("proxyaddress", settings.m_proxyAddress);
		snprintf(portStr, 64, "%u", settings.m_proxyPort);
		pDownloader->setSetting("proxyport", portStr);
		pDownloader->setSetting("proxytype", settings.m_proxyType);
	}

	// Run the query
	if (pEngine->runQuery(m_queryProps) == false)
	{
		m_status = _("Couldn't run query on search engine");
		m_status += " ";
		m_status += m_engineDisplayableName;
	}
	else
	{
		IndexInterface *pDocsIndex = NULL;
		IndexInterface *pDaemonIndex = NULL;
		const vector<Result> &resultsList = pEngine->getResults();
		unsigned int indexId = 0;
		bool isIndexQuery = false;

		m_resultsList.clear();
		m_resultsList.reserve(resultsList.size());
		m_resultsCharset = pEngine->getResultsCharset();

		// Are we querying an index ?
		if (m_engineName == "xapian")
		{
			// Internal index ?
			if (m_engineOption == settings.m_docsIndexLocation)
			{
				indexId = settings.getIndexId(_("My Web Pages"));
				isIndexQuery = true;
			}
			else if (m_engineOption == settings.m_daemonIndexLocation)
			{
				indexId = settings.getIndexId(_("My Documents"));
				isIndexQuery = true;
			}
		}

		// Will we have to query internal indices ?
		if (isIndexQuery == false)
		{
			pDocsIndex = settings.getIndex(settings.m_docsIndexLocation);
			pDaemonIndex = settings.getIndex(settings.m_daemonIndexLocation);
		}

		// Copy the results list
		for (vector<Result>::const_iterator resultIter = resultsList.begin();
			resultIter != resultsList.end(); ++resultIter)
		{
			Result current(*resultIter);
			string title(_("No title"));
			string location(current.getLocation());
			string language(current.getLanguage());
			unsigned int docId = 0;

			// The title may contain formatting
			if (current.getTitle().empty() == false)
			{
				title = FilterUtils::stripMarkup(current.getTitle());
			}
			current.setTitle(title);
#ifdef DEBUG
			cout << "QueryingThread::doWork: title is " << title << endl;
#endif

			// Use the query's language if the result's is unknown
			if (language.empty() == true)
			{
				language = m_queryProps.getLanguage();
			}
			current.setLanguage(language);

			if (isIndexQuery == true)
			{
				unsigned int tmpId = 0;

				// The index engine should have set this
				docId = current.getIsIndexed(tmpId);
			}

			// Is this in one of the indexes ?
			if ((pDocsIndex != NULL) &&
				(pDocsIndex->isGood() == true))
			{
				docId = pDocsIndex->hasDocument(location);
				if (docId > 0)
				{
					indexId = settings.getIndexId(_("My Web Pages"));
				}
			}
			if ((pDaemonIndex != NULL) &&
				(pDaemonIndex->isGood() == true) &&
				(docId == 0))
			{
				docId = pDaemonIndex->hasDocument(location);
				if (docId > 0)
				{
					indexId = settings.getIndexId(_("My Documents"));
				}
			}

			if (docId > 0)
			{
				current.setIsIndexed(indexId, docId);
#ifdef DEBUG
				cout << "QueryingThread::doWork: found in index " << indexId << endl;
#endif
			}
#ifdef DEBUG
			else cout << "QueryingThread::doWork: not found in any index" << endl;
#endif

			m_resultsList.push_back(current);
		}

		if (pDocsIndex != NULL)
		{
			delete pDocsIndex;
		}
		if (pDaemonIndex != NULL)
		{
			delete pDaemonIndex;
		}
	}

	delete pEngine;
}
예제 #4
0
int main(int argc, char **argv)
{
	int longOptionIndex = 0;

	// Look at the options
	int optionChar = getopt_long(argc, argv, "hv", g_longOptions, &longOptionIndex);
	while (optionChar != -1)
	{
		switch (optionChar)
		{
			case 'h':
				// Help
				cout << "pinot-collect - Download an URL from the command-line\n\n"
					<< "Usage: pinot-collect [OPTIONS] URL\n\n"
					<< "Options:\n"
					<< "  -h, --help		display this help and exit\n"
					<< "  -v, --version		output version information and exit\n"
					<< "\nExamples:\n"
					<< "  pinot-collect http://some.website.com/\n"
					<< "  pinot-collect xapian:///home/fabrice/.pinot/index/1\n"
					<< "\nReport bugs to " << PACKAGE_BUGREPORT << endl;
				return EXIT_SUCCESS;
			case 'v':
				cout << "pinot-collect - " << PACKAGE_STRING << "\n\n"
					<< "This is free software.  You may redistribute copies of it under the terms of\n"
					<< "the GNU General Public License <http://www.gnu.org/licenses/gpl.html>.\n"
					<< "There is NO WARRANTY, to the extent permitted by law." << endl;
				return EXIT_SUCCESS;
			default:
				return EXIT_FAILURE;
		}

		// Next option
		optionChar = getopt_long(argc, argv, "hv", g_longOptions, &longOptionIndex);
	}

	if (argc < 2)
	{
		cerr << "Not enough parameters" << endl;
		return EXIT_FAILURE;
	}

	MIMEScanner::initialize();
	DownloaderInterface::initialize();

	string url(argv[1]);
	Url thisUrl(url);
	cout << "Protocol: " << thisUrl.getProtocol() << endl;
	cout << "User: "******"Password: "******"Host: " << thisUrl.getHost() << endl;
	cout << "Location: " << thisUrl.getLocation() << endl;
	cout << "File: " << thisUrl.getFile() << endl;
	cout << "Parameters: " << thisUrl.getParameters() << endl;

	// Which Downloader ?
	DownloaderInterface *pDownloader = DownloaderFactory::getDownloader(thisUrl.getProtocol());
	if (pDownloader == NULL)
	{
		cerr << "Couldn't obtain downloader for protocol " << thisUrl.getProtocol() << endl;

		DownloaderInterface::shutdown();
		MIMEScanner::shutdown();

		return EXIT_FAILURE;
	}

	DocumentInfo docInfo("Test", url, "", "");
	Document *pDoc = pDownloader->retrieveUrl(docInfo);
	if (pDoc == NULL)
	{
		cerr << "Download operation failed !" << endl;
	}
	else
	{
		cout << "Type: " << pDoc->getType() << endl;

		unsigned int contentLen;
		const char *pContent = pDoc->getData(contentLen);

		if ((pContent != NULL) &&
			(contentLen > 0))
		{
			string fileName(thisUrl.getFile());

			if (fileName.empty() == true)
			{
				fileName = "index.html";
			}

			cout << "Saving " << contentLen << " bytes to " << fileName << endl;

			// Save the content to a file
			ofstream outputFile(fileName.c_str());
			outputFile.write(pContent, contentLen);
			outputFile.close();
		}
		else
		{
			cout << "Document is empty" << endl;
		}

		delete pDoc;
	}

	delete pDownloader;

	DownloaderInterface::shutdown();
	MIMEScanner::shutdown();

	return EXIT_SUCCESS;
}
예제 #5
0
int main(int argc, char **argv)
{
	QueryProperties::QueryType queryType = QueryProperties::XAPIAN_QP;
	string engineType, option, csvExport, xmlExport, proxyAddress, proxyPort, proxyType;
	unsigned int maxResultsCount = 10; 
	int longOptionIndex = 0;
	bool printResults = true;

	// Look at the options
	int optionChar = getopt_long(argc, argv, "c:hm:a:p:qt:uvx:", g_longOptions, &longOptionIndex);
	while (optionChar != -1)
	{
		switch (optionChar)
		{
			case 'a':
				if (optarg != NULL)
				{
					proxyAddress = optarg;
				}
				break;
			case 'c':
				if (optarg != NULL)
				{
					csvExport = optarg;
					printResults = false;
				}
				break;
			case 'h':
				printHelp();
				return EXIT_SUCCESS;
			case 'm':
				if (optarg != NULL)
				{
					maxResultsCount = (unsigned int )atoi(optarg);
				}
				break;
			case 'p':
				if (optarg != NULL)
				{
					proxyPort = optarg;
				}
				break;
			case 'q':
				queryType = QueryProperties::XESAM_QL;
				break;
			case 't':
				if (optarg != NULL)
				{
					proxyType = optarg;
				}
				break;
			case 'u':
				queryType = QueryProperties::XESAM_UL;
				break;
			case 'v':
				cout << "pinot-search - " << PACKAGE_STRING << "\n\n"
					<< "This is free software.  You may redistribute copies of it under the terms of\n"
					<< "the GNU General Public License <http://www.gnu.org/licenses/old-licenses/gpl-2.0.html>.\n"
					<< "There is NO WARRANTY, to the extent permitted by law." << endl;
				return EXIT_SUCCESS;
			case 'x':
				if (optarg != NULL)
				{
					xmlExport = optarg;
					printResults = false;
				}
				break;
			default:
				return EXIT_FAILURE;
		}

		// Next option
		optionChar = getopt_long(argc, argv, "c:hm:a:p:qt:uvx:", g_longOptions, &longOptionIndex);
	}

	if (argc == 1)
	{
		printHelp();
		return EXIT_SUCCESS;
	}

	if ((argc < 4) ||
		(argc - optind != 3))
	{
		cerr << "Not enough parameters" << endl;
		return EXIT_FAILURE;
	}

	MIMEScanner::initialize();
	DownloaderInterface::initialize();

	engineType = argv[optind];
	option = argv[optind + 1];
	char *pQueryInput = argv[optind + 2];

	// Which SearchEngine ?
	SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine(engineType, option);
	if (pEngine == NULL)
	{
		cerr << "Couldn't obtain search engine instance" << endl;

		DownloaderInterface::shutdown();
		MIMEScanner::shutdown();

		return EXIT_FAILURE;
	}

	// Set up the proxy
	DownloaderInterface *pDownloader = pEngine->getDownloader();
	if ((pDownloader != NULL) &&
		(proxyAddress.empty() == false) &&
		(proxyPort.empty() == false))
	{
		pDownloader->setSetting("proxyaddress", proxyAddress);
		pDownloader->setSetting("proxyport", proxyPort);
		pDownloader->setSetting("proxytype", proxyType);
	}

	// Set the query
	QueryProperties queryProps("pinot-search", "", queryType);
	if (queryType == QueryProperties::XAPIAN_QP)
	{
		queryProps.setFreeQuery(pQueryInput);
	}
	else
	{
		string fileContents;

		// Load the query from file
		if (loadFile(pQueryInput, fileContents) == false)
		{
			cerr << "Couldn't load query from file " << pQueryInput << endl;

			DownloaderInterface::shutdown();
			MIMEScanner::shutdown();

			return EXIT_FAILURE;
		}

		queryProps.setFreeQuery(fileContents);
	}

	queryProps.setMaximumResultsCount(maxResultsCount);
	if (pEngine->runQuery(queryProps) == true)
	{
		string resultsPage;

		// Try getting a list of links
		const vector<DocumentInfo> resultsList = pEngine->getResults();
		if (resultsList.empty() == false)
		{
			if (printResults == true)
			{
				unsigned int count = 0;

				cout << "Matching documents are :" << endl;

				vector<DocumentInfo>::const_iterator resultIter = resultsList.begin();
				while (resultIter != resultsList.end())
				{
					string rawUrl(resultIter->getLocation());
					Url thisUrl(rawUrl);

					cout << count << " Raw URL  : '" << rawUrl << "'"<< endl;
					cout << count << " Protocol : " << thisUrl.getProtocol() << endl;
					cout << count << " Host     : " << thisUrl.getHost() << endl;
					cout << count << " Location : " << thisUrl.getLocation() << "/" << thisUrl.getFile() << endl;
					cout << count << " Title    : " << resultIter->getTitle() << endl;
					cout << count << " Type     : " << resultIter->getType() << endl;
					cout << count << " Language : " << resultIter->getLanguage() << endl;
					cout << count << " Extract  : " << resultIter->getExtract() << endl;
					cout << count << " Score    : " << resultIter->getScore() << endl;
					count++;

					// Next
					resultIter++;
				}
			}
			else
			{
				string engineName(SearchEngineFactory::getSearchEngineName(engineType, option));

				if (csvExport.empty() == false)
				{
					CSVExporter exporter(csvExport, queryProps);

					exporter.exportResults(engineName, maxResultsCount, resultsList);
				}

				if (xmlExport.empty() == false)
				{
					OpenSearchExporter exporter(xmlExport, queryProps);

					exporter.exportResults(engineName, maxResultsCount, resultsList);
				}
			}
		}
		else
		{
			cerr << "Couldn't get a results list !" << endl;
		}
	}
	else
	{
		cerr << "Couldn't run query on search engine " << engineType << endl;
	}

	delete pEngine;

	XapianDatabaseFactory::closeAll();
	DownloaderInterface::shutdown();
	MIMEScanner::shutdown();

	return EXIT_SUCCESS;
}
예제 #6
0
void EngineQueryThread::doWork(void)
{
	PinotSettings &settings = PinotSettings::getInstance();

	// Get the SearchEngine
	SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine(m_engineName, m_engineOption);
	if (pEngine == NULL)
	{
		m_errorNum = UNKNOWN_ENGINE;
		m_errorParam = m_engineDisplayableName;
		return;
	}

	// Set up the proxy
	DownloaderInterface *pDownloader = pEngine->getDownloader();
	if ((pDownloader != NULL) &&
		(settings.m_proxyEnabled == true) &&
		(settings.m_proxyAddress.empty() == false))
	{
		char portStr[64];

		pDownloader->setSetting("proxyaddress", settings.m_proxyAddress);
		snprintf(portStr, 64, "%u", settings.m_proxyPort);
		pDownloader->setSetting("proxyport", portStr);
		pDownloader->setSetting("proxytype", settings.m_proxyType);
	}

	if (m_listingIndex == false)
	{
		pEngine->setLimitSet(m_limitToDocsSet);
	}

	// Run the query
	pEngine->setDefaultOperator(SearchEngineInterface::DEFAULT_OP_AND);
	if (pEngine->runQuery(m_queryProps, m_startDoc) == false)
	{
		m_errorNum = QUERY_FAILED;
		m_errorParam = m_engineDisplayableName;
	}
	else
	{
		const vector<DocumentInfo> &resultsList = pEngine->getResults();

		m_documentsList.clear();
		m_documentsList.reserve(resultsList.size());
		m_documentsCount = pEngine->getResultsCountEstimate();
#ifdef DEBUG
		cout << "EngineQueryThread::doWork: " << resultsList.size() << " off " << m_documentsCount
			<< " results to process, starting at position " << m_startDoc << endl;
#endif

		m_resultsCharset = pEngine->getResultsCharset();
		if (m_listingIndex == false)
		{
			processResults(resultsList);
		}
		else
		{
			processResults(resultsList,
				PinotSettings::getInstance().getIndexIdByName(m_engineDisplayableName));
		}

		// Any spelling correction ?
		string correctedFreeQuery(pEngine->getSpellingCorrection());
		if (correctedFreeQuery.empty() == false)
		{
			m_correctedSpelling = true;
			m_queryProps.setFreeQuery(correctedFreeQuery);
		}
	}

	delete pEngine;
}
예제 #7
0
int main(int argc, char **argv)
{
	QueryProperties::QueryType queryType = QueryProperties::XAPIAN_QP;
	string engineType, option, csvExport, xmlExport, stemLanguage;
	unsigned int maxResultsCount = 10; 
	int longOptionIndex = 0;
	bool printResults = true;
	bool sortByDate = false;
	bool locationOnly = false;
	bool isStoredQuery = false;

	// Look at the options
	int optionChar = getopt_long(argc, argv, "c:dhlm:rs:vx:", g_longOptions, &longOptionIndex);
	while (optionChar != -1)
	{
		switch (optionChar)
		{
			case 'c':
				if (optarg != NULL)
				{
					csvExport = optarg;
					printResults = false;
				}
				break;
			case 'd':
				sortByDate = true;
				break;
			case 'h':
				printHelp();
				return EXIT_SUCCESS;
			case 'l':
				locationOnly = true;
				break;
			case 'm':
				if (optarg != NULL)
				{
					maxResultsCount = (unsigned int )atoi(optarg);
				}
				break;
			case 'r':
				isStoredQuery = true;
				break;
			case 's':
				if (optarg != NULL)
				{
					stemLanguage = optarg;
				}
				break;
			case 'v':
				clog << "pinot-search - " << PACKAGE_STRING << "\n\n"
					<< "This is free software.  You may redistribute copies of it under the terms of\n"
					<< "the GNU General Public License <http://www.gnu.org/licenses/old-licenses/gpl-2.0.html>.\n"
					<< "There is NO WARRANTY, to the extent permitted by law." << endl;
				return EXIT_SUCCESS;
			case 'x':
				if (optarg != NULL)
				{
					xmlExport = optarg;
					printResults = false;
				}
				break;
			default:
				return EXIT_FAILURE;
		}

		// Next option
		optionChar = getopt_long(argc, argv, "c:dhlm:rs:vx:", g_longOptions, &longOptionIndex);
	}

#if defined(ENABLE_NLS)
	bindtextdomain(GETTEXT_PACKAGE, PACKAGE_LOCALE_DIR);
	bind_textdomain_codeset(GETTEXT_PACKAGE, "UTF-8");
	textdomain(GETTEXT_PACKAGE);
#endif //ENABLE_NLS

	if (argc == 1)
	{
		printHelp();
		return EXIT_SUCCESS;
	}

	if ((argc < 4) ||
		(argc - optind != 3))
	{
		clog << "Wrong number of parameters" << endl;
		return EXIT_FAILURE;
	}

	// This will create the necessary directories on the first run
	PinotSettings &settings = PinotSettings::getInstance();
	string confDirectory(PinotSettings::getConfigurationDirectory());

	if (MIMEScanner::initialize(PinotSettings::getHomeDirectory() + "/.local",
		string(SHARED_MIME_INFO_PREFIX)) == false)
	{
		clog << "Couldn't load MIME settings" << endl;
	}
	DownloaderInterface::initialize();
	ModuleFactory::loadModules(string(LIBDIR) + string("/pinot/backends"));
	ModuleFactory::loadModules(confDirectory + "/backends");

	// Localize language names
	Languages::setIntlName(0, _("Unknown"));
	Languages::setIntlName(1, _("Danish"));
	Languages::setIntlName(2, _("Dutch"));
	Languages::setIntlName(3, _("English"));
	Languages::setIntlName(4, _("Finnish"));
	Languages::setIntlName(5, _("French"));
	Languages::setIntlName(6, _("German"));
	Languages::setIntlName(7, _("Hungarian"));
	Languages::setIntlName(8, _("Italian"));
	Languages::setIntlName(9, _("Norwegian"));
	Languages::setIntlName(10, _("Portuguese"));
	Languages::setIntlName(11, _("Romanian"));
	Languages::setIntlName(12, _("Russian"));
	Languages::setIntlName(13, _("Spanish"));
	Languages::setIntlName(14, _("Swedish"));
	Languages::setIntlName(15, _("Turkish"));

	// Load the settings
	settings.load(PinotSettings::LOAD_ALL);

	engineType = argv[optind];
	option = argv[optind + 1];
	char *pQueryInput = argv[optind + 2];

	// Set the query
	QueryProperties queryProps("pinot-search", "", queryType);
	if (queryType == QueryProperties::XAPIAN_QP)
	{
		if (isStoredQuery == true)
		{
			const map<string, QueryProperties> &queries = settings.getQueries();
			map<string, QueryProperties>::const_iterator queryIter = queries.find(pQueryInput);
			if (queryIter != queries.end())
			{
				queryProps = queryIter->second;
			}
			else
			{
				clog << "Couldn't find stored query " << pQueryInput << endl;

				DownloaderInterface::shutdown();
				MIMEScanner::shutdown();

				return EXIT_FAILURE;
			}
		}
		else
		{
			queryProps.setFreeQuery(pQueryInput);
		}
	}
	queryProps.setStemmingLanguage(stemLanguage);
	queryProps.setMaximumResultsCount(maxResultsCount);
	if (sortByDate == true)
	{
		queryProps.setSortOrder(QueryProperties::DATE);
	}

	// Which SearchEngine ?
	SearchEngineInterface *pEngine = ModuleFactory::getSearchEngine(engineType, option);
	if (pEngine == NULL)
	{
		clog << "Couldn't obtain search engine instance" << endl;

		DownloaderInterface::shutdown();
		MIMEScanner::shutdown();

		return EXIT_FAILURE;
	}

	// Set up the proxy
	WebEngine *pWebEngine = dynamic_cast<WebEngine *>(pEngine);
	if (pWebEngine != NULL)
	{
		DownloaderInterface *pDownloader = pWebEngine->getDownloader();
		if ((pDownloader != NULL) &&
			(settings.m_proxyEnabled == true) &&
			(settings.m_proxyAddress.empty() == false))
		{
			char portStr[64];

			pDownloader->setSetting("proxyaddress", settings.m_proxyAddress);
			snprintf(portStr, 64, "%u", settings.m_proxyPort);
			pDownloader->setSetting("proxyport", portStr);
			pDownloader->setSetting("proxytype", settings.m_proxyType);
		}

		pWebEngine->setEditableValues(settings.m_editablePluginValues);
	}

	pEngine->setDefaultOperator(SearchEngineInterface::DEFAULT_OP_AND);
	if (pEngine->runQuery(queryProps) == true)
	{
		string resultsPage;
		unsigned int estimatedResultsCount = pEngine->getResultsCountEstimate();

		const vector<DocumentInfo> &resultsList = pEngine->getResults();
		if (resultsList.empty() == false)
		{
			if (printResults == true)
			{
				unsigned int count = 0;

				if (locationOnly == false)
				{
					clog << "Showing " << resultsList.size() << " results of about " << estimatedResultsCount << endl;
				}

				vector<DocumentInfo>::const_iterator resultIter = resultsList.begin();
				while (resultIter != resultsList.end())
				{
					string rawUrl(resultIter->getLocation(true));

					if (locationOnly == false)
					{
						clog << count << " Location : '" << rawUrl << "'"<< endl;
						clog << count << " Title    : " << resultIter->getTitle() << endl;
						clog << count << " Type     : " << resultIter->getType() << endl;
						clog << count << " Language : " << resultIter->getLanguage() << endl;
						clog << count << " Date     : " << resultIter->getTimestamp() << endl;
						clog << count << " Size     : " << resultIter->getSize() << endl;
						clog << count << " Extract  : " << resultIter->getExtract() << endl;
						clog << count << " Score    : " << resultIter->getScore() << endl;
					}
					else
					{
						clog << rawUrl << endl;
					}
					++count;

					// Next
					++resultIter;
				}
			}
			else
			{
				string engineName(ModuleFactory::getSearchEngineName(engineType, option));

				if (csvExport.empty() == false)
				{
					CSVExporter exporter(csvExport, queryProps);

					exporter.exportResults(engineName, maxResultsCount, resultsList);
				}

				if (xmlExport.empty() == false)
				{
					OpenSearchExporter exporter(xmlExport, queryProps);

					exporter.exportResults(engineName, maxResultsCount, resultsList);
				}
			}
		}
		else
		{
			clog << "No results" << endl;
		}
	}
	else
	{
		clog << "Couldn't run query on search engine " << engineType << endl;
	}

	delete pEngine;

	ModuleFactory::unloadModules();
	DownloaderInterface::shutdown();
	MIMEScanner::shutdown();

	return EXIT_SUCCESS;
}
예제 #8
0
void QueryingThread::doWork(void)
{
	PinotSettings &settings = PinotSettings::getInstance();

	// Get the SearchEngine
	SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine(m_engineName, m_engineOption);
	if (pEngine == NULL)
	{
		m_status = _("Couldn't create search engine");
		m_status += " ";
		m_status += m_engineDisplayableName;
		return;
	}

	// Set up the proxy
	DownloaderInterface *pDownloader = pEngine->getDownloader();
	if ((pDownloader != NULL) &&
		(settings.m_proxyEnabled == true) &&
		(settings.m_proxyAddress.empty() == false))
	{
		char portStr[64];

		pDownloader->setSetting("proxyaddress", settings.m_proxyAddress);
		snprintf(portStr, 64, "%u", settings.m_proxyPort);
		pDownloader->setSetting("proxyport", portStr);
		pDownloader->setSetting("proxytype", settings.m_proxyType);
	}

	// Run the query
	if (pEngine->runQuery(m_queryProps, m_startDoc) == false)
	{
		m_status = _("Couldn't run query on search engine");
		m_status += " ";
		m_status += m_engineDisplayableName;
	}
	else
	{
		const vector<DocumentInfo> &resultsList = pEngine->getResults();

		m_documentsList.clear();
		m_documentsList.reserve(resultsList.size());
		m_documentsCount = pEngine->getResultsCountEstimate();
#ifdef DEBUG
		cout << "QueryingThread::doWork: " << resultsList.size() << " off " << m_documentsCount
			<< " results to process, starting at position " << m_startDoc << endl;
#endif

		m_resultsCharset = pEngine->getResultsCharset();
		if (m_listingIndex == false)
		{
			processResults(resultsList);
		}
		else
		{
			processResults(resultsList,
				PinotSettings::getInstance().getIndexId(m_engineDisplayableName));
		}
	}

	delete pEngine;
}
예제 #9
0
int main(int argc, char **argv)
{
    string type, option;
    string databaseName, proxyAddress, proxyPort, proxyType;
    int longOptionIndex = 0;
    unsigned int docId = 0;
    bool checkDocument = false, indexDocument = false, showInfo = false, success = false;

    // Look at the options
    int optionChar = getopt_long(argc, argv, "cd:hia:p:t:sv", g_longOptions, &longOptionIndex);
    while (optionChar != -1)
    {
        set<string> engines;

        switch (optionChar)
        {
        case 'a':
            if (optarg != NULL)
            {
                proxyAddress = optarg;
            }
            break;
        case 'c':
            checkDocument = true;
            break;
        case 'd':
            if (optarg != NULL)
            {
                databaseName = optarg;
            }
            break;
        case 'h':
            printHelp();
            return EXIT_SUCCESS;
        case 'i':
            indexDocument = true;
            break;
        case 'p':
            if (optarg != NULL)
            {
                proxyPort = optarg;
            }
            break;
        case 's':
            showInfo = true;
            break;
        case 't':
            if (optarg != NULL)
            {
                proxyType = optarg;
            }
            break;
        case 'v':
            cout << "pinot-index - " << PACKAGE_STRING << "\n\n"
                 << "This is free software.  You may redistribute copies of it under the terms of\n"
                 << "the GNU General Public License <http://www.gnu.org/licenses/old-licenses/gpl-2.0.html>.\n"
                 << "There is NO WARRANTY, to the extent permitted by law." << endl;
            return EXIT_SUCCESS;
        default:
            return EXIT_FAILURE;
        }

        // Next option
        optionChar = getopt_long(argc, argv, "cd:hia:p:t:sv", g_longOptions, &longOptionIndex);
    }

    if (argc == 1)
    {
        printHelp();
        return EXIT_SUCCESS;
    }

    if ((argc < 2) ||
            (argc - optind == 0))
    {
        cerr << "Not enough parameters" << endl;
        return EXIT_FAILURE;
    }

    if (((indexDocument == false) &&
            (checkDocument == false)) ||
            (databaseName.empty() == true))
    {
        cerr << "Incorrect parameters" << endl;
        return EXIT_FAILURE;
    }

    MIMEScanner::initialize("", "");
    DownloaderInterface::initialize();
    // Localize language names
    Languages::setIntlName(0, "Unknown");
    Languages::setIntlName(1, "Danish");
    Languages::setIntlName(2, "Dutch");
    Languages::setIntlName(3, "English");
    Languages::setIntlName(4, "Finnish");
    Languages::setIntlName(5, "French");
    Languages::setIntlName(6, "German");
    Languages::setIntlName(7, "Hungarian");
    Languages::setIntlName(8, "Italian");
    Languages::setIntlName(9, "Norwegian");
    Languages::setIntlName(10, "Portuguese");
    Languages::setIntlName(11, "Romanian");
    Languages::setIntlName(12, "Russian");
    Languages::setIntlName(13, "Spanish");
    Languages::setIntlName(14, "Swedish");
    Languages::setIntlName(15, "Turkish");
    Dijon::HtmlFilter::initialize();
    Dijon::FilterFactory::loadFilters(string(LIBDIR) + string("/pinot/filters"));

    // Make sure the index is open in the correct mode
    XapianDatabase *pDb = XapianDatabaseFactory::getDatabase(databaseName, (indexDocument ? false : true));
    if (pDb == NULL)
    {
        cerr << "Couldn't open index " << databaseName << endl;

        Dijon::FilterFactory::unloadFilters();
        Dijon::HtmlFilter::shutdown();
        DownloaderInterface::shutdown();
        MIMEScanner::shutdown();

        return EXIT_FAILURE;
    }

    // Get a read-write index of the given type
    IndexInterface *pIndex = IndexFactory::getIndex("xapian", databaseName);
    if (pIndex == NULL)
    {
        cerr << "Couldn't obtain index for " << databaseName << endl;

        XapianDatabaseFactory::closeAll();
        Dijon::FilterFactory::unloadFilters();
        Dijon::HtmlFilter::shutdown();
        DownloaderInterface::shutdown();
        MIMEScanner::shutdown();

        return EXIT_FAILURE;
    }

    while (optind < argc)
    {
        string urlParam(argv[optind]);

        if (checkDocument == true)
        {
            if (pIndex->isGood() == true)
            {
                docId = pIndex->hasDocument(urlParam);
                if (docId > 0)
                {
                    cout << urlParam << ": document ID " << docId << endl;
                    success = true;
                }
            }
        }
        if (indexDocument == true)
        {
            Url thisUrl(urlParam);

            // Which Downloader ?
            DownloaderInterface *pDownloader = DownloaderFactory::getDownloader(thisUrl.getProtocol());
            if (pDownloader == NULL)
            {
                cerr << "Couldn't obtain downloader for protocol " << thisUrl.getProtocol() << endl;

                success = false;
                continue;
            }

            // Set up the proxy
            if ((proxyAddress.empty() == false) &&
                    (proxyPort.empty() == false))
            {
                pDownloader->setSetting("proxyaddress", proxyAddress);
                pDownloader->setSetting("proxyport", proxyPort);
                pDownloader->setSetting("proxytype", proxyType);
            }

            DocumentInfo docInfo("", urlParam, MIMEScanner::scanUrl(thisUrl), "");
            Document *pDoc = pDownloader->retrieveUrl(docInfo);
            if (pDoc == NULL)
            {
                cerr << "Couldn't download " << urlParam << endl;
            }
            else
            {
                FilterWrapper wrapFilter(pIndex);
                set<string> labels;

                // Update an existing document or add to the index ?
                docId = pIndex->hasDocument(urlParam);
                if (docId > 0)
                {
                    // Update the document
                    if (wrapFilter.updateDocument(*pDoc, docId) == true)
                    {
                        success = true;
                    }
                }
                else
                {
                    // Index the document
                    success = wrapFilter.indexDocument(*pDoc, labels, docId);
                }

                if (success == true)
                {
                    // Flush the index
                    pIndex->flush();
                }

                delete pDoc;
            }

            delete pDownloader;
        }
        if ((showInfo == true) &&
                (docId > 0))
        {
            DocumentInfo docInfo;
            set<string> labels;

            if (pIndex->getDocumentInfo(docId, docInfo) == true)
            {
                cout << "Location : '" << docInfo.getLocation() << "'" << endl;
                cout << "Title    : " << docInfo.getTitle() << endl;
                cout << "Type     : " << docInfo.getType() << endl;
                cout << "Language : " << docInfo.getLanguage() << endl;
                cout << "Date     : " << docInfo.getTimestamp() << endl;
                cout << "Size     : " << docInfo.getSize() << endl;
            }
            if (pIndex->getDocumentLabels(docId, labels) == true)
            {
                cout << "Labels   : ";
                for (set<string>::const_iterator labelIter = labels.begin();
                        labelIter != labels.end(); ++labelIter)
                {
                    if (labelIter->substr(0, 2) == "X-")
                    {
                        continue;
                    }
                    cout << "[" << Url::escapeUrl(*labelIter) << "]";
                }
                cout << endl;
            }
        }

        // Next
        ++optind;
    }
    delete pIndex;

    XapianDatabaseFactory::closeAll();
    Dijon::FilterFactory::unloadFilters();
    Dijon::HtmlFilter::shutdown();
    DownloaderInterface::shutdown();
    MIMEScanner::shutdown();

    // Did whatever operation we carried out succeed ?
    if (success == true)
    {
        return EXIT_SUCCESS;
    }

    return EXIT_FAILURE;
}
예제 #10
0
int main(int argc, char **argv)
{
	string type, option;
	string databaseName, urlToCheck, urlToIndex;
	int longOptionIndex = 0;
	unsigned int docId = 0;
	bool checkDocument = false, indexDocument = false, showInfo = false, success = false;

	// Look at the options
	int optionChar = getopt_long(argc, argv, "c:d:hi:sv", g_longOptions, &longOptionIndex);
	while (optionChar != -1)
	{
		set<string> engines;

		switch (optionChar)
		{
			case 'c':
				if (optarg != NULL)
				{
					urlToCheck = optarg;
				}
				checkDocument = true;
				break;
			case 'd':
				if (optarg != NULL)
				{
					databaseName = optarg;
				}
				checkDocument = true;
				break;
			case 'h':
				// Help
				cout << "pinot-index - Index documents from the command-line\n\n"
					<< "Usage: pinot-index [OPTIONS]\n\n"
					<< "Options:\n"
					<< "  -c, --check               check whether the given URL is in the index\n"
					<< "  -d, --db                  path to index to use (mandatory)\n"
					<< "  -h, --help                display this help and exit\n"
					<< "  -i, --index               index the given URL\n"
					<< "  -s, --showinfo            show information about the document\n"
					<< "  -v, --version             output version information and exit\n\n";
				// Don't mention type dbus here as it doesn't support indexing and
				// is identical to xapian when checking for URLs
				cout << "Examples:\n"
					<< "pinot-index --check file:///home/fabrice/Documents/Bozo.txt --showinfo --db ~/.pinot/daemon\n\n"
					<< "pinot-index --index http://pinot.berlios.de/ --db ~/.pinot/index\n\n"
					<< "Report bugs to " << PACKAGE_BUGREPORT << endl;
				return EXIT_SUCCESS;
			case 'i':
				if (optarg != NULL)
				{
					urlToIndex = optarg;
				}
				indexDocument = true;
				break;
			case 's':
				showInfo = true;
				break;
			case 'v':
				cout << "pinot-index - " << PACKAGE_STRING << "\n\n"
					<< "This is free software.  You may redistribute copies of it under the terms of\n"
					<< "the GNU General Public License <http://www.gnu.org/licenses/gpl.html>.\n"
					<< "There is NO WARRANTY, to the extent permitted by law." << endl;
				return EXIT_SUCCESS;
			default:
				return EXIT_FAILURE;
		}

		// Next option
		optionChar = getopt_long(argc, argv, "c:d:hi:sv", g_longOptions, &longOptionIndex);
	}

	if (((indexDocument == false) &&
		(checkDocument == false)) ||
		(databaseName.empty() == true))
	{
		cerr << "Incorrect parameters" << endl;
		return EXIT_FAILURE;
	}

	MIMEScanner::initialize();
	DownloaderInterface::initialize();
	Dijon::FilterFactory::loadFilters(string(LIBDIR) + string("/pinot/filters"));

	// Make sure the index is open in the correct mode
	XapianDatabase *pDb = XapianDatabaseFactory::getDatabase(databaseName, (indexDocument ? false : true));
	if (pDb == NULL)
	{
		cerr << "Couldn't open index " << databaseName << endl;

		Dijon::FilterFactory::unloadFilters();
		DownloaderInterface::shutdown();
		MIMEScanner::shutdown();

		return EXIT_FAILURE;
	}

	// Get a read-write index of the given type
	IndexInterface *pIndex = IndexFactory::getIndex("xapian", databaseName);
	if (pIndex == NULL)
	{
		cerr << "Couldn't obtain index for " << databaseName << endl;

		XapianDatabaseFactory::closeAll();
		Dijon::FilterFactory::unloadFilters();
		DownloaderInterface::shutdown();
		MIMEScanner::shutdown();

		return EXIT_FAILURE;
	}

	if (checkDocument == true)
	{
		if (pIndex->isGood() == true)
		{
			docId = pIndex->hasDocument(urlToCheck);
			if (docId > 0)
			{
				cout << urlToCheck << ": document ID " << docId << endl;
				success = true;
			}
		}
	}
	if (indexDocument == true)
	{
		Url thisUrl(urlToIndex);

		// Which Downloader ?
		DownloaderInterface *pDownloader = DownloaderFactory::getDownloader(thisUrl.getProtocol());
		if (pDownloader == NULL)
		{
			cerr << "Couldn't obtain downloader for protocol " << thisUrl.getProtocol() << endl;

			XapianDatabaseFactory::closeAll();
			Dijon::FilterFactory::unloadFilters();
			DownloaderInterface::shutdown();
			MIMEScanner::shutdown();

			return EXIT_FAILURE;
		}

		DocumentInfo docInfo(urlToIndex, urlToIndex, MIMEScanner::scanUrl(thisUrl), "");
		Document *pDoc = pDownloader->retrieveUrl(docInfo);
		if (pDoc == NULL)
		{
			cerr << "Download operation failed !" << endl;
		}
		else
		{
			set<string> labels;

			pIndex->setStemmingMode(IndexInterface::STORE_BOTH);

			// Update an existing document or add to the index ?
			docId = pIndex->hasDocument(urlToIndex);
			if (docId > 0)
			{
				// Update the document
				if (FilterWrapper::updateDocument(docId, *pIndex, *pDoc) == true)
				{
					success = true;
				}
			}
			else
			{
				// Index the document
				success = FilterWrapper::indexDocument(*pIndex, *pDoc, labels, docId);
			}

			if (success == true)
			{
				// Flush the index
				pIndex->flush();
			}

			delete pDoc;
		}

		delete pDownloader;
	}
	if ((showInfo == true) &&
		(docId > 0))
	{
		DocumentInfo docInfo;
		set<string> labels;

		if (pIndex->getDocumentInfo(docId, docInfo) == true)
		{
			cout << "Title: " << docInfo.getTitle() << endl;
			cout << "Location: " << docInfo.getLocation() << endl;
			cout << "Type: " << docInfo.getType() << endl;
			cout << "Language: " << docInfo.getLanguage() << endl;
			cout << "Timestamp: " << docInfo.getTimestamp() << endl;
			cout << "Size: " << docInfo.getSize() << endl;
		}
		if (pIndex->getDocumentLabels(docId, labels) == true)
		{
			cout << "Labels:";
			for (set<string>::const_iterator labelIter = labels.begin();
				labelIter != labels.end(); ++labelIter)
			{
				cout << " '" << *labelIter << "'";
			}
			cout << endl;
		}
	}
	delete pIndex;

	XapianDatabaseFactory::closeAll();
	Dijon::FilterFactory::unloadFilters();
	DownloaderInterface::shutdown();
	MIMEScanner::shutdown();

	// Did whatever operation we carried out succeed ?
	if (success == true)
	{
		return EXIT_SUCCESS;
	}

	return EXIT_FAILURE;
}