Exemplo n.º 1
0
void ExpandQueryThread::doWork(void)
{
	// Get the SearchEngine
	SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine(m_engineName, m_engineOption);
	if (pEngine == NULL)
	{
		m_status = _("Couldn't create search engine");
		m_status += " ";
		m_status += m_queryProps.getName();
		return;
	}

	// Set the maximum number of results
	pEngine->setMaxResultsCount(m_queryProps.getMaximumResultsCount());
	// Set whether to expand the query
	pEngine->setQueryExpansion(m_relevantDocs);

	// Run the query
	if (pEngine->runQuery(m_queryProps) == false)
	{
		m_status = _("Couldn't run query on search engine");
		m_status += " ";
		m_status += m_engineName;
	}
	else
	{
		// Copy the expand terms
		const set<string> &expandTerms = pEngine->getExpandTerms();
		copy(expandTerms.begin(), expandTerms.end(),
			inserter(m_expandTerms, m_expandTerms.begin()));
	}

	delete pEngine;
}
Exemplo n.º 2
0
void QueryingThread::doWork(void)
{
	// Get the SearchEngine
	SearchEngineInterface *engine = SearchEngineFactory::getSearchEngine(m_engineName, m_engineOption);
	if (engine == NULL)
	{
		m_status = _("Couldn't create search engine");
		m_status += " ";
		m_status += m_engineDisplayableName;
		return;
	}
	// Set the maximum number of results
	engine->setMaxResultsCount(m_queryProps.getMaximumResultsCount());

	// Run the query
	if (engine->runQuery(m_queryProps) == false)
	{
		m_status = _("Couldn't run query on search engine");
		m_status += " ";
		m_status += m_engineDisplayableName;
	}
	else
	{
		const vector<Result> &resultsList = engine->getResults();

		m_resultsList.clear();
		m_resultsList.reserve(resultsList.size());
		m_resultsCharset = engine->getResultsCharset();

		// Copy the results list
		for (vector<Result>::const_iterator resultIter = resultsList.begin();
			resultIter != resultsList.end(); ++resultIter)
		{
			string title = _("No title");
			string extract = HtmlTokenizer::stripTags(resultIter->getExtract());

			// The title may contain formatting
			if (resultIter->getTitle().empty() == false)
			{
				title = HtmlTokenizer::stripTags(resultIter->getTitle());
			}

			string language = resultIter->getLanguage();
			if (language.empty() == true)
			{
				// Use the query's language
				language = m_queryProps.getLanguage();
			}

			m_resultsList.push_back(Result(resultIter->getLocation(),
				title,
				extract,
				language,
				resultIter->getScore()));
		}
	}
	delete engine;
}
Exemplo n.º 3
0
void ExpandQueryThread::doWork(void)
{
	IndexInterface *pIndex = PinotSettings::getInstance().getIndex("MERGED");
	set<unsigned int> relevantDocIds;

	if ((pIndex == NULL) ||
		(pIndex->isGood() == false))
	{
		m_status = _("Index error on");
		m_status += " MERGED";
		if (pIndex != NULL)
		{
			delete pIndex;
		}
		return;
	}

	for (set<string>::iterator locationIter = m_relevantDocs.begin();
		locationIter != m_relevantDocs.end(); ++locationIter)
	{
		relevantDocIds.insert(pIndex->hasDocument(*locationIter));
	}

	delete pIndex;

	// Get the SearchEngine
	SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine("xapian", "MERGED");
	if (pEngine == NULL)
	{
		m_status = _("Couldn't create search engine");
		m_status += " ";
		m_status += m_queryProps.getName();
		return;
	}

	// Set the maximum number of results
	pEngine->setMaxResultsCount(m_queryProps.getMaximumResultsCount());
	// Set whether to expand the query
	pEngine->setQueryExpansion(relevantDocIds);

	// Run the query
	if (pEngine->runQuery(m_queryProps) == false)
	{
		m_status = _("Couldn't run query on search engine");
	}
	else
	{
		// Copy the expand terms
		const set<string> &expandTerms = pEngine->getExpandTerms();
		copy(expandTerms.begin(), expandTerms.end(),
			inserter(m_expandTerms, m_expandTerms.begin()));
	}

	delete pEngine;
}
Exemplo n.º 4
0
void QueryingThread::doWork(void)
{
	PinotSettings &settings = PinotSettings::getInstance();

	// Get the SearchEngine
	SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine(m_engineName, m_engineOption);
	if (pEngine == NULL)
	{
		m_status = _("Couldn't create search engine");
		m_status += " ";
		m_status += m_engineDisplayableName;
		return;
	}

	// Set the maximum number of results
	pEngine->setMaxResultsCount(m_queryProps.getMaximumResultsCount());

	// Set up the proxy
	DownloaderInterface *pDownloader = pEngine->getDownloader();
	if ((pDownloader != NULL) &&
		(settings.m_proxyEnabled == true) &&
		(settings.m_proxyAddress.empty() == false))
	{
		char portStr[64];

		pDownloader->setSetting("proxyaddress", settings.m_proxyAddress);
		snprintf(portStr, 64, "%u", settings.m_proxyPort);
		pDownloader->setSetting("proxyport", portStr);
		pDownloader->setSetting("proxytype", settings.m_proxyType);
	}

	// Run the query
	if (pEngine->runQuery(m_queryProps) == false)
	{
		m_status = _("Couldn't run query on search engine");
		m_status += " ";
		m_status += m_engineDisplayableName;
	}
	else
	{
		IndexInterface *pDocsIndex = NULL;
		IndexInterface *pDaemonIndex = NULL;
		const vector<Result> &resultsList = pEngine->getResults();
		unsigned int indexId = 0;
		bool isIndexQuery = false;

		m_resultsList.clear();
		m_resultsList.reserve(resultsList.size());
		m_resultsCharset = pEngine->getResultsCharset();

		// Are we querying an index ?
		if (m_engineName == "xapian")
		{
			// Internal index ?
			if (m_engineOption == settings.m_docsIndexLocation)
			{
				indexId = settings.getIndexId(_("My Web Pages"));
				isIndexQuery = true;
			}
			else if (m_engineOption == settings.m_daemonIndexLocation)
			{
				indexId = settings.getIndexId(_("My Documents"));
				isIndexQuery = true;
			}
		}

		// Will we have to query internal indices ?
		if (isIndexQuery == false)
		{
			pDocsIndex = settings.getIndex(settings.m_docsIndexLocation);
			pDaemonIndex = settings.getIndex(settings.m_daemonIndexLocation);
		}

		// Copy the results list
		for (vector<Result>::const_iterator resultIter = resultsList.begin();
			resultIter != resultsList.end(); ++resultIter)
		{
			Result current(*resultIter);
			string title(_("No title"));
			string location(current.getLocation());
			string language(current.getLanguage());
			unsigned int docId = 0;

			// The title may contain formatting
			if (current.getTitle().empty() == false)
			{
				title = FilterUtils::stripMarkup(current.getTitle());
			}
			current.setTitle(title);
#ifdef DEBUG
			cout << "QueryingThread::doWork: title is " << title << endl;
#endif

			// Use the query's language if the result's is unknown
			if (language.empty() == true)
			{
				language = m_queryProps.getLanguage();
			}
			current.setLanguage(language);

			if (isIndexQuery == true)
			{
				unsigned int tmpId = 0;

				// The index engine should have set this
				docId = current.getIsIndexed(tmpId);
			}

			// Is this in one of the indexes ?
			if ((pDocsIndex != NULL) &&
				(pDocsIndex->isGood() == true))
			{
				docId = pDocsIndex->hasDocument(location);
				if (docId > 0)
				{
					indexId = settings.getIndexId(_("My Web Pages"));
				}
			}
			if ((pDaemonIndex != NULL) &&
				(pDaemonIndex->isGood() == true) &&
				(docId == 0))
			{
				docId = pDaemonIndex->hasDocument(location);
				if (docId > 0)
				{
					indexId = settings.getIndexId(_("My Documents"));
				}
			}

			if (docId > 0)
			{
				current.setIsIndexed(indexId, docId);
#ifdef DEBUG
				cout << "QueryingThread::doWork: found in index " << indexId << endl;
#endif
			}
#ifdef DEBUG
			else cout << "QueryingThread::doWork: not found in any index" << endl;
#endif

			m_resultsList.push_back(current);
		}

		if (pDocsIndex != NULL)
		{
			delete pDocsIndex;
		}
		if (pDaemonIndex != NULL)
		{
			delete pDaemonIndex;
		}
	}

	delete pEngine;
}
Exemplo n.º 5
0
int main(int argc, char **argv)
{
	string type, option;
	bool bDownloadResults = false;

	if (argc < 5)
	{
		cerr << "Usage: " << argv[0] << " <search engine name> <option> <search string> <max results> [DOWNLOAD]" << endl;
		return EXIT_FAILURE;
	}
	if (argc > 5)
	{
		string flag = argv[5];
		if (flag == "DOWNLOAD")
		{
			bDownloadResults = true;
		}
	}

	// Which SearchEngine ?
	type = argv[1];
	option = argv[2];
	SearchEngineInterface *myEngine = SearchEngineFactory::getSearchEngine(type, option);
	if (myEngine == NULL)
	{
		cerr << "Couldn't obtain search engine instance" << endl;
		return EXIT_FAILURE;
	}

	// How many results ?
	unsigned int count = atoi(argv[4]);
	myEngine->setMaxResultsCount(count);

	QueryProperties queryProps("senginetest", argv[3], "", "", "");
	bool bOK = myEngine->runQuery(queryProps);
	if (bOK == true)
	{
		string resultsPage;

		// Try getting a list of links
		const vector<Result> resultsList = myEngine->getResults();
		if (resultsList.empty() == false)
		{
			unsigned int count = 0;

			cout << "Matching documents are :" << endl;

			vector<Result>::const_iterator resultIter = resultsList.begin();
			while (resultIter != resultsList.end())
			{
				string rawUrl = (*resultIter).getLocation();
				Url thisUrl(rawUrl);

				cout << count << " Raw URL  : '" << rawUrl << "'"<< endl;
				cout << count << " Protocol : " << thisUrl.getProtocol() << endl;
				cout << count << " Host     : " << thisUrl.getHost() << endl;
				cout << count << " Location : " << thisUrl.getLocation() << "/" << thisUrl.getFile() << endl;
				cout << count << " Title    : " << HtmlTokenizer::stripTags((*resultIter).getTitle()) << endl;
				cout << count << " Extract  : " << HtmlTokenizer::stripTags((*resultIter).getExtract()) << endl;
				cout << count << " Score    : " << (*resultIter).getScore() << endl;

				if (bDownloadResults == true)
				{
					// Set the name of the file to which this page will be saved
					char num[16];
					sprintf(num, "%d", count);
					string url = (*resultIter).getLocation();
					string file = num;
					file += "_";
					file += thisUrl.getHost();
					file += ".html";

					if (type == "googleapi")
					{
						// Fetch the page from the Google cache
						fetchCachedPage(url, file, option);
					}
					else
					{
						fetchPage(url, file);
					}
				}
				count++;

				// Next
				resultIter++;
			}
		}
		else
		{
			cerr << "Couldn't get a results list !" << endl;
		}
	}
	else
	{
		cerr << "Couldn't run query on search engine " << argv[1] << endl;
	}

	delete myEngine;

	return EXIT_SUCCESS;
}
Exemplo n.º 6
0
int main(int argc, char **argv)
{
	string type, option, csvExport, xmlExport, proxyAddress, proxyPort, proxyType;
	unsigned int maxResultsCount = 10; 
	int longOptionIndex = 0;
	bool printResults = true;

	// Look at the options
	int optionChar = getopt_long(argc, argv, "c:hm:a:p:t:vx:", g_longOptions, &longOptionIndex);
	while (optionChar != -1)
	{
		switch (optionChar)
		{
			case 'a':
				if (optarg != NULL)
				{
					proxyAddress = optarg;
				}
				break;
			case 'c':
				if (optarg != NULL)
				{
					csvExport = optarg;
					printResults = false;
				}
				break;
			case 'h':
				printHelp();
				return EXIT_SUCCESS;
			case 'm':
				if (optarg != NULL)
				{
					maxResultsCount = (unsigned int )atoi(optarg);
				}
				break;
			case 'p':
				if (optarg != NULL)
				{
					proxyPort = optarg;
				}
				break;
			case 't':
				if (optarg != NULL)
				{
					proxyType = optarg;
				}
				break;
			case 'v':
				cout << "pinot-search - " << PACKAGE_STRING << "\n\n"
					<< "This is free software.  You may redistribute copies of it under the terms of\n"
					<< "the GNU General Public License <http://www.gnu.org/licenses/gpl.html>.\n"
					<< "There is NO WARRANTY, to the extent permitted by law." << endl;
				return EXIT_SUCCESS;
			case 'x':
				if (optarg != NULL)
				{
					xmlExport = optarg;
					printResults = false;
				}
				break;
			default:
				return EXIT_FAILURE;
		}

		// Next option
		optionChar = getopt_long(argc, argv, "c:hm:a:p:t:vx:", g_longOptions, &longOptionIndex);
	}

	if (argc == 1)
	{
		printHelp();
		return EXIT_SUCCESS;
	}

	if ((argc < 4) ||
		(argc - optind != 3))
	{
		cerr << "Not enough parameters" << endl;
		return EXIT_FAILURE;
	}

	MIMEScanner::initialize();
	DownloaderInterface::initialize();

	type = argv[optind];
	option = argv[optind + 1];
	char *pQuery = argv[optind + 2];

	// Which SearchEngine ?
	SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine(type, option);
	if (pEngine == NULL)
	{
		cerr << "Couldn't obtain search engine instance" << endl;

		DownloaderInterface::shutdown();
		MIMEScanner::shutdown();

		return EXIT_FAILURE;
	}

	// How many results ?
	pEngine->setMaxResultsCount(maxResultsCount);

	// Set up the proxy
	DownloaderInterface *pDownloader = pEngine->getDownloader();
	if ((pDownloader != NULL) &&
		(proxyAddress.empty() == false) &&
		(proxyPort.empty() == false))
	{
		pDownloader->setSetting("proxyaddress", proxyAddress);
		pDownloader->setSetting("proxyport", proxyPort);
		pDownloader->setSetting("proxytype", proxyType);
	}

	QueryProperties queryProps("senginetest", pQuery);
	if (pEngine->runQuery(queryProps) == true)
	{
		string resultsPage;

		// Try getting a list of links
		const vector<DocumentInfo> resultsList = pEngine->getResults();
		if (resultsList.empty() == false)
		{
			if (printResults == true)
			{
				unsigned int count = 0;

				cout << "Matching documents are :" << endl;

				vector<DocumentInfo>::const_iterator resultIter = resultsList.begin();
				while (resultIter != resultsList.end())
				{
					string rawUrl(resultIter->getLocation());
					Url thisUrl(rawUrl);

					cout << count << " Raw URL  : '" << rawUrl << "'"<< endl;
					cout << count << " Protocol : " << thisUrl.getProtocol() << endl;
					cout << count << " Host     : " << thisUrl.getHost() << endl;
					cout << count << " Location : " << thisUrl.getLocation() << "/" << thisUrl.getFile() << endl;
					cout << count << " Title    : " << resultIter->getTitle() << endl;
					cout << count << " Extract  : " << FilterUtils::stripMarkup(resultIter->getExtract()) << endl;
					cout << count << " Score    : " << resultIter->getScore() << endl;
					count++;

					// Next
					resultIter++;
				}
			}
			else
			{
				string engineName(SearchEngineFactory::getSearchEngineName(type, option));

				if (csvExport.empty() == false)
				{
					CSVExporter exporter(csvExport, queryProps);

					exporter.exportResults(engineName, maxResultsCount, resultsList);
				}

				if (xmlExport.empty() == false)
				{
					OpenSearchExporter exporter(xmlExport, queryProps);

					exporter.exportResults(engineName, maxResultsCount, resultsList);
				}
			}
		}
		else
		{
			cerr << "Couldn't get a results list !" << endl;
		}
	}
	else
	{
		cerr << "Couldn't run query on search engine " << type << endl;
	}

	delete pEngine;

	XapianDatabaseFactory::closeAll();
	DownloaderInterface::shutdown();
	MIMEScanner::shutdown();

	return EXIT_SUCCESS;
}