void ExpandQueryThread::doWork(void) { // Get the SearchEngine SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine(m_engineName, m_engineOption); if (pEngine == NULL) { m_status = _("Couldn't create search engine"); m_status += " "; m_status += m_queryProps.getName(); return; } // Set the maximum number of results pEngine->setMaxResultsCount(m_queryProps.getMaximumResultsCount()); // Set whether to expand the query pEngine->setQueryExpansion(m_relevantDocs); // Run the query if (pEngine->runQuery(m_queryProps) == false) { m_status = _("Couldn't run query on search engine"); m_status += " "; m_status += m_engineName; } else { // Copy the expand terms const set<string> &expandTerms = pEngine->getExpandTerms(); copy(expandTerms.begin(), expandTerms.end(), inserter(m_expandTerms, m_expandTerms.begin())); } delete pEngine; }
void QueryingThread::doWork(void) { // Get the SearchEngine SearchEngineInterface *engine = SearchEngineFactory::getSearchEngine(m_engineName, m_engineOption); if (engine == NULL) { m_status = _("Couldn't create search engine"); m_status += " "; m_status += m_engineDisplayableName; return; } // Set the maximum number of results engine->setMaxResultsCount(m_queryProps.getMaximumResultsCount()); // Run the query if (engine->runQuery(m_queryProps) == false) { m_status = _("Couldn't run query on search engine"); m_status += " "; m_status += m_engineDisplayableName; } else { const vector<Result> &resultsList = engine->getResults(); m_resultsList.clear(); m_resultsList.reserve(resultsList.size()); m_resultsCharset = engine->getResultsCharset(); // Copy the results list for (vector<Result>::const_iterator resultIter = resultsList.begin(); resultIter != resultsList.end(); ++resultIter) { string title = _("No title"); string extract = HtmlTokenizer::stripTags(resultIter->getExtract()); // The title may contain formatting if (resultIter->getTitle().empty() == false) { title = HtmlTokenizer::stripTags(resultIter->getTitle()); } string language = resultIter->getLanguage(); if (language.empty() == true) { // Use the query's language language = m_queryProps.getLanguage(); } m_resultsList.push_back(Result(resultIter->getLocation(), title, extract, language, resultIter->getScore())); } } delete engine; }
void ExpandQueryThread::doWork(void) { IndexInterface *pIndex = PinotSettings::getInstance().getIndex("MERGED"); set<unsigned int> relevantDocIds; if ((pIndex == NULL) || (pIndex->isGood() == false)) { m_status = _("Index error on"); m_status += " MERGED"; if (pIndex != NULL) { delete pIndex; } return; } for (set<string>::iterator locationIter = m_relevantDocs.begin(); locationIter != m_relevantDocs.end(); ++locationIter) { relevantDocIds.insert(pIndex->hasDocument(*locationIter)); } delete pIndex; // Get the SearchEngine SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine("xapian", "MERGED"); if (pEngine == NULL) { m_status = _("Couldn't create search engine"); m_status += " "; m_status += m_queryProps.getName(); return; } // Set the maximum number of results pEngine->setMaxResultsCount(m_queryProps.getMaximumResultsCount()); // Set whether to expand the query pEngine->setQueryExpansion(relevantDocIds); // Run the query if (pEngine->runQuery(m_queryProps) == false) { m_status = _("Couldn't run query on search engine"); } else { // Copy the expand terms const set<string> &expandTerms = pEngine->getExpandTerms(); copy(expandTerms.begin(), expandTerms.end(), inserter(m_expandTerms, m_expandTerms.begin())); } delete pEngine; }
void QueryingThread::doWork(void) { PinotSettings &settings = PinotSettings::getInstance(); // Get the SearchEngine SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine(m_engineName, m_engineOption); if (pEngine == NULL) { m_status = _("Couldn't create search engine"); m_status += " "; m_status += m_engineDisplayableName; return; } // Set the maximum number of results pEngine->setMaxResultsCount(m_queryProps.getMaximumResultsCount()); // Set up the proxy DownloaderInterface *pDownloader = pEngine->getDownloader(); if ((pDownloader != NULL) && (settings.m_proxyEnabled == true) && (settings.m_proxyAddress.empty() == false)) { char portStr[64]; pDownloader->setSetting("proxyaddress", settings.m_proxyAddress); snprintf(portStr, 64, "%u", settings.m_proxyPort); pDownloader->setSetting("proxyport", portStr); pDownloader->setSetting("proxytype", settings.m_proxyType); } // Run the query if (pEngine->runQuery(m_queryProps) == false) { m_status = _("Couldn't run query on search engine"); m_status += " "; m_status += m_engineDisplayableName; } else { IndexInterface *pDocsIndex = NULL; IndexInterface *pDaemonIndex = NULL; const vector<Result> &resultsList = pEngine->getResults(); unsigned int indexId = 0; bool isIndexQuery = false; m_resultsList.clear(); m_resultsList.reserve(resultsList.size()); m_resultsCharset = pEngine->getResultsCharset(); // Are we querying an index ? if (m_engineName == "xapian") { // Internal index ? if (m_engineOption == settings.m_docsIndexLocation) { indexId = settings.getIndexId(_("My Web Pages")); isIndexQuery = true; } else if (m_engineOption == settings.m_daemonIndexLocation) { indexId = settings.getIndexId(_("My Documents")); isIndexQuery = true; } } // Will we have to query internal indices ? if (isIndexQuery == false) { pDocsIndex = settings.getIndex(settings.m_docsIndexLocation); pDaemonIndex = settings.getIndex(settings.m_daemonIndexLocation); } // Copy the results list for (vector<Result>::const_iterator resultIter = resultsList.begin(); resultIter != resultsList.end(); ++resultIter) { Result current(*resultIter); string title(_("No title")); string location(current.getLocation()); string language(current.getLanguage()); unsigned int docId = 0; // The title may contain formatting if (current.getTitle().empty() == false) { title = FilterUtils::stripMarkup(current.getTitle()); } current.setTitle(title); #ifdef DEBUG cout << "QueryingThread::doWork: title is " << title << endl; #endif // Use the query's language if the result's is unknown if (language.empty() == true) { language = m_queryProps.getLanguage(); } current.setLanguage(language); if (isIndexQuery == true) { unsigned int tmpId = 0; // The index engine should have set this docId = current.getIsIndexed(tmpId); } // Is this in one of the indexes ? if ((pDocsIndex != NULL) && (pDocsIndex->isGood() == true)) { docId = pDocsIndex->hasDocument(location); if (docId > 0) { indexId = settings.getIndexId(_("My Web Pages")); } } if ((pDaemonIndex != NULL) && (pDaemonIndex->isGood() == true) && (docId == 0)) { docId = pDaemonIndex->hasDocument(location); if (docId > 0) { indexId = settings.getIndexId(_("My Documents")); } } if (docId > 0) { current.setIsIndexed(indexId, docId); #ifdef DEBUG cout << "QueryingThread::doWork: found in index " << indexId << endl; #endif } #ifdef DEBUG else cout << "QueryingThread::doWork: not found in any index" << endl; #endif m_resultsList.push_back(current); } if (pDocsIndex != NULL) { delete pDocsIndex; } if (pDaemonIndex != NULL) { delete pDaemonIndex; } } delete pEngine; }
int main(int argc, char **argv) { string type, option; bool bDownloadResults = false; if (argc < 5) { cerr << "Usage: " << argv[0] << " <search engine name> <option> <search string> <max results> [DOWNLOAD]" << endl; return EXIT_FAILURE; } if (argc > 5) { string flag = argv[5]; if (flag == "DOWNLOAD") { bDownloadResults = true; } } // Which SearchEngine ? type = argv[1]; option = argv[2]; SearchEngineInterface *myEngine = SearchEngineFactory::getSearchEngine(type, option); if (myEngine == NULL) { cerr << "Couldn't obtain search engine instance" << endl; return EXIT_FAILURE; } // How many results ? unsigned int count = atoi(argv[4]); myEngine->setMaxResultsCount(count); QueryProperties queryProps("senginetest", argv[3], "", "", ""); bool bOK = myEngine->runQuery(queryProps); if (bOK == true) { string resultsPage; // Try getting a list of links const vector<Result> resultsList = myEngine->getResults(); if (resultsList.empty() == false) { unsigned int count = 0; cout << "Matching documents are :" << endl; vector<Result>::const_iterator resultIter = resultsList.begin(); while (resultIter != resultsList.end()) { string rawUrl = (*resultIter).getLocation(); Url thisUrl(rawUrl); cout << count << " Raw URL : '" << rawUrl << "'"<< endl; cout << count << " Protocol : " << thisUrl.getProtocol() << endl; cout << count << " Host : " << thisUrl.getHost() << endl; cout << count << " Location : " << thisUrl.getLocation() << "/" << thisUrl.getFile() << endl; cout << count << " Title : " << HtmlTokenizer::stripTags((*resultIter).getTitle()) << endl; cout << count << " Extract : " << HtmlTokenizer::stripTags((*resultIter).getExtract()) << endl; cout << count << " Score : " << (*resultIter).getScore() << endl; if (bDownloadResults == true) { // Set the name of the file to which this page will be saved char num[16]; sprintf(num, "%d", count); string url = (*resultIter).getLocation(); string file = num; file += "_"; file += thisUrl.getHost(); file += ".html"; if (type == "googleapi") { // Fetch the page from the Google cache fetchCachedPage(url, file, option); } else { fetchPage(url, file); } } count++; // Next resultIter++; } } else { cerr << "Couldn't get a results list !" << endl; } } else { cerr << "Couldn't run query on search engine " << argv[1] << endl; } delete myEngine; return EXIT_SUCCESS; }
int main(int argc, char **argv) { string type, option, csvExport, xmlExport, proxyAddress, proxyPort, proxyType; unsigned int maxResultsCount = 10; int longOptionIndex = 0; bool printResults = true; // Look at the options int optionChar = getopt_long(argc, argv, "c:hm:a:p:t:vx:", g_longOptions, &longOptionIndex); while (optionChar != -1) { switch (optionChar) { case 'a': if (optarg != NULL) { proxyAddress = optarg; } break; case 'c': if (optarg != NULL) { csvExport = optarg; printResults = false; } break; case 'h': printHelp(); return EXIT_SUCCESS; case 'm': if (optarg != NULL) { maxResultsCount = (unsigned int )atoi(optarg); } break; case 'p': if (optarg != NULL) { proxyPort = optarg; } break; case 't': if (optarg != NULL) { proxyType = optarg; } break; case 'v': cout << "pinot-search - " << PACKAGE_STRING << "\n\n" << "This is free software. You may redistribute copies of it under the terms of\n" << "the GNU General Public License <http://www.gnu.org/licenses/gpl.html>.\n" << "There is NO WARRANTY, to the extent permitted by law." << endl; return EXIT_SUCCESS; case 'x': if (optarg != NULL) { xmlExport = optarg; printResults = false; } break; default: return EXIT_FAILURE; } // Next option optionChar = getopt_long(argc, argv, "c:hm:a:p:t:vx:", g_longOptions, &longOptionIndex); } if (argc == 1) { printHelp(); return EXIT_SUCCESS; } if ((argc < 4) || (argc - optind != 3)) { cerr << "Not enough parameters" << endl; return EXIT_FAILURE; } MIMEScanner::initialize(); DownloaderInterface::initialize(); type = argv[optind]; option = argv[optind + 1]; char *pQuery = argv[optind + 2]; // Which SearchEngine ? SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine(type, option); if (pEngine == NULL) { cerr << "Couldn't obtain search engine instance" << endl; DownloaderInterface::shutdown(); MIMEScanner::shutdown(); return EXIT_FAILURE; } // How many results ? pEngine->setMaxResultsCount(maxResultsCount); // Set up the proxy DownloaderInterface *pDownloader = pEngine->getDownloader(); if ((pDownloader != NULL) && (proxyAddress.empty() == false) && (proxyPort.empty() == false)) { pDownloader->setSetting("proxyaddress", proxyAddress); pDownloader->setSetting("proxyport", proxyPort); pDownloader->setSetting("proxytype", proxyType); } QueryProperties queryProps("senginetest", pQuery); if (pEngine->runQuery(queryProps) == true) { string resultsPage; // Try getting a list of links const vector<DocumentInfo> resultsList = pEngine->getResults(); if (resultsList.empty() == false) { if (printResults == true) { unsigned int count = 0; cout << "Matching documents are :" << endl; vector<DocumentInfo>::const_iterator resultIter = resultsList.begin(); while (resultIter != resultsList.end()) { string rawUrl(resultIter->getLocation()); Url thisUrl(rawUrl); cout << count << " Raw URL : '" << rawUrl << "'"<< endl; cout << count << " Protocol : " << thisUrl.getProtocol() << endl; cout << count << " Host : " << thisUrl.getHost() << endl; cout << count << " Location : " << thisUrl.getLocation() << "/" << thisUrl.getFile() << endl; cout << count << " Title : " << resultIter->getTitle() << endl; cout << count << " Extract : " << FilterUtils::stripMarkup(resultIter->getExtract()) << endl; cout << count << " Score : " << resultIter->getScore() << endl; count++; // Next resultIter++; } } else { string engineName(SearchEngineFactory::getSearchEngineName(type, option)); if (csvExport.empty() == false) { CSVExporter exporter(csvExport, queryProps); exporter.exportResults(engineName, maxResultsCount, resultsList); } if (xmlExport.empty() == false) { OpenSearchExporter exporter(xmlExport, queryProps); exporter.exportResults(engineName, maxResultsCount, resultsList); } } } else { cerr << "Couldn't get a results list !" << endl; } } else { cerr << "Couldn't run query on search engine " << type << endl; } delete pEngine; XapianDatabaseFactory::closeAll(); DownloaderInterface::shutdown(); MIMEScanner::shutdown(); return EXIT_SUCCESS; }