static void fetchPage(const string &url, const string &file) { #ifdef DEBUG cout << "fetchPage: attempting to save " << url << " to " << file << endl; #endif // Any type of downloader will do... Url thisUrl(url); DownloaderInterface *myDownloader = DownloaderFactory::getDownloader(thisUrl.getProtocol(), ""); if (myDownloader == NULL) { cerr << "fetchPage: couldn't obtain downloader instance (" << thisUrl.getProtocol() << ")" << endl; return; } DocumentInfo docInfo("Page", url, "", ""); Document *urlDoc = myDownloader->retrieveUrl(docInfo); if (urlDoc != NULL) { unsigned int urlContentLen; ofstream outputFile; outputFile.open(file.c_str(), ofstream::out | ofstream::trunc); outputFile << urlDoc->getData(urlContentLen); outputFile.close(); delete urlDoc; } else { cerr << "fetchPage: couldn't get " << url << " !" << endl; } delete myDownloader; }
Document *WebEngine::downloadPage(const DocumentInfo &docInfo) { m_charset.clear(); // Any type of downloader will do... DownloaderInterface *pDownloader = DownloaderFactory::getDownloader("http"); if (pDownloader == NULL) { return NULL; } Document *pDoc = pDownloader->retrieveUrl(docInfo); if (pDoc != NULL) { string contentType(pDoc->getType()); // Found a charset ? m_charset = getCharset(contentType); if (m_charset.empty() == true) { HtmlTokenizer tokens(pDoc, true); // Content-Type might be specified as a META tag contentType = tokens.getMetaTag("Content-Type"); m_charset = getCharset(contentType); if (m_charset.empty() == false) { // Reset the document's type pDoc->setType(contentType); } } } delete pDownloader; return pDoc; }
void QueryingThread::doWork(void) { PinotSettings &settings = PinotSettings::getInstance(); // Get the SearchEngine SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine(m_engineName, m_engineOption); if (pEngine == NULL) { m_status = _("Couldn't create search engine"); m_status += " "; m_status += m_engineDisplayableName; return; } // Set the maximum number of results pEngine->setMaxResultsCount(m_queryProps.getMaximumResultsCount()); // Set up the proxy DownloaderInterface *pDownloader = pEngine->getDownloader(); if ((pDownloader != NULL) && (settings.m_proxyEnabled == true) && (settings.m_proxyAddress.empty() == false)) { char portStr[64]; pDownloader->setSetting("proxyaddress", settings.m_proxyAddress); snprintf(portStr, 64, "%u", settings.m_proxyPort); pDownloader->setSetting("proxyport", portStr); pDownloader->setSetting("proxytype", settings.m_proxyType); } // Run the query if (pEngine->runQuery(m_queryProps) == false) { m_status = _("Couldn't run query on search engine"); m_status += " "; m_status += m_engineDisplayableName; } else { IndexInterface *pDocsIndex = NULL; IndexInterface *pDaemonIndex = NULL; const vector<Result> &resultsList = pEngine->getResults(); unsigned int indexId = 0; bool isIndexQuery = false; m_resultsList.clear(); m_resultsList.reserve(resultsList.size()); m_resultsCharset = pEngine->getResultsCharset(); // Are we querying an index ? if (m_engineName == "xapian") { // Internal index ? if (m_engineOption == settings.m_docsIndexLocation) { indexId = settings.getIndexId(_("My Web Pages")); isIndexQuery = true; } else if (m_engineOption == settings.m_daemonIndexLocation) { indexId = settings.getIndexId(_("My Documents")); isIndexQuery = true; } } // Will we have to query internal indices ? if (isIndexQuery == false) { pDocsIndex = settings.getIndex(settings.m_docsIndexLocation); pDaemonIndex = settings.getIndex(settings.m_daemonIndexLocation); } // Copy the results list for (vector<Result>::const_iterator resultIter = resultsList.begin(); resultIter != resultsList.end(); ++resultIter) { Result current(*resultIter); string title(_("No title")); string location(current.getLocation()); string language(current.getLanguage()); unsigned int docId = 0; // The title may contain formatting if (current.getTitle().empty() == false) { title = FilterUtils::stripMarkup(current.getTitle()); } current.setTitle(title); #ifdef DEBUG cout << "QueryingThread::doWork: title is " << title << endl; #endif // Use the query's language if the result's is unknown if (language.empty() == true) { language = m_queryProps.getLanguage(); } current.setLanguage(language); if (isIndexQuery == true) { unsigned int tmpId = 0; // The index engine should have set this docId = current.getIsIndexed(tmpId); } // Is this in one of the indexes ? if ((pDocsIndex != NULL) && (pDocsIndex->isGood() == true)) { docId = pDocsIndex->hasDocument(location); if (docId > 0) { indexId = settings.getIndexId(_("My Web Pages")); } } if ((pDaemonIndex != NULL) && (pDaemonIndex->isGood() == true) && (docId == 0)) { docId = pDaemonIndex->hasDocument(location); if (docId > 0) { indexId = settings.getIndexId(_("My Documents")); } } if (docId > 0) { current.setIsIndexed(indexId, docId); #ifdef DEBUG cout << "QueryingThread::doWork: found in index " << indexId << endl; #endif } #ifdef DEBUG else cout << "QueryingThread::doWork: not found in any index" << endl; #endif m_resultsList.push_back(current); } if (pDocsIndex != NULL) { delete pDocsIndex; } if (pDaemonIndex != NULL) { delete pDaemonIndex; } } delete pEngine; }
int main(int argc, char **argv) { int longOptionIndex = 0; // Look at the options int optionChar = getopt_long(argc, argv, "hv", g_longOptions, &longOptionIndex); while (optionChar != -1) { switch (optionChar) { case 'h': // Help cout << "pinot-collect - Download an URL from the command-line\n\n" << "Usage: pinot-collect [OPTIONS] URL\n\n" << "Options:\n" << " -h, --help display this help and exit\n" << " -v, --version output version information and exit\n" << "\nExamples:\n" << " pinot-collect http://some.website.com/\n" << " pinot-collect xapian:///home/fabrice/.pinot/index/1\n" << "\nReport bugs to " << PACKAGE_BUGREPORT << endl; return EXIT_SUCCESS; case 'v': cout << "pinot-collect - " << PACKAGE_STRING << "\n\n" << "This is free software. You may redistribute copies of it under the terms of\n" << "the GNU General Public License <http://www.gnu.org/licenses/gpl.html>.\n" << "There is NO WARRANTY, to the extent permitted by law." << endl; return EXIT_SUCCESS; default: return EXIT_FAILURE; } // Next option optionChar = getopt_long(argc, argv, "hv", g_longOptions, &longOptionIndex); } if (argc < 2) { cerr << "Not enough parameters" << endl; return EXIT_FAILURE; } MIMEScanner::initialize(); DownloaderInterface::initialize(); string url(argv[1]); Url thisUrl(url); cout << "Protocol: " << thisUrl.getProtocol() << endl; cout << "User: "******"Password: "******"Host: " << thisUrl.getHost() << endl; cout << "Location: " << thisUrl.getLocation() << endl; cout << "File: " << thisUrl.getFile() << endl; cout << "Parameters: " << thisUrl.getParameters() << endl; // Which Downloader ? DownloaderInterface *pDownloader = DownloaderFactory::getDownloader(thisUrl.getProtocol()); if (pDownloader == NULL) { cerr << "Couldn't obtain downloader for protocol " << thisUrl.getProtocol() << endl; DownloaderInterface::shutdown(); MIMEScanner::shutdown(); return EXIT_FAILURE; } DocumentInfo docInfo("Test", url, "", ""); Document *pDoc = pDownloader->retrieveUrl(docInfo); if (pDoc == NULL) { cerr << "Download operation failed !" << endl; } else { cout << "Type: " << pDoc->getType() << endl; unsigned int contentLen; const char *pContent = pDoc->getData(contentLen); if ((pContent != NULL) && (contentLen > 0)) { string fileName(thisUrl.getFile()); if (fileName.empty() == true) { fileName = "index.html"; } cout << "Saving " << contentLen << " bytes to " << fileName << endl; // Save the content to a file ofstream outputFile(fileName.c_str()); outputFile.write(pContent, contentLen); outputFile.close(); } else { cout << "Document is empty" << endl; } delete pDoc; } delete pDownloader; DownloaderInterface::shutdown(); MIMEScanner::shutdown(); return EXIT_SUCCESS; }
int main(int argc, char **argv) { QueryProperties::QueryType queryType = QueryProperties::XAPIAN_QP; string engineType, option, csvExport, xmlExport, proxyAddress, proxyPort, proxyType; unsigned int maxResultsCount = 10; int longOptionIndex = 0; bool printResults = true; // Look at the options int optionChar = getopt_long(argc, argv, "c:hm:a:p:qt:uvx:", g_longOptions, &longOptionIndex); while (optionChar != -1) { switch (optionChar) { case 'a': if (optarg != NULL) { proxyAddress = optarg; } break; case 'c': if (optarg != NULL) { csvExport = optarg; printResults = false; } break; case 'h': printHelp(); return EXIT_SUCCESS; case 'm': if (optarg != NULL) { maxResultsCount = (unsigned int )atoi(optarg); } break; case 'p': if (optarg != NULL) { proxyPort = optarg; } break; case 'q': queryType = QueryProperties::XESAM_QL; break; case 't': if (optarg != NULL) { proxyType = optarg; } break; case 'u': queryType = QueryProperties::XESAM_UL; break; case 'v': cout << "pinot-search - " << PACKAGE_STRING << "\n\n" << "This is free software. You may redistribute copies of it under the terms of\n" << "the GNU General Public License <http://www.gnu.org/licenses/old-licenses/gpl-2.0.html>.\n" << "There is NO WARRANTY, to the extent permitted by law." << endl; return EXIT_SUCCESS; case 'x': if (optarg != NULL) { xmlExport = optarg; printResults = false; } break; default: return EXIT_FAILURE; } // Next option optionChar = getopt_long(argc, argv, "c:hm:a:p:qt:uvx:", g_longOptions, &longOptionIndex); } if (argc == 1) { printHelp(); return EXIT_SUCCESS; } if ((argc < 4) || (argc - optind != 3)) { cerr << "Not enough parameters" << endl; return EXIT_FAILURE; } MIMEScanner::initialize(); DownloaderInterface::initialize(); engineType = argv[optind]; option = argv[optind + 1]; char *pQueryInput = argv[optind + 2]; // Which SearchEngine ? SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine(engineType, option); if (pEngine == NULL) { cerr << "Couldn't obtain search engine instance" << endl; DownloaderInterface::shutdown(); MIMEScanner::shutdown(); return EXIT_FAILURE; } // Set up the proxy DownloaderInterface *pDownloader = pEngine->getDownloader(); if ((pDownloader != NULL) && (proxyAddress.empty() == false) && (proxyPort.empty() == false)) { pDownloader->setSetting("proxyaddress", proxyAddress); pDownloader->setSetting("proxyport", proxyPort); pDownloader->setSetting("proxytype", proxyType); } // Set the query QueryProperties queryProps("pinot-search", "", queryType); if (queryType == QueryProperties::XAPIAN_QP) { queryProps.setFreeQuery(pQueryInput); } else { string fileContents; // Load the query from file if (loadFile(pQueryInput, fileContents) == false) { cerr << "Couldn't load query from file " << pQueryInput << endl; DownloaderInterface::shutdown(); MIMEScanner::shutdown(); return EXIT_FAILURE; } queryProps.setFreeQuery(fileContents); } queryProps.setMaximumResultsCount(maxResultsCount); if (pEngine->runQuery(queryProps) == true) { string resultsPage; // Try getting a list of links const vector<DocumentInfo> resultsList = pEngine->getResults(); if (resultsList.empty() == false) { if (printResults == true) { unsigned int count = 0; cout << "Matching documents are :" << endl; vector<DocumentInfo>::const_iterator resultIter = resultsList.begin(); while (resultIter != resultsList.end()) { string rawUrl(resultIter->getLocation()); Url thisUrl(rawUrl); cout << count << " Raw URL : '" << rawUrl << "'"<< endl; cout << count << " Protocol : " << thisUrl.getProtocol() << endl; cout << count << " Host : " << thisUrl.getHost() << endl; cout << count << " Location : " << thisUrl.getLocation() << "/" << thisUrl.getFile() << endl; cout << count << " Title : " << resultIter->getTitle() << endl; cout << count << " Type : " << resultIter->getType() << endl; cout << count << " Language : " << resultIter->getLanguage() << endl; cout << count << " Extract : " << resultIter->getExtract() << endl; cout << count << " Score : " << resultIter->getScore() << endl; count++; // Next resultIter++; } } else { string engineName(SearchEngineFactory::getSearchEngineName(engineType, option)); if (csvExport.empty() == false) { CSVExporter exporter(csvExport, queryProps); exporter.exportResults(engineName, maxResultsCount, resultsList); } if (xmlExport.empty() == false) { OpenSearchExporter exporter(xmlExport, queryProps); exporter.exportResults(engineName, maxResultsCount, resultsList); } } } else { cerr << "Couldn't get a results list !" << endl; } } else { cerr << "Couldn't run query on search engine " << engineType << endl; } delete pEngine; XapianDatabaseFactory::closeAll(); DownloaderInterface::shutdown(); MIMEScanner::shutdown(); return EXIT_SUCCESS; }
void EngineQueryThread::doWork(void) { PinotSettings &settings = PinotSettings::getInstance(); // Get the SearchEngine SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine(m_engineName, m_engineOption); if (pEngine == NULL) { m_errorNum = UNKNOWN_ENGINE; m_errorParam = m_engineDisplayableName; return; } // Set up the proxy DownloaderInterface *pDownloader = pEngine->getDownloader(); if ((pDownloader != NULL) && (settings.m_proxyEnabled == true) && (settings.m_proxyAddress.empty() == false)) { char portStr[64]; pDownloader->setSetting("proxyaddress", settings.m_proxyAddress); snprintf(portStr, 64, "%u", settings.m_proxyPort); pDownloader->setSetting("proxyport", portStr); pDownloader->setSetting("proxytype", settings.m_proxyType); } if (m_listingIndex == false) { pEngine->setLimitSet(m_limitToDocsSet); } // Run the query pEngine->setDefaultOperator(SearchEngineInterface::DEFAULT_OP_AND); if (pEngine->runQuery(m_queryProps, m_startDoc) == false) { m_errorNum = QUERY_FAILED; m_errorParam = m_engineDisplayableName; } else { const vector<DocumentInfo> &resultsList = pEngine->getResults(); m_documentsList.clear(); m_documentsList.reserve(resultsList.size()); m_documentsCount = pEngine->getResultsCountEstimate(); #ifdef DEBUG cout << "EngineQueryThread::doWork: " << resultsList.size() << " off " << m_documentsCount << " results to process, starting at position " << m_startDoc << endl; #endif m_resultsCharset = pEngine->getResultsCharset(); if (m_listingIndex == false) { processResults(resultsList); } else { processResults(resultsList, PinotSettings::getInstance().getIndexIdByName(m_engineDisplayableName)); } // Any spelling correction ? string correctedFreeQuery(pEngine->getSpellingCorrection()); if (correctedFreeQuery.empty() == false) { m_correctedSpelling = true; m_queryProps.setFreeQuery(correctedFreeQuery); } } delete pEngine; }
int main(int argc, char **argv) { QueryProperties::QueryType queryType = QueryProperties::XAPIAN_QP; string engineType, option, csvExport, xmlExport, stemLanguage; unsigned int maxResultsCount = 10; int longOptionIndex = 0; bool printResults = true; bool sortByDate = false; bool locationOnly = false; bool isStoredQuery = false; // Look at the options int optionChar = getopt_long(argc, argv, "c:dhlm:rs:vx:", g_longOptions, &longOptionIndex); while (optionChar != -1) { switch (optionChar) { case 'c': if (optarg != NULL) { csvExport = optarg; printResults = false; } break; case 'd': sortByDate = true; break; case 'h': printHelp(); return EXIT_SUCCESS; case 'l': locationOnly = true; break; case 'm': if (optarg != NULL) { maxResultsCount = (unsigned int )atoi(optarg); } break; case 'r': isStoredQuery = true; break; case 's': if (optarg != NULL) { stemLanguage = optarg; } break; case 'v': clog << "pinot-search - " << PACKAGE_STRING << "\n\n" << "This is free software. You may redistribute copies of it under the terms of\n" << "the GNU General Public License <http://www.gnu.org/licenses/old-licenses/gpl-2.0.html>.\n" << "There is NO WARRANTY, to the extent permitted by law." << endl; return EXIT_SUCCESS; case 'x': if (optarg != NULL) { xmlExport = optarg; printResults = false; } break; default: return EXIT_FAILURE; } // Next option optionChar = getopt_long(argc, argv, "c:dhlm:rs:vx:", g_longOptions, &longOptionIndex); } #if defined(ENABLE_NLS) bindtextdomain(GETTEXT_PACKAGE, PACKAGE_LOCALE_DIR); bind_textdomain_codeset(GETTEXT_PACKAGE, "UTF-8"); textdomain(GETTEXT_PACKAGE); #endif //ENABLE_NLS if (argc == 1) { printHelp(); return EXIT_SUCCESS; } if ((argc < 4) || (argc - optind != 3)) { clog << "Wrong number of parameters" << endl; return EXIT_FAILURE; } // This will create the necessary directories on the first run PinotSettings &settings = PinotSettings::getInstance(); string confDirectory(PinotSettings::getConfigurationDirectory()); if (MIMEScanner::initialize(PinotSettings::getHomeDirectory() + "/.local", string(SHARED_MIME_INFO_PREFIX)) == false) { clog << "Couldn't load MIME settings" << endl; } DownloaderInterface::initialize(); ModuleFactory::loadModules(string(LIBDIR) + string("/pinot/backends")); ModuleFactory::loadModules(confDirectory + "/backends"); // Localize language names Languages::setIntlName(0, _("Unknown")); Languages::setIntlName(1, _("Danish")); Languages::setIntlName(2, _("Dutch")); Languages::setIntlName(3, _("English")); Languages::setIntlName(4, _("Finnish")); Languages::setIntlName(5, _("French")); Languages::setIntlName(6, _("German")); Languages::setIntlName(7, _("Hungarian")); Languages::setIntlName(8, _("Italian")); Languages::setIntlName(9, _("Norwegian")); Languages::setIntlName(10, _("Portuguese")); Languages::setIntlName(11, _("Romanian")); Languages::setIntlName(12, _("Russian")); Languages::setIntlName(13, _("Spanish")); Languages::setIntlName(14, _("Swedish")); Languages::setIntlName(15, _("Turkish")); // Load the settings settings.load(PinotSettings::LOAD_ALL); engineType = argv[optind]; option = argv[optind + 1]; char *pQueryInput = argv[optind + 2]; // Set the query QueryProperties queryProps("pinot-search", "", queryType); if (queryType == QueryProperties::XAPIAN_QP) { if (isStoredQuery == true) { const map<string, QueryProperties> &queries = settings.getQueries(); map<string, QueryProperties>::const_iterator queryIter = queries.find(pQueryInput); if (queryIter != queries.end()) { queryProps = queryIter->second; } else { clog << "Couldn't find stored query " << pQueryInput << endl; DownloaderInterface::shutdown(); MIMEScanner::shutdown(); return EXIT_FAILURE; } } else { queryProps.setFreeQuery(pQueryInput); } } queryProps.setStemmingLanguage(stemLanguage); queryProps.setMaximumResultsCount(maxResultsCount); if (sortByDate == true) { queryProps.setSortOrder(QueryProperties::DATE); } // Which SearchEngine ? SearchEngineInterface *pEngine = ModuleFactory::getSearchEngine(engineType, option); if (pEngine == NULL) { clog << "Couldn't obtain search engine instance" << endl; DownloaderInterface::shutdown(); MIMEScanner::shutdown(); return EXIT_FAILURE; } // Set up the proxy WebEngine *pWebEngine = dynamic_cast<WebEngine *>(pEngine); if (pWebEngine != NULL) { DownloaderInterface *pDownloader = pWebEngine->getDownloader(); if ((pDownloader != NULL) && (settings.m_proxyEnabled == true) && (settings.m_proxyAddress.empty() == false)) { char portStr[64]; pDownloader->setSetting("proxyaddress", settings.m_proxyAddress); snprintf(portStr, 64, "%u", settings.m_proxyPort); pDownloader->setSetting("proxyport", portStr); pDownloader->setSetting("proxytype", settings.m_proxyType); } pWebEngine->setEditableValues(settings.m_editablePluginValues); } pEngine->setDefaultOperator(SearchEngineInterface::DEFAULT_OP_AND); if (pEngine->runQuery(queryProps) == true) { string resultsPage; unsigned int estimatedResultsCount = pEngine->getResultsCountEstimate(); const vector<DocumentInfo> &resultsList = pEngine->getResults(); if (resultsList.empty() == false) { if (printResults == true) { unsigned int count = 0; if (locationOnly == false) { clog << "Showing " << resultsList.size() << " results of about " << estimatedResultsCount << endl; } vector<DocumentInfo>::const_iterator resultIter = resultsList.begin(); while (resultIter != resultsList.end()) { string rawUrl(resultIter->getLocation(true)); if (locationOnly == false) { clog << count << " Location : '" << rawUrl << "'"<< endl; clog << count << " Title : " << resultIter->getTitle() << endl; clog << count << " Type : " << resultIter->getType() << endl; clog << count << " Language : " << resultIter->getLanguage() << endl; clog << count << " Date : " << resultIter->getTimestamp() << endl; clog << count << " Size : " << resultIter->getSize() << endl; clog << count << " Extract : " << resultIter->getExtract() << endl; clog << count << " Score : " << resultIter->getScore() << endl; } else { clog << rawUrl << endl; } ++count; // Next ++resultIter; } } else { string engineName(ModuleFactory::getSearchEngineName(engineType, option)); if (csvExport.empty() == false) { CSVExporter exporter(csvExport, queryProps); exporter.exportResults(engineName, maxResultsCount, resultsList); } if (xmlExport.empty() == false) { OpenSearchExporter exporter(xmlExport, queryProps); exporter.exportResults(engineName, maxResultsCount, resultsList); } } } else { clog << "No results" << endl; } } else { clog << "Couldn't run query on search engine " << engineType << endl; } delete pEngine; ModuleFactory::unloadModules(); DownloaderInterface::shutdown(); MIMEScanner::shutdown(); return EXIT_SUCCESS; }
void QueryingThread::doWork(void) { PinotSettings &settings = PinotSettings::getInstance(); // Get the SearchEngine SearchEngineInterface *pEngine = SearchEngineFactory::getSearchEngine(m_engineName, m_engineOption); if (pEngine == NULL) { m_status = _("Couldn't create search engine"); m_status += " "; m_status += m_engineDisplayableName; return; } // Set up the proxy DownloaderInterface *pDownloader = pEngine->getDownloader(); if ((pDownloader != NULL) && (settings.m_proxyEnabled == true) && (settings.m_proxyAddress.empty() == false)) { char portStr[64]; pDownloader->setSetting("proxyaddress", settings.m_proxyAddress); snprintf(portStr, 64, "%u", settings.m_proxyPort); pDownloader->setSetting("proxyport", portStr); pDownloader->setSetting("proxytype", settings.m_proxyType); } // Run the query if (pEngine->runQuery(m_queryProps, m_startDoc) == false) { m_status = _("Couldn't run query on search engine"); m_status += " "; m_status += m_engineDisplayableName; } else { const vector<DocumentInfo> &resultsList = pEngine->getResults(); m_documentsList.clear(); m_documentsList.reserve(resultsList.size()); m_documentsCount = pEngine->getResultsCountEstimate(); #ifdef DEBUG cout << "QueryingThread::doWork: " << resultsList.size() << " off " << m_documentsCount << " results to process, starting at position " << m_startDoc << endl; #endif m_resultsCharset = pEngine->getResultsCharset(); if (m_listingIndex == false) { processResults(resultsList); } else { processResults(resultsList, PinotSettings::getInstance().getIndexId(m_engineDisplayableName)); } } delete pEngine; }
int main(int argc, char **argv) { string type, option; string databaseName, proxyAddress, proxyPort, proxyType; int longOptionIndex = 0; unsigned int docId = 0; bool checkDocument = false, indexDocument = false, showInfo = false, success = false; // Look at the options int optionChar = getopt_long(argc, argv, "cd:hia:p:t:sv", g_longOptions, &longOptionIndex); while (optionChar != -1) { set<string> engines; switch (optionChar) { case 'a': if (optarg != NULL) { proxyAddress = optarg; } break; case 'c': checkDocument = true; break; case 'd': if (optarg != NULL) { databaseName = optarg; } break; case 'h': printHelp(); return EXIT_SUCCESS; case 'i': indexDocument = true; break; case 'p': if (optarg != NULL) { proxyPort = optarg; } break; case 's': showInfo = true; break; case 't': if (optarg != NULL) { proxyType = optarg; } break; case 'v': cout << "pinot-index - " << PACKAGE_STRING << "\n\n" << "This is free software. You may redistribute copies of it under the terms of\n" << "the GNU General Public License <http://www.gnu.org/licenses/old-licenses/gpl-2.0.html>.\n" << "There is NO WARRANTY, to the extent permitted by law." << endl; return EXIT_SUCCESS; default: return EXIT_FAILURE; } // Next option optionChar = getopt_long(argc, argv, "cd:hia:p:t:sv", g_longOptions, &longOptionIndex); } if (argc == 1) { printHelp(); return EXIT_SUCCESS; } if ((argc < 2) || (argc - optind == 0)) { cerr << "Not enough parameters" << endl; return EXIT_FAILURE; } if (((indexDocument == false) && (checkDocument == false)) || (databaseName.empty() == true)) { cerr << "Incorrect parameters" << endl; return EXIT_FAILURE; } MIMEScanner::initialize("", ""); DownloaderInterface::initialize(); // Localize language names Languages::setIntlName(0, "Unknown"); Languages::setIntlName(1, "Danish"); Languages::setIntlName(2, "Dutch"); Languages::setIntlName(3, "English"); Languages::setIntlName(4, "Finnish"); Languages::setIntlName(5, "French"); Languages::setIntlName(6, "German"); Languages::setIntlName(7, "Hungarian"); Languages::setIntlName(8, "Italian"); Languages::setIntlName(9, "Norwegian"); Languages::setIntlName(10, "Portuguese"); Languages::setIntlName(11, "Romanian"); Languages::setIntlName(12, "Russian"); Languages::setIntlName(13, "Spanish"); Languages::setIntlName(14, "Swedish"); Languages::setIntlName(15, "Turkish"); Dijon::HtmlFilter::initialize(); Dijon::FilterFactory::loadFilters(string(LIBDIR) + string("/pinot/filters")); // Make sure the index is open in the correct mode XapianDatabase *pDb = XapianDatabaseFactory::getDatabase(databaseName, (indexDocument ? false : true)); if (pDb == NULL) { cerr << "Couldn't open index " << databaseName << endl; Dijon::FilterFactory::unloadFilters(); Dijon::HtmlFilter::shutdown(); DownloaderInterface::shutdown(); MIMEScanner::shutdown(); return EXIT_FAILURE; } // Get a read-write index of the given type IndexInterface *pIndex = IndexFactory::getIndex("xapian", databaseName); if (pIndex == NULL) { cerr << "Couldn't obtain index for " << databaseName << endl; XapianDatabaseFactory::closeAll(); Dijon::FilterFactory::unloadFilters(); Dijon::HtmlFilter::shutdown(); DownloaderInterface::shutdown(); MIMEScanner::shutdown(); return EXIT_FAILURE; } while (optind < argc) { string urlParam(argv[optind]); if (checkDocument == true) { if (pIndex->isGood() == true) { docId = pIndex->hasDocument(urlParam); if (docId > 0) { cout << urlParam << ": document ID " << docId << endl; success = true; } } } if (indexDocument == true) { Url thisUrl(urlParam); // Which Downloader ? DownloaderInterface *pDownloader = DownloaderFactory::getDownloader(thisUrl.getProtocol()); if (pDownloader == NULL) { cerr << "Couldn't obtain downloader for protocol " << thisUrl.getProtocol() << endl; success = false; continue; } // Set up the proxy if ((proxyAddress.empty() == false) && (proxyPort.empty() == false)) { pDownloader->setSetting("proxyaddress", proxyAddress); pDownloader->setSetting("proxyport", proxyPort); pDownloader->setSetting("proxytype", proxyType); } DocumentInfo docInfo("", urlParam, MIMEScanner::scanUrl(thisUrl), ""); Document *pDoc = pDownloader->retrieveUrl(docInfo); if (pDoc == NULL) { cerr << "Couldn't download " << urlParam << endl; } else { FilterWrapper wrapFilter(pIndex); set<string> labels; // Update an existing document or add to the index ? docId = pIndex->hasDocument(urlParam); if (docId > 0) { // Update the document if (wrapFilter.updateDocument(*pDoc, docId) == true) { success = true; } } else { // Index the document success = wrapFilter.indexDocument(*pDoc, labels, docId); } if (success == true) { // Flush the index pIndex->flush(); } delete pDoc; } delete pDownloader; } if ((showInfo == true) && (docId > 0)) { DocumentInfo docInfo; set<string> labels; if (pIndex->getDocumentInfo(docId, docInfo) == true) { cout << "Location : '" << docInfo.getLocation() << "'" << endl; cout << "Title : " << docInfo.getTitle() << endl; cout << "Type : " << docInfo.getType() << endl; cout << "Language : " << docInfo.getLanguage() << endl; cout << "Date : " << docInfo.getTimestamp() << endl; cout << "Size : " << docInfo.getSize() << endl; } if (pIndex->getDocumentLabels(docId, labels) == true) { cout << "Labels : "; for (set<string>::const_iterator labelIter = labels.begin(); labelIter != labels.end(); ++labelIter) { if (labelIter->substr(0, 2) == "X-") { continue; } cout << "[" << Url::escapeUrl(*labelIter) << "]"; } cout << endl; } } // Next ++optind; } delete pIndex; XapianDatabaseFactory::closeAll(); Dijon::FilterFactory::unloadFilters(); Dijon::HtmlFilter::shutdown(); DownloaderInterface::shutdown(); MIMEScanner::shutdown(); // Did whatever operation we carried out succeed ? if (success == true) { return EXIT_SUCCESS; } return EXIT_FAILURE; }
int main(int argc, char **argv) { string type, option; string databaseName, urlToCheck, urlToIndex; int longOptionIndex = 0; unsigned int docId = 0; bool checkDocument = false, indexDocument = false, showInfo = false, success = false; // Look at the options int optionChar = getopt_long(argc, argv, "c:d:hi:sv", g_longOptions, &longOptionIndex); while (optionChar != -1) { set<string> engines; switch (optionChar) { case 'c': if (optarg != NULL) { urlToCheck = optarg; } checkDocument = true; break; case 'd': if (optarg != NULL) { databaseName = optarg; } checkDocument = true; break; case 'h': // Help cout << "pinot-index - Index documents from the command-line\n\n" << "Usage: pinot-index [OPTIONS]\n\n" << "Options:\n" << " -c, --check check whether the given URL is in the index\n" << " -d, --db path to index to use (mandatory)\n" << " -h, --help display this help and exit\n" << " -i, --index index the given URL\n" << " -s, --showinfo show information about the document\n" << " -v, --version output version information and exit\n\n"; // Don't mention type dbus here as it doesn't support indexing and // is identical to xapian when checking for URLs cout << "Examples:\n" << "pinot-index --check file:///home/fabrice/Documents/Bozo.txt --showinfo --db ~/.pinot/daemon\n\n" << "pinot-index --index http://pinot.berlios.de/ --db ~/.pinot/index\n\n" << "Report bugs to " << PACKAGE_BUGREPORT << endl; return EXIT_SUCCESS; case 'i': if (optarg != NULL) { urlToIndex = optarg; } indexDocument = true; break; case 's': showInfo = true; break; case 'v': cout << "pinot-index - " << PACKAGE_STRING << "\n\n" << "This is free software. You may redistribute copies of it under the terms of\n" << "the GNU General Public License <http://www.gnu.org/licenses/gpl.html>.\n" << "There is NO WARRANTY, to the extent permitted by law." << endl; return EXIT_SUCCESS; default: return EXIT_FAILURE; } // Next option optionChar = getopt_long(argc, argv, "c:d:hi:sv", g_longOptions, &longOptionIndex); } if (((indexDocument == false) && (checkDocument == false)) || (databaseName.empty() == true)) { cerr << "Incorrect parameters" << endl; return EXIT_FAILURE; } MIMEScanner::initialize(); DownloaderInterface::initialize(); Dijon::FilterFactory::loadFilters(string(LIBDIR) + string("/pinot/filters")); // Make sure the index is open in the correct mode XapianDatabase *pDb = XapianDatabaseFactory::getDatabase(databaseName, (indexDocument ? false : true)); if (pDb == NULL) { cerr << "Couldn't open index " << databaseName << endl; Dijon::FilterFactory::unloadFilters(); DownloaderInterface::shutdown(); MIMEScanner::shutdown(); return EXIT_FAILURE; } // Get a read-write index of the given type IndexInterface *pIndex = IndexFactory::getIndex("xapian", databaseName); if (pIndex == NULL) { cerr << "Couldn't obtain index for " << databaseName << endl; XapianDatabaseFactory::closeAll(); Dijon::FilterFactory::unloadFilters(); DownloaderInterface::shutdown(); MIMEScanner::shutdown(); return EXIT_FAILURE; } if (checkDocument == true) { if (pIndex->isGood() == true) { docId = pIndex->hasDocument(urlToCheck); if (docId > 0) { cout << urlToCheck << ": document ID " << docId << endl; success = true; } } } if (indexDocument == true) { Url thisUrl(urlToIndex); // Which Downloader ? DownloaderInterface *pDownloader = DownloaderFactory::getDownloader(thisUrl.getProtocol()); if (pDownloader == NULL) { cerr << "Couldn't obtain downloader for protocol " << thisUrl.getProtocol() << endl; XapianDatabaseFactory::closeAll(); Dijon::FilterFactory::unloadFilters(); DownloaderInterface::shutdown(); MIMEScanner::shutdown(); return EXIT_FAILURE; } DocumentInfo docInfo(urlToIndex, urlToIndex, MIMEScanner::scanUrl(thisUrl), ""); Document *pDoc = pDownloader->retrieveUrl(docInfo); if (pDoc == NULL) { cerr << "Download operation failed !" << endl; } else { set<string> labels; pIndex->setStemmingMode(IndexInterface::STORE_BOTH); // Update an existing document or add to the index ? docId = pIndex->hasDocument(urlToIndex); if (docId > 0) { // Update the document if (FilterWrapper::updateDocument(docId, *pIndex, *pDoc) == true) { success = true; } } else { // Index the document success = FilterWrapper::indexDocument(*pIndex, *pDoc, labels, docId); } if (success == true) { // Flush the index pIndex->flush(); } delete pDoc; } delete pDownloader; } if ((showInfo == true) && (docId > 0)) { DocumentInfo docInfo; set<string> labels; if (pIndex->getDocumentInfo(docId, docInfo) == true) { cout << "Title: " << docInfo.getTitle() << endl; cout << "Location: " << docInfo.getLocation() << endl; cout << "Type: " << docInfo.getType() << endl; cout << "Language: " << docInfo.getLanguage() << endl; cout << "Timestamp: " << docInfo.getTimestamp() << endl; cout << "Size: " << docInfo.getSize() << endl; } if (pIndex->getDocumentLabels(docId, labels) == true) { cout << "Labels:"; for (set<string>::const_iterator labelIter = labels.begin(); labelIter != labels.end(); ++labelIter) { cout << " '" << *labelIter << "'"; } cout << endl; } } delete pIndex; XapianDatabaseFactory::closeAll(); Dijon::FilterFactory::unloadFilters(); DownloaderInterface::shutdown(); MIMEScanner::shutdown(); // Did whatever operation we carried out succeed ? if (success == true) { return EXIT_SUCCESS; } return EXIT_FAILURE; }