//======================================================== void parseUrl(const string &href, UrlElement &starturl) { UrlElement newUrl = starturl.parseUrl(href); if (newUrl.isValid() && (extractNonHttp || newUrl.is_http()) && addEdge(starturl, newUrl, nullptr, nullptr)) addUrl(newUrl, visitOther || (newUrl.server == starturl.server)); }
//======================================================== bool addNode(const UrlElement &url, node &n) { if (nodes.find(url) == nodes.end()) { // no more added node after maxSize if (graph->numberOfNodes() == maxSize) { n = node(); return false; } n = graph->addNode(); stringstream str; str << url.server; if (url.url[0] != '/') str << "/"; str << url.getUrl(); labels->setNodeValue(n, urlDecode(str.str())); ostringstream oss; if (url.is_http()) { oss << url.http_prefix.c_str(); } oss << str.str(); urls->setNodeValue(n, oss.str()); nodes[url] = n; return true; } n = nodes[url]; return false; }
bool operator()(const UrlElement &lhs, const UrlElement &rhs) const { if (lhs.server.compare(rhs.server) < 0 ) return true; if (lhs.server.compare(rhs.server) > 0 ) return false; return lhs.getUrl().compare(rhs.getUrl()) < 0; }
//======================================================== void addUrl(const UrlElement &url, bool _toVisit) { if (visited.find(url) != visited.end()) return; if (_toVisit && url.is_http()) toVisit.push_back(url); }
//======================================================== bool start() { UrlElement url; unsigned step = 20; while (nextUrl(url)) { if (url.isHtmlPage()) { if (pluginProgress && ((nbNodes % step) == 0)) { pluginProgress->setComment(string("Visiting ") + urlDecode(url.server + url.url)); if (pluginProgress->progress(nbNodes, maxSize) !=TLP_CONTINUE) return pluginProgress->state()!= TLP_CANCEL; } #ifndef NDEBUG tlp::warning() << "Visiting: " << url.server << url.url << " ..." << std::endl << flush; #endif if (url.isRedirected()) { UrlElement redirection = url.getRedirection(); if (redirection.isValid()) { #ifndef NDEBUG tlp::warning() << endl << "redirected to " << redirection.server << redirection.url << endl; #endif if (addEdge(url, redirection, "redirection", redirectionColor)) addUrl(redirection, visitOther || redirection.server == url.server); } else tlp::warning() << endl << "invalid redirection" << endl; } else { url.load(); parseHtml(url); url.clear(); #ifndef NDEBUG tlp::warning() << " done" << endl << flush; #endif } } #ifndef NDEBUG else tlp::warning() << "Omitting : " << url.server << url.url << " ==> [not html]"<< endl; #endif } return true; }
bool importGraph() override { string server = "www.labri.fr"; string url; bool computelayout = true; Color pColor(255, 0, 0); Color lColor(0, 0, 255, 128); Color rColor(255, 255, 0, 128); maxSize = 1000; visitOther = false; extractNonHttp = true; if (dataSet != nullptr) { dataSet->get("server", server); dataSet->get("web page", url); dataSet->get("max size", maxSize); dataSet->get("non http links", extractNonHttp); dataSet->get("other server", visitOther); dataSet->get("compute layout", computelayout); dataSet->get("page color", pColor); dataSet->get("link color", lColor); dataSet->get("redirection color", rColor); } UrlElement mySite; size_t pos = server.find("http://"); if (pos == 0) // remove http:// prefix server = server.substr(7); // remove / prefix if (server[0] == '/') server = server.substr(1); if (server[server.size() - 1] == '/') // remove / suffix server = server.substr(0, server.size() - 1); mySite.server = server; if (url[0] == '/') // remove / prefix url = url.substr(1); mySite.setUrl(string("/") + url); mySite.data = ""; labels = graph->getProperty<StringProperty>("viewLabel"); labels->setAllEdgeValue(string("link")); urls = graph->getProperty<StringProperty>("url"); colors = graph->getProperty<ColorProperty>("viewColor"); colors->setAllNodeValue(pColor); colors->setAllEdgeValue(lColor); redirectionColor = &rColor; graph->getProperty<IntegerProperty>("viewShape") ->setAllNodeValue(14); // GlyphManager::getInst().glyphId("2D - Circle") if (!mySite.load()) { if (pluginProgress) { stringstream sstr; sstr << "Unable to access http://" << mySite.server << mySite.url << " (ERROR " << mySite.getCode() << ')'; pluginProgress->setError(sstr.str()); } return false; } node n; toVisit.push_back(mySite); addNode(mySite, n); if (pluginProgress) { pluginProgress->showPreview(false); pluginProgress->setComment(std::string("Visiting ") + mySite.server + mySite.url); } if (!start()) return false; if (computelayout) { pluginProgress->setComment("Layouting extracted graph using FM³..."); string errMsg; // apply FM³ LayoutProperty *layout = graph->getProperty<LayoutProperty>("viewLayout"); return graph->applyPropertyAlgorithm("FM^3 (OGDF)", layout, errMsg, nullptr, pluginProgress); } return true; }
UrlElement UrlElement::parseUrl(const std::string &href) { UrlElement newUrl; string lowercase(href); size_t i, len = lowercase.length(); for (i = 0; i < len; ++i) lowercase[i] = tolower(lowercase[i]); for (i = 0; rejected_protocols[i] != nullptr; i++) { if (lowercase.find(rejected_protocols[i]) != string::npos) break; } if (rejected_protocols[i]) { newUrl.http_prefix.clear(); if (i != 3 /* no javascript */) newUrl.server = href; return newUrl; } size_t pos = 0; bool host = false; pos = lowercase.rfind("://", len); if (pos == string::npos) pos = 0; else { host = true; if (lowercase[pos - 1] == 's') newUrl.http_prefix = "https://"; pos += 3; } if (host) { size_t endhost = lowercase.find_first_of("/ ", pos); if (endhost == string::npos) endhost = len; string hostname = href.substr(pos, endhost - pos); newUrl.server = hostname; newUrl.setUrl(href.substr(endhost)); } else { size_t querystart = lowercase.find_first_of("#", pos); /* previously ?# instead of # */ if (querystart != string::npos) len = querystart; string theUrl = href.substr(pos, len - pos); if (theUrl.empty()) return newUrl; // Manage relative urls if (theUrl[0] != '/') { string urlreference(this->url); size_t findUp = urlreference.rfind("/", urlreference.length()); if (findUp == string::npos) { urlreference.clear(); urlreference.append(1, '/'); } else urlreference = urlreference.substr(0, findUp + 1); size_t pos; // remove space chars at the beginning for (pos = 0; pos < theUrl.size(); ++pos) { char c = theUrl[pos]; if (c != ' ' && c != '\t') break; } if (pos > 0) theUrl = theUrl.substr(pos); while ((pos = theUrl.find("./")) != string::npos) { if (pos == 0) { theUrl = theUrl.substr(2); continue; } if (theUrl[pos - 1] == '.') { theUrl = theUrl.substr(3); findUp = urlreference.rfind('/', findUp - 1); if (findUp == string::npos) { tlp::warning() << "bad url reference, to much ../" << endl; return newUrl; } urlreference = urlreference.substr(0, findUp + 1); } else { tlp::warning() << "bad url reference, to much ../" << endl; return newUrl; } } theUrl = urlreference + theUrl; } if (theUrl != "/") { newUrl.setUrl(theUrl); newUrl.server = this->server; } } return newUrl; }