Пример #1
0
  //========================================================
  void parseUrl(const string &href, UrlElement &starturl) {
    UrlElement newUrl = starturl.parseUrl(href);

    if (newUrl.isValid() && (extractNonHttp || newUrl.is_http()) &&
        addEdge(starturl, newUrl, nullptr, nullptr))
      addUrl(newUrl, visitOther || (newUrl.server == starturl.server));
  }
Пример #2
0
  //========================================================
  bool addNode(const UrlElement &url, node &n) {
    if (nodes.find(url) == nodes.end()) {
      // no more added node after maxSize
      if (graph->numberOfNodes() == maxSize) {
        n = node();
        return false;
      }

      n = graph->addNode();
      stringstream str;
      str << url.server;

      if (url.url[0] != '/')
        str << "/";

      str << url.getUrl();
      labels->setNodeValue(n, urlDecode(str.str()));
      ostringstream oss;

      if (url.is_http()) {
        oss << url.http_prefix.c_str();
      }

      oss << str.str();
      urls->setNodeValue(n, oss.str());
      nodes[url] = n;
      return true;
    }

    n = nodes[url];
    return false;
  }
  bool operator()(const UrlElement &lhs, const UrlElement &rhs) const {
    if (lhs.server.compare(rhs.server) < 0 ) return true;

    if (lhs.server.compare(rhs.server) > 0 ) return false;

    return lhs.getUrl().compare(rhs.getUrl()) < 0;
  }
Пример #4
0
  //========================================================
  void addUrl(const UrlElement &url, bool _toVisit) {
    if (visited.find(url) != visited.end())
      return;

    if (_toVisit && url.is_http())
      toVisit.push_back(url);
  }
  //========================================================
  bool start() {
    UrlElement url;
    unsigned step = 20;

    while (nextUrl(url)) {
      if (url.isHtmlPage()) {
        if (pluginProgress && ((nbNodes % step) == 0)) {
          pluginProgress->setComment(string("Visiting ") +
                                     urlDecode(url.server + url.url));

          if (pluginProgress->progress(nbNodes, maxSize) !=TLP_CONTINUE)
            return pluginProgress->state()!= TLP_CANCEL;
        }

#ifndef NDEBUG
        tlp::warning() << "Visiting: " << url.server << url.url << " ..."  << std::endl << flush;
#endif

        if (url.isRedirected()) {
          UrlElement redirection = url.getRedirection();

          if (redirection.isValid()) {
#ifndef NDEBUG
            tlp::warning() << endl << "redirected to " << redirection.server << redirection.url << endl;
#endif

            if (addEdge(url, redirection,  "redirection", redirectionColor))
              addUrl(redirection,
                     visitOther || redirection.server == url.server);
          }
          else
            tlp::warning() << endl << "invalid redirection" << endl;
        }
        else {
          url.load();
          parseHtml(url);
          url.clear();
#ifndef NDEBUG
          tlp::warning() << " done" << endl << flush;
#endif
        }
      }

#ifndef NDEBUG
      else
        tlp::warning() << "Omitting : " << url.server << url.url << " ==> [not html]"<< endl;

#endif
    }

    return true;
  }
Пример #6
0
  bool importGraph() override {
    string server = "www.labri.fr";
    string url;
    bool computelayout = true;
    Color pColor(255, 0, 0);
    Color lColor(0, 0, 255, 128);
    Color rColor(255, 255, 0, 128);
    maxSize = 1000;
    visitOther = false;
    extractNonHttp = true;

    if (dataSet != nullptr) {
      dataSet->get("server", server);
      dataSet->get("web page", url);
      dataSet->get("max size", maxSize);
      dataSet->get("non http links", extractNonHttp);
      dataSet->get("other server", visitOther);
      dataSet->get("compute layout", computelayout);
      dataSet->get("page color", pColor);
      dataSet->get("link color", lColor);
      dataSet->get("redirection color", rColor);
    }

    UrlElement mySite;
    size_t pos = server.find("http://");

    if (pos == 0)
      // remove http:// prefix
      server = server.substr(7);

    // remove / prefix
    if (server[0] == '/')
      server = server.substr(1);

    if (server[server.size() - 1] == '/')
      // remove / suffix
      server = server.substr(0, server.size() - 1);

    mySite.server = server;

    if (url[0] == '/')
      // remove / prefix
      url = url.substr(1);

    mySite.setUrl(string("/") + url);
    mySite.data = "";

    labels = graph->getProperty<StringProperty>("viewLabel");
    labels->setAllEdgeValue(string("link"));
    urls = graph->getProperty<StringProperty>("url");
    colors = graph->getProperty<ColorProperty>("viewColor");
    colors->setAllNodeValue(pColor);
    colors->setAllEdgeValue(lColor);
    redirectionColor = &rColor;

    graph->getProperty<IntegerProperty>("viewShape")
        ->setAllNodeValue(14); // GlyphManager::getInst().glyphId("2D - Circle")

    if (!mySite.load()) {
      if (pluginProgress) {
        stringstream sstr;
        sstr << "Unable to access http://" << mySite.server << mySite.url << " (ERROR "
             << mySite.getCode() << ')';
        pluginProgress->setError(sstr.str());
      }

      return false;
    }

    node n;
    toVisit.push_back(mySite);
    addNode(mySite, n);

    if (pluginProgress) {
      pluginProgress->showPreview(false);
      pluginProgress->setComment(std::string("Visiting ") + mySite.server + mySite.url);
    }

    if (!start())
      return false;

    if (computelayout) {
      pluginProgress->setComment("Layouting extracted graph using FM³...");
      string errMsg;
      // apply FM³
      LayoutProperty *layout = graph->getProperty<LayoutProperty>("viewLayout");
      return graph->applyPropertyAlgorithm("FM^3 (OGDF)", layout, errMsg, nullptr, pluginProgress);
    }

    return true;
  }
Пример #7
0
UrlElement UrlElement::parseUrl(const std::string &href) {
  UrlElement newUrl;
  string lowercase(href);
  size_t i, len = lowercase.length();

  for (i = 0; i < len; ++i)
    lowercase[i] = tolower(lowercase[i]);

  for (i = 0; rejected_protocols[i] != nullptr; i++) {
    if (lowercase.find(rejected_protocols[i]) != string::npos)
      break;
  }

  if (rejected_protocols[i]) {
    newUrl.http_prefix.clear();

    if (i != 3 /* no javascript */)
      newUrl.server = href;

    return newUrl;
  }

  size_t pos = 0;
  bool host = false;
  pos = lowercase.rfind("://", len);

  if (pos == string::npos)
    pos = 0;
  else {
    host = true;

    if (lowercase[pos - 1] == 's')
      newUrl.http_prefix = "https://";

    pos += 3;
  }

  if (host) {
    size_t endhost = lowercase.find_first_of("/ ", pos);

    if (endhost == string::npos)
      endhost = len;

    string hostname = href.substr(pos, endhost - pos);
    newUrl.server = hostname;
    newUrl.setUrl(href.substr(endhost));
  } else {
    size_t querystart = lowercase.find_first_of("#", pos); /* previously ?#  instead of # */

    if (querystart != string::npos)
      len = querystart;

    string theUrl = href.substr(pos, len - pos);

    if (theUrl.empty())
      return newUrl;

    // Manage relative urls
    if (theUrl[0] != '/') {
      string urlreference(this->url);
      size_t findUp = urlreference.rfind("/", urlreference.length());

      if (findUp == string::npos) {
        urlreference.clear();
        urlreference.append(1, '/');
      } else
        urlreference = urlreference.substr(0, findUp + 1);

      size_t pos;

      // remove space chars at the beginning
      for (pos = 0; pos < theUrl.size(); ++pos) {
        char c = theUrl[pos];

        if (c != ' ' && c != '\t')
          break;
      }

      if (pos > 0)
        theUrl = theUrl.substr(pos);

      while ((pos = theUrl.find("./")) != string::npos) {
        if (pos == 0) {
          theUrl = theUrl.substr(2);
          continue;
        }

        if (theUrl[pos - 1] == '.') {
          theUrl = theUrl.substr(3);
          findUp = urlreference.rfind('/', findUp - 1);

          if (findUp == string::npos) {
            tlp::warning() << "bad url reference, to much ../" << endl;
            return newUrl;
          }

          urlreference = urlreference.substr(0, findUp + 1);
        } else {
          tlp::warning() << "bad url reference, to much ../" << endl;
          return newUrl;
        }
      }

      theUrl = urlreference + theUrl;
    }

    if (theUrl != "/") {
      newUrl.setUrl(theUrl);
      newUrl.server = this->server;
    }
  }

  return newUrl;
}