コード例 #1
0
ファイル: NavProcessor.cpp プロジェクト: PippaCarron/Sigil
QList<NavLandmarkEntry> NavProcessor::GetLandmarks()
{
    QList<NavLandmarkEntry> landlist;
    if (!m_NavResource) return landlist; 

    QReadLocker locker(&m_NavResource->GetLock());

    GumboInterface gi = GumboInterface(m_NavResource->GetText(), "3.0");
    gi.parse();
    const QList<GumboNode*> nav_nodes = gi.get_all_nodes_with_tag(GUMBO_TAG_NAV);
    for (int i = 0; i < nav_nodes.length(); ++i) {
        GumboNode* node = nav_nodes.at(i);
        GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, "epub:type");
        if (attr && (QString::fromUtf8(attr->value) == "landmarks")) {
            const QList<GumboTag> tags = QList<GumboTag>() << GUMBO_TAG_A;;
            const QList<GumboNode*> anchor_nodes = gi.get_nodes_with_tags(node, tags);
            for (int j = 0; j < anchor_nodes.length(); ++j) {
                NavLandmarkEntry le;
                GumboNode* ancnode = anchor_nodes.at(j);
                GumboAttribute* typeattr = gumbo_get_attribute(&ancnode->v.element.attributes, "epub:type");
                GumboAttribute* hrefattr = gumbo_get_attribute(&ancnode->v.element.attributes, "href");
                if (typeattr) le.etype = QString::fromUtf8(typeattr->value);
                if (hrefattr) le.href = Utility::URLDecodePath(QString::fromUtf8(hrefattr->value));
                le.title = Utility::DecodeXML(gi.get_local_text_of_node(ancnode));
                landlist.append(le);
            }
            break;
        }
    }
    return landlist;
}
コード例 #2
0
	void parseForumPostText(GumboNode* node)
	{
		assert(node->type == GUMBO_NODE_ELEMENT);
		assert(node->v.element.tag == GUMBO_TAG_DIV);

		GumboAttribute* classAttr = gumbo_get_attribute(&node->v.element.attributes, "class");
		assert(classAttr != NULL);
		assert(strCiCmp(classAttr->value, "forum-post-text"));

		GumboAttribute* idAttr = gumbo_get_attribute(&node->v.element.attributes, "id");
		assert(idAttr != NULL);
		std::string idStr(idAttr->value);
		std::size_t messageTextStrIndex = idStr.find("message_text_");
		assert(messageTextStrIndex != std::string::npos);

		std::string messageIdStr = idStr.substr(13);
		std::cout << "Forum post found, id: " << messageIdStr << std::endl;

		// FIXME: remove after debug completion
		if (!strCiCmp(messageIdStr, "4453758"))return;

		std::string forumPostText = "";
		GumboVector* nodeChildren = &node->v.element.children;
		for (std::size_t i = 0; i < nodeChildren->length; i++)
		{
			GumboNode* childNode = static_cast<GumboNode*>(nodeChildren->data[i]);
			assert(childNode != NULL);
			if (childNode->type == GUMBO_NODE_ELEMENT)
			{
				// FIXME: parse message quotes (<table> tags)

				std::cout << "Tag found: " << gumbo_normalized_tagname(childNode->v.element.tag);
				std::cout << ", children: " << childNode->v.element.children.length << std::endl;
				//
			}
			else if (childNode->type == GUMBO_NODE_TEXT)
			{
				std::string elementTextUtf8(childNode->v.text.text);
				std::string elementTextCp1251 = "";
				utf8ToCp1251(elementTextCp1251, elementTextUtf8);
				forumPostText += elementTextCp1251;
//				std::cout << "Forum message text: " << elementTextCp1251 << std::endl;
			}
			else std::cout << "Ignoring \"" << gumboElementTypeToString(childNode->type) << "\" node..." << std::endl;

//			classAttr = gumbo_get_attribute(&childNode->v.element.attributes, "class");
		}

		std::cout << "Forum post: " << std::endl << forumPostText << std::endl;
	}
コード例 #3
0
ファイル: NavProcessor.cpp プロジェクト: PippaCarron/Sigil
QList<NavTOCEntry> NavProcessor::GetNodeTOC(GumboInterface & gi, const GumboNode * node, int lvl)
{
    if ((node->type != GUMBO_NODE_ELEMENT) || (node->v.element.tag != GUMBO_TAG_OL)) {
        return QList<NavTOCEntry>();
    }
  
    QList<NavTOCEntry> toclist;
    const GumboVector* children = &node->v.element.children;

    for (unsigned int i = 0; i < children->length; ++i) {
        GumboNode * child = static_cast<GumboNode*>(children->data[i]);
        if (child->type == GUMBO_NODE_ELEMENT) {
            if (child->v.element.tag == GUMBO_TAG_LI) {
                const GumboVector* li_children = &child->v.element.children;
                for (unsigned int j = 0; j < li_children->length; ++j) {
                    GumboNode * li_child = static_cast<GumboNode*>(li_children->data[j]);
                    if (li_child->type == GUMBO_NODE_ELEMENT) {
                        if (li_child->v.element.tag == GUMBO_TAG_A) {
                            NavTOCEntry te;
                            te.lvl = lvl;
                            GumboAttribute* hrefattr = gumbo_get_attribute(&li_child->v.element.attributes, "href");
                            if (hrefattr) te.href = Utility::URLDecodePath(QString::fromUtf8(hrefattr->value));
                            te.title = Utility::DecodeXML(gi.get_local_text_of_node(li_child));
                            toclist.append(te);
                        } else if (li_child->v.element.tag == GUMBO_TAG_OL) {
                            toclist.append(GetNodeTOC(gi, li_child, lvl+1));
                        }
                    }
                }
            }  
        }
    } 
    return toclist;             
}
コード例 #4
0
ファイル: NavProcessor.cpp プロジェクト: PippaCarron/Sigil
QList<NavTOCEntry> NavProcessor::GetTOC()
{
    QList<NavTOCEntry> toclist;
    if (!m_NavResource) return toclist; 
        
    QReadLocker locker(&m_NavResource->GetLock());

    GumboInterface gi = GumboInterface(m_NavResource->GetText(), "3.0");
    gi.parse();
    const QList<GumboNode*> nav_nodes = gi.get_all_nodes_with_tag(GUMBO_TAG_NAV);
    for (int i = 0; i < nav_nodes.length(); ++i) {
        GumboNode* node = nav_nodes.at(i);
        GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, "epub:type");
        if (attr && (QString::fromUtf8(attr->value) == "toc")) {
            QList<GumboTag> tags = QList<GumboTag>() << GUMBO_TAG_OL;
            const QList<GumboNode*> ol_nodes = gi.get_nodes_with_tags(node, tags);
            for (int j = 0; j < ol_nodes.length(); ++j) {
                GumboNode * olnode = ol_nodes.at(j);
                toclist.append(GetNodeTOC(gi, olnode, 1));
            }
            break;              
        }
    }
    return toclist;
}
コード例 #5
0
ファイル: es_gumbo.c プロジェクト: Cy-4AH/showtime
static void
es_gumbo_find_by_class_r(GumboNode *node, char **classes, duk_context *ctx,
                         int *idxp, es_gumbo_output_t *ego)
{
  if(node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE)
    return;

  const GumboElement *e = &node->v.element;
  GumboAttribute *a = gumbo_get_attribute(&e->attributes, "class");

  if(a != NULL) {
    char **list = strvec_split(a->value, ' ');
    for(int i = 0; classes[i] != NULL; i++) {
      int found = 0;
      for(int j = 0; list[j] != NULL; j++) {
        if(!strcmp(list[j], classes[i])) {
          found = 1;
          break;
        }
      }
      if(!found)
        goto notfound;
    }
    push_gumbo_node(ctx, node, ego);
    duk_put_prop_index(ctx, -2, (*idxp)++);

  notfound:
    strvec_free(list);
  }

  for(int i = 0; i < e->children.length; i++)
    es_gumbo_find_by_class_r(e->children.data[i], classes, ctx, idxp, ego);
}
コード例 #6
0
ファイル: HTMLParser.cpp プロジェクト: lizardoluis/irSearch
void HTMLParser::extractLinks(GumboNode *node,
		list<pair<string, string> > &links, string &docURL) {

	if (node->type != GUMBO_NODE_ELEMENT) {
		return;
	}

	GumboAttribute* href;
	if (node->v.element.tag == GUMBO_TAG_A
			&& (href = gumbo_get_attribute(&node->v.element.attributes, "href"))) {

		string anchor;

		if (node->v.element.children.length > 0) {
			GumboNode* title_text =
					(GumboNode*) node->v.element.children.data[0];
			anchor = title_text->v.text.text;
		} else {
			anchor = "";
		}

		string url = treatLink(docURL, href->value);
		if (!url.empty()) {
			links.push_back(make_pair(url, anchor));
		}
	}

	GumboVector* children = &node->v.element.children;
	for (size_t i = 0; i < children->length; ++i) {
		extractLinks(static_cast<GumboNode*>(children->data[i]), links, docURL);
	}
}
コード例 #7
0
ファイル: Client.cpp プロジェクト: ZhengYuanQing/NetSpider
void Client::getlinks(GumboNode *node, std::vector<std::string>& vec)
{
	GumboVector *children;

//	如果当前节点不是一个元素的话直接返回
	if(node->type != GUMBO_NODE_ELEMENT) return;

//	获取该节点的所有子元素节点
	children=&node->v.element.children;



	GumboAttribute* href;
	if ((node->v.element.tag == GUMBO_TAG_A || node->v.element.tag == GUMBO_TAG_LINK) &&
			(href = gumbo_get_attribute(&node->v.element.attributes, "href")))
	{
		char const* hrefstr = href->value;

		if(strcmp(hrefstr, "") != 0)
		{
			vec.push_back(hrefstr);
		}
	}


//	递归该节点下的所有子节点
	for(int i=0;i < children->length; ++i)
		getlinks((GumboNode*)children->data[i], vec);
}
コード例 #8
0
ファイル: htmlparser.cpp プロジェクト: damarbo33/httpCurl
void HtmlParser::search_for_links(GumboNode* node, Tags *tag) {
    if (node->type != GUMBO_NODE_ELEMENT) {
        return;
    }
    GumboAttribute* attribute;

    if (node->v.element.tag == tag->htmlTag){
        map<string, string> attrList;
        attrValue attrElement;

        if ( (attribute = gumbo_get_attribute(&node->v.element.attributes, tag->attr.c_str()))) {
            attrElement.attrList.insert( make_pair(attribute->name, attribute->value));
        } else {
            //Recogemos el resto de atributos y los incluimos en la variable tag
            for (unsigned int i=0; i<node->v.element.attributes.length; i++){
                attribute = ((GumboAttribute *)node->v.element.attributes.data[i]);
                attrElement.attrList.insert( make_pair(attribute->name, attribute->value));
            }
        }
        attrElement.content = isObtainContentTag() ? search_text(node, tag) : "";
        tag->tagElement.push_back(attrElement);
    }

    GumboVector* children = &node->v.element.children;
//    std::cout << "****** elementos: " << children->length << endl;
    for (unsigned int i = 0; i < children->length; ++i) {
        search_for_links(static_cast<GumboNode*>(children->data[i]), tag);
    }
}
コード例 #9
0
ファイル: wiki-registry.c プロジェクト: hkjels/clib-search
static package_t *package_from_wiki_anchor(GumboNode *anchor) {
  package_t *pkg = malloc(sizeof(package_t));

  GumboAttribute* href = gumbo_get_attribute(&anchor->v.element.attributes, "href");

  char *url = strdup(href->value);
  pkg->href = url;
  pkg->repo = package_get_repo(url);

  GumboNode *parent = anchor->parent;
  if (GUMBO_TAG_LI != parent->v.element.tag) {
    free(pkg);
    return NULL;
  }

  GumboVector* children = &parent->v.element.children;
  for (int i = 0; i < children->length; ++i) {
    GumboNode *child = children->data[i];
    if (GUMBO_NODE_TEXT == child->type) {
      // TODO support nested elements (<code>, <em>, etc.)
      char *description = strdup(child->v.text.text);
      pkg->description = substr(description, 3, strlen(description) + 1);
      break;
    }
  }
  return pkg;
}
コード例 #10
0
ファイル: htmlparser.cpp プロジェクト: damarbo33/httpCurl
string HtmlParser::search_for_elem(GumboNode* node, Tags *tag, string atributo, string valAtributo) {
    if (node->type != GUMBO_NODE_ELEMENT) {
        return "";
    }

    GumboAttribute* attribute;
    if (node->v.element.tag == tag->htmlTag){
        map<string, string> attrElement;
        if ((attribute = gumbo_get_attribute(&node->v.element.attributes, tag->attr.c_str()))) {
            if (string(attribute->value).compare(valAtributo) == 0
                    && string(attribute->name).compare(atributo) == 0){
                    return cleantext(node);
            }
        } else {
            //Recogemos el resto de atributos y los incluimos en la variable tag
            for (unsigned int i=0; i<node->v.element.attributes.length; i++){
                attribute = ((GumboAttribute *)node->v.element.attributes.data[i]);
                if (string(attribute->value).compare(valAtributo) == 0
                    && string(attribute->name).compare(atributo) == 0){
                    return cleantext(node);
                }
            }
        }
    }

    GumboVector* children = &node->v.element.children;
    for (unsigned int i = 0; i < children->length; ++i) {
        string ret = search_for_elem(static_cast<GumboNode*>(children->data[i]), tag, atributo, valAtributo);
        if (!ret.empty()){
            return ret;
        }
    }
    return "";
}
コード例 #11
0
ファイル: wiki-registry.c プロジェクト: hkjels/clib-search
static void wiki_registry_iterate_nodes(GumboNode *node, list_t *packages, char *category) {
  if (node->type != GUMBO_NODE_ELEMENT) return;

  if (node->v.element.tag == GUMBO_TAG_A) {
    GumboAttribute* name = gumbo_get_attribute(&node->v.element.attributes, "name");
    if (name) {
      // set the current category for next iteration
      size_t len = strlen(name->value);
      category = realloc(category, len + 1);
      memcpy(category, name->value, len);
      category[len] = 0;
    } else {
      package_t *pkg = package_from_wiki_anchor(node);
      if (pkg) {
        pkg->category = strdup(category);
        list_node_t *pkg_node = list_rpush(packages, list_node_new(pkg));
      }
    }
  } else {
    GumboVector* children = &node->v.element.children;
    for (int i = 0; i < children->length; ++i) {
      wiki_registry_iterate_nodes(children->data[i], packages, category);
    }
  }
}
コード例 #12
0
ファイル: facebook.cpp プロジェクト: imclab/facebook-tunnel
void FacebookClient::extractLinks(GumboNode* node) {

  if (node->type != GUMBO_NODE_ELEMENT) {
    return;
  }

  if (node->v.element.tag == GUMBO_TAG_A ) {
    GumboAttribute* hrefAttr;
    hrefAttr = gumbo_get_attribute( &node->v.element.attributes, "href" );

    std::string valueStr = std::string( hrefAttr->value );
    std::size_t match = valueStr.find( "messages/thread" );

    if( match == 1 ) {

      std::stringstream ss( valueStr );

      int index = 0;

//double xx = 100001930423169;

      while( std::getline( ss, valueStr, '/' ) ) {
        std::istringstream iss( valueStr );
        if( index == 3 ) {
          std::istringstream iss( valueStr );
          iss >> this->friendID;
          break;
        };
        index++;
      };
コード例 #13
0
	void parseForumPostEntry(GumboNode* node)
	{
		assert(node->type == GUMBO_NODE_ELEMENT);
		assert(node->v.element.tag == GUMBO_TAG_DIV);

		GumboAttribute* classAttr = gumbo_get_attribute(&node->v.element.attributes, "class");
		assert(classAttr != NULL);
		assert(strCiCmp(classAttr->value, "forum-post-entry"));

		GumboVector* nodeChildren = &node->v.element.children;
		int fpeElementCount = 0;
		for (std::size_t i = 0; i < nodeChildren->length;i++)
		{
			GumboNode* divChildNode = static_cast<GumboNode*>(nodeChildren->data[i]);
			assert(divChildNode != NULL);
			if (divChildNode->type != GUMBO_NODE_ELEMENT) continue;

			fpeElementCount++;
			assert(divChildNode->v.element.tag == GUMBO_TAG_DIV);

			classAttr = gumbo_get_attribute(&divChildNode->v.element.attributes, "class");
			if (strCiCmp(classAttr->value, "forum-post-text"))
			{
				parseForumPostText(divChildNode);
			}
			else if (strCiCmp(classAttr->value, "forum-post-attachments"))
			{
				parseForumPostAttachments(divChildNode);
			}
			else if (strCiCmp(classAttr->value, "forum-post-lastedit"))
			{
				parseForumPostLastEdit(divChildNode);
			}
			else if (strCiCmp(classAttr->value, "forum-user-signature"))
			{
				parseForumUserSignature(divChildNode);
			}
			else
			{
				assert(0);
			}
		}
		assert(fpeElementCount <= 3);
	}
コード例 #14
0
    string ParserUtils::getAttribute(const GumboNode* node, const string& attributeName)
    {
        assert(node != nullptr);
        const GumboAttribute* nodeAttribute = gumbo_get_attribute(&node->v.element.attributes, attributeName.c_str());
        if (nodeAttribute != nullptr)
        {
            return nodeAttribute->value;
        }

        return "";
    }
コード例 #15
0
ファイル: NavProcessor.cpp プロジェクト: PippaCarron/Sigil
NavProcessor::NavProcessor(HTMLResource * nav_resource)
  : m_NavResource(nav_resource)
{
      bool valid = true;
      {
          QReadLocker locker(&m_NavResource->GetLock());
          QString source = m_NavResource->GetText();
          GumboInterface gi = GumboInterface(source, "3.0");
          gi.parse();
          const QList<GumboNode*> nav_nodes = gi.get_all_nodes_with_tag(GUMBO_TAG_NAV);
          valid = valid && nav_nodes.length() > 0;
          bool has_toc = false;
          for (int i = 0; i < nav_nodes.length(); ++i) {
              GumboNode* node = nav_nodes.at(i);
              GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, "epub:type");
              if (attr) {
                  QString etype = QString::fromUtf8(attr->value);
                  if (etype == "toc") has_toc = true;
              }
          }
          valid = valid && has_toc;
      }
      if (!valid) {
          SettingsStore ss;
          QString lang = ss.defaultMetadataLang();
          QString newsource = 
            "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
            "<!DOCTYPE html>\n"
            "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\" "
            "lang=\"%1\" xml:lang=\"%2\">\n"
            "<head>\n"
            "  <meta charset=\"utf-8\" />\n"
            "  <style type=\"text/css\">\n"
            "    nav#landmarks, nav#page-list { display:none; }\n"
            "    ol { list-style-type: none; }\n"
            "  </style>\n"
            "</head>\n"
            "<body epub:type=\"frontmatter\">\n"
            "  <nav epub:type=\"toc\" id=\"toc\">\n"
            "  </nav>\n"
            "  <nav epub:type=\"landmarks\" id=\"landmarks\" hidden=\"\">\n"
            "  </nav>\n"
            "</body>\n"
            "</html>";
          newsource = newsource.arg(lang).arg(lang);
          QWriteLocker locker(&m_NavResource->GetLock());
          m_NavResource->SetText(newsource);
    }
}
コード例 #16
0
ファイル: wiki-registry.c プロジェクト: hkjels/clib-search
static void wiki_registry_find_body(GumboNode* node, list_t *packages) {
  if (node->type != GUMBO_NODE_ELEMENT) return;

  GumboAttribute *id = gumbo_get_attribute(&node->v.element.attributes, "id");
  if (id && 0 == strcmp("wiki-body", id->value)) {
    // temp category buffer, we'll populate this later
    char *category = malloc(1);
    wiki_registry_iterate_nodes(node, packages, category);
    return;
  }

  GumboVector* children = &node->v.element.children;
  for (int i = 0; i < children->length; ++i) {
    wiki_registry_find_body(children->data[i], packages);
  }
}
コード例 #17
0
ファイル: GumboInterface.cpp プロジェクト: CedarLogic/Sigil
QList<GumboNode*> GumboInterface::get_nodes_with_attribute(GumboNode* node, const char * attname)
{
  if (node->type != GUMBO_NODE_ELEMENT) {
    return QList<GumboNode*>();
  }
  QList<GumboNode*> nodes;
  GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, attname);
  if (attr) {
      nodes.append(node);
  }
  GumboVector* children = &node->v.element.children;
  for (unsigned int i = 0; i < children->length; ++i) {
      nodes.append(get_nodes_with_attribute(static_cast<GumboNode*>(children->data[i]), attname));
  }
  return nodes;
}
コード例 #18
0
ファイル: GumboInterface.cpp プロジェクト: CedarLogic/Sigil
QStringList  GumboInterface::get_values_for_attr(GumboNode* node, const char* attr_name) 
{
    if (node->type != GUMBO_NODE_ELEMENT) {
        return QStringList();
    }
    QStringList attr_vals;
    GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, attr_name);
    if (attr != NULL) {
        attr_vals.append(QString::fromUtf8(attr->value));
    }
    GumboVector* children = &node->v.element.children;
    for (unsigned int i = 0; i < children->length; ++i) {
        attr_vals.append(get_values_for_attr(static_cast<GumboNode*>(children->data[i]), attr_name));
    }
    return attr_vals;
}
コード例 #19
0
ファイル: es_gumbo.c プロジェクト: Cy-4AH/showtime
static GumboNode *
es_gumbo_find_by_id_r(GumboNode *node, const char *id)
{
  if(node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE)
    return NULL;

  const GumboElement *e = &node->v.element;
  GumboAttribute *a = gumbo_get_attribute(&e->attributes, "id");

  if(a != NULL && !strcmp(a->value, id))
    return node;

  for(int i = 0; i < e->children.length; i++) {
    GumboNode *r = es_gumbo_find_by_id_r(e->children.data[i], id);
    if(r != NULL)
      return r;
  }
  return NULL;
}
コード例 #20
0
ファイル: GumboInterface.cpp プロジェクト: CedarLogic/Sigil
QStringList GumboInterface::get_properties(GumboNode* node)
{
    if (node->type != GUMBO_NODE_ELEMENT) {
        return QStringList();
    }
    QStringList properties;
    std::string tagname = get_tag_name(node);
    if (in_set(manifest_properties, tagname)) {
        properties.append(QString::fromStdString(tagname));
    }
    GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, "src");
    if (attr && !QUrl(QString::fromUtf8(attr->value)).isRelative()) {
        properties.append(QString("remote-resources"));
    }
    GumboVector* children = &node->v.element.children;
    for (unsigned int i = 0; i < children->length; ++i) {
        properties.append(get_properties(static_cast<GumboNode*>(children->data[i])));
    }
    return properties;
}
コード例 #21
0
ファイル: t2HTMLParser.cpp プロジェクト: 91yuan/TattyUI
    void t2HTMLParser::t2LabelParser::head(GumboNode *h)
    {
        GumboVector *headChildren = &h->v.element.children;

        for(int i = 0; i < headChildren->length; i++)
        {
            GumboNode *child = (GumboNode *) headChildren->data[i];

            // find title
            if(child->type == GUMBO_NODE_ELEMENT && child->v.element.tag == GUMBO_TAG_TITLE)
            {
                if(child->v.element.children.length != 1)
                {
                    t2PrintError("<empty title>");
                    return;
                }

                // title无子节点
                GumboNode *title = (GumboNode *) child->v.element.children.data[0];
                if(title->type != GUMBO_NODE_TEXT && title->type != GUMBO_NODE_WHITESPACE)
                {
                    t2PrintError("标题只可为空或文字");
                    return;
                }

                t2Window::getInstance()->window->setTitle(title->v.text.text);
            }
            // find css file
            else if(child->type == GUMBO_NODE_ELEMENT && child->v.element.tag == GUMBO_TAG_LINK)
            {
                GumboAttribute *href = gumbo_get_attribute(&child->v.element.attributes, "href");
                if(!href)
                    t2PrintError("<link>缺失href属性");

                cssFilePaths.push_back(href->value);
            }
        }
    }
コード例 #22
0
ファイル: get-element-by-id.c プロジェクト: 0x00A/clib
GumboNode *
gumbo_get_element_by_id(const char *id, GumboNode *document) {

  if (GUMBO_NODE_DOCUMENT != document->type
   && GUMBO_NODE_ELEMENT != document->type) {
    return NULL;
  }

  GumboAttribute *node_id =
    gumbo_get_attribute(&document->v.element.attributes, "id");
  if (node_id && 0 == strcmp(id, node_id->value)) {
    return document;
  }

  // iterate all children
  GumboVector *children = &document->v.element.children;
  for (unsigned int i = 0; i < children->length; i++) {
    GumboNode *node = gumbo_get_element_by_id(id, children->data[i]);
    if (node) return node;
  }

  return NULL;
}
コード例 #23
0
bool SteamUserCrawler::run() {
	printf("Started Running User Crawler\n");
	
	gettimeofday(&start, NULL);
	// Connect To DB
	dbConn->connect();
	gettimeofday(&end, NULL);
	printf("DB Connected (time consumed : %ldms)\n", this->calTime());

	sql::Statement *stmt;
	sql::ResultSet *res;
	sql::PreparedStatement *pstmt;
	pstmt = dbConn->con->prepareStatement("INSERT INTO user(url, name, steamlv) VALUES (?, ?, ?) ON DUPLICATE KEY UPDATE name=VALUES(name), steamlv=VALUES(steamlv);");   

	string page;
	
	string userName;
	int userLevel;

    string url = this->seedURL;
    if(url == "r") {
	    gettimeofday(&start, NULL);
		stmt = dbConn->con->createStatement();
		res = stmt->executeQuery("SELECT url FROM user WHERE name IS NULL ORDER BY RAND() LIMIT 1;");
		gettimeofday(&end, NULL);
		if(res->next()) {
			url = res->getString(1);
			printf("Getting Random Seed URL from DB Done (time consumed : %ldms)\n", this->calTime());
		}
		delete stmt;
		delete res;
    }
    
	while(url != "") {
		gettimeofday(&start, NULL);
		string page = curl->getPage(url);
		gettimeofday(&end, NULL);
		printf("Getting User Profile Page Done (time consumed : %ldms)\n", this->calTime());
		
		userName = "";
		userLevel = -1;

		// Getting User Name & User Steam Level
		if(page != "") {
			gettimeofday(&start, NULL);
			GumboOutput *output = gumbo_parse(page.c_str());
			gettimeofday(&end, NULL);
			printf("User Profile Page Parsing Done (time consumed : %ldms)\n", this->calTime());
			printf("Current URL [%s]\n", url.c_str());
			
			queue<GumboNode *> nodes;
			nodes.push(output->root);
			
			gettimeofday(&start, NULL);
			while(!nodes.empty() && (userName == "" || userLevel == -1)) {
				GumboNode *node = nodes.front();
				nodes.pop();
				
				if(node->type != GUMBO_NODE_ELEMENT) {
					continue;
				}
				
				GumboAttribute *attr;
	
				// User Name
				if((node->v.element.tag == GUMBO_TAG_DIV) &&
				(attr = gumbo_get_attribute(&node->v.element.attributes, "class")) &&
				(strcmp(attr->value, "persona_name") == 0)) {
					GumboVector *aChild = &node->v.element.children;
					for(size_t i = 0; i < aChild->length; i++) {
						GumboNode *aNode = static_cast<GumboNode *>(aChild->data[i]);
						if(aNode->type == GUMBO_NODE_TEXT) {
							userName = aNode->v.text.text;
							trim(userName);
						}
					}
					continue;
				}
	
				// User Steam Level
				if((node->v.element.tag == GUMBO_TAG_DIV) &&
				(attr = gumbo_get_attribute(&node->v.element.attributes, "class")) &&
				(strcmp(attr->value, "persona_name persona_level") == 0)) {
					GumboVector *aChild = &node->v.element.children;
					for(size_t i = 0; i < aChild->length; i++) {
						GumboNode *aNode = static_cast<GumboNode *>(aChild->data[i]);
						if((aNode->type == GUMBO_NODE_ELEMENT) &&
						(aNode->v.element.tag == GUMBO_TAG_DIV)) {
							GumboVector *bChild = &aNode->v.element.children;
							for(size_t j = 0; j < bChild->length; j++) {
								GumboNode *bNode = static_cast<GumboNode *>(bChild->data[j]);
								if((bNode->type == GUMBO_NODE_ELEMENT) &&
								(bNode->v.element.tag == GUMBO_TAG_SPAN)) {
									GumboVector *cChild = &bNode->v.element.children;
									for(size_t k = 0; k < cChild->length; k++) {
										GumboNode *cNode = static_cast<GumboNode *>(cChild->data[k]);
										if(cNode->type == GUMBO_NODE_TEXT) {
											stringstream ss;
											ss << cNode->v.text.text;
											ss >> userLevel;
										}
									}
								}
							}
						}
					}
コード例 #24
0
ファイル: Index.cpp プロジェクト: CedarLogic/Sigil
void Index::AddIndexIDsOneFile(HTMLResource *html_resource)
{
    QWriteLocker locker(&html_resource->GetLock());
    QString source = html_resource->GetText();
    QString version = html_resource->GetEpubVersion();
    GumboInterface gi = GumboInterface(source, version);
    QList<GumboNode*> nodes = XhtmlDoc::GetIDNodes(gi, gi.get_root_node());
    bool resource_updated = false;
    int index_id_number = 1;
    foreach(GumboNode * node, nodes) {
        QString index_id_value;

        // Get the text of all sub-nodes.
        QString text_node_text = XhtmlDoc::GetIDElementText(gi, node);
        // Convert &nbsp; to space since Index Editor unfortunately does the same.
        text_node_text.replace(QChar(160), " ");

        GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, "id");
        if (attr) {
            index_id_value = QString::fromUtf8(attr->value);
            if (index_id_value.startsWith(SIGIL_INDEX_ID_PREFIX)) {
                GumboElement* element = &node->v.element;
                gumbo_element_remove_attribute(element, attr);
                resource_updated = true;
            }
        }

        // If this node is a custom index entry make sure it gets included
        bool is_custom_index_entry = false;
        QString custom_index_value = text_node_text;

        attr = gumbo_get_attribute(&node->v.element.attributes, "class");
        if (attr) {
            QString class_names = QString::fromUtf8(attr->value);

            if (class_names.split(" ").contains(SIGIL_INDEX_CLASS)) {
                is_custom_index_entry = true;
                
                GumboAttribute* titleattr = gumbo_get_attribute(&node->v.element.attributes, "title");
                if (titleattr) {
                    QString title = QString::fromUtf8(titleattr->value);
                    if (!title.isEmpty()) {
                        custom_index_value = title;
                    }
                }
            }

        }

        // Use the existing id if there is one, else add one if node contains index item
        attr = gumbo_get_attribute(&node->v.element.attributes, "id");
        if (attr) {
            CreateIndexEntry(text_node_text, html_resource, index_id_value, is_custom_index_entry, custom_index_value);
        } else {
            index_id_value = SIGIL_INDEX_ID_PREFIX + QString::number(index_id_number);

            if (CreateIndexEntry(text_node_text, html_resource, index_id_value, is_custom_index_entry, custom_index_value)) {
                GumboElement* element = &node->v.element;
                gumbo_element_set_attribute(element, "id", index_id_value.toUtf8().constData()); 
                resource_updated = true;
                index_id_number++;
            }
        }
    }
コード例 #25
0
ファイル: Headings.cpp プロジェクト: CedarLogic/Sigil
QList<Headings::Heading> Headings::GetHeadingListForOneFile(HTMLResource *html_resource,
        bool include_unwanted_headings)
{
    Q_ASSERT(html_resource);
    QString source = html_resource->GetText();
    QString version = html_resource->GetEpubVersion();
    GumboInterface gi = GumboInterface(source, version);
    gi.parse();

    // get original source line number of body element
    unsigned int body_line = 0;
    QList<GumboNode*> bodylist = gi.get_all_nodes_with_tag(GUMBO_TAG_BODY);
    if (!bodylist.isEmpty()) {
        GumboNode* body = bodylist.at(0);
        body_line = body->v.element.start_pos.line;
    }

    QList<GumboNode*> heading_nodes = gi.get_all_nodes_with_tags(GHEADING_TAGS);
    int num_heading_nodes = heading_nodes.count();
    QList<Headings::Heading> headings;

    for (int i = 0; i < num_heading_nodes; ++i) {

        GumboNode* node = heading_nodes.at(i);

        Heading heading;

        heading.resource_file  = html_resource;
        heading.path_to_node = gi.get_path_to_node(node);

        heading.title = QString();
        GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes,"title");
        if (attr) {
           heading.title = QString::fromUtf8(attr->value);
        }
        heading.orig_title     = heading.title;
        if (!heading.title.isEmpty()) {
            heading.text = heading.title;
        } else {
            heading.text = gi.get_local_text_of_node(node);
        }
        heading.level = QString( QString::fromStdString(gi.get_tag_name(node)).at(1) ).toInt();
        heading.orig_level     = heading.level;

        QString classes  = QString();
        attr = gumbo_get_attribute(&node->v.element.attributes,"class");
        if (attr) {
            classes = QString::fromUtf8(attr->value);
        }

        heading.include_in_toc = !(classes.contains(SIGIL_NOT_IN_TOC_CLASS) ||
                                   classes.contains(OLD_SIGIL_NOT_IN_TOC_CLASS));

        unsigned int node_line = node->v.element.start_pos.line;

        heading.at_file_start = (i == 0) && ((node_line - body_line) < ALLOWED_HEADING_DISTANCE);
        heading.is_changed     = false;

        if (heading.include_in_toc || include_unwanted_headings) {
            headings.append(heading);
        }
    }

    return headings;
}
コード例 #26
0
ファイル: fetch_data.cpp プロジェクト: Jordi1990/NHLRooster
bool doFetchLesson(Class t, ThreadSafeQueue<Class> *classList, std::atomic<int> *totalLessons)
{
	std::string buffer;
	CURL *curl;
	CURLcode res;
	
	weekAmountList.lock();
	int weekAmount = -1;
	for (int i = 0; i < weekAmountList.size(); i++)
	{
		if (weekAmountList.at(i).departmentStringId == t.departmentString() && weekAmountList.at(i).cpath == t.cpath())
		{
			weekAmount = weekAmountList.at(i).amountOfWeeks;
			break;
		}
	}
	if(weekAmount == -1)
		printf("ERROR: NOT FOUND: %s - %s\n", t.departmentString().c_str(), t.cpath().c_str());
	weekAmountList.unlock();
	
	char postFields[1024] = "";
	for (int i = 1; i < weekAmount+1; i++)
	{
		int n = sprintf(postFields, "%sweken[]=%d&", postFields, i);
		postFields[n] = '\0';
	}
	int n = sprintf(postFields, "%ssleutelveld=%s&object=%s&filter=%s", postFields, t.classIdString().c_str(), t.cpath().c_str(), t.departmentString().c_str());
	postFields[n] = '\0';
	//printf("%s\n", postFields);
	curl = curl_easy_init();
	curl_easy_setopt(curl, CURLOPT_URL, "https://rooster.nhl.nl/1516/rooster.php");
	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writer);
	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);
	curl_easy_setopt(curl, CURLOPT_POSTFIELDS, postFields);
	//curl_easy_setopt(curl, CURLOPT_REFERER, NHL_REFERER);
	curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT); // 5 sec time out on whole request
	curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, CURL_CONNECT_TIMEOUT); // 10 sec connect time out
	res = curl_easy_perform(curl);
	//printf("------\n%s------\n\n", buffer.c_str());
	if (res == CURLE_OK)
	{
		GumboOutput* output = gumbo_parse(buffer.c_str());
		GumboNode* node = GetTBodyNode(output->root);
		if (node == NULL)
		{
			if (t.cpath() == "ttstud"){
				printf("[%s] FAIL(%s-%s-%s)\n", currentDateTime().c_str(), t.className().c_str(), t.departmentString().c_str(), t.cpath().c_str());
				curl_easy_cleanup(curl);
			}
			else{
				printf("[%s] FAIL(%s-%s-%s), aborting program\n", currentDateTime().c_str(), t.className().c_str(), t.departmentString().c_str(), t.cpath().c_str());
				curl_easy_cleanup(curl);
				exit(1);
			}
			return true;
		}
		GumboVector* children = &node->v.element.children;
		bool newDay = false;
		int lessonAmount = 0;
		int yearOffset = -1;
		int titleOffset = -1;
		int locationOffset = -1;
		int teacherOffset = -1;
		int typeOffset = -1;
		int commentOffset = -1;
		int endOffset = -1;
		std::string date;
		char dayName[128];
		int day;
		int month;
		int year;
		for (unsigned int i = 0; i < children->length; ++i) {
			GumboNode *node1 = static_cast<GumboNode*>(children->data[i]);
			if (node1->v.element.tag == GUMBO_TAG_TR){

				GumboAttribute *att = gumbo_get_attribute(&node1->v.element.attributes, "class");
				if (att)
				{
					//printf("TR CLASS: %s\n", att->value);
					std::string value = att->value;
					if (value == "datarij")
					{
						std::string yearType = "";
						GumboNode *startTimeNode = static_cast<GumboNode*>(node1->v.element.children.data[0]);
						std::string endDate = "";
						if (endOffset != -1)
						{
							GumboNode *endTimeNode = static_cast<GumboNode*>(node1->v.element.children.data[endOffset]);
							endDate = GetTextFromElement(endTimeNode);
						}
						if (yearOffset != -1)
						{
							GumboNode *yearNode = static_cast<GumboNode*>(node1->v.element.children.data[yearOffset]); // optional
							if (yearNode)
								yearType = GetTextFromElement(yearNode);
						}
						std::string typeStr = "";
						GumboNode *titleNode = static_cast<GumboNode*>(node1->v.element.children.data[titleOffset]);
						GumboNode *locationNode = static_cast<GumboNode*>(node1->v.element.children.data[locationOffset]);
						std::string teacher = "";
						if (teacherOffset != -1)
						{
							GumboNode *teacherNode = static_cast<GumboNode*>(node1->v.element.children.data[teacherOffset]);
							teacher = GetTextFromElement(teacherNode);
						}
						//if(teacher == "" && t.departmentString() == "TEE" && t.cpath() == "stud"){
						//printf("Teacher empty: %s - %s\n", GetTextFromElement(titleNode).c_str(), GetTextFromElement(startTimeNode).c_str());
						//printf("Buffer:\n%s\n----\n", buffer.c_str());
						//}
						//printf("%s\n", teacher.c_str());
						if (typeOffset != -1)
						{							
							GumboNode *typeNode = static_cast<GumboNode*>(node1->v.element.children.data[typeOffset]);
							typeStr = GetTextFromElement(typeNode);
						}
						GumboNode *commentsNode = static_cast<GumboNode*>(node1->v.element.children.data[commentOffset]);
						std::string startDate = GetTextFromElement(startTimeNode);
						int startHour;
						int startMinute;
						sscanf(startDate.c_str(), "%02d:%02d", &startHour, &startMinute);
						char newStartDate[128];
						//YYYY-MM-DDTHH:MM:SS
						int n = sprintf(newStartDate, "%04d-%02d-%02dT%02d:%02d:00", year, month, day, startHour, startMinute);
						newStartDate[n] = '\0';

						char newEndDate[128];
						if (endDate != "") {
							int endHour;
							int endMinute;
							sscanf(endDate.c_str(), "%02d:%02d", &endHour, &endMinute);
							//YYYY-MM-DDTHH:MM:SS
							int ne = sprintf(newEndDate, "%04d-%02d-%02dT%02d:%02d:00", year, month, day, endHour, endMinute);
							newEndDate[ne] = '\0';
						}else
							newEndDate[0] = '\0';

						//printf("%s - %s - %s - %s - %s (YEAR: %s)\n", GetTextFromElement(titleNode).c_str(), newStartDate, newEndDate, GetTextFromElement(teacherNode).c_str(), GetTextFromElement(locationNode).c_str(), yearType.c_str());
						int weekNr = getWeekNrFromDate(newStartDate);

						// Calculate time difference to remove old lessons
						time_t timeStampThisWeek = getTimeStampFromDate(newStartDate);
						time_t timeStampCurrentWeek;
						time(&timeStampCurrentWeek);
						//printf("Current week: %d - Lesson week: %d\n", getCurrentWeekNumber(), weekNr);
						double diff = difftime(timeStampThisWeek, timeStampCurrentWeek);
						double weeks = diff / 604800;
						//printf("Time difference: %.f\n", weeks);
						if (weeks > -2)
						{ // ignore old lessons
							t.addLesson(shared_ptr<Lesson>(new Lesson(GetTextFromElement(titleNode), GetTextFromElement(commentsNode), teacher, replaceAll(GetTextFromElement(locationNode), "    ", ", "), newStartDate, newEndDate, weekNr, yearType, typeStr)));
							++*totalLessons;
						}
						lessonAmount++;
					}
					else if (value == "weekheader")
					{
						yearOffset = -1;
						titleOffset = -1;
						locationOffset = -1;
						teacherOffset = -1;
						typeOffset = -1;
						commentOffset = -1;
						GumboVector* children = &node1->v.element.children;
						for (unsigned int i = 0; i < children->length; ++i)
						{
							GumboNode *child = static_cast<GumboNode*>(children->data[i]);
							std::string text = GetTextFromElement(child);
							if (text == "jaar")
								yearOffset = i;
							else if (text == "activiteit")
								titleOffset = i;
							else if (text == "lokaal")
								locationOffset = i;
							else if (text == "docent(en)" || text == "klas(en)")
								teacherOffset = i;
							else if (text == "werkvorm")
								typeOffset = i;
							else if (text == "opmerkingen")
								commentOffset = i;
							else if (text == "eind")
								endOffset = i;
						}
					}
				}
				else
				{
					// no class
					GumboNode *td = static_cast<GumboNode*>(node1->v.element.children.data[0]);
					if (td->v.element.tag == GUMBO_TAG_TD) {
						GumboAttribute *classAtt = gumbo_get_attribute(&td->v.element.attributes, "class");
						if (classAtt)
						{
							std::string dayRow = classAtt->value;
							if (dayRow == "dagrij")
							{
								GumboNode *dateNode = static_cast<GumboNode*>(td->v.element.children.data[0]);
								date = dateNode->v.text.text; // get date with sscanf

								sscanf(date.c_str(), "%s %02d-%02d-%04d", &dayName, &day, &month, &year);
								//printf("New day @ %s\n", date.c_str());
							}
						}
					}
				}
			}
		}
		//printf("Lessons: %d\n", lessonAmount);
		gumbo_destroy_output(&kGumboDefaultOptions, output);
		curl_easy_cleanup(curl);
		classList->lock();
		classList->push(t);
		classList->unlock();
		return true;
	}
	else{
		//printf("[%s] Fail: %s, aborting program", currentDateTime().c_str(), curl_easy_strerror(res));
		curl_easy_cleanup(curl);
		//exit(1);
		return false;
	}
		/* New ICAL style as of semester starting at 01-09-2015 */
		/*
		icalcomponent *rootNode = icalparser_parse_string(cstr);
		icalcomponent *comp = icalcomponent_get_first_component(rootNode, ICAL_VEVENT_COMPONENT);
		//icalcomponent *zoneComp = icalcomponent_get_first_component(rootNode, ICAL_VTIMEZONE_COMPONENT);
		//icaltimezone *zone = icaltimezone_get_builtin_timezone("Europe/Amsterdam");
	
		//icalcomponent *next = icalcomponent_get_next_component(rootNode, ICAL_VEVENT_COMPONENT);
		while (comp != NULL){
			//printf("%s\n", icalcomponent_as_ical_string(comp));
			std::string summary = icalcomponent_get_summary(comp);
			icaltimetype dtstart = icalcomponent_get_dtstart(comp);
			
			std::string starttime = formatDateTime(getTimeStampFromDateAlt(icaltime_as_ical_string(dtstart)));
			icaltimetype dtend = icalcomponent_get_dtend(comp);

			std::string endtime = formatDateTime(getTimeStampFromDateAlt(icaltime_as_ical_string(dtend)));
			const char *locationStr = icalcomponent_get_location(comp); // can be null
			std::string location = "";
			if (locationStr){
				location = locationStr;
				location = replaceAll(location, "    ", ", ");
			}
			std::string commentStr = (char *)icalcomponent_get_comment(comp);
			printf("Comment: %s\n", commentStr.c_str());
			char *comment = (char *)commentStr.c_str();
			char *line = strtok(comment, "\n");
			std::string docenten;
			while (line != NULL){
				if (strstr(line, "Docent(en): ")){
					line += 12; // 12 is length of Docent(en)
					int length = strlen(line);
					docenten = line;
				}
				line = strtok(NULL, "\n");
			}
			int weekNr = getWeekNrFromDate(starttime);
			//printf("%s - %s\n", summary.c_str(), starttime.c_str());
			if (weekNr > getCurrentWeekNumber() - 2) { // ignore old lessons
				t.addLesson(shared_ptr<Lesson>(new Lesson(summary, summary, docenten, location, starttime, endtime, weekNr)));
				printf("1Add Lesson(%s): %s - %s - %s - %s(%s)\n", t.className().c_str(), summary.c_str(), starttime.c_str(), endtime.c_str(), docenten.c_str(), commentStr.c_str());
			}
			icalcomponent_free(comp);
			comp = icalcomponent_get_next_component(rootNode, ICAL_VEVENT_COMPONENT);
		}
		icalcomponent_free(comp);
		icalcomponent_free(rootNode);
		classList->lock();
		classList->push(t);
		classList->unlock();
	*/
		// delete our garbage
		//delete[] refererUrlBuffer;
		//delete[] urlBuffer;
		//delete[] cstr;
		//curl_free(classIdStringEscaped);
		//curl_easy_cleanup(curl);

		//return true;
		/* Old XML Style
		xml_document<> doc;
		doc.parse<0>(cstr);
		xml_node<> *pRoot = doc.first_node();
		if (pRoot == 0)
		{
			std::cout << "doFetchLesson() ERROR: Invalid rootnode" << std::endl;
			exit(1); // Immediately abort program as the document is unreadable
		}
		else if (pRoot != NULL)
		{
			pRoot = pRoot->first_node();
			if (pRoot == 0)
			{
				std::cout << "doFetchLesson() ERROR: Rootnode has an invalid first node" << std::endl;
				exit(1); // Immediately abort program as the document is unreadable
			}
		}
		for (xml_node<> *pNode = pRoot->first_node("item"); pNode; pNode = pNode->next_sibling())
		{
			std::string title = pNode->first_node("title") ? pNode->first_node("title")->value() : "";
			if (title.length() > 2)
				title = title.substr(title.find(": ") + 2, title.length());
			std::string description = pNode->first_node("description") ? pNode->first_node("description")->value() : "";
			std::string teacher = getStringBetween(" - ", " -", description, "([a-zA-Z,. ]+)");

			std::string location = pNode->first_node("ev:location") ? pNode->first_node("ev:location")->value() : "";
			location = trim(location);
			char *locationDecoded = curl_easy_unescape(curl, location.c_str(), 0, NULL);
			location = std::string(locationDecoded);
			curl_free(locationDecoded);
			std::string startdate = pNode->first_node("ev:startdate") ? pNode->first_node("ev:startdate")->value() : "";
			std::string enddate = pNode->first_node("ev:enddate") ? pNode->first_node("ev:enddate")->value() : "";
			int weekNr = getWeekNrFromDate(startdate);
			if (weekNr > getCurrentWeekNumber() - 2) // ignore old lessons
				t.addLesson(shared_ptr<Lesson>(new Lesson(title, description, teacher, location, startdate, enddate, weekNr)));
		}
		// ready to push back the class
		classList->lock();
		classList->push_back(t);
		classList->unlock();

		// delete our garbage
		delete[] refererUrlBuffer;
		delete[] urlBuffer;
		delete[] cstr;
		curl_free(classIdStringEscaped);
		curl_easy_cleanup(curl);
		
		return true;*/
}
コード例 #27
0
 bool ParserUtils::hasAttribute(const GumboNode* node, const string& attributeName)
 {
     assert(node != nullptr);
     return gumbo_get_attribute(&node->v.element.attributes, attributeName.c_str()) != nullptr;
 }
コード例 #28
0
ファイル: t2HTMLParser.cpp プロジェクト: 91yuan/TattyUI
    void t2HTMLParser::t2LabelParser::body(GumboNode *b)
    {
        static t3Queue<t2Element*> queue;
        t2Element* element = new t2Element(b);

        // --!这里body下有一个隐藏的div,长宽与窗口一致
        string rootID = getRootDivGlobalID();
        t2DivController::getInstance()->addDiv(rootID, element->div);
        t2DivController::getInstance()->setRoot(rootID);
        element->div->normal.width = t2GetWindowWidth();
        element->div->normal.height = t2GetWindowHeight();
        element->div->hover = element->div->normal;
        element->div->active = element->div->normal;

        queue.push(element);
        for(;;)
        {
            t2Element* e;
            if(queue.isEmpty()) e = NULL;
            else e = queue.pop();

            if(e)
            {
                GumboNode* child = e->node;
                // find div
                if(child->type == GUMBO_NODE_ELEMENT && child->v.element.tag == GUMBO_TAG_DIV)
                {
                    t2Div *div = e->div;

                    // find className
                    GumboAttribute *className = gumbo_get_attribute(&child->v.element.attributes, "class");
                    div->className = className->value;
                    
                    // text
                    GumboNode *content = (GumboNode *) child->v.element.children.data[0];
                    if(content->type == GUMBO_NODE_TEXT || content->type == GUMBO_NODE_WHITESPACE)
                    {
                        div->normal.text = content->v.text.text;
                        div->hover.text = content->v.text.text;
                        div->active.text = content->v.text.text;

                        div->normalCondition.text = content->v.text.text;
                        div->hoverCondition.text = content->v.text.text;
                        div->activeCondition.text = content->v.text.text;
                    }
                    
                    // find id
                    // not supported
                    string id;
                    GumboAttribute *idName = gumbo_get_attribute(&child->v.element.attributes, "id");
                    if(idName)
                        div->className = idName->value;
                    else
                        id = getDivGlobalID();

                    // 新增div
                    t2DivController::getInstance()->addDiv(id, div);
                }

                t2Div *brother = NULL;
                // 将所有兄弟结点入队列
                for(int i = 0; i < child->v.element.children.length; i++)
                {
                    GumboNode *c = (GumboNode *) child->v.element.children.data[i];
                    if(c->type == GUMBO_NODE_ELEMENT && 
                        c->v.element.tag == GUMBO_TAG_DIV)
                    {
                        // 多叉树的复制
                        t2Element* element = new t2Element(c);
                        // 第一个兄弟结点
                        if(!brother)
                        {
                            // 子节点
                            e->div->child = element->div;
                            element->div->parent = e->div;
                        }
                        else
                        {
                            // 父节点
                            element->div->parent = e->div;
                            brother->next = element->div;
                        }

                        // 更新兄弟结点
                        brother = element->div;
                        queue.push(element);
                    }
                }
            }
            else
                break;
        }
    }
コード例 #29
0
// TODO: case-insensitive comparison
static void searchForDivBlocks(GumboNode* node)
{
	if (node->type != GUMBO_NODE_ELEMENT) return;

	// <div class="forum-post-entry">
	GumboAttribute* classAttr = NULL;
	if ((node->v.element.tag == GUMBO_TAG_DIV) && (classAttr = gumbo_get_attribute(&node->v.element.attributes, "class")))
	{
		std::string classAttrStr(classAttr->value);
		if (classAttrStr == "forum-post-entry" )
		{
			// <div class="forum-post-text" id="message_text_4453758">
			GumboNode* fpeNode = node;

			ForumPageParser fpp;
			fpp.parseForumPostEntry(fpeNode);

			for (unsigned int i = 0; i < fpeNode->v.element.children.length; ++i)
			{
#if 0
				GumboNode* fpeChildNode = static_cast< GumboNode* >(fpeNode->v.element.children.data[i]);
				if (fpeChildNode->type != GUMBO_NODE_ELEMENT) continue;
				assert(fpeChildNode->v.element.tag == GUMBO_TAG_DIV);
				if (fpeChildNode->v.element.tag != GUMBO_TAG_DIV) continue;

				std::cout << "    FPE children tags:" << gumbo_normalized_tagname(fpeChildNode->v.element.tag) << std::endl;
				for (int iAttr = 0; iAttr < fpeChildNode->v.element.attributes.length;++iAttr)
				{
					GumboAttribute* attr = static_cast< GumboAttribute* >( fpeChildNode->v.element.attributes.data[iAttr] );
					assert(attr != NULL);
					std::cout << "ATTR: " << attr->name << " = " << attr->value << std::endl;
				}

				if (fpeChildNode->v.element.tag == GUMBO_TAG_DIV)
				{
					GumboAttribute* fpeChildNodeClassAttr = gumbo_get_attribute(&fpeChildNode->v.element.attributes, "class");
					assert(fpeChildNodeClassAttr != NULL);
					std::string fpeChildNodeClassAttrStr(fpeChildNodeClassAttr->value);
					if (fpeChildNodeClassAttrStr == "forum-post-text")
					{
						GumboNode* fptNode = fpeChildNode;
						assert(fptNode->type == GUMBO_NODE_ELEMENT);
//                       qDebug() << "  forum-post-text children count: " << fptNode->v.element.children.length;

						GumboAttribute* fptNodeIdAttr = gumbo_get_attribute(&fptNode->v.element.attributes, "id");
						assert(fptNodeIdAttr != NULL);

						//std::cout << "    Forum post found: " << fptNodeIdAttr->value << std::endl;

						// FIXME: remove after debugging
						if (std::strcmp(fptNodeIdAttr->value, "message_text_4453758") != 0) continue;

						GumboVector* fptNodeChildren = &fptNode->v.element.children;
						for (unsigned int j = 0; j < fptNodeChildren->length; ++j)
						{
							GumboNode* fptNodeChild = static_cast< GumboNode* >(fptNodeChildren->data[j]);
							std::cout << "Element type: " << gumboElementTypeToString( fptNodeChild->type ) << std::endl;

							if (fptNodeChild->type == GUMBO_NODE_TEXT)
							{
								/*std::locale loc(std::locale(), new std::codecvt_utf8<char>);
								std::cout.imbue(loc);*/
								std::string TEST(fptNodeChild->v.text.text);

								WCHAR wBuf[1024] = { 0 };
								int res2 = MultiByteToWideChar(CP_UTF8, 0, fptNodeChild->v.text.text, -1, wBuf, 1024);
								char mbcsBuf[1024] = { 0 };
								int res3 = WideCharToMultiByte(1251, 0, wBuf, 1024, mbcsBuf, 1024, NULL, NULL);
								std::string TEST_2(mbcsBuf);
								std::cout << "Element text: " << TEST_2 << std::endl;

								break;
							}
							else if (fptNodeChild->type == GUMBO_NODE_ELEMENT)
							{
								std::cout << "    " << gumbo_normalized_tagname(fptNodeChild->v.element.tag) << std::endl;
								//qDebug() << "     " << gumbo_tag_from_original_text( &fptNodeChild->v.element.original_tag );
							}
						}
					}
				}
#endif
			}

			std::cout << "----------------------------" << std::endl;
		}
	}

	GumboVector* children = &node->v.element.children;
	for (unsigned int i = 0; i < children->length; ++i)
	{
		searchForDivBlocks(static_cast< GumboNode* >(children->data[i]));
	}
}
コード例 #30
0
ファイル: htmlparser.cpp プロジェクト: damarbo33/httpCurl
GumboNode* HtmlParser::search_for_elem2(GumboNode* node, Tags *tag, string atributo, string valAtributo,
                                        Tags *tag2, string atributoElem2)
    {

    if (node->type != GUMBO_NODE_ELEMENT) {
        return NULL;
    }


    GumboAttribute* attribute;
    if (node->v.element.tag == tag->htmlTag){
        map<string, string> attrElement;
        if ((attribute = gumbo_get_attribute(&node->v.element.attributes, tag->attr.c_str()))) {
            if (string(attribute->value).compare(valAtributo) == 0 &&
                 (string(attribute->name).compare(atributo) == 0 || valAtributo.empty())
                    ){

                    if (string(attribute->name).compare(atributoElem2) == 0){
                        //cout << attribute->name << ":2 " << attribute->value << endl;
                        listUrlInfoSong.push_back(attribute->value);
                    }

                    GumboVector* children = &node->v.element.children;
                    for (int j=0; j < children->length; j++){
                        GumboNode* enlace = search_for_elem2(static_cast<GumboNode*>(children->data[j]), tag2, atributoElem2, "", tag2, atributoElem2);
                        return enlace;
                    }
            }
        } else {
            //Recogemos el resto de atributos y los incluimos en la variable tag
            for (unsigned int i=0; i<node->v.element.attributes.length; i++){
                attribute = ((GumboAttribute *)node->v.element.attributes.data[i]);
                if (string(attribute->name).compare(atributo) == 0 &&
                     (string(attribute->value).compare(valAtributo) == 0 || valAtributo.empty())
                    ){

                    if (string(attribute->name).compare(atributoElem2) == 0){
                        listUrlInfoSong.push_back(attribute->value);
                        //cout << attribute->name << ":1 " << attribute->value << endl;
                    }

                    GumboVector* children = &node->v.element.children;
                    for (int j=0; j < children->length; j++){
                        GumboNode* enlace = search_for_elem2(static_cast<GumboNode*>(children->data[j]), tag2, atributoElem2, "", tag2, atributoElem2);
                        return enlace;
                    }
                }
            }
        }
    }

    GumboVector* children = &node->v.element.children;
    for (unsigned int i = 0; i < children->length; ++i) {
        GumboNode* ret = search_for_elem2(static_cast<GumboNode*>(children->data[i]), tag, atributo, valAtributo, tag2, atributoElem2);
        if (ret != NULL){
            return ret;
        }
    }

    return NULL;
}