static void add_tag(lua_State *L, const GumboElement *element) { if (element->tag_namespace == GUMBO_NAMESPACE_SVG) { GumboStringPiece original_tag = element->original_tag; gumbo_tag_from_original_text(&original_tag); const char *normalized = gumbo_normalize_svg_tagname(&original_tag); if (normalized) { add_string(L, "tag", normalized); return; } } if (element->tag == GUMBO_TAG_UNKNOWN) { GumboStringPiece original_tag = element->original_tag; gumbo_tag_from_original_text(&original_tag); luaL_Buffer b; luaL_buffinit(L, &b); for (size_t i = 0, n = original_tag.length; i < n; i++) { const char c = original_tag.data[i]; luaL_addchar(&b, (c <= 'Z' && c >= 'A') ? c + 32 : c); } luaL_pushresult(&b); } else { lua_pushstring(L, gumbo_normalized_tagname(element->tag)); } lua_setfield(L, -2, "tag"); }
std::string CNode::tag() { if (mpNode->type != GUMBO_NODE_ELEMENT) { return ""; } return gumbo_normalized_tagname(mpNode->v.element.tag); }
static void print_tag_stack(const GumboParserError* error, GumboStringBuffer* output) { print_message(output, " Currently open tags: "); for (int i = 0; i < error->tag_stack.length; ++i) { if (i) { print_message(output, ", "); } GumboTag tag = (GumboTag) error->tag_stack.data[i]; print_message(output, gumbo_normalized_tagname(tag)); } gumbo_string_buffer_append_codepoint('.', output); }
static void print_tag_stack(GumboParser* parser, const GumboParserError* error, GumboStringBuffer* output) { print_message(parser, output, " Currently open tags: "); for (unsigned int i = 0; i < error->tag_stack.length; ++i) { if (i) { print_message(parser, output, ", "); } unsigned int tmp = (unsigned int) (error->tag_stack.data[i]); GumboTag tag = (GumboTag)tmp; print_message(parser, output, gumbo_normalized_tagname(tag)); } gumbo_string_buffer_append_codepoint(parser, '.', output); }
void parseForumPostText(GumboNode* node) { assert(node->type == GUMBO_NODE_ELEMENT); assert(node->v.element.tag == GUMBO_TAG_DIV); GumboAttribute* classAttr = gumbo_get_attribute(&node->v.element.attributes, "class"); assert(classAttr != NULL); assert(strCiCmp(classAttr->value, "forum-post-text")); GumboAttribute* idAttr = gumbo_get_attribute(&node->v.element.attributes, "id"); assert(idAttr != NULL); std::string idStr(idAttr->value); std::size_t messageTextStrIndex = idStr.find("message_text_"); assert(messageTextStrIndex != std::string::npos); std::string messageIdStr = idStr.substr(13); std::cout << "Forum post found, id: " << messageIdStr << std::endl; // FIXME: remove after debug completion if (!strCiCmp(messageIdStr, "4453758"))return; std::string forumPostText = ""; GumboVector* nodeChildren = &node->v.element.children; for (std::size_t i = 0; i < nodeChildren->length; i++) { GumboNode* childNode = static_cast<GumboNode*>(nodeChildren->data[i]); assert(childNode != NULL); if (childNode->type == GUMBO_NODE_ELEMENT) { // FIXME: parse message quotes (<table> tags) std::cout << "Tag found: " << gumbo_normalized_tagname(childNode->v.element.tag); std::cout << ", children: " << childNode->v.element.children.length << std::endl; // } else if (childNode->type == GUMBO_NODE_TEXT) { std::string elementTextUtf8(childNode->v.text.text); std::string elementTextCp1251 = ""; utf8ToCp1251(elementTextCp1251, elementTextUtf8); forumPostText += elementTextCp1251; // std::cout << "Forum message text: " << elementTextCp1251 << std::endl; } else std::cout << "Ignoring \"" << gumboElementTypeToString(childNode->type) << "\" node..." << std::endl; // classAttr = gumbo_get_attribute(&childNode->v.element.attributes, "class"); } std::cout << "Forum post: " << std::endl << forumPostText << std::endl; }
static GString *get_tag_name(GumboNode *node) { // work around lack of proper name for document node if (node->type == GUMBO_NODE_DOCUMENT) return g_string_new("document"); const gchar *n_tagname = gumbo_normalized_tagname(node->v.element.tag); GString *tagname = g_string_new(n_tagname); if (!tagname->len) { g_string_free(tagname, TRUE); return handle_unknown_tag(&node->v.element.original_tag); } return tagname; }
static int es_gumbo_node_name(duk_context *ctx) { es_gumbo_node_t *egn = es_get_native_obj(ctx, 0, &es_native_gumbo_node); const GumboNode *node = egn->node; switch(node->type) { case GUMBO_NODE_DOCUMENT: duk_push_string(ctx, node->v.document.name); break; case GUMBO_NODE_ELEMENT: duk_push_string(ctx, gumbo_normalized_tagname(node->v.element.tag)); break; default: duk_push_string(ctx, node->v.text.text); break; } return 1; }
static const void findTag(const GumboNode* root, int layer){ //layer代表层数 //用于递归结束条件 if(root->type != GUMBO_NODE_ELEMENT){ return; } //while的功能是缩进格式 int indent = layer; while(indent--){ printf(" "); } //获取标签名 const char* tag_name = gumbo_normalized_tagname(root->v.element.tag); printf("%d--------%s\n", layer, tag_name); // const GumboVector* rchildren = &root->v.element.children; for(unsigned int i = 0; i < rchildren->length; ++i){ GumboNode* child = rchildren->data[i]; findTag(child, layer+1); } }
static void set_tag(lua_State *L, const GumboElement *element) { if (element->tag_namespace == GUMBO_NAMESPACE_SVG) { set_literal(L, "namespace", "svg"); GumboStringPiece original_tag = element->original_tag; gumbo_tag_from_original_text(&original_tag); const char *normalized = gumbo_normalize_svg_tagname(&original_tag); if (normalized) { set_string(L, "localName", normalized); return; } } else if (element->tag_namespace == GUMBO_NAMESPACE_MATHML) { set_literal(L, "namespace", "math"); } if (element->tag == GUMBO_TAG_UNKNOWN) { GumboStringPiece original_tag = element->original_tag; gumbo_tag_from_original_text(&original_tag); pushstring_lower(L, original_tag.data, original_tag.length); } else { lua_pushstring(L, gumbo_normalized_tagname(element->tag)); } lua_setfield(L, -2, "localName"); }
std::string GumboInterface::get_tag_name(GumboNode *node) { std::string tagname; if (node->type == GUMBO_NODE_DOCUMENT) { tagname = "#document"; return tagname; } else if ((node->type == GUMBO_NODE_TEXT) || (node->type == GUMBO_NODE_WHITESPACE)) { tagname = "#text"; return tagname; } else if (node->type == GUMBO_NODE_CDATA) { tagname = "#cdata"; return tagname; } tagname = gumbo_normalized_tagname(node->v.element.tag); if ((tagname.empty()) || (node->v.element.tag_namespace == GUMBO_NAMESPACE_SVG)) { // set up to examine original text of tag. GumboStringPiece gsp = node->v.element.original_tag; gumbo_tag_from_original_text(&gsp); // special handling for some svg tag names. if (node->v.element.tag_namespace == GUMBO_NAMESPACE_SVG) { const char * data = gumbo_normalize_svg_tagname(&gsp); // NOTE: data may not be null-terminated! // since case change only - length must be same as original // if no replacement found returns null, not original tag! if (data != NULL) { return std::string(data, gsp.length); } } if (tagname.empty()) { return std::string(gsp.data, gsp.length); } } return tagname; }
// TODO: case-insensitive comparison static void searchForDivBlocks(GumboNode* node) { if (node->type != GUMBO_NODE_ELEMENT) return; // <div class="forum-post-entry"> GumboAttribute* classAttr = NULL; if ((node->v.element.tag == GUMBO_TAG_DIV) && (classAttr = gumbo_get_attribute(&node->v.element.attributes, "class"))) { std::string classAttrStr(classAttr->value); if (classAttrStr == "forum-post-entry" ) { // <div class="forum-post-text" id="message_text_4453758"> GumboNode* fpeNode = node; ForumPageParser fpp; fpp.parseForumPostEntry(fpeNode); for (unsigned int i = 0; i < fpeNode->v.element.children.length; ++i) { #if 0 GumboNode* fpeChildNode = static_cast< GumboNode* >(fpeNode->v.element.children.data[i]); if (fpeChildNode->type != GUMBO_NODE_ELEMENT) continue; assert(fpeChildNode->v.element.tag == GUMBO_TAG_DIV); if (fpeChildNode->v.element.tag != GUMBO_TAG_DIV) continue; std::cout << " FPE children tags:" << gumbo_normalized_tagname(fpeChildNode->v.element.tag) << std::endl; for (int iAttr = 0; iAttr < fpeChildNode->v.element.attributes.length;++iAttr) { GumboAttribute* attr = static_cast< GumboAttribute* >( fpeChildNode->v.element.attributes.data[iAttr] ); assert(attr != NULL); std::cout << "ATTR: " << attr->name << " = " << attr->value << std::endl; } if (fpeChildNode->v.element.tag == GUMBO_TAG_DIV) { GumboAttribute* fpeChildNodeClassAttr = gumbo_get_attribute(&fpeChildNode->v.element.attributes, "class"); assert(fpeChildNodeClassAttr != NULL); std::string fpeChildNodeClassAttrStr(fpeChildNodeClassAttr->value); if (fpeChildNodeClassAttrStr == "forum-post-text") { GumboNode* fptNode = fpeChildNode; assert(fptNode->type == GUMBO_NODE_ELEMENT); // qDebug() << " forum-post-text children count: " << fptNode->v.element.children.length; GumboAttribute* fptNodeIdAttr = gumbo_get_attribute(&fptNode->v.element.attributes, "id"); assert(fptNodeIdAttr != NULL); //std::cout << " Forum post found: " << fptNodeIdAttr->value << std::endl; // FIXME: remove after debugging if (std::strcmp(fptNodeIdAttr->value, "message_text_4453758") != 0) continue; GumboVector* fptNodeChildren = &fptNode->v.element.children; for (unsigned int j = 0; j < fptNodeChildren->length; ++j) { GumboNode* fptNodeChild = static_cast< GumboNode* >(fptNodeChildren->data[j]); std::cout << "Element type: " << gumboElementTypeToString( fptNodeChild->type ) << std::endl; if (fptNodeChild->type == GUMBO_NODE_TEXT) { /*std::locale loc(std::locale(), new std::codecvt_utf8<char>); std::cout.imbue(loc);*/ std::string TEST(fptNodeChild->v.text.text); WCHAR wBuf[1024] = { 0 }; int res2 = MultiByteToWideChar(CP_UTF8, 0, fptNodeChild->v.text.text, -1, wBuf, 1024); char mbcsBuf[1024] = { 0 }; int res3 = WideCharToMultiByte(1251, 0, wBuf, 1024, mbcsBuf, 1024, NULL, NULL); std::string TEST_2(mbcsBuf); std::cout << "Element text: " << TEST_2 << std::endl; break; } else if (fptNodeChild->type == GUMBO_NODE_ELEMENT) { std::cout << " " << gumbo_normalized_tagname(fptNodeChild->v.element.tag) << std::endl; //qDebug() << " " << gumbo_tag_from_original_text( &fptNodeChild->v.element.original_tag ); } } } } #endif } std::cout << "----------------------------" << std::endl; } } GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { searchForDivBlocks(static_cast< GumboNode* >(children->data[i])); } }
string_view Node::GetElement() const { if(impl->data->type == GUMBO_NODE_ELEMENT) return string_view { gumbo_normalized_tagname(impl->data->v.element.tag) }; throw NotAnElementException(); }