QList<NavLandmarkEntry> NavProcessor::GetLandmarks() { QList<NavLandmarkEntry> landlist; if (!m_NavResource) return landlist; QReadLocker locker(&m_NavResource->GetLock()); GumboInterface gi = GumboInterface(m_NavResource->GetText(), "3.0"); gi.parse(); const QList<GumboNode*> nav_nodes = gi.get_all_nodes_with_tag(GUMBO_TAG_NAV); for (int i = 0; i < nav_nodes.length(); ++i) { GumboNode* node = nav_nodes.at(i); GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, "epub:type"); if (attr && (QString::fromUtf8(attr->value) == "landmarks")) { const QList<GumboTag> tags = QList<GumboTag>() << GUMBO_TAG_A;; const QList<GumboNode*> anchor_nodes = gi.get_nodes_with_tags(node, tags); for (int j = 0; j < anchor_nodes.length(); ++j) { NavLandmarkEntry le; GumboNode* ancnode = anchor_nodes.at(j); GumboAttribute* typeattr = gumbo_get_attribute(&ancnode->v.element.attributes, "epub:type"); GumboAttribute* hrefattr = gumbo_get_attribute(&ancnode->v.element.attributes, "href"); if (typeattr) le.etype = QString::fromUtf8(typeattr->value); if (hrefattr) le.href = Utility::URLDecodePath(QString::fromUtf8(hrefattr->value)); le.title = Utility::DecodeXML(gi.get_local_text_of_node(ancnode)); landlist.append(le); } break; } } return landlist; }
void parseForumPostText(GumboNode* node) { assert(node->type == GUMBO_NODE_ELEMENT); assert(node->v.element.tag == GUMBO_TAG_DIV); GumboAttribute* classAttr = gumbo_get_attribute(&node->v.element.attributes, "class"); assert(classAttr != NULL); assert(strCiCmp(classAttr->value, "forum-post-text")); GumboAttribute* idAttr = gumbo_get_attribute(&node->v.element.attributes, "id"); assert(idAttr != NULL); std::string idStr(idAttr->value); std::size_t messageTextStrIndex = idStr.find("message_text_"); assert(messageTextStrIndex != std::string::npos); std::string messageIdStr = idStr.substr(13); std::cout << "Forum post found, id: " << messageIdStr << std::endl; // FIXME: remove after debug completion if (!strCiCmp(messageIdStr, "4453758"))return; std::string forumPostText = ""; GumboVector* nodeChildren = &node->v.element.children; for (std::size_t i = 0; i < nodeChildren->length; i++) { GumboNode* childNode = static_cast<GumboNode*>(nodeChildren->data[i]); assert(childNode != NULL); if (childNode->type == GUMBO_NODE_ELEMENT) { // FIXME: parse message quotes (<table> tags) std::cout << "Tag found: " << gumbo_normalized_tagname(childNode->v.element.tag); std::cout << ", children: " << childNode->v.element.children.length << std::endl; // } else if (childNode->type == GUMBO_NODE_TEXT) { std::string elementTextUtf8(childNode->v.text.text); std::string elementTextCp1251 = ""; utf8ToCp1251(elementTextCp1251, elementTextUtf8); forumPostText += elementTextCp1251; // std::cout << "Forum message text: " << elementTextCp1251 << std::endl; } else std::cout << "Ignoring \"" << gumboElementTypeToString(childNode->type) << "\" node..." << std::endl; // classAttr = gumbo_get_attribute(&childNode->v.element.attributes, "class"); } std::cout << "Forum post: " << std::endl << forumPostText << std::endl; }
QList<NavTOCEntry> NavProcessor::GetNodeTOC(GumboInterface & gi, const GumboNode * node, int lvl) { if ((node->type != GUMBO_NODE_ELEMENT) || (node->v.element.tag != GUMBO_TAG_OL)) { return QList<NavTOCEntry>(); } QList<NavTOCEntry> toclist; const GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { GumboNode * child = static_cast<GumboNode*>(children->data[i]); if (child->type == GUMBO_NODE_ELEMENT) { if (child->v.element.tag == GUMBO_TAG_LI) { const GumboVector* li_children = &child->v.element.children; for (unsigned int j = 0; j < li_children->length; ++j) { GumboNode * li_child = static_cast<GumboNode*>(li_children->data[j]); if (li_child->type == GUMBO_NODE_ELEMENT) { if (li_child->v.element.tag == GUMBO_TAG_A) { NavTOCEntry te; te.lvl = lvl; GumboAttribute* hrefattr = gumbo_get_attribute(&li_child->v.element.attributes, "href"); if (hrefattr) te.href = Utility::URLDecodePath(QString::fromUtf8(hrefattr->value)); te.title = Utility::DecodeXML(gi.get_local_text_of_node(li_child)); toclist.append(te); } else if (li_child->v.element.tag == GUMBO_TAG_OL) { toclist.append(GetNodeTOC(gi, li_child, lvl+1)); } } } } } } return toclist; }
QList<NavTOCEntry> NavProcessor::GetTOC() { QList<NavTOCEntry> toclist; if (!m_NavResource) return toclist; QReadLocker locker(&m_NavResource->GetLock()); GumboInterface gi = GumboInterface(m_NavResource->GetText(), "3.0"); gi.parse(); const QList<GumboNode*> nav_nodes = gi.get_all_nodes_with_tag(GUMBO_TAG_NAV); for (int i = 0; i < nav_nodes.length(); ++i) { GumboNode* node = nav_nodes.at(i); GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, "epub:type"); if (attr && (QString::fromUtf8(attr->value) == "toc")) { QList<GumboTag> tags = QList<GumboTag>() << GUMBO_TAG_OL; const QList<GumboNode*> ol_nodes = gi.get_nodes_with_tags(node, tags); for (int j = 0; j < ol_nodes.length(); ++j) { GumboNode * olnode = ol_nodes.at(j); toclist.append(GetNodeTOC(gi, olnode, 1)); } break; } } return toclist; }
static void es_gumbo_find_by_class_r(GumboNode *node, char **classes, duk_context *ctx, int *idxp, es_gumbo_output_t *ego) { if(node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) return; const GumboElement *e = &node->v.element; GumboAttribute *a = gumbo_get_attribute(&e->attributes, "class"); if(a != NULL) { char **list = strvec_split(a->value, ' '); for(int i = 0; classes[i] != NULL; i++) { int found = 0; for(int j = 0; list[j] != NULL; j++) { if(!strcmp(list[j], classes[i])) { found = 1; break; } } if(!found) goto notfound; } push_gumbo_node(ctx, node, ego); duk_put_prop_index(ctx, -2, (*idxp)++); notfound: strvec_free(list); } for(int i = 0; i < e->children.length; i++) es_gumbo_find_by_class_r(e->children.data[i], classes, ctx, idxp, ego); }
void HTMLParser::extractLinks(GumboNode *node, list<pair<string, string> > &links, string &docURL) { if (node->type != GUMBO_NODE_ELEMENT) { return; } GumboAttribute* href; if (node->v.element.tag == GUMBO_TAG_A && (href = gumbo_get_attribute(&node->v.element.attributes, "href"))) { string anchor; if (node->v.element.children.length > 0) { GumboNode* title_text = (GumboNode*) node->v.element.children.data[0]; anchor = title_text->v.text.text; } else { anchor = ""; } string url = treatLink(docURL, href->value); if (!url.empty()) { links.push_back(make_pair(url, anchor)); } } GumboVector* children = &node->v.element.children; for (size_t i = 0; i < children->length; ++i) { extractLinks(static_cast<GumboNode*>(children->data[i]), links, docURL); } }
void Client::getlinks(GumboNode *node, std::vector<std::string>& vec) { GumboVector *children; // 如果当前节点不是一个元素的话直接返回 if(node->type != GUMBO_NODE_ELEMENT) return; // 获取该节点的所有子元素节点 children=&node->v.element.children; GumboAttribute* href; if ((node->v.element.tag == GUMBO_TAG_A || node->v.element.tag == GUMBO_TAG_LINK) && (href = gumbo_get_attribute(&node->v.element.attributes, "href"))) { char const* hrefstr = href->value; if(strcmp(hrefstr, "") != 0) { vec.push_back(hrefstr); } } // 递归该节点下的所有子节点 for(int i=0;i < children->length; ++i) getlinks((GumboNode*)children->data[i], vec); }
void HtmlParser::search_for_links(GumboNode* node, Tags *tag) { if (node->type != GUMBO_NODE_ELEMENT) { return; } GumboAttribute* attribute; if (node->v.element.tag == tag->htmlTag){ map<string, string> attrList; attrValue attrElement; if ( (attribute = gumbo_get_attribute(&node->v.element.attributes, tag->attr.c_str()))) { attrElement.attrList.insert( make_pair(attribute->name, attribute->value)); } else { //Recogemos el resto de atributos y los incluimos en la variable tag for (unsigned int i=0; i<node->v.element.attributes.length; i++){ attribute = ((GumboAttribute *)node->v.element.attributes.data[i]); attrElement.attrList.insert( make_pair(attribute->name, attribute->value)); } } attrElement.content = isObtainContentTag() ? search_text(node, tag) : ""; tag->tagElement.push_back(attrElement); } GumboVector* children = &node->v.element.children; // std::cout << "****** elementos: " << children->length << endl; for (unsigned int i = 0; i < children->length; ++i) { search_for_links(static_cast<GumboNode*>(children->data[i]), tag); } }
static package_t *package_from_wiki_anchor(GumboNode *anchor) { package_t *pkg = malloc(sizeof(package_t)); GumboAttribute* href = gumbo_get_attribute(&anchor->v.element.attributes, "href"); char *url = strdup(href->value); pkg->href = url; pkg->repo = package_get_repo(url); GumboNode *parent = anchor->parent; if (GUMBO_TAG_LI != parent->v.element.tag) { free(pkg); return NULL; } GumboVector* children = &parent->v.element.children; for (int i = 0; i < children->length; ++i) { GumboNode *child = children->data[i]; if (GUMBO_NODE_TEXT == child->type) { // TODO support nested elements (<code>, <em>, etc.) char *description = strdup(child->v.text.text); pkg->description = substr(description, 3, strlen(description) + 1); break; } } return pkg; }
string HtmlParser::search_for_elem(GumboNode* node, Tags *tag, string atributo, string valAtributo) { if (node->type != GUMBO_NODE_ELEMENT) { return ""; } GumboAttribute* attribute; if (node->v.element.tag == tag->htmlTag){ map<string, string> attrElement; if ((attribute = gumbo_get_attribute(&node->v.element.attributes, tag->attr.c_str()))) { if (string(attribute->value).compare(valAtributo) == 0 && string(attribute->name).compare(atributo) == 0){ return cleantext(node); } } else { //Recogemos el resto de atributos y los incluimos en la variable tag for (unsigned int i=0; i<node->v.element.attributes.length; i++){ attribute = ((GumboAttribute *)node->v.element.attributes.data[i]); if (string(attribute->value).compare(valAtributo) == 0 && string(attribute->name).compare(atributo) == 0){ return cleantext(node); } } } } GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { string ret = search_for_elem(static_cast<GumboNode*>(children->data[i]), tag, atributo, valAtributo); if (!ret.empty()){ return ret; } } return ""; }
static void wiki_registry_iterate_nodes(GumboNode *node, list_t *packages, char *category) { if (node->type != GUMBO_NODE_ELEMENT) return; if (node->v.element.tag == GUMBO_TAG_A) { GumboAttribute* name = gumbo_get_attribute(&node->v.element.attributes, "name"); if (name) { // set the current category for next iteration size_t len = strlen(name->value); category = realloc(category, len + 1); memcpy(category, name->value, len); category[len] = 0; } else { package_t *pkg = package_from_wiki_anchor(node); if (pkg) { pkg->category = strdup(category); list_node_t *pkg_node = list_rpush(packages, list_node_new(pkg)); } } } else { GumboVector* children = &node->v.element.children; for (int i = 0; i < children->length; ++i) { wiki_registry_iterate_nodes(children->data[i], packages, category); } } }
void FacebookClient::extractLinks(GumboNode* node) { if (node->type != GUMBO_NODE_ELEMENT) { return; } if (node->v.element.tag == GUMBO_TAG_A ) { GumboAttribute* hrefAttr; hrefAttr = gumbo_get_attribute( &node->v.element.attributes, "href" ); std::string valueStr = std::string( hrefAttr->value ); std::size_t match = valueStr.find( "messages/thread" ); if( match == 1 ) { std::stringstream ss( valueStr ); int index = 0; //double xx = 100001930423169; while( std::getline( ss, valueStr, '/' ) ) { std::istringstream iss( valueStr ); if( index == 3 ) { std::istringstream iss( valueStr ); iss >> this->friendID; break; }; index++; };
void parseForumPostEntry(GumboNode* node) { assert(node->type == GUMBO_NODE_ELEMENT); assert(node->v.element.tag == GUMBO_TAG_DIV); GumboAttribute* classAttr = gumbo_get_attribute(&node->v.element.attributes, "class"); assert(classAttr != NULL); assert(strCiCmp(classAttr->value, "forum-post-entry")); GumboVector* nodeChildren = &node->v.element.children; int fpeElementCount = 0; for (std::size_t i = 0; i < nodeChildren->length;i++) { GumboNode* divChildNode = static_cast<GumboNode*>(nodeChildren->data[i]); assert(divChildNode != NULL); if (divChildNode->type != GUMBO_NODE_ELEMENT) continue; fpeElementCount++; assert(divChildNode->v.element.tag == GUMBO_TAG_DIV); classAttr = gumbo_get_attribute(&divChildNode->v.element.attributes, "class"); if (strCiCmp(classAttr->value, "forum-post-text")) { parseForumPostText(divChildNode); } else if (strCiCmp(classAttr->value, "forum-post-attachments")) { parseForumPostAttachments(divChildNode); } else if (strCiCmp(classAttr->value, "forum-post-lastedit")) { parseForumPostLastEdit(divChildNode); } else if (strCiCmp(classAttr->value, "forum-user-signature")) { parseForumUserSignature(divChildNode); } else { assert(0); } } assert(fpeElementCount <= 3); }
string ParserUtils::getAttribute(const GumboNode* node, const string& attributeName) { assert(node != nullptr); const GumboAttribute* nodeAttribute = gumbo_get_attribute(&node->v.element.attributes, attributeName.c_str()); if (nodeAttribute != nullptr) { return nodeAttribute->value; } return ""; }
NavProcessor::NavProcessor(HTMLResource * nav_resource) : m_NavResource(nav_resource) { bool valid = true; { QReadLocker locker(&m_NavResource->GetLock()); QString source = m_NavResource->GetText(); GumboInterface gi = GumboInterface(source, "3.0"); gi.parse(); const QList<GumboNode*> nav_nodes = gi.get_all_nodes_with_tag(GUMBO_TAG_NAV); valid = valid && nav_nodes.length() > 0; bool has_toc = false; for (int i = 0; i < nav_nodes.length(); ++i) { GumboNode* node = nav_nodes.at(i); GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, "epub:type"); if (attr) { QString etype = QString::fromUtf8(attr->value); if (etype == "toc") has_toc = true; } } valid = valid && has_toc; } if (!valid) { SettingsStore ss; QString lang = ss.defaultMetadataLang(); QString newsource = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" "<!DOCTYPE html>\n" "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\" " "lang=\"%1\" xml:lang=\"%2\">\n" "<head>\n" " <meta charset=\"utf-8\" />\n" " <style type=\"text/css\">\n" " nav#landmarks, nav#page-list { display:none; }\n" " ol { list-style-type: none; }\n" " </style>\n" "</head>\n" "<body epub:type=\"frontmatter\">\n" " <nav epub:type=\"toc\" id=\"toc\">\n" " </nav>\n" " <nav epub:type=\"landmarks\" id=\"landmarks\" hidden=\"\">\n" " </nav>\n" "</body>\n" "</html>"; newsource = newsource.arg(lang).arg(lang); QWriteLocker locker(&m_NavResource->GetLock()); m_NavResource->SetText(newsource); } }
static void wiki_registry_find_body(GumboNode* node, list_t *packages) { if (node->type != GUMBO_NODE_ELEMENT) return; GumboAttribute *id = gumbo_get_attribute(&node->v.element.attributes, "id"); if (id && 0 == strcmp("wiki-body", id->value)) { // temp category buffer, we'll populate this later char *category = malloc(1); wiki_registry_iterate_nodes(node, packages, category); return; } GumboVector* children = &node->v.element.children; for (int i = 0; i < children->length; ++i) { wiki_registry_find_body(children->data[i], packages); } }
QList<GumboNode*> GumboInterface::get_nodes_with_attribute(GumboNode* node, const char * attname) { if (node->type != GUMBO_NODE_ELEMENT) { return QList<GumboNode*>(); } QList<GumboNode*> nodes; GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, attname); if (attr) { nodes.append(node); } GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { nodes.append(get_nodes_with_attribute(static_cast<GumboNode*>(children->data[i]), attname)); } return nodes; }
QStringList GumboInterface::get_values_for_attr(GumboNode* node, const char* attr_name) { if (node->type != GUMBO_NODE_ELEMENT) { return QStringList(); } QStringList attr_vals; GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, attr_name); if (attr != NULL) { attr_vals.append(QString::fromUtf8(attr->value)); } GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { attr_vals.append(get_values_for_attr(static_cast<GumboNode*>(children->data[i]), attr_name)); } return attr_vals; }
static GumboNode * es_gumbo_find_by_id_r(GumboNode *node, const char *id) { if(node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) return NULL; const GumboElement *e = &node->v.element; GumboAttribute *a = gumbo_get_attribute(&e->attributes, "id"); if(a != NULL && !strcmp(a->value, id)) return node; for(int i = 0; i < e->children.length; i++) { GumboNode *r = es_gumbo_find_by_id_r(e->children.data[i], id); if(r != NULL) return r; } return NULL; }
QStringList GumboInterface::get_properties(GumboNode* node) { if (node->type != GUMBO_NODE_ELEMENT) { return QStringList(); } QStringList properties; std::string tagname = get_tag_name(node); if (in_set(manifest_properties, tagname)) { properties.append(QString::fromStdString(tagname)); } GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, "src"); if (attr && !QUrl(QString::fromUtf8(attr->value)).isRelative()) { properties.append(QString("remote-resources")); } GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { properties.append(get_properties(static_cast<GumboNode*>(children->data[i]))); } return properties; }
void t2HTMLParser::t2LabelParser::head(GumboNode *h) { GumboVector *headChildren = &h->v.element.children; for(int i = 0; i < headChildren->length; i++) { GumboNode *child = (GumboNode *) headChildren->data[i]; // find title if(child->type == GUMBO_NODE_ELEMENT && child->v.element.tag == GUMBO_TAG_TITLE) { if(child->v.element.children.length != 1) { t2PrintError("<empty title>"); return; } // title无子节点 GumboNode *title = (GumboNode *) child->v.element.children.data[0]; if(title->type != GUMBO_NODE_TEXT && title->type != GUMBO_NODE_WHITESPACE) { t2PrintError("标题只可为空或文字"); return; } t2Window::getInstance()->window->setTitle(title->v.text.text); } // find css file else if(child->type == GUMBO_NODE_ELEMENT && child->v.element.tag == GUMBO_TAG_LINK) { GumboAttribute *href = gumbo_get_attribute(&child->v.element.attributes, "href"); if(!href) t2PrintError("<link>缺失href属性"); cssFilePaths.push_back(href->value); } } }
GumboNode * gumbo_get_element_by_id(const char *id, GumboNode *document) { if (GUMBO_NODE_DOCUMENT != document->type && GUMBO_NODE_ELEMENT != document->type) { return NULL; } GumboAttribute *node_id = gumbo_get_attribute(&document->v.element.attributes, "id"); if (node_id && 0 == strcmp(id, node_id->value)) { return document; } // iterate all children GumboVector *children = &document->v.element.children; for (unsigned int i = 0; i < children->length; i++) { GumboNode *node = gumbo_get_element_by_id(id, children->data[i]); if (node) return node; } return NULL; }
bool SteamUserCrawler::run() { printf("Started Running User Crawler\n"); gettimeofday(&start, NULL); // Connect To DB dbConn->connect(); gettimeofday(&end, NULL); printf("DB Connected (time consumed : %ldms)\n", this->calTime()); sql::Statement *stmt; sql::ResultSet *res; sql::PreparedStatement *pstmt; pstmt = dbConn->con->prepareStatement("INSERT INTO user(url, name, steamlv) VALUES (?, ?, ?) ON DUPLICATE KEY UPDATE name=VALUES(name), steamlv=VALUES(steamlv);"); string page; string userName; int userLevel; string url = this->seedURL; if(url == "r") { gettimeofday(&start, NULL); stmt = dbConn->con->createStatement(); res = stmt->executeQuery("SELECT url FROM user WHERE name IS NULL ORDER BY RAND() LIMIT 1;"); gettimeofday(&end, NULL); if(res->next()) { url = res->getString(1); printf("Getting Random Seed URL from DB Done (time consumed : %ldms)\n", this->calTime()); } delete stmt; delete res; } while(url != "") { gettimeofday(&start, NULL); string page = curl->getPage(url); gettimeofday(&end, NULL); printf("Getting User Profile Page Done (time consumed : %ldms)\n", this->calTime()); userName = ""; userLevel = -1; // Getting User Name & User Steam Level if(page != "") { gettimeofday(&start, NULL); GumboOutput *output = gumbo_parse(page.c_str()); gettimeofday(&end, NULL); printf("User Profile Page Parsing Done (time consumed : %ldms)\n", this->calTime()); printf("Current URL [%s]\n", url.c_str()); queue<GumboNode *> nodes; nodes.push(output->root); gettimeofday(&start, NULL); while(!nodes.empty() && (userName == "" || userLevel == -1)) { GumboNode *node = nodes.front(); nodes.pop(); if(node->type != GUMBO_NODE_ELEMENT) { continue; } GumboAttribute *attr; // User Name if((node->v.element.tag == GUMBO_TAG_DIV) && (attr = gumbo_get_attribute(&node->v.element.attributes, "class")) && (strcmp(attr->value, "persona_name") == 0)) { GumboVector *aChild = &node->v.element.children; for(size_t i = 0; i < aChild->length; i++) { GumboNode *aNode = static_cast<GumboNode *>(aChild->data[i]); if(aNode->type == GUMBO_NODE_TEXT) { userName = aNode->v.text.text; trim(userName); } } continue; } // User Steam Level if((node->v.element.tag == GUMBO_TAG_DIV) && (attr = gumbo_get_attribute(&node->v.element.attributes, "class")) && (strcmp(attr->value, "persona_name persona_level") == 0)) { GumboVector *aChild = &node->v.element.children; for(size_t i = 0; i < aChild->length; i++) { GumboNode *aNode = static_cast<GumboNode *>(aChild->data[i]); if((aNode->type == GUMBO_NODE_ELEMENT) && (aNode->v.element.tag == GUMBO_TAG_DIV)) { GumboVector *bChild = &aNode->v.element.children; for(size_t j = 0; j < bChild->length; j++) { GumboNode *bNode = static_cast<GumboNode *>(bChild->data[j]); if((bNode->type == GUMBO_NODE_ELEMENT) && (bNode->v.element.tag == GUMBO_TAG_SPAN)) { GumboVector *cChild = &bNode->v.element.children; for(size_t k = 0; k < cChild->length; k++) { GumboNode *cNode = static_cast<GumboNode *>(cChild->data[k]); if(cNode->type == GUMBO_NODE_TEXT) { stringstream ss; ss << cNode->v.text.text; ss >> userLevel; } } } } } }
void Index::AddIndexIDsOneFile(HTMLResource *html_resource) { QWriteLocker locker(&html_resource->GetLock()); QString source = html_resource->GetText(); QString version = html_resource->GetEpubVersion(); GumboInterface gi = GumboInterface(source, version); QList<GumboNode*> nodes = XhtmlDoc::GetIDNodes(gi, gi.get_root_node()); bool resource_updated = false; int index_id_number = 1; foreach(GumboNode * node, nodes) { QString index_id_value; // Get the text of all sub-nodes. QString text_node_text = XhtmlDoc::GetIDElementText(gi, node); // Convert to space since Index Editor unfortunately does the same. text_node_text.replace(QChar(160), " "); GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, "id"); if (attr) { index_id_value = QString::fromUtf8(attr->value); if (index_id_value.startsWith(SIGIL_INDEX_ID_PREFIX)) { GumboElement* element = &node->v.element; gumbo_element_remove_attribute(element, attr); resource_updated = true; } } // If this node is a custom index entry make sure it gets included bool is_custom_index_entry = false; QString custom_index_value = text_node_text; attr = gumbo_get_attribute(&node->v.element.attributes, "class"); if (attr) { QString class_names = QString::fromUtf8(attr->value); if (class_names.split(" ").contains(SIGIL_INDEX_CLASS)) { is_custom_index_entry = true; GumboAttribute* titleattr = gumbo_get_attribute(&node->v.element.attributes, "title"); if (titleattr) { QString title = QString::fromUtf8(titleattr->value); if (!title.isEmpty()) { custom_index_value = title; } } } } // Use the existing id if there is one, else add one if node contains index item attr = gumbo_get_attribute(&node->v.element.attributes, "id"); if (attr) { CreateIndexEntry(text_node_text, html_resource, index_id_value, is_custom_index_entry, custom_index_value); } else { index_id_value = SIGIL_INDEX_ID_PREFIX + QString::number(index_id_number); if (CreateIndexEntry(text_node_text, html_resource, index_id_value, is_custom_index_entry, custom_index_value)) { GumboElement* element = &node->v.element; gumbo_element_set_attribute(element, "id", index_id_value.toUtf8().constData()); resource_updated = true; index_id_number++; } } }
QList<Headings::Heading> Headings::GetHeadingListForOneFile(HTMLResource *html_resource, bool include_unwanted_headings) { Q_ASSERT(html_resource); QString source = html_resource->GetText(); QString version = html_resource->GetEpubVersion(); GumboInterface gi = GumboInterface(source, version); gi.parse(); // get original source line number of body element unsigned int body_line = 0; QList<GumboNode*> bodylist = gi.get_all_nodes_with_tag(GUMBO_TAG_BODY); if (!bodylist.isEmpty()) { GumboNode* body = bodylist.at(0); body_line = body->v.element.start_pos.line; } QList<GumboNode*> heading_nodes = gi.get_all_nodes_with_tags(GHEADING_TAGS); int num_heading_nodes = heading_nodes.count(); QList<Headings::Heading> headings; for (int i = 0; i < num_heading_nodes; ++i) { GumboNode* node = heading_nodes.at(i); Heading heading; heading.resource_file = html_resource; heading.path_to_node = gi.get_path_to_node(node); heading.title = QString(); GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes,"title"); if (attr) { heading.title = QString::fromUtf8(attr->value); } heading.orig_title = heading.title; if (!heading.title.isEmpty()) { heading.text = heading.title; } else { heading.text = gi.get_local_text_of_node(node); } heading.level = QString( QString::fromStdString(gi.get_tag_name(node)).at(1) ).toInt(); heading.orig_level = heading.level; QString classes = QString(); attr = gumbo_get_attribute(&node->v.element.attributes,"class"); if (attr) { classes = QString::fromUtf8(attr->value); } heading.include_in_toc = !(classes.contains(SIGIL_NOT_IN_TOC_CLASS) || classes.contains(OLD_SIGIL_NOT_IN_TOC_CLASS)); unsigned int node_line = node->v.element.start_pos.line; heading.at_file_start = (i == 0) && ((node_line - body_line) < ALLOWED_HEADING_DISTANCE); heading.is_changed = false; if (heading.include_in_toc || include_unwanted_headings) { headings.append(heading); } } return headings; }
bool doFetchLesson(Class t, ThreadSafeQueue<Class> *classList, std::atomic<int> *totalLessons) { std::string buffer; CURL *curl; CURLcode res; weekAmountList.lock(); int weekAmount = -1; for (int i = 0; i < weekAmountList.size(); i++) { if (weekAmountList.at(i).departmentStringId == t.departmentString() && weekAmountList.at(i).cpath == t.cpath()) { weekAmount = weekAmountList.at(i).amountOfWeeks; break; } } if(weekAmount == -1) printf("ERROR: NOT FOUND: %s - %s\n", t.departmentString().c_str(), t.cpath().c_str()); weekAmountList.unlock(); char postFields[1024] = ""; for (int i = 1; i < weekAmount+1; i++) { int n = sprintf(postFields, "%sweken[]=%d&", postFields, i); postFields[n] = '\0'; } int n = sprintf(postFields, "%ssleutelveld=%s&object=%s&filter=%s", postFields, t.classIdString().c_str(), t.cpath().c_str(), t.departmentString().c_str()); postFields[n] = '\0'; //printf("%s\n", postFields); curl = curl_easy_init(); curl_easy_setopt(curl, CURLOPT_URL, "https://rooster.nhl.nl/1516/rooster.php"); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writer); curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer); curl_easy_setopt(curl, CURLOPT_POSTFIELDS, postFields); //curl_easy_setopt(curl, CURLOPT_REFERER, NHL_REFERER); curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT); // 5 sec time out on whole request curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, CURL_CONNECT_TIMEOUT); // 10 sec connect time out res = curl_easy_perform(curl); //printf("------\n%s------\n\n", buffer.c_str()); if (res == CURLE_OK) { GumboOutput* output = gumbo_parse(buffer.c_str()); GumboNode* node = GetTBodyNode(output->root); if (node == NULL) { if (t.cpath() == "ttstud"){ printf("[%s] FAIL(%s-%s-%s)\n", currentDateTime().c_str(), t.className().c_str(), t.departmentString().c_str(), t.cpath().c_str()); curl_easy_cleanup(curl); } else{ printf("[%s] FAIL(%s-%s-%s), aborting program\n", currentDateTime().c_str(), t.className().c_str(), t.departmentString().c_str(), t.cpath().c_str()); curl_easy_cleanup(curl); exit(1); } return true; } GumboVector* children = &node->v.element.children; bool newDay = false; int lessonAmount = 0; int yearOffset = -1; int titleOffset = -1; int locationOffset = -1; int teacherOffset = -1; int typeOffset = -1; int commentOffset = -1; int endOffset = -1; std::string date; char dayName[128]; int day; int month; int year; for (unsigned int i = 0; i < children->length; ++i) { GumboNode *node1 = static_cast<GumboNode*>(children->data[i]); if (node1->v.element.tag == GUMBO_TAG_TR){ GumboAttribute *att = gumbo_get_attribute(&node1->v.element.attributes, "class"); if (att) { //printf("TR CLASS: %s\n", att->value); std::string value = att->value; if (value == "datarij") { std::string yearType = ""; GumboNode *startTimeNode = static_cast<GumboNode*>(node1->v.element.children.data[0]); std::string endDate = ""; if (endOffset != -1) { GumboNode *endTimeNode = static_cast<GumboNode*>(node1->v.element.children.data[endOffset]); endDate = GetTextFromElement(endTimeNode); } if (yearOffset != -1) { GumboNode *yearNode = static_cast<GumboNode*>(node1->v.element.children.data[yearOffset]); // optional if (yearNode) yearType = GetTextFromElement(yearNode); } std::string typeStr = ""; GumboNode *titleNode = static_cast<GumboNode*>(node1->v.element.children.data[titleOffset]); GumboNode *locationNode = static_cast<GumboNode*>(node1->v.element.children.data[locationOffset]); std::string teacher = ""; if (teacherOffset != -1) { GumboNode *teacherNode = static_cast<GumboNode*>(node1->v.element.children.data[teacherOffset]); teacher = GetTextFromElement(teacherNode); } //if(teacher == "" && t.departmentString() == "TEE" && t.cpath() == "stud"){ //printf("Teacher empty: %s - %s\n", GetTextFromElement(titleNode).c_str(), GetTextFromElement(startTimeNode).c_str()); //printf("Buffer:\n%s\n----\n", buffer.c_str()); //} //printf("%s\n", teacher.c_str()); if (typeOffset != -1) { GumboNode *typeNode = static_cast<GumboNode*>(node1->v.element.children.data[typeOffset]); typeStr = GetTextFromElement(typeNode); } GumboNode *commentsNode = static_cast<GumboNode*>(node1->v.element.children.data[commentOffset]); std::string startDate = GetTextFromElement(startTimeNode); int startHour; int startMinute; sscanf(startDate.c_str(), "%02d:%02d", &startHour, &startMinute); char newStartDate[128]; //YYYY-MM-DDTHH:MM:SS int n = sprintf(newStartDate, "%04d-%02d-%02dT%02d:%02d:00", year, month, day, startHour, startMinute); newStartDate[n] = '\0'; char newEndDate[128]; if (endDate != "") { int endHour; int endMinute; sscanf(endDate.c_str(), "%02d:%02d", &endHour, &endMinute); //YYYY-MM-DDTHH:MM:SS int ne = sprintf(newEndDate, "%04d-%02d-%02dT%02d:%02d:00", year, month, day, endHour, endMinute); newEndDate[ne] = '\0'; }else newEndDate[0] = '\0'; //printf("%s - %s - %s - %s - %s (YEAR: %s)\n", GetTextFromElement(titleNode).c_str(), newStartDate, newEndDate, GetTextFromElement(teacherNode).c_str(), GetTextFromElement(locationNode).c_str(), yearType.c_str()); int weekNr = getWeekNrFromDate(newStartDate); // Calculate time difference to remove old lessons time_t timeStampThisWeek = getTimeStampFromDate(newStartDate); time_t timeStampCurrentWeek; time(&timeStampCurrentWeek); //printf("Current week: %d - Lesson week: %d\n", getCurrentWeekNumber(), weekNr); double diff = difftime(timeStampThisWeek, timeStampCurrentWeek); double weeks = diff / 604800; //printf("Time difference: %.f\n", weeks); if (weeks > -2) { // ignore old lessons t.addLesson(shared_ptr<Lesson>(new Lesson(GetTextFromElement(titleNode), GetTextFromElement(commentsNode), teacher, replaceAll(GetTextFromElement(locationNode), " ", ", "), newStartDate, newEndDate, weekNr, yearType, typeStr))); ++*totalLessons; } lessonAmount++; } else if (value == "weekheader") { yearOffset = -1; titleOffset = -1; locationOffset = -1; teacherOffset = -1; typeOffset = -1; commentOffset = -1; GumboVector* children = &node1->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { GumboNode *child = static_cast<GumboNode*>(children->data[i]); std::string text = GetTextFromElement(child); if (text == "jaar") yearOffset = i; else if (text == "activiteit") titleOffset = i; else if (text == "lokaal") locationOffset = i; else if (text == "docent(en)" || text == "klas(en)") teacherOffset = i; else if (text == "werkvorm") typeOffset = i; else if (text == "opmerkingen") commentOffset = i; else if (text == "eind") endOffset = i; } } } else { // no class GumboNode *td = static_cast<GumboNode*>(node1->v.element.children.data[0]); if (td->v.element.tag == GUMBO_TAG_TD) { GumboAttribute *classAtt = gumbo_get_attribute(&td->v.element.attributes, "class"); if (classAtt) { std::string dayRow = classAtt->value; if (dayRow == "dagrij") { GumboNode *dateNode = static_cast<GumboNode*>(td->v.element.children.data[0]); date = dateNode->v.text.text; // get date with sscanf sscanf(date.c_str(), "%s %02d-%02d-%04d", &dayName, &day, &month, &year); //printf("New day @ %s\n", date.c_str()); } } } } } } //printf("Lessons: %d\n", lessonAmount); gumbo_destroy_output(&kGumboDefaultOptions, output); curl_easy_cleanup(curl); classList->lock(); classList->push(t); classList->unlock(); return true; } else{ //printf("[%s] Fail: %s, aborting program", currentDateTime().c_str(), curl_easy_strerror(res)); curl_easy_cleanup(curl); //exit(1); return false; } /* New ICAL style as of semester starting at 01-09-2015 */ /* icalcomponent *rootNode = icalparser_parse_string(cstr); icalcomponent *comp = icalcomponent_get_first_component(rootNode, ICAL_VEVENT_COMPONENT); //icalcomponent *zoneComp = icalcomponent_get_first_component(rootNode, ICAL_VTIMEZONE_COMPONENT); //icaltimezone *zone = icaltimezone_get_builtin_timezone("Europe/Amsterdam"); //icalcomponent *next = icalcomponent_get_next_component(rootNode, ICAL_VEVENT_COMPONENT); while (comp != NULL){ //printf("%s\n", icalcomponent_as_ical_string(comp)); std::string summary = icalcomponent_get_summary(comp); icaltimetype dtstart = icalcomponent_get_dtstart(comp); std::string starttime = formatDateTime(getTimeStampFromDateAlt(icaltime_as_ical_string(dtstart))); icaltimetype dtend = icalcomponent_get_dtend(comp); std::string endtime = formatDateTime(getTimeStampFromDateAlt(icaltime_as_ical_string(dtend))); const char *locationStr = icalcomponent_get_location(comp); // can be null std::string location = ""; if (locationStr){ location = locationStr; location = replaceAll(location, " ", ", "); } std::string commentStr = (char *)icalcomponent_get_comment(comp); printf("Comment: %s\n", commentStr.c_str()); char *comment = (char *)commentStr.c_str(); char *line = strtok(comment, "\n"); std::string docenten; while (line != NULL){ if (strstr(line, "Docent(en): ")){ line += 12; // 12 is length of Docent(en) int length = strlen(line); docenten = line; } line = strtok(NULL, "\n"); } int weekNr = getWeekNrFromDate(starttime); //printf("%s - %s\n", summary.c_str(), starttime.c_str()); if (weekNr > getCurrentWeekNumber() - 2) { // ignore old lessons t.addLesson(shared_ptr<Lesson>(new Lesson(summary, summary, docenten, location, starttime, endtime, weekNr))); printf("1Add Lesson(%s): %s - %s - %s - %s(%s)\n", t.className().c_str(), summary.c_str(), starttime.c_str(), endtime.c_str(), docenten.c_str(), commentStr.c_str()); } icalcomponent_free(comp); comp = icalcomponent_get_next_component(rootNode, ICAL_VEVENT_COMPONENT); } icalcomponent_free(comp); icalcomponent_free(rootNode); classList->lock(); classList->push(t); classList->unlock(); */ // delete our garbage //delete[] refererUrlBuffer; //delete[] urlBuffer; //delete[] cstr; //curl_free(classIdStringEscaped); //curl_easy_cleanup(curl); //return true; /* Old XML Style xml_document<> doc; doc.parse<0>(cstr); xml_node<> *pRoot = doc.first_node(); if (pRoot == 0) { std::cout << "doFetchLesson() ERROR: Invalid rootnode" << std::endl; exit(1); // Immediately abort program as the document is unreadable } else if (pRoot != NULL) { pRoot = pRoot->first_node(); if (pRoot == 0) { std::cout << "doFetchLesson() ERROR: Rootnode has an invalid first node" << std::endl; exit(1); // Immediately abort program as the document is unreadable } } for (xml_node<> *pNode = pRoot->first_node("item"); pNode; pNode = pNode->next_sibling()) { std::string title = pNode->first_node("title") ? pNode->first_node("title")->value() : ""; if (title.length() > 2) title = title.substr(title.find(": ") + 2, title.length()); std::string description = pNode->first_node("description") ? pNode->first_node("description")->value() : ""; std::string teacher = getStringBetween(" - ", " -", description, "([a-zA-Z,. ]+)"); std::string location = pNode->first_node("ev:location") ? pNode->first_node("ev:location")->value() : ""; location = trim(location); char *locationDecoded = curl_easy_unescape(curl, location.c_str(), 0, NULL); location = std::string(locationDecoded); curl_free(locationDecoded); std::string startdate = pNode->first_node("ev:startdate") ? pNode->first_node("ev:startdate")->value() : ""; std::string enddate = pNode->first_node("ev:enddate") ? pNode->first_node("ev:enddate")->value() : ""; int weekNr = getWeekNrFromDate(startdate); if (weekNr > getCurrentWeekNumber() - 2) // ignore old lessons t.addLesson(shared_ptr<Lesson>(new Lesson(title, description, teacher, location, startdate, enddate, weekNr))); } // ready to push back the class classList->lock(); classList->push_back(t); classList->unlock(); // delete our garbage delete[] refererUrlBuffer; delete[] urlBuffer; delete[] cstr; curl_free(classIdStringEscaped); curl_easy_cleanup(curl); return true;*/ }
bool ParserUtils::hasAttribute(const GumboNode* node, const string& attributeName) { assert(node != nullptr); return gumbo_get_attribute(&node->v.element.attributes, attributeName.c_str()) != nullptr; }
void t2HTMLParser::t2LabelParser::body(GumboNode *b) { static t3Queue<t2Element*> queue; t2Element* element = new t2Element(b); // --!这里body下有一个隐藏的div,长宽与窗口一致 string rootID = getRootDivGlobalID(); t2DivController::getInstance()->addDiv(rootID, element->div); t2DivController::getInstance()->setRoot(rootID); element->div->normal.width = t2GetWindowWidth(); element->div->normal.height = t2GetWindowHeight(); element->div->hover = element->div->normal; element->div->active = element->div->normal; queue.push(element); for(;;) { t2Element* e; if(queue.isEmpty()) e = NULL; else e = queue.pop(); if(e) { GumboNode* child = e->node; // find div if(child->type == GUMBO_NODE_ELEMENT && child->v.element.tag == GUMBO_TAG_DIV) { t2Div *div = e->div; // find className GumboAttribute *className = gumbo_get_attribute(&child->v.element.attributes, "class"); div->className = className->value; // text GumboNode *content = (GumboNode *) child->v.element.children.data[0]; if(content->type == GUMBO_NODE_TEXT || content->type == GUMBO_NODE_WHITESPACE) { div->normal.text = content->v.text.text; div->hover.text = content->v.text.text; div->active.text = content->v.text.text; div->normalCondition.text = content->v.text.text; div->hoverCondition.text = content->v.text.text; div->activeCondition.text = content->v.text.text; } // find id // not supported string id; GumboAttribute *idName = gumbo_get_attribute(&child->v.element.attributes, "id"); if(idName) div->className = idName->value; else id = getDivGlobalID(); // 新增div t2DivController::getInstance()->addDiv(id, div); } t2Div *brother = NULL; // 将所有兄弟结点入队列 for(int i = 0; i < child->v.element.children.length; i++) { GumboNode *c = (GumboNode *) child->v.element.children.data[i]; if(c->type == GUMBO_NODE_ELEMENT && c->v.element.tag == GUMBO_TAG_DIV) { // 多叉树的复制 t2Element* element = new t2Element(c); // 第一个兄弟结点 if(!brother) { // 子节点 e->div->child = element->div; element->div->parent = e->div; } else { // 父节点 element->div->parent = e->div; brother->next = element->div; } // 更新兄弟结点 brother = element->div; queue.push(element); } } } else break; } }
// TODO: case-insensitive comparison static void searchForDivBlocks(GumboNode* node) { if (node->type != GUMBO_NODE_ELEMENT) return; // <div class="forum-post-entry"> GumboAttribute* classAttr = NULL; if ((node->v.element.tag == GUMBO_TAG_DIV) && (classAttr = gumbo_get_attribute(&node->v.element.attributes, "class"))) { std::string classAttrStr(classAttr->value); if (classAttrStr == "forum-post-entry" ) { // <div class="forum-post-text" id="message_text_4453758"> GumboNode* fpeNode = node; ForumPageParser fpp; fpp.parseForumPostEntry(fpeNode); for (unsigned int i = 0; i < fpeNode->v.element.children.length; ++i) { #if 0 GumboNode* fpeChildNode = static_cast< GumboNode* >(fpeNode->v.element.children.data[i]); if (fpeChildNode->type != GUMBO_NODE_ELEMENT) continue; assert(fpeChildNode->v.element.tag == GUMBO_TAG_DIV); if (fpeChildNode->v.element.tag != GUMBO_TAG_DIV) continue; std::cout << " FPE children tags:" << gumbo_normalized_tagname(fpeChildNode->v.element.tag) << std::endl; for (int iAttr = 0; iAttr < fpeChildNode->v.element.attributes.length;++iAttr) { GumboAttribute* attr = static_cast< GumboAttribute* >( fpeChildNode->v.element.attributes.data[iAttr] ); assert(attr != NULL); std::cout << "ATTR: " << attr->name << " = " << attr->value << std::endl; } if (fpeChildNode->v.element.tag == GUMBO_TAG_DIV) { GumboAttribute* fpeChildNodeClassAttr = gumbo_get_attribute(&fpeChildNode->v.element.attributes, "class"); assert(fpeChildNodeClassAttr != NULL); std::string fpeChildNodeClassAttrStr(fpeChildNodeClassAttr->value); if (fpeChildNodeClassAttrStr == "forum-post-text") { GumboNode* fptNode = fpeChildNode; assert(fptNode->type == GUMBO_NODE_ELEMENT); // qDebug() << " forum-post-text children count: " << fptNode->v.element.children.length; GumboAttribute* fptNodeIdAttr = gumbo_get_attribute(&fptNode->v.element.attributes, "id"); assert(fptNodeIdAttr != NULL); //std::cout << " Forum post found: " << fptNodeIdAttr->value << std::endl; // FIXME: remove after debugging if (std::strcmp(fptNodeIdAttr->value, "message_text_4453758") != 0) continue; GumboVector* fptNodeChildren = &fptNode->v.element.children; for (unsigned int j = 0; j < fptNodeChildren->length; ++j) { GumboNode* fptNodeChild = static_cast< GumboNode* >(fptNodeChildren->data[j]); std::cout << "Element type: " << gumboElementTypeToString( fptNodeChild->type ) << std::endl; if (fptNodeChild->type == GUMBO_NODE_TEXT) { /*std::locale loc(std::locale(), new std::codecvt_utf8<char>); std::cout.imbue(loc);*/ std::string TEST(fptNodeChild->v.text.text); WCHAR wBuf[1024] = { 0 }; int res2 = MultiByteToWideChar(CP_UTF8, 0, fptNodeChild->v.text.text, -1, wBuf, 1024); char mbcsBuf[1024] = { 0 }; int res3 = WideCharToMultiByte(1251, 0, wBuf, 1024, mbcsBuf, 1024, NULL, NULL); std::string TEST_2(mbcsBuf); std::cout << "Element text: " << TEST_2 << std::endl; break; } else if (fptNodeChild->type == GUMBO_NODE_ELEMENT) { std::cout << " " << gumbo_normalized_tagname(fptNodeChild->v.element.tag) << std::endl; //qDebug() << " " << gumbo_tag_from_original_text( &fptNodeChild->v.element.original_tag ); } } } } #endif } std::cout << "----------------------------" << std::endl; } } GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { searchForDivBlocks(static_cast< GumboNode* >(children->data[i])); } }
GumboNode* HtmlParser::search_for_elem2(GumboNode* node, Tags *tag, string atributo, string valAtributo, Tags *tag2, string atributoElem2) { if (node->type != GUMBO_NODE_ELEMENT) { return NULL; } GumboAttribute* attribute; if (node->v.element.tag == tag->htmlTag){ map<string, string> attrElement; if ((attribute = gumbo_get_attribute(&node->v.element.attributes, tag->attr.c_str()))) { if (string(attribute->value).compare(valAtributo) == 0 && (string(attribute->name).compare(atributo) == 0 || valAtributo.empty()) ){ if (string(attribute->name).compare(atributoElem2) == 0){ //cout << attribute->name << ":2 " << attribute->value << endl; listUrlInfoSong.push_back(attribute->value); } GumboVector* children = &node->v.element.children; for (int j=0; j < children->length; j++){ GumboNode* enlace = search_for_elem2(static_cast<GumboNode*>(children->data[j]), tag2, atributoElem2, "", tag2, atributoElem2); return enlace; } } } else { //Recogemos el resto de atributos y los incluimos en la variable tag for (unsigned int i=0; i<node->v.element.attributes.length; i++){ attribute = ((GumboAttribute *)node->v.element.attributes.data[i]); if (string(attribute->name).compare(atributo) == 0 && (string(attribute->value).compare(valAtributo) == 0 || valAtributo.empty()) ){ if (string(attribute->name).compare(atributoElem2) == 0){ listUrlInfoSong.push_back(attribute->value); //cout << attribute->name << ":1 " << attribute->value << endl; } GumboVector* children = &node->v.element.children; for (int j=0; j < children->length; j++){ GumboNode* enlace = search_for_elem2(static_cast<GumboNode*>(children->data[j]), tag2, atributoElem2, "", tag2, atributoElem2); return enlace; } } } } } GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { GumboNode* ret = search_for_elem2(static_cast<GumboNode*>(children->data[i]), tag, atributo, valAtributo, tag2, atributoElem2); if (ret != NULL){ return ret; } } return NULL; }