void LinkExtractor::safeHandleStartTag(const std::string& tag_name, attr_list_t& attrs, bool empty_element_tag) { LinkExtractor::link_tag_map_t::const_iterator _i; std::string _attr; std::string name = tag_name; to_lower(name); // Base should be treat separately if ((name == "base") and (attrs.find("href") != attrs.end()) ) { this->base = attrs["href"].str(); } else if ( name == "meta" ){ this->handleMetaTag(tag_name, attrs); } // Extract Links // - Is this a tag link? // - If so, is the corresponding attribute of this tag present? _i = this->LINK_TAGS.find(name); if ( (_i != this->LINK_TAGS.end()) and ( attrs.find( _attr = _i->second ) != attrs.end() )) { this->links.insert( attrs[_attr].str() ); } }
void FindEncParser::handleProcessingInstruction(const std::string& name, attr_list_t& attrs) { if ( name == "xml" and attrs.find("encoding") != attrs.end()) { this->enc = attrs["encoding"].str(); this->enc = strip(this->enc); throw CharsetDetectedException("Found in a Processing Instruction"); } }
void FindEncParser::safeHandleStartTag(const std::string& name, attr_list_t& attrs, bool empty_element_tag) { std::string val; if ( name == "meta" and attrs.find("http-equiv") != attrs.end() ) { val = attrs["http-equiv"].str(); to_lower(val); if (val == "content-type" and attrs.find("content") != attrs.end() ) { val = attrs["content"].str(); to_lower(val); std::string charset = get_charset_from_content_type(val); if (not charset.empty()) { this->enc = strip(charset); throw CharsetDetectedException("Found in a Meta Tag"); } } } }
void LinkExtractor::handleMetaTag( const std::string& tag_name, attr_list_t& attrs) { std::string value; std::string content; attr_list_t::const_iterator name_attr = attrs.find("name"); if (name_attr != attrs.end()){ value = name_attr->second.str(); to_lower(value); if (value == "robots"){ if ( attrs.count("content") ) { content = attrs["content"].str(); handleRobotsMetaContent(content); } } } }