Exemple #1
0
void LinkExtractor::safeHandleStartTag(const std::string& tag_name,
		attr_list_t& attrs, bool empty_element_tag)
{
	LinkExtractor::link_tag_map_t::const_iterator _i;
	std::string _attr;

	std::string name = tag_name;
	to_lower(name);
	// Base should be treat separately
	if ((name == "base") and (attrs.find("href") != attrs.end()) ) {
		this->base = attrs["href"].str();
	} else if ( name == "meta" ){
		this->handleMetaTag(tag_name, attrs);
	}
	// Extract Links
	//  - Is this a tag link?
	//  - If so, is the corresponding attribute of this tag present?
	_i = this->LINK_TAGS.find(name);
	if ( (_i != this->LINK_TAGS.end()) and 
	     ( attrs.find( _attr = _i->second ) != attrs.end() ))
	{
		this->links.insert( attrs[_attr].str() );
	}


}
void FindEncParser::handleProcessingInstruction(const std::string& name,
		attr_list_t& attrs)
{
	if ( name == "xml" and attrs.find("encoding") != attrs.end()) {
		this->enc = attrs["encoding"].str();
		this->enc = strip(this->enc);
		throw CharsetDetectedException("Found in a Processing Instruction");
	}

}
void FindEncParser::safeHandleStartTag(const std::string& name,
	attr_list_t& attrs, bool empty_element_tag)
{
	std::string val;

	if ( name == "meta" and attrs.find("http-equiv") != attrs.end() ) {
		val = attrs["http-equiv"].str();
		to_lower(val);
		if (val == "content-type" and attrs.find("content") != attrs.end() ) {
			val = attrs["content"].str();
			to_lower(val);
			std::string charset = get_charset_from_content_type(val);
			if (not charset.empty()) {
				this->enc = strip(charset);
				throw CharsetDetectedException("Found in a Meta Tag");
			}
		}
	}
}
Exemple #4
0
void LinkExtractor::handleMetaTag( const std::string& tag_name,
attr_list_t& attrs)
{
	std::string value;
	std::string content;

	attr_list_t::const_iterator name_attr = attrs.find("name");
	

	if (name_attr != attrs.end()){
		value = name_attr->second.str();
		to_lower(value);
		if (value == "robots"){
			if ( attrs.count("content") ) {
				content = attrs["content"].str();
				handleRobotsMetaContent(content);
			}
		}
	}
}