void indri::parse::HTMLParser::initialize( TokenizedDocument* tokenized, indri::api::ParsedDocument* parsed ) { indri::parse::TaggedTextParser::initialize( tokenized, parsed ); // clear URL url[0] = 0; base_url[0] = 0; bool have_URL = false; // find the DOCHDR tag, so we can yank out the URL for( size_t i=0; i<tokenized->metadata.size(); i++ ) { if( !strcmp(tokenized->metadata[i].key, "url") ) have_URL = true; if( !strcmp(tokenized->metadata[i].key, "dochdr") ) { char* beginURL = (char*) tokenized->metadata[i].value; char* endURL = beginURL + strcspn( (char*) tokenized->metadata[i].value, " \t\r\n" ); int length = lemur_compat::min<int>( endURL-beginURL, sizeof url-1 ); memcpy( url, beginURL, length ); url[length] = 0; strncpy( base_url, url, sizeof url-1 ); base_url[length] = 0; char* lastSlash = strrchr( base_url, '/' ); if( lastSlash ) *lastSlash = 0; break; } } // set url normalizeURL(url); // set base_url normalizeURL(base_url); // get tag definitions _absoluteUrlTag = _findTag("absolute-url"); _relativeUrlTag = _findTag("relative-url"); _anchorTag = _findTag("a"); // add URL to metadata if ( ! have_URL ) { indri::parse::MetadataPair pair; pair.key = "url"; pair.value = url; pair.valueLength = (int)strlen(url)+1; parsed->metadata.push_back( pair ); } _urlBuffer.clear(); // _urlBuffer.grow( parsed->textLength * 4 ); // will this be large enough? _urlBuffer.grow( 1024 * 1024 * 25 ); }
indri::parse::TaggedTextParser::tag_properties* indri::parse::TaggedTextParser::_buildTag( std::string name ) { tag_properties* result = 0; if ( result = _findTag( name.c_str() ) ) return result; result = (tag_properties*) malloc( sizeof(tag_properties) + name.length() + 1 ); result->index = false; result->exclude = false; result->include = false; result->metadata = false; char* pName = (char *) result + sizeof(tag_properties); result->name = pName; strcpy( pName, name.c_str() ); _tagTable.insert( result->name, result ); return result; }
void indri::parse::TaggedTextParser::handleTag( TagEvent* te ) { // Here, we know what the element is, what the attributes are, and // whether we are a close tag or an open tag. bool atEnd = ! te->open_tag; // Conflate const char* original_name = te->name; _p_conflater->conflate( te ); // Now check for tag_properties using conflated form, since // tag_properties can only be set per the tag name (ie, not per an // attribute-value pattern). const tag_properties* tagProps = _findTag( te->name ); bool oldInclude = _include; if( tagProps ) { // set _include and _exclude flags appropriately: if( atEnd ) { if( _exclude ) { if( tagProps == _startExcludeRegion ) { // this is an end tag, and it matches the start of an exclude region _startExcludeRegion = 0; _exclude = false; } } else if( _include && tagProps == _startIncludeRegion ) { _startIncludeRegion = 0; _include = false; } } else { // !atEnd // if we're in exclude mode, new tags don't matter if( ! _exclude ) { // not in an exclude if( !_include ) { // not in included territory if( tagProps->include && _startIncludeRegion == 0 ) { _startIncludeRegion = tagProps; _include = true; } } else { // !_exclude && _include if( tagProps->exclude && _startExcludeRegion == 0 ) { _startExcludeRegion = tagProps; _exclude = true; } } } } // index the tags if necessary // this may be an end include tag, so we allow oldInclude if( (tagProps->index && !_exclude && (_include || oldInclude)) || tagProps == _findTag("document") ) { if( atEnd ) { endTag( original_name, tagProps->name, te->pos ); } else { addTag( original_name, tagProps->name, te->pos ); } } // index metadata if necessary if( tagProps->metadata ) { if( atEnd ) { // te->begin is the byte offset of the beginning of the end // tag ('<'), or end of the enclosed region. endMetadataTag( original_name, tagProps->name, te->begin ); } else { // te->end is the byte offset of the end of the begin tag // ('>'), or beginning of the enclosed region. addMetadataTag( original_name, tagProps->name, te->end ); } } } }