示例#1
0
void indri::parse::HTMLParser::initialize( TokenizedDocument* tokenized, indri::api::ParsedDocument* parsed ) {
  indri::parse::TaggedTextParser::initialize( tokenized, parsed );

  // clear URL
  url[0] = 0;
  base_url[0] = 0;

  bool have_URL = false;

  // find the DOCHDR tag, so we can yank out the URL
  for( size_t i=0; i<tokenized->metadata.size(); i++ ) {
    if( !strcmp(tokenized->metadata[i].key, "url") ) have_URL = true;
    if( !strcmp(tokenized->metadata[i].key, "dochdr") ) {
      char* beginURL = (char*) tokenized->metadata[i].value;
      char* endURL = beginURL + strcspn( (char*) tokenized->metadata[i].value, " \t\r\n" );
      int length = lemur_compat::min<int>( endURL-beginURL, sizeof url-1 );
      memcpy( url, beginURL, length );
      url[length] = 0;

      strncpy( base_url, url, sizeof url-1 );
      base_url[length] = 0;
      char* lastSlash = strrchr( base_url, '/' );
      if( lastSlash ) *lastSlash = 0;
      break;
    }
  }

  // set url
  normalizeURL(url);

  // set base_url
  normalizeURL(base_url);

  // get tag definitions
  _absoluteUrlTag = _findTag("absolute-url");
  _relativeUrlTag = _findTag("relative-url");
  _anchorTag = _findTag("a");

  // add URL to metadata
  if ( ! have_URL ) {
    indri::parse::MetadataPair pair;
    pair.key = "url";
    pair.value = url;
    pair.valueLength = (int)strlen(url)+1;
    parsed->metadata.push_back( pair );
  }

  _urlBuffer.clear();
  //  _urlBuffer.grow( parsed->textLength * 4 ); // will this be large enough?
  _urlBuffer.grow( 1024 * 1024 * 25 );
}
示例#2
0
indri::parse::TaggedTextParser::tag_properties* indri::parse::TaggedTextParser::_buildTag( std::string name ) {

  tag_properties* result = 0;

  if ( result = _findTag( name.c_str() ) )
    return result;

  result = (tag_properties*) malloc( sizeof(tag_properties) + name.length() + 1 );

  result->index = false;
  result->exclude = false;
  result->include = false;
  result->metadata = false;

  char* pName = (char *) result + sizeof(tag_properties);
  result->name = pName;
  strcpy( pName, name.c_str() );

  _tagTable.insert( result->name, result );

  return result;
}
示例#3
0
void indri::parse::TaggedTextParser::handleTag( TagEvent* te ) {

  // Here, we know what the element is, what the attributes are, and
  // whether we are a close tag or an open tag.

  bool atEnd = ! te->open_tag;

  // Conflate
  
  const char* original_name = te->name;
  
  _p_conflater->conflate( te );

  // Now check for tag_properties using conflated form, since
  // tag_properties can only be set per the tag name (ie, not per an
  // attribute-value pattern).

  const tag_properties* tagProps = _findTag( te->name );
  
  bool oldInclude = _include;
  
  if( tagProps ) { 
    // set _include and _exclude flags appropriately:
    if( atEnd ) {
      if( _exclude ) {
        if( tagProps == _startExcludeRegion ) {
          // this is an end tag, and it matches the start of an exclude region
          _startExcludeRegion = 0;
          _exclude = false;
        }
      } else if( _include && tagProps == _startIncludeRegion ) {
        _startIncludeRegion = 0;
        _include = false;
      }
    } else { // !atEnd
      // if we're in exclude mode, new tags don't matter
      if( ! _exclude ) {
        // not in an exclude
        if( !_include ) {
          // not in included territory
          if( tagProps->include && _startIncludeRegion == 0 ) {
            _startIncludeRegion = tagProps;
            _include = true;
          }
        } else {
          // !_exclude && _include
          if( tagProps->exclude && _startExcludeRegion == 0 ) {
            _startExcludeRegion = tagProps;
            _exclude = true;
          }
        }
      }
    }
    
    // index the tags if necessary
    // this may be an end include tag, so we allow oldInclude
    if( (tagProps->index && !_exclude && (_include || oldInclude)) || tagProps == _findTag("document") ) {
      if( atEnd ) {
        endTag( original_name, tagProps->name, te->pos );
      } else {
        addTag( original_name, tagProps->name, te->pos );
      }
    }
    
    // index metadata if necessary
    if( tagProps->metadata ) {
      if( atEnd ) {
        // te->begin is the byte offset of the beginning of the end
        // tag ('<'), or end of the enclosed region.
        endMetadataTag( original_name, tagProps->name, te->begin );
      } else {
        // te->end is the byte offset of the end of the begin tag
        // ('>'), or beginning of the enclosed region.
        addMetadataTag( original_name, tagProps->name, te->end );
      }
    }
  }
}