Ejemplo n.º 1
0
  void _mergeData() {
    // fill in tags from the document vector fields
    // unnecessary to filter out any fields to remove, as they will
    // be ignored when indexing.
    indri::utility::greedy_vector<indri::index::FieldExtent>& fields = _docIter->currentEntry()->fields() ;
    tl.clear();
    for (size_t i = 0; i < fields.size(); i++) {
      const indri::index::FieldExtent& field = fields[i];
      std::string fieldName = _index->field(fields[i].id);;
    
      tl.addTag(fieldName.c_str(), fieldName.c_str(), field.begin);
      tl.endTag(fieldName.c_str(), fieldName.c_str(), field.end);
    }
    //stuff it into the parsed doc
    tl.writeTagList(parsed->tags);

    // fill in terms from the document text so that they will stop/stem
    // correctly when added to the new repository.
    // Potentially issues with url injection here...
    // probably best not to do this with trecweb/html docs....
    // TODO: test this
    termBuffer.clear();
    termBuffer.grow( parsed->textLength * 2 );

    for (size_t i = 0; i < parsed->positions.size(); i++ ) {
      int start = parsed->positions[i].begin;
      int end = parsed->positions[i].end;
      int token_len = end - start;
      const char *token = parsed->text + start;
      char* write_loc = termBuffer.write( token_len + 1 );
      strncpy( write_loc, token, token_len );
      write_loc[token_len] = '\0';
      parsed->terms.push_back( write_loc );
    }
  }
Ejemplo n.º 2
0
void indri::infnet::ShrinkageBeliefNode::indexChanged( indri::index::Index& index ) {

  _ruleMap.clear();
  
  std::set<smoothing_rule, lt_rule>::iterator ruleIter = _ruleSet.begin();
  while( ruleIter != _ruleSet.end() ) {
    int field = index.field( ruleIter->fieldName );
    _ruleMap[ field ] = *ruleIter;
    ruleIter++;
  }

}