void _mergeData() { // fill in tags from the document vector fields // unnecessary to filter out any fields to remove, as they will // be ignored when indexing. indri::utility::greedy_vector<indri::index::FieldExtent>& fields = _docIter->currentEntry()->fields() ; tl.clear(); for (size_t i = 0; i < fields.size(); i++) { const indri::index::FieldExtent& field = fields[i]; std::string fieldName = _index->field(fields[i].id);; tl.addTag(fieldName.c_str(), fieldName.c_str(), field.begin); tl.endTag(fieldName.c_str(), fieldName.c_str(), field.end); } //stuff it into the parsed doc tl.writeTagList(parsed->tags); // fill in terms from the document text so that they will stop/stem // correctly when added to the new repository. // Potentially issues with url injection here... // probably best not to do this with trecweb/html docs.... // TODO: test this termBuffer.clear(); termBuffer.grow( parsed->textLength * 2 ); for (size_t i = 0; i < parsed->positions.size(); i++ ) { int start = parsed->positions[i].begin; int end = parsed->positions[i].end; int token_len = end - start; const char *token = parsed->text + start; char* write_loc = termBuffer.write( token_len + 1 ); strncpy( write_loc, token, token_len ); write_loc[token_len] = '\0'; parsed->terms.push_back( write_loc ); } }
void indri::infnet::ShrinkageBeliefNode::indexChanged( indri::index::Index& index ) { _ruleMap.clear(); std::set<smoothing_rule, lt_rule>::iterator ruleIter = _ruleSet.begin(); while( ruleIter != _ruleSet.end() ) { int field = index.field( ruleIter->fieldName ); _ruleMap[ field ] = *ruleIter; ruleIter++; } }