Beispiel #1
0
void indri::infnet::InferenceNetwork::_indexChanged( indri::index::Index& index ) {
  _closeIterators.clear();
  _closeIteratorBound = -1;

  // doc iterators
  for( size_t i=0; i<_termNames.size(); i++ ) {
    indri::index::DocListIterator* iterator = index.docListIterator( _termNames[i] );
    if( iterator )
      iterator->startIteration();

    _docIterators.push_back( iterator );
  }

  // field iterators
  for( size_t i=0; i<_fieldNames.size(); i++ ) {
    indri::index::DocExtentListIterator* iterator = index.fieldListIterator( _fieldNames[i] );
    if( iterator )
      iterator->startIteration();

    _fieldIterators.push_back( iterator );
  }
  
  // prior iterators
  for( size_t i=0; i<_priorNames.size(); i++ ) {
    // TODO: this is wasteful, since the prior is associated with the whole collection,
    // there's no need to fetch it for each index.  but, it's just easier to code it like this for now
    indri::collection::PriorListIterator* iterator = _repository.priorListIterator( _priorNames[i] );
    if( iterator )
      iterator->startIteration();
    else {
      // if the named prior doesn't exist in the Repository, throw an Exception
      LEMUR_THROW( LEMUR_RUNTIME_ERROR, "named prior: " + _priorNames[i] + " not found in Repository. Unable to process query." );
    }

    _priorIterators.push_back( iterator );
  }

  // extent iterator nodes
  std::vector<ListIteratorNode*>::iterator diter;
  for( diter = _listIteratorNodes.begin(); diter != _listIteratorNodes.end(); diter++ ) {
    (*diter)->indexChanged( index );
  }

  // belief nodes
  std::vector<BeliefNode*>::iterator biter;
  for( biter = _beliefNodes.begin(); biter != _beliefNodes.end(); biter++ ) {
    (*biter)->indexChanged( index );
  }

  // evaluator nodes
  std::vector<indri::infnet::EvaluatorNode*>::iterator eiter;
  for( eiter = _evaluators.begin(); eiter != _evaluators.end(); eiter++ ) {
    (*eiter)->indexChanged( index );
  }

  // document structure
  if (_documentStructureHolderNode != 0) {
    _documentStructureHolderNode->indexChanged( index );
  }
}
void indri::infnet::ContextSimpleCountAccumulator::_computeCounts( indri::index::Index& index ) {
  _size += index.termCount();
  _documentCount += index.documentCount();
  _documentOccurrences += index.documentCount( _term );
  if( _term.length() != 0 ) {
    _occurrences += index.termCount( _term );
  }
}
Beispiel #3
0
void indri::infnet::InferenceNetwork::_evaluateIndex( indri::index::Index& index ) {
  // don't need to do anything unless there are some
  // evaluators in the network that need full evaluation

  if( _complexEvaluators.size() ) {
    lemur::api::DOCID_T maximumDocument = index.documentMaximum();
    
    if (maximumDocument == index.documentBase()) {
      // empty memory index, nothing to score.
      return;
    }

    lemur::api::DOCID_T lastCandidate = MAX_INT32; // 64
    int scoredDocuments = 0;
    lemur::api::DOCID_T candidate = 0;
    indri::index::DeletedDocumentList::read_transaction* deleted;
    deleted = _repository.deletedList().getReadTransaction();

    while(1) {
      // ask the root node for a candidate document
      // this asks the whole inference network for the
      // first document that might possibly produce a
      // usable (above the max score threshold) score
      candidate = _nextCandidateDocument( deleted );
      if (candidate < index.documentBase()) {
        std::cerr << candidate << " < index.documentBase()" << std::endl;
        break;
      }
      
      assert( candidate >= index.documentBase() );

      // if candidate is MAX_INT32, we're done
      if( candidate == MAX_INT32 || candidate > maximumDocument ) {
        break;
      }

      // move all the doc info lists to this new document
      // in preparation for scoring
      if( candidate != lastCandidate ) {
        _moveToDocument( candidate );
      }

      // ask all the evaluators to evaluate this document
      _evaluateDocument( index, candidate );
      scoredDocuments++;

      // if that was the last document, we can quit now
      if( candidate+1 > maximumDocument )
        break;

      // move all candidate iterators to candidate+1
      _moveToDocument( candidate+1 );
      lastCandidate = candidate+1;
      assert( candidate >= index.documentBase() );
    }
    delete deleted;
  }
}
Beispiel #4
0
void indri::infnet::InferenceNetwork::_evaluateDocument( indri::index::Index& index, lemur::api::DOCID_T document ) {
  int candidateLength = index.documentLength( document );

  for( size_t i=0; i<_complexEvaluators.size(); i++ ) {
    _complexEvaluators[i]->evaluate( document, candidateLength );
  }
}
Beispiel #5
0
  void _mergeData() {
    // fill in tags from the document vector fields
    // unnecessary to filter out any fields to remove, as they will
    // be ignored when indexing.
    indri::utility::greedy_vector<indri::index::FieldExtent>& fields = _docIter->currentEntry()->fields() ;
    tl.clear();
    for (size_t i = 0; i < fields.size(); i++) {
      const indri::index::FieldExtent& field = fields[i];
      std::string fieldName = _index->field(fields[i].id);;
    
      tl.addTag(fieldName.c_str(), fieldName.c_str(), field.begin);
      tl.endTag(fieldName.c_str(), fieldName.c_str(), field.end);
    }
    //stuff it into the parsed doc
    tl.writeTagList(parsed->tags);

    // fill in terms from the document text so that they will stop/stem
    // correctly when added to the new repository.
    // Potentially issues with url injection here...
    // probably best not to do this with trecweb/html docs....
    // TODO: test this
    termBuffer.clear();
    termBuffer.grow( parsed->textLength * 2 );

    for (size_t i = 0; i < parsed->positions.size(); i++ ) {
      int start = parsed->positions[i].begin;
      int end = parsed->positions[i].end;
      int token_len = end - start;
      const char *token = parsed->text + start;
      char* write_loc = termBuffer.write( token_len + 1 );
      strncpy( write_loc, token, token_len );
      write_loc[token_len] = '\0';
      parsed->terms.push_back( write_loc );
    }
  }
void indri::infnet::FieldWildcardNode::indexChanged( indri::index::Index& index ) { 
  if ( _docIter != 0 ) {
    delete _docIter;
    _docIter = 0;
  }
  _index = & index;
  _nextDocument = 1;
  _docIterID = 1;
  _docIter = index.termListFileIterator();
  _docIter->startIteration();
}
void indri::infnet::ShrinkageBeliefNode::indexChanged( indri::index::Index& index ) {

  _ruleMap.clear();
  
  std::set<smoothing_rule, lt_rule>::iterator ruleIter = _ruleSet.begin();
  while( ruleIter != _ruleSet.end() ) {
    int field = index.field( ruleIter->fieldName );
    _ruleMap[ field ] = *ruleIter;
    ruleIter++;
  }

}
void indri::infnet::DocumentStructureHolderNode::indexChanged( indri::index::Index& index ) { 

  _index = & index;
  _nextDocument = 1;
  _docIterID = 1;

  delete _docIter;
  _docIter = index.termListFileIterator();
  _docIter->startIteration();

  delete _documentStructure;
  _documentStructure = 0;  
}
void indri::infnet::ContextSimpleCountAccumulator::_computeCounts( indri::index::Index& index ) {
  assert( _terms.size() );
  assert( _context.size() == 0 || _field.size() == 0 );

  if( _context.size() ) {
    _size += index.fieldTermCount( _context );
  } else {
    _size += index.termCount();
  }

  for( unsigned int i=0; i<_terms.size(); i++ ) {
    if( _terms[i].length() != 0 ) {
      if( _field.size() ) {
        _occurrences += index.fieldTermCount( _field, _terms[i] );
      } else if( _context.size() ) {
        _occurrences += index.fieldTermCount( _context, _terms[i] );
      } else {
        _occurrences += index.termCount( _terms[i] );
      }
    }
  }
}
void indri::infnet::ContextCountAccumulator::indexChanged( indri::index::Index& index ) {
  if( ! _context ) {
    _contextSize += index.termCount();
  }
  _documentCount += index.documentCount();
}
void indri::infnet::ContextSimpleCountAccumulator::indexChanged( indri::index::Index& index ) {
  _computeCounts( index );
  _maximumDocument = index.documentCount() + index.documentBase();
}
Beispiel #12
0
  void processFields( indri::api::Parameters &param ) {
    g_timer.start();
    std::string index = param.get("index");
    std::cout << "Opening: " << index << std::endl;
    // make sure this path doesn't exist.
    std::string idx2 = index + ".new"; // temp target index.

    // presumes a single input oa file for the entire collection.
    std::string offsetAnnotationsPath = param.get("annotations");
      
    /// these need to be combined with existing.
    // fields to add
    // these need to supply numeric/parental/ordinal/etc...
    if (param.exists("addField"))
      addFields = param["addField"];
      
    // fields to remove
    // these only need to be a list of names.
    if (param.exists("removeField")) {
      indri::api::Parameters slice = param["removeField"];
      for (size_t i = 0; i < slice.size(); i++) {
        if( slice[i].exists("name") ) {
          removeNames.push_back( slice[i]["name"] );
        }
      }
    }
      
    // need to know the file class environment to get the 
    // conflations right.
    std::string className = param.get("fileclass", "");

    indri::collection::Repository sourceRepo;
    indri::collection::Repository targetRepo;
    indri::parse::OffsetAnnotationAnnotator oa_annotator;
    indri::parse::FileClassEnvironmentFactory _fileClassFactory;
            
    // Open source repo
    sourceRepo.openRead(index);
    // Copy its parameters, create target repo, adding or removing
    // fields.
    repo.loadFile( indri::file::Path::combine( index, "manifest" ) );
    int mem = param.get("memory", INT64(100*1024*1024));
      
    repo.set("memory", mem);
    adding = addFields.exists("field");
    _mergeFields();
    // Create the offset annotator.
    fce = _fileClassFactory.get( className );
    indri::parse::Conflater* conflater = 0;
    if( fce ) {
      conflater = fce->conflater;
    }
    if (adding) 
      {
        oa_annotator.setConflater( conflater );
        oa_annotator.open( offsetAnnotationsPath );
      }

    targetRepo.create(idx2, &repo);
      
    // for each document in the source repo, fetch ParsedDocument 
    // construct full rep, apply annotator, insert into
    // target repo.

    _index = sourceRepo.indexes()->front(); // presume 1
    _docIter = _index->termListFileIterator();
    _docIter->startIteration();
    // ought to deal with deleted documents here...
    // if there are deleted documents, regular add to collection
    // if not, only rewrite the indexes, then rename the collection.
    indri::index::DeletedDocumentList& deleted = sourceRepo.deletedList();
    UINT64 delCount = deleted.deletedCount();
    if (delCount > 0) 
      {
        // either warn, compact and then process, or 
        // do it the old way... FIXME!
        std::cerr << "Deleted documents detected... compact with dumpindex first." << std::endl;
        return;
      }
    
    for (UINT64 docid = 1; docid <= _index->documentCount(); docid++) 
      {
        if ((docid % 500) == 0)  {
          g_timer.printElapsedSeconds(std::cout);
          std::cout << ": " << docid << "\r";
          std::cout.flush();
        }

        parsed = sourceRepo.collection()->retrieve(docid);
        // combine field and term data with parsed document
        _mergeData();
        // apply annotator
        if (adding)
          parsed = oa_annotator.transform(parsed);
        targetRepo.addDocument(parsed, false);
        // TagList allocs memory for the tags...
        for (size_t i = 0; i < parsed->tags.size(); i++)
          delete(parsed->tags[i]);
        delete(parsed);
        _docIter->nextEntry();
      }
    std::cout << std::endl;
    g_timer.printElapsedSeconds(std::cout);
    std::cout << ": " << _index->documentCount() << std::endl;
    g_timer.printElapsedSeconds(std::cout);
    std::cout << ": closing"  << std::endl;

    targetRepo.close();
    sourceRepo.close();
    std::string oldcollectionPath = indri::file::Path::combine( index, "collection" );
    std::string newcollectionPath = indri::file::Path::combine( idx2, "collection" );
    // clone the collection
    indri::file::Path::remove(newcollectionPath);
    indri::file::Path::rename(oldcollectionPath, newcollectionPath);
    // rename target repo to source repo.
    indri::file::Path::remove(index);
    indri::file::Path::rename(idx2, index);

    g_timer.printElapsedSeconds(std::cout);
    std::cout << ": done"  << std::endl;
  }