Пример #1
0
void indri::api::IndexEnvironment::addFile( const std::string& fileName, const std::string& fileClass ) {
  indri::parse::Parser* parser = 0;
  indri::parse::Tokenizer* tokenizer = 0;
  indri::parse::DocumentIterator* iterator = 0;
  indri::parse::Conflater* conflater = 0;
  
  _getParsingContext( &parser, &tokenizer, &iterator, &conflater, fileClass );

  if( !parser || !iterator ) {
    _documentsSeen++;
    if( _callback ) (*_callback) ( indri::api::IndexStatus::FileSkip, fileName, _error, _documentsIndexed, _documentsSeen );
  } else {
    try {
      indri::parse::UnparsedDocument* document;
      indri::parse::TokenizedDocument* tokenized;
      ParsedDocument* parsed;

      iterator->open( fileName );

      std::vector<indri::parse::Transformation*> annotators = _createAnnotators( fileName, fileClass, &conflater );

      // notify caller that the file was successfully parsed
      if( _callback ) (*_callback)( indri::api::IndexStatus::FileOpen, fileName, _error, _documentsIndexed, _documentsSeen );

      while( document = iterator->nextDocument() ) {
        _documentsSeen++;

        tokenized = tokenizer->tokenize( document );
        parsed = parser->parse( tokenized );
        parsed = _applyAnnotators( annotators, parsed );

        _repository.addDocument( parsed );

        _documentsIndexed++;
        if( _callback ) (*_callback)( indri::api::IndexStatus::DocumentCount, fileName, _error, _documentsIndexed, _documentsSeen );
      }

      // notify caller that the file was successfully closed
      if( _callback ) (*_callback)( indri::api::IndexStatus::FileClose, fileName, _error, _documentsIndexed, _documentsSeen );

      iterator->close();
    } catch( lemur::api::Exception& e ) {
      if( iterator )
        iterator->close();

      // notify caller of errors
      if( _callback ) (*_callback)( indri::api::IndexStatus::FileError, fileName, e.what(), _documentsIndexed, _documentsSeen );
    }
  }
}
Пример #2
0
void indri::api::IndexEnvironment::addFile( const std::string& fileName, const std::string& fileClass ) {
  indri::parse::Parser* parser = 0;
  indri::parse::Tokenizer* tokenizer = 0;
  indri::parse::DocumentIterator* iterator = 0;
  indri::parse::Conflater* conflater = 0;
  
  _getParsingContext( &parser, &tokenizer, &iterator, &conflater, fileClass );

  if( !parser || !iterator ) {
    _documentsSeen++;
    if( _callback ) (*_callback) ( indri::api::IndexStatus::FileSkip, fileName, _error, _documentsIndexed, _documentsSeen );
  } else {
    try {
      indri::parse::UnparsedDocument* document;
      indri::parse::TokenizedDocument* tokenized;
      ParsedDocument* parsed;

      iterator->open( fileName );

      std::vector<indri::parse::Transformation*> annotators = _createAnnotators( fileName, fileClass, &conflater );

      // notify caller that the file was successfully parsed
      if( _callback ) (*_callback)( indri::api::IndexStatus::FileOpen, fileName, _error, _documentsIndexed, _documentsSeen );

      while( document = iterator->nextDocument() ) {
        _documentsSeen++;

        tokenized = tokenizer->tokenize( document );
        parsed = parser->parse( tokenized );
        parsed = _applyAnnotators( annotators, parsed );
        // can't know for sure that we have the docno element until after
        // the annotators have been applied
        // check if this document is in the index already
        // find the docno
        std::string docIDStr = "";
        for( size_t i=0; i<parsed->metadata.size(); i++ ) {
          const char * key = parsed->metadata[i].key;
          if( !strcmp( key, "docno" ) ) {
            docIDStr = (const char *)parsed->metadata[i].value;
            break;
          }
        }
        // look up the id.
        std::vector<lemur::api::DOCID_T> ids = _repository.collection()->retrieveIDByMetadatum("docno", docIDStr);
        // if not found, add the document.
        if (ids.size() == 0)  {
            _repository.addDocument( parsed );
            _documentsIndexed++;
        } // else mention the dupe?
        
        if( _callback ) (*_callback)( indri::api::IndexStatus::DocumentCount, fileName, _error, _documentsIndexed, _documentsSeen );
      }

      // notify caller that the file was successfully closed
      if( _callback ) (*_callback)( indri::api::IndexStatus::FileClose, fileName, _error, _documentsIndexed, _documentsSeen );

      iterator->close();
    } catch( lemur::api::Exception& e ) {
      if( iterator )
        iterator->close();

      // notify caller of errors
      if( _callback ) (*_callback)( indri::api::IndexStatus::FileError, fileName, e.what(), _documentsIndexed, _documentsSeen );
    }
  }
}