Пример #1
0
lemur::api::DOCID_T indri::api::IndexEnvironment::addString( const std::string& documentString, const std::string& fileClass, const std::vector<indri::parse::MetadataPair>& metadata ) {
  indri::parse::UnparsedDocument document;
  indri::parse::Parser* parser;
  indri::parse::Tokenizer* tokenizer;
  indri::parse::DocumentIterator* iterator;
  indri::parse::Conflater* conflater;
  std::string nothing;

  _documentsSeen++;

  document.text = documentString.c_str();
  document.textLength = documentString.length() + 1; // for the null
  document.metadata = metadata;
  document.content = document.text;
  document.contentLength = document.textLength - 1;
  
  _getParsingContext( &parser, &tokenizer, &iterator, &conflater, fileClass );

  if( parser == 0 ) {
    LEMUR_THROW( LEMUR_RUNTIME_ERROR, "File class '" + fileClass + "' wasn't recognized." );
  }
  indri::parse::TokenizedDocument* tokenized = tokenizer->tokenize( &document );

  ParsedDocument* parsed = parser->parse( tokenized );
  lemur::api::DOCID_T documentID =_repository.addDocument( parsed );

  _documentsIndexed++;
  if( _callback ) (*_callback)( indri::api::IndexStatus::DocumentCount, nothing, _error, _documentsIndexed, _documentsSeen );

  return documentID;
}
Пример #2
0
void indri::api::IndexEnvironment::addFile( const std::string& fileName, const std::string& fileClass ) {
  indri::parse::Parser* parser = 0;
  indri::parse::Tokenizer* tokenizer = 0;
  indri::parse::DocumentIterator* iterator = 0;
  indri::parse::Conflater* conflater = 0;
  
  _getParsingContext( &parser, &tokenizer, &iterator, &conflater, fileClass );

  if( !parser || !iterator ) {
    _documentsSeen++;
    if( _callback ) (*_callback) ( indri::api::IndexStatus::FileSkip, fileName, _error, _documentsIndexed, _documentsSeen );
  } else {
    try {
      indri::parse::UnparsedDocument* document;
      indri::parse::TokenizedDocument* tokenized;
      ParsedDocument* parsed;

      iterator->open( fileName );

      std::vector<indri::parse::Transformation*> annotators = _createAnnotators( fileName, fileClass, &conflater );

      // notify caller that the file was successfully parsed
      if( _callback ) (*_callback)( indri::api::IndexStatus::FileOpen, fileName, _error, _documentsIndexed, _documentsSeen );

      while( document = iterator->nextDocument() ) {
        _documentsSeen++;

        tokenized = tokenizer->tokenize( document );
        parsed = parser->parse( tokenized );
        parsed = _applyAnnotators( annotators, parsed );

        _repository.addDocument( parsed );

        _documentsIndexed++;
        if( _callback ) (*_callback)( indri::api::IndexStatus::DocumentCount, fileName, _error, _documentsIndexed, _documentsSeen );
      }

      // notify caller that the file was successfully closed
      if( _callback ) (*_callback)( indri::api::IndexStatus::FileClose, fileName, _error, _documentsIndexed, _documentsSeen );

      iterator->close();
    } catch( lemur::api::Exception& e ) {
      if( iterator )
        iterator->close();

      // notify caller of errors
      if( _callback ) (*_callback)( indri::api::IndexStatus::FileError, fileName, e.what(), _documentsIndexed, _documentsSeen );
    }
  }
}
Пример #3
0
//
// For UIMA with offset annotations
//
lemur::api::DOCID_T indri::api::IndexEnvironment::addString( const std::string& documentString, const std::string&
                                             fileClass, const std::vector<indri::parse::MetadataPair>& metadata, const std::vector<indri::parse::TagExtent *> &tags ) {
  indri::parse::UnparsedDocument document;
  indri::parse::Parser* parser;
  indri::parse::Tokenizer* tokenizer;
  indri::parse::DocumentIterator* iterator;
  indri::parse::Conflater* conflater;
  indri::parse::OffsetAnnotationAnnotator *annote;
  std::string docno = "";
  
  for ( size_t i=0; i<metadata.size(); i++ ) {
    const char* attributeName = metadata[i].key;
    const char* attributeValue = (const char*) metadata[i].value;
    if ( ! strcmp( attributeName, "docno" ) ) docno = attributeValue;
  }
  
  std::string nothing;

  _documentsSeen++;

  document.text = documentString.c_str();
  document.textLength = documentString.length() + 1; // for the null
  document.metadata = metadata;
  document.content = document.text;
  document.contentLength = document.textLength - 1;
  
  _getParsingContext( &parser, &tokenizer, &iterator, &conflater, fileClass );
  annote = new indri::parse::OffsetAnnotationAnnotator(conflater);
  annote->setTags(docno.c_str(), tags);
  
  if( parser == 0 ) {
    LEMUR_THROW( LEMUR_RUNTIME_ERROR, "File class '" + fileClass + "' wasn't recognized." );
  }
  indri::parse::TokenizedDocument* tokenized = tokenizer->tokenize( &document );

  ParsedDocument* parsed = parser->parse( tokenized );
  parsed = annote->transform(parsed);
  
  lemur::api::DOCID_T documentID =_repository.addDocument( parsed );

  _documentsIndexed++;
  if( _callback ) (*_callback)( indri::api::IndexStatus::DocumentCount, nothing, _error, _documentsIndexed, _documentsSeen );

  return documentID;
}
Пример #4
0
void indri::api::IndexEnvironment::addFile( const std::string& fileName, const std::string& fileClass ) {
  indri::parse::Parser* parser = 0;
  indri::parse::Tokenizer* tokenizer = 0;
  indri::parse::DocumentIterator* iterator = 0;
  indri::parse::Conflater* conflater = 0;
  
  _getParsingContext( &parser, &tokenizer, &iterator, &conflater, fileClass );

  if( !parser || !iterator ) {
    _documentsSeen++;
    if( _callback ) (*_callback) ( indri::api::IndexStatus::FileSkip, fileName, _error, _documentsIndexed, _documentsSeen );
  } else {
    try {
      indri::parse::UnparsedDocument* document;
      indri::parse::TokenizedDocument* tokenized;
      ParsedDocument* parsed;

      iterator->open( fileName );

      std::vector<indri::parse::Transformation*> annotators = _createAnnotators( fileName, fileClass, &conflater );

      // notify caller that the file was successfully parsed
      if( _callback ) (*_callback)( indri::api::IndexStatus::FileOpen, fileName, _error, _documentsIndexed, _documentsSeen );

      while( document = iterator->nextDocument() ) {
        _documentsSeen++;

        tokenized = tokenizer->tokenize( document );
        parsed = parser->parse( tokenized );
        parsed = _applyAnnotators( annotators, parsed );
        // can't know for sure that we have the docno element until after
        // the annotators have been applied
        // check if this document is in the index already
        // find the docno
        std::string docIDStr = "";
        for( size_t i=0; i<parsed->metadata.size(); i++ ) {
          const char * key = parsed->metadata[i].key;
          if( !strcmp( key, "docno" ) ) {
            docIDStr = (const char *)parsed->metadata[i].value;
            break;
          }
        }
        // look up the id.
        std::vector<lemur::api::DOCID_T> ids = _repository.collection()->retrieveIDByMetadatum("docno", docIDStr);
        // if not found, add the document.
        if (ids.size() == 0)  {
            _repository.addDocument( parsed );
            _documentsIndexed++;
        } // else mention the dupe?
        
        if( _callback ) (*_callback)( indri::api::IndexStatus::DocumentCount, fileName, _error, _documentsIndexed, _documentsSeen );
      }

      // notify caller that the file was successfully closed
      if( _callback ) (*_callback)( indri::api::IndexStatus::FileClose, fileName, _error, _documentsIndexed, _documentsSeen );

      iterator->close();
    } catch( lemur::api::Exception& e ) {
      if( iterator )
        iterator->close();

      // notify caller of errors
      if( _callback ) (*_callback)( indri::api::IndexStatus::FileError, fileName, e.what(), _documentsIndexed, _documentsSeen );
    }
  }
}