lemur::api::DOCID_T indri::api::IndexEnvironment::addString( const std::string& documentString, const std::string& fileClass, const std::vector<indri::parse::MetadataPair>& metadata ) { indri::parse::UnparsedDocument document; indri::parse::Parser* parser; indri::parse::Tokenizer* tokenizer; indri::parse::DocumentIterator* iterator; indri::parse::Conflater* conflater; std::string nothing; _documentsSeen++; document.text = documentString.c_str(); document.textLength = documentString.length() + 1; // for the null document.metadata = metadata; document.content = document.text; document.contentLength = document.textLength - 1; _getParsingContext( &parser, &tokenizer, &iterator, &conflater, fileClass ); if( parser == 0 ) { LEMUR_THROW( LEMUR_RUNTIME_ERROR, "File class '" + fileClass + "' wasn't recognized." ); } indri::parse::TokenizedDocument* tokenized = tokenizer->tokenize( &document ); ParsedDocument* parsed = parser->parse( tokenized ); lemur::api::DOCID_T documentID =_repository.addDocument( parsed ); _documentsIndexed++; if( _callback ) (*_callback)( indri::api::IndexStatus::DocumentCount, nothing, _error, _documentsIndexed, _documentsSeen ); return documentID; }
void indri::api::IndexEnvironment::addFile( const std::string& fileName, const std::string& fileClass ) { indri::parse::Parser* parser = 0; indri::parse::Tokenizer* tokenizer = 0; indri::parse::DocumentIterator* iterator = 0; indri::parse::Conflater* conflater = 0; _getParsingContext( &parser, &tokenizer, &iterator, &conflater, fileClass ); if( !parser || !iterator ) { _documentsSeen++; if( _callback ) (*_callback) ( indri::api::IndexStatus::FileSkip, fileName, _error, _documentsIndexed, _documentsSeen ); } else { try { indri::parse::UnparsedDocument* document; indri::parse::TokenizedDocument* tokenized; ParsedDocument* parsed; iterator->open( fileName ); std::vector<indri::parse::Transformation*> annotators = _createAnnotators( fileName, fileClass, &conflater ); // notify caller that the file was successfully parsed if( _callback ) (*_callback)( indri::api::IndexStatus::FileOpen, fileName, _error, _documentsIndexed, _documentsSeen ); while( document = iterator->nextDocument() ) { _documentsSeen++; tokenized = tokenizer->tokenize( document ); parsed = parser->parse( tokenized ); parsed = _applyAnnotators( annotators, parsed ); _repository.addDocument( parsed ); _documentsIndexed++; if( _callback ) (*_callback)( indri::api::IndexStatus::DocumentCount, fileName, _error, _documentsIndexed, _documentsSeen ); } // notify caller that the file was successfully closed if( _callback ) (*_callback)( indri::api::IndexStatus::FileClose, fileName, _error, _documentsIndexed, _documentsSeen ); iterator->close(); } catch( lemur::api::Exception& e ) { if( iterator ) iterator->close(); // notify caller of errors if( _callback ) (*_callback)( indri::api::IndexStatus::FileError, fileName, e.what(), _documentsIndexed, _documentsSeen ); } } }
// // For UIMA with offset annotations // lemur::api::DOCID_T indri::api::IndexEnvironment::addString( const std::string& documentString, const std::string& fileClass, const std::vector<indri::parse::MetadataPair>& metadata, const std::vector<indri::parse::TagExtent *> &tags ) { indri::parse::UnparsedDocument document; indri::parse::Parser* parser; indri::parse::Tokenizer* tokenizer; indri::parse::DocumentIterator* iterator; indri::parse::Conflater* conflater; indri::parse::OffsetAnnotationAnnotator *annote; std::string docno = ""; for ( size_t i=0; i<metadata.size(); i++ ) { const char* attributeName = metadata[i].key; const char* attributeValue = (const char*) metadata[i].value; if ( ! strcmp( attributeName, "docno" ) ) docno = attributeValue; } std::string nothing; _documentsSeen++; document.text = documentString.c_str(); document.textLength = documentString.length() + 1; // for the null document.metadata = metadata; document.content = document.text; document.contentLength = document.textLength - 1; _getParsingContext( &parser, &tokenizer, &iterator, &conflater, fileClass ); annote = new indri::parse::OffsetAnnotationAnnotator(conflater); annote->setTags(docno.c_str(), tags); if( parser == 0 ) { LEMUR_THROW( LEMUR_RUNTIME_ERROR, "File class '" + fileClass + "' wasn't recognized." ); } indri::parse::TokenizedDocument* tokenized = tokenizer->tokenize( &document ); ParsedDocument* parsed = parser->parse( tokenized ); parsed = annote->transform(parsed); lemur::api::DOCID_T documentID =_repository.addDocument( parsed ); _documentsIndexed++; if( _callback ) (*_callback)( indri::api::IndexStatus::DocumentCount, nothing, _error, _documentsIndexed, _documentsSeen ); return documentID; }
void indri::api::IndexEnvironment::addFile( const std::string& fileName, const std::string& fileClass ) { indri::parse::Parser* parser = 0; indri::parse::Tokenizer* tokenizer = 0; indri::parse::DocumentIterator* iterator = 0; indri::parse::Conflater* conflater = 0; _getParsingContext( &parser, &tokenizer, &iterator, &conflater, fileClass ); if( !parser || !iterator ) { _documentsSeen++; if( _callback ) (*_callback) ( indri::api::IndexStatus::FileSkip, fileName, _error, _documentsIndexed, _documentsSeen ); } else { try { indri::parse::UnparsedDocument* document; indri::parse::TokenizedDocument* tokenized; ParsedDocument* parsed; iterator->open( fileName ); std::vector<indri::parse::Transformation*> annotators = _createAnnotators( fileName, fileClass, &conflater ); // notify caller that the file was successfully parsed if( _callback ) (*_callback)( indri::api::IndexStatus::FileOpen, fileName, _error, _documentsIndexed, _documentsSeen ); while( document = iterator->nextDocument() ) { _documentsSeen++; tokenized = tokenizer->tokenize( document ); parsed = parser->parse( tokenized ); parsed = _applyAnnotators( annotators, parsed ); // can't know for sure that we have the docno element until after // the annotators have been applied // check if this document is in the index already // find the docno std::string docIDStr = ""; for( size_t i=0; i<parsed->metadata.size(); i++ ) { const char * key = parsed->metadata[i].key; if( !strcmp( key, "docno" ) ) { docIDStr = (const char *)parsed->metadata[i].value; break; } } // look up the id. std::vector<lemur::api::DOCID_T> ids = _repository.collection()->retrieveIDByMetadatum("docno", docIDStr); // if not found, add the document. if (ids.size() == 0) { _repository.addDocument( parsed ); _documentsIndexed++; } // else mention the dupe? if( _callback ) (*_callback)( indri::api::IndexStatus::DocumentCount, fileName, _error, _documentsIndexed, _documentsSeen ); } // notify caller that the file was successfully closed if( _callback ) (*_callback)( indri::api::IndexStatus::FileClose, fileName, _error, _documentsIndexed, _documentsSeen ); iterator->close(); } catch( lemur::api::Exception& e ) { if( iterator ) iterator->close(); // notify caller of errors if( _callback ) (*_callback)( indri::api::IndexStatus::FileError, fileName, e.what(), _documentsIndexed, _documentsSeen ); } } }