void indri::api::IndexEnvironment::addFile( const std::string& fileName, const std::string& fileClass ) { indri::parse::Parser* parser = 0; indri::parse::Tokenizer* tokenizer = 0; indri::parse::DocumentIterator* iterator = 0; indri::parse::Conflater* conflater = 0; _getParsingContext( &parser, &tokenizer, &iterator, &conflater, fileClass ); if( !parser || !iterator ) { _documentsSeen++; if( _callback ) (*_callback) ( indri::api::IndexStatus::FileSkip, fileName, _error, _documentsIndexed, _documentsSeen ); } else { try { indri::parse::UnparsedDocument* document; indri::parse::TokenizedDocument* tokenized; ParsedDocument* parsed; iterator->open( fileName ); std::vector<indri::parse::Transformation*> annotators = _createAnnotators( fileName, fileClass, &conflater ); // notify caller that the file was successfully parsed if( _callback ) (*_callback)( indri::api::IndexStatus::FileOpen, fileName, _error, _documentsIndexed, _documentsSeen ); while( document = iterator->nextDocument() ) { _documentsSeen++; tokenized = tokenizer->tokenize( document ); parsed = parser->parse( tokenized ); parsed = _applyAnnotators( annotators, parsed ); _repository.addDocument( parsed ); _documentsIndexed++; if( _callback ) (*_callback)( indri::api::IndexStatus::DocumentCount, fileName, _error, _documentsIndexed, _documentsSeen ); } // notify caller that the file was successfully closed if( _callback ) (*_callback)( indri::api::IndexStatus::FileClose, fileName, _error, _documentsIndexed, _documentsSeen ); iterator->close(); } catch( lemur::api::Exception& e ) { if( iterator ) iterator->close(); // notify caller of errors if( _callback ) (*_callback)( indri::api::IndexStatus::FileError, fileName, e.what(), _documentsIndexed, _documentsSeen ); } } }
void indri::api::IndexEnvironment::addFile( const std::string& fileName, const std::string& fileClass ) { indri::parse::Parser* parser = 0; indri::parse::Tokenizer* tokenizer = 0; indri::parse::DocumentIterator* iterator = 0; indri::parse::Conflater* conflater = 0; _getParsingContext( &parser, &tokenizer, &iterator, &conflater, fileClass ); if( !parser || !iterator ) { _documentsSeen++; if( _callback ) (*_callback) ( indri::api::IndexStatus::FileSkip, fileName, _error, _documentsIndexed, _documentsSeen ); } else { try { indri::parse::UnparsedDocument* document; indri::parse::TokenizedDocument* tokenized; ParsedDocument* parsed; iterator->open( fileName ); std::vector<indri::parse::Transformation*> annotators = _createAnnotators( fileName, fileClass, &conflater ); // notify caller that the file was successfully parsed if( _callback ) (*_callback)( indri::api::IndexStatus::FileOpen, fileName, _error, _documentsIndexed, _documentsSeen ); while( document = iterator->nextDocument() ) { _documentsSeen++; tokenized = tokenizer->tokenize( document ); parsed = parser->parse( tokenized ); parsed = _applyAnnotators( annotators, parsed ); // can't know for sure that we have the docno element until after // the annotators have been applied // check if this document is in the index already // find the docno std::string docIDStr = ""; for( size_t i=0; i<parsed->metadata.size(); i++ ) { const char * key = parsed->metadata[i].key; if( !strcmp( key, "docno" ) ) { docIDStr = (const char *)parsed->metadata[i].value; break; } } // look up the id. std::vector<lemur::api::DOCID_T> ids = _repository.collection()->retrieveIDByMetadatum("docno", docIDStr); // if not found, add the document. if (ids.size() == 0) { _repository.addDocument( parsed ); _documentsIndexed++; } // else mention the dupe? if( _callback ) (*_callback)( indri::api::IndexStatus::DocumentCount, fileName, _error, _documentsIndexed, _documentsSeen ); } // notify caller that the file was successfully closed if( _callback ) (*_callback)( indri::api::IndexStatus::FileClose, fileName, _error, _documentsIndexed, _documentsSeen ); iterator->close(); } catch( lemur::api::Exception& e ) { if( iterator ) iterator->close(); // notify caller of errors if( _callback ) (*_callback)( indri::api::IndexStatus::FileError, fileName, e.what(), _documentsIndexed, _documentsSeen ); } } }