void indri::api::Parameters::loadFile( const std::string& filename ) { std::ifstream input; indri::xml::XMLReader reader; input.open( filename.c_str(), std::ifstream::in ); if( input.rdstate() & std::ios::failbit ) LEMUR_THROW( LEMUR_IO_ERROR, "Couldn't open parameter file '" + filename + "' for reading." ); input.seekg( 0, std::ios::end ); size_t length = input.tellg(); input.seekg( 0, std::ios::beg ); // null terminate it to make a string in the XML reader for comment strip char* buffer = new char[length + 1]; buffer[length] = '\0'; try { input.read( buffer, length ); std::auto_ptr<indri::xml::XMLNode> result( reader.read( buffer, length ) ); _loadXML( result.get() ); } catch( lemur::api::Exception& e ) { LEMUR_RETHROW( e, "Had trouble parsing parameter file '" + filename + "'" ); } delete[] buffer; input.close(); }
int indri::xml::XMLReader::_findClosingTag( const char* buffer, int start, int finish, std::string& openingTagName, bool* tagsBetween ) { int openingTags = 0; int closingTags = 0; int position = start; bool done = false; bool match = false; int tagType; if( tagsBetween ) *tagsBetween = false; try { while( !done ) { std::string tagName; position = _findBeginTag( buffer, position, finish ); int end = _readTag( buffer, position, finish, &tagName, NULL, &tagType ); if( tagType == TAG_CDATA_TYPE ) { std::string cdata = &buffer[end]; std::string::size_type dataEnd = cdata.find("]]>"); position = end + dataEnd + 1; } else if( tagType != TAG_CLOSE_TYPE ) { if( tagsBetween ) *tagsBetween = true; if( tagType == TAG_OPEN_TYPE ) openingTags++; position = end; while( openingTags > closingTags ) { // don't need to check for matching tags here, we just need to // count open and closed tags position = _findBeginTag( buffer, position, finish ); end = _readTag( buffer, position, finish, NULL, NULL, &tagType ); position = end; if( tagType == TAG_CDATA_TYPE ) { std::string cdata = &buffer[end]; std::string::size_type dataEnd = cdata.find("]]>"); position = end + dataEnd + 1; } else if( tagType == TAG_OPEN_TYPE ) { openingTags++; } else if( tagType == TAG_CLOSE_TYPE ) { closingTags++; } } } else { match = (tagName == openingTagName); done = true; } } } catch( lemur::api::Exception& e ) { LEMUR_RETHROW( e, std::string() + "Caught an error while looking for an end tag for '" + openingTagName + "'" ); } if( match ) { return position; } else { return -1; } }
lemur::api::Stemmer* lemur::api::TextHandlerManager::createStemmer(string type, string datadir, string func) { Stemmer *stemmer = NULL; if (type.empty()) { // didn't pass in type, try to get it from the paramstack type = ParamGetString("stemmer"); } // if it's still empty, return nothing if (type.empty()) return NULL; // make it all lowercase for (int i=0;i<type.length();i++) type[i] = tolower(type[i]); try { if (type == lemur::parse::KStemmer::identifier) { stemmer = new lemur::parse::KStemmer(); // cerr << "created Kstemmer" << endl; } else if (type == lemur::parse::PorterStemmer::identifier) { stemmer = new lemur::parse::PorterStemmer(); //cerr << "created porter stemmer" << endl; } } catch (Exception &ex) { LEMUR_RETHROW(ex, "Could not create Stemmer"); } return stemmer; }
void indri::api::Parameters::load( const std::string& text ) { indri::xml::XMLReader reader; try { std::auto_ptr<indri::xml::XMLNode> result( reader.read( text ) ); _loadXML( result.get() ); } catch( lemur::api::Exception& e ) { LEMUR_RETHROW( e, "Had trouble parsing parameter text" ); } }
// Runs the query, expanding it if necessary. Will print output as well if verbose is on. void _runQuery( std::stringstream& output, const std::string& query, const std::string &queryType, const std::vector<std::string> &workingSet, std::vector<std::string> relFBDocs ) { try { if( _printQuery ) output << "# query: " << query << std::endl; std::vector<lemur::api::DOCID_T> docids;; if (workingSet.size() > 0) docids = _environment.documentIDsFromMetadata("docno", workingSet); if (relFBDocs.size() == 0) { if( _printSnippets ) { if (workingSet.size() > 0) _annotation = _environment.runAnnotatedQuery( query, docids, _initialRequested, queryType ); else _annotation = _environment.runAnnotatedQuery( query, _initialRequested ); _results = _annotation->getResults(); } else { if (workingSet.size() > 0) _results = _environment.runQuery( query, docids, _initialRequested, queryType ); else _results = _environment.runQuery( query, _initialRequested, queryType ); } } if( _expander ) { std::vector<indri::api::ScoredExtentResult> fbDocs; if (relFBDocs.size() > 0) { docids = _environment.documentIDsFromMetadata("docno", relFBDocs); for (size_t i = 0; i < docids.size(); i++) { indri::api::ScoredExtentResult r(0.0, docids[i]); fbDocs.push_back(r); } } std::string expandedQuery; if (relFBDocs.size() != 0) expandedQuery = _expander->expand( query, fbDocs ); else expandedQuery = _expander->expand( query, _results ); if( _printQuery ) output << "# expanded: " << expandedQuery << std::endl; if (workingSet.size() > 0) { docids = _environment.documentIDsFromMetadata("docno", workingSet); _results = _environment.runQuery( expandedQuery, docids, _requested, queryType ); } else { _results = _environment.runQuery( expandedQuery, _requested, queryType ); } } } catch( lemur::api::Exception& e ) { _results.clear(); LEMUR_RETHROW(e, "QueryThread::_runQuery Exception"); } }
bool lemur::file::Keyfile::next( int& key, char* value, int& valueLength ) { char keyBuf[KEYFILE_KEYBUF_SIZE]; int keyLength = KEYFILE_KEYBUF_SIZE; bool result = false ; try { result = next( keyBuf, keyLength, value, valueLength ); } catch (lemur::api::Exception &e) { key = _decodeKey( keyBuf ); LEMUR_RETHROW( e, "Caught an internal error while trying to fetch next record with an int key." ); } if( result ) key = _decodeKey( keyBuf ); return result; }
lemur::api::Stopper* lemur::api::TextHandlerManager::createStopper(string filename) { Stopper* stopper = NULL; if (filename.empty()) filename = ParamGetString("stopwords"); if (!filename.empty()) { try { stopper = new Stopper(filename); //cerr << "created stopper" << endl; } catch (Exception &ex) { LEMUR_RETHROW(ex, "Could not create Stopper using file name"); } } return stopper; }
void indri::collection::Repository::open(const std::string& path, indri::api::Parameters* options) { try { _path = path; _readOnly = false; _memory = defaultMemory; if (options) _memory = options->get("memory", _memory); float queryProportion = 0.75; if (options) queryProportion = static_cast<float>(options->get("queryProportion", queryProportion)); std::string indexPath = indri::file::Path::combine(path, "index"); std::string collectionPath = indri::file::Path::combine(path, "collection"); std::string indexName = indri::file::Path::combine(indexPath, "index"); _parameters.loadFile(indri::file::Path::combine(path, "manifest")); _buildFields(); _buildChain(_parameters, options); // open all indexes, add a memory index _openIndexes(_parameters, indexPath); _addMemoryIndex(); // remove that initial state (only disk indexes) _states.erase(_states.begin()); // open compressed collection _collection = new CompressedCollection(); _collection->open(collectionPath); // open priors _openPriors(path); // read deleted documents in std::string deletedName = indri::file::Path::combine(path, "deleted"); _deletedList.read(deletedName); _startThreads(); } catch(lemur::api::Exception& e) { LEMUR_RETHROW(e, "Couldn't open a repository at '" + path + "' because:"); } catch(...) { LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Something unexpected happened while trying to create '" + path + "'"); } }
void indri::query::RelevanceModel::generate( const std::string& query, const std::vector<indri::api::ScoredExtentResult>& results ) { try { _results = results; _logtoposterior(_results); _grams.clear(); _extractDocuments(); _vectors = _environment.documentVectors( _documentIDs ); _countGrams(); _scoreGrams(); _sortGrams(); for (unsigned int i = 0; i < _vectors.size(); i++) delete _vectors[i]; } catch( lemur::api::Exception& e ) { LEMUR_RETHROW( e, "Couldn't generate relevance model for '" + query + "' because: " ); } }
// Runs the query, expanding it if necessary. Will print output as well if verbose is on. void _runQuery( std::stringstream& output, const std::string& query, const std::string &queryType ) { try { if( _printQuery ) output << "# query: " << query << std::endl; if( _printSnippets ) { _annotation = _environment.runAnnotatedQuery( query, _initialRequested ); _results = _annotation->getResults(); } else { _results = _environment.runQuery( query, _initialRequested, queryType ); } if( _expander ) { std::string expandedQuery = _expander->expand( query, _results ); if( _printQuery ) output << "# expanded: " << expandedQuery << std::endl; _results = _environment.runQuery( expandedQuery, _requested, queryType ); } } catch( lemur::api::Exception& e ) { _results.clear(); LEMUR_RETHROW(e, "QueryThread::_runQuery Exception"); } }
void indri::collection::Repository::_openIndexes(indri::api::Parameters& params, const std::string& parentPath) { try { indri::api::Parameters container = params["indexes"]; _active = new index_vector; _states.push_back(_active); _indexCount = params.get("indexCount", 0); if (container.exists("index")) { indri::api::Parameters indexes = container["index"]; for(size_t i=0; i<indexes.size(); i++) { indri::api::Parameters indexSpec = indexes[i]; indri::index::DiskIndex* diskIndex = new indri::index::DiskIndex(); std::string indexName = (std::string) indexSpec; diskIndex->open(parentPath, indexName); _active->push_back(diskIndex); } } } catch(lemur::api::Exception& e) { LEMUR_RETHROW(e, "_openIndexes: Couldn't open DiskIndexes because:"); } }
indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() { if( !_documentPath.length() ) return 0; PDFDoc* doc = 0; TextOutputDev* textOut = 0; GString* gfilename = new GString(_documentPath.c_str()); doc = new PDFDoc( gfilename ); // if the doc is not ok, or ok to copy, it // will be a document of length 0. if( doc->isOk() && doc->okToCopy() ) { void* stream = &_documentTextBuffer; textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse); if ( textOut->isOk() ) { int firstPage = 1; int lastPage = doc->getNumPages(); double hDPI=72.0; double vDPI=72.0; int rotate=0; GBool useMediaBox=gFalse; GBool crop=gTrue; GBool printing=gFalse; if(doc->readMetadata()!=NULL) { GString rawMetaData = doc->readMetadata(); GString preparedMetaData=""; //zoek <rdf:RDF en eindig bij </rdf:RDF>!! for(int x=0; x<rawMetaData.getLength(); x++) { if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') { //skip characters which the XMLReader doesn't understand preparedMetaData.append(rawMetaData.getChar(x)); } } std::string metaData(preparedMetaData.getCString()); int startbegin = metaData.find("<rdf"); int stopend = metaData.find(">", metaData.rfind("</rdf") ); metaData = metaData.substr(startbegin, (stopend-startbegin)+1 ); indri::xml::XMLReader reader; try { std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) ); appendPdfMetaData( result.get() ); } catch( lemur::api::Exception& e ) { LEMUR_RETHROW( e, "Had trouble reading PDF metadata" ); } if( _author.length()>0 || _title.length()>0 ) { std::string createdPdfHeader; createdPdfHeader="<head>\n"; if(_title.length()>0) { createdPdfHeader+="<title>"; createdPdfHeader+=_title; createdPdfHeader+="</title>\n"; } if(_author.length()>0) { createdPdfHeader+="<author>"; createdPdfHeader+=_author; createdPdfHeader+="</author>\n"; } createdPdfHeader+="</head>\n"; char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 ); strcpy(metastream, createdPdfHeader.c_str()); } } doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing); } } delete textOut; delete doc; _unparsedDocument.textLength = _documentTextBuffer.position(); _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty. char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 ); strcpy( docnoPoint, _documentPath.c_str() ); _unparsedDocument.text = _documentTextBuffer.front(); _unparsedDocument.content = _documentTextBuffer.front(); _unparsedDocument.metadata.clear(); indri::parse::MetadataPair pair; pair.key = "path"; pair.value = docnoPoint; pair.valueLength = _documentPath.length()+1; _unparsedDocument.metadata.push_back( pair ); _docnostring.assign(_documentPath.c_str() ); cleanDocno(); pair.value = _docnostring.c_str(); pair.valueLength = _docnostring.length()+1; pair.key = "docno"; _unparsedDocument.metadata.push_back( pair ); _documentPath = ""; return &_unparsedDocument; }
void indri::collection::Repository::create(const std::string& path, indri::api::Parameters* options) { _path = path; _readOnly = false; try { _cleanAndCreateDirectory(path); _memory = defaultMemory; if (options) _memory = options->get("memory", _memory); float queryProportion = 0.15f; if (options) queryProportion = static_cast<float>(options->get("queryProportion", queryProportion)); if (options) _copyParameters(*options); _buildFields(); _buildChain(_parameters, 0); std::string indexPath = indri::file::Path::combine(path, "index"); std::string collectionPath = indri::file::Path::combine(path, "collection"); if (!indri::file::Path::exists(indexPath)) indri::file::Path::create(indexPath); std::string indexName = indri::file::Path::combine(indexPath, "index"); _active = new index_vector; _states.push_back(_active); _active->push_back(new indri::index::MemoryIndex(1, _indexFields)); _indexCount = 0; _collection = new CompressedCollection(); if (!indri::file::Path::exists(collectionPath)) indri::file::Path::create(collectionPath); std::vector<std::string> forwardFields; std::vector<std::string> backwardFields; if (options && options->exists("collection.forward")) { indri::api::Parameters cfields = options->get("collection.forward"); for(size_t i=0; i<cfields.size(); i++) { forwardFields.push_back((std::string) cfields[i]); } } if (options && options->exists("collection.backward")) { indri::api::Parameters cfields = options->get("collection.backward"); for(size_t i=0; i<cfields.size(); i++) { backwardFields.push_back((std::string) cfields[i]); } } _collection->create(collectionPath, forwardFields, backwardFields, options->get("storeDocs", true)); _startThreads(); } catch(lemur::api::Exception& e) { LEMUR_RETHROW(e, "Couldn't create a repository at '" + path + "' because:"); } catch(...) { LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Something unexpected happened while trying to create '" + path + "'"); } }
void indri::collection::Repository::merge(const std::string& path, const std::vector<std::string>& inputIndexes) { // Create the directory for the output index _cleanAndCreateDirectory(path); std::string indexPath = indri::file::Path::combine(path, "index"); std::string collectionPath = indri::file::Path::combine(path, "collection"); // First, we're going to harvest information from the individual indexes. We want to // check a few things: // 1. do they all use the same stemmer? // 2. do they all have the same indexed fields? // 3. are they all merged (only have one disk index?) // 4. how many documents are in each one? // If no indexes are given, make an empty repository and return if (inputIndexes.size() == 0) { makeEmpty(path); return; } std::vector<lemur::api::DOCID_T> documentMaximums; // Open up the first repository and extract field information Repository firstRepository; try { firstRepository.openRead(inputIndexes[0]); } catch(lemur::api::Exception& e) { LEMUR_RETHROW(e, "Merge failed, couldn't find repository: " + inputIndexes[0]); } std::vector<Field> indexFields = firstRepository.fields(); firstRepository.close(); // Open up the first manifest and check on stemming and fields indri::api::Parameters firstManifest; std::string firstManifestPath = indri::file::Path::combine(inputIndexes[0], "manifest"); try { firstManifest.loadFile(firstManifestPath); } catch(lemur::api::Exception& e) { LEMUR_RETHROW(e, "Merge failed, couldn't find repository: " + inputIndexes[0]); } std::string stemmerName = _stemmerName(firstManifest); std::vector<std::string> fieldNames = _fieldNames(firstManifest); // Now, gather information about the indexes for(size_t i=0; i<inputIndexes.size(); i++) { indri::api::Parameters repositoryManifest; std::string manifestPath = indri::file::Path::combine(inputIndexes[i], "manifest"); try { repositoryManifest.loadFile(manifestPath); } catch(lemur::api::Exception& e) { LEMUR_RETHROW(e, "Couldn't find repository: " + inputIndexes[i]); } if (!repositoryManifest.exists("indexes.index")) { documentMaximums.push_back(0); continue; } // Check to make sure there's only one index in there size_t indexCount = repositoryManifest["indexes.index"].size(); if (indexCount > 1) { LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that have unmerged internal indexes: " + inputIndexes[i]); } // How many documents are in this one? indri::index::DiskIndex diskIndex; std::string basePath = indri::file::Path::combine(inputIndexes[i], "index"); std::string relativePath = i64_to_string((INT64)repositoryManifest["indexes.index"]); diskIndex.open(basePath, relativePath); documentMaximums.push_back(diskIndex.documentMaximum()); diskIndex.close(); // Only check successive indexes against the first one if (i == 0) continue; // Verify that the same fields and stemmers are used if (stemmerName != _stemmerName(repositoryManifest)) { LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that use different stemmers: " + inputIndexes[i]); } if (fieldNames != _fieldNames(repositoryManifest)) { LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that use different fields: " + inputIndexes[i]); } } std::vector<std::string> usableIndexes = inputIndexes; // remove any repositories that have no documents for(size_t i=0; i<usableIndexes.size(); i++) { if (documentMaximums[i] == 0) { documentMaximums.erase(documentMaximums.begin() + i); usableIndexes.erase(usableIndexes.begin() + i); i--; } } // now that we've removed empty indexes, are there any left? if (usableIndexes.size() == 0) { makeEmpty(path); return; } // 2. merge the deleted bitmaps _mergeBitmaps(path, usableIndexes, documentMaximums); // 3. merge compressed collections _mergeCompressedCollections(path, usableIndexes, documentMaximums); // 4. merge the indexes _mergeClosedIndexes(path, usableIndexes, indexFields, documentMaximums); // 5. write the manifest file _writeMergedManifest(path, firstManifest); }
UINT64 initialize() { try { _environment.setSingleBackgroundModel( _parameters.get("singleBackgroundModel", false) ); std::vector<std::string> stopwords; if( copy_parameters_to_string_vector( stopwords, _parameters, "stopper.word" ) ) _environment.setStopwords(stopwords); std::vector<std::string> smoothingRules; if( copy_parameters_to_string_vector( smoothingRules, _parameters, "rule" ) ) _environment.setScoringRules( smoothingRules ); if( _parameters.exists( "index" ) ) { indri::api::Parameters indexes = _parameters["index"]; for( size_t i=0; i < indexes.size(); i++ ) { _environment.addIndex( std::string(indexes[i]) ); } } if( _parameters.exists( "server" ) ) { indri::api::Parameters servers = _parameters["server"]; for( size_t i=0; i < servers.size(); i++ ) { _environment.addServer( std::string(servers[i]) ); } } if( _parameters.exists("maxWildcardTerms") ) _environment.setMaxWildcardTerms(_parameters.get("maxWildcardTerms", 100)); _requested = _parameters.get( "count", 1000 ); _initialRequested = _parameters.get( "fbDocs", _requested ); _runID = _parameters.get( "runID", "indri" ); _trecFormat = _parameters.get( "trecFormat" , false ); _inexFormat = _parameters.exists( "inex" ); _printQuery = _parameters.get( "printQuery", false ); _printDocuments = _parameters.get( "printDocuments", false ); _printPassages = _parameters.get( "printPassages", false ); _printSnippets = _parameters.get( "printSnippets", false ); if (_parameters.exists("baseline")) { // doing a baseline std::string baseline = _parameters["baseline"]; _environment.setBaseline(baseline); // need a factory for this... if( _parameters.get( "fbDocs", 0 ) != 0 ) { // have to push the method in... std::string rule = "method:" + baseline; _parameters.set("rule", rule); _expander = new indri::query::TFIDFExpander( &_environment, _parameters ); } } else { if( _parameters.get( "fbDocs", 0 ) != 0 ) { _expander = new indri::query::RMExpander( &_environment, _parameters ); } } if (_parameters.exists("maxWildcardTerms")) { _environment.setMaxWildcardTerms((int)_parameters.get("maxWildcardTerms")); } } catch ( lemur::api::Exception& e ) { while( _queries.size() ) { query_t *query = _queries.front(); _queries.pop(); _output.push( new query_t( query->index, query->number, "query: " + query->number + " QueryThread::_initialize exception\n" ) ); _queueEvent.notifyAll(); LEMUR_RETHROW(e, "QueryThread::_initialize"); } } return 0; }