Ejemplo n.º 1
0
void indri::api::Parameters::loadFile( const std::string& filename ) {
  std::ifstream input;
  indri::xml::XMLReader reader;
  
  input.open( filename.c_str(), std::ifstream::in );

  if( input.rdstate() & std::ios::failbit )
    LEMUR_THROW( LEMUR_IO_ERROR, "Couldn't open parameter file '" + filename + "' for reading." );

  input.seekg( 0, std::ios::end );
  size_t length = input.tellg();
  input.seekg( 0, std::ios::beg );
  // null terminate it to make a string in the XML reader for comment strip
  char* buffer = new char[length + 1];
  buffer[length] = '\0';

  try {
    input.read( buffer, length );
    std::auto_ptr<indri::xml::XMLNode> result( reader.read( buffer, length ) );

    _loadXML( result.get() );
  } catch( lemur::api::Exception& e ) {
    LEMUR_RETHROW( e, "Had trouble parsing parameter file '" + filename + "'" );
  }

  delete[] buffer;
  input.close();
}
Ejemplo n.º 2
0
int indri::xml::XMLReader::_findClosingTag( const char* buffer, int start, int finish, std::string& openingTagName, bool* tagsBetween ) {
  int openingTags = 0;
  int closingTags = 0;
  int position = start;
  bool done = false;
  bool match = false;
  int tagType;

  if( tagsBetween )
    *tagsBetween = false;
  try {
    while( !done ) {
      std::string tagName;
      position = _findBeginTag( buffer, position, finish );
      int end = _readTag( buffer, position, finish, &tagName, NULL, &tagType );

      if( tagType == TAG_CDATA_TYPE ) {
        std::string cdata = &buffer[end];
        std::string::size_type dataEnd = cdata.find("]]>");
        position = end + dataEnd + 1;
      } else if( tagType != TAG_CLOSE_TYPE ) {
        if( tagsBetween )
          *tagsBetween = true;

        if( tagType == TAG_OPEN_TYPE )
          openingTags++;
        position = end;

        while( openingTags > closingTags ) {
          // don't need to check for matching tags here, we just need to
          // count open and closed tags
          position = _findBeginTag( buffer, position, finish );
          end = _readTag( buffer, position, finish, NULL, NULL, &tagType );
          position = end;

          if( tagType == TAG_CDATA_TYPE ) {
            std::string cdata = &buffer[end];
            std::string::size_type dataEnd = cdata.find("]]>");
            position = end + dataEnd + 1;
          } else if( tagType == TAG_OPEN_TYPE ) {
            openingTags++;
          } else if( tagType == TAG_CLOSE_TYPE ) {
            closingTags++;
          }
        }
      } else {
        match = (tagName == openingTagName);
        done = true;
      }
    }
  } catch( lemur::api::Exception& e ) {
    LEMUR_RETHROW( e, std::string() + "Caught an error while looking for an end tag for '" + openingTagName + "'" );
  }

  if( match ) {
    return position;
  } else {
    return -1;
  }
}
Ejemplo n.º 3
0
lemur::api::Stemmer* lemur::api::TextHandlerManager::createStemmer(string type, string datadir, string func) {
  Stemmer *stemmer = NULL;
  if (type.empty()) {
    // didn't pass in type, try to get it from the paramstack
    type = ParamGetString("stemmer");
  }  

  // if it's still empty, return nothing
  if (type.empty())
    return NULL;

  // make it all lowercase
  for (int i=0;i<type.length();i++)
    type[i] = tolower(type[i]);

  try {
    if (type == lemur::parse::KStemmer::identifier) {
      stemmer = new lemur::parse::KStemmer();
      //      cerr << "created Kstemmer" << endl;
    } else if (type == lemur::parse::PorterStemmer::identifier) {
      stemmer = new lemur::parse::PorterStemmer();
      //cerr << "created porter stemmer" << endl;
    }
  } catch (Exception &ex) {
    LEMUR_RETHROW(ex, "Could not create Stemmer");
  }
  
  return stemmer;
}
Ejemplo n.º 4
0
void indri::api::Parameters::load( const std::string& text ) {
  indri::xml::XMLReader reader;

  try {
    std::auto_ptr<indri::xml::XMLNode> result( reader.read( text ) );
    _loadXML( result.get() );
  } catch( lemur::api::Exception& e ) {
    LEMUR_RETHROW( e, "Had trouble parsing parameter text" );
  }
}
Ejemplo n.º 5
0
  // Runs the query, expanding it if necessary.  Will print output as well if verbose is on.
  void _runQuery( std::stringstream& output, const std::string& query,
                  const std::string &queryType, const std::vector<std::string> &workingSet, std::vector<std::string> relFBDocs ) {
    try {
      if( _printQuery ) output << "# query: " << query << std::endl;
      std::vector<lemur::api::DOCID_T> docids;;
      if (workingSet.size() > 0) 
        docids = _environment.documentIDsFromMetadata("docno", workingSet);

      if (relFBDocs.size() == 0) {
          if( _printSnippets ) {
            if (workingSet.size() > 0) 
              _annotation = _environment.runAnnotatedQuery( query, docids, _initialRequested, queryType ); 
            else
              _annotation = _environment.runAnnotatedQuery( query, _initialRequested );
            _results = _annotation->getResults();
          } else {
            if (workingSet.size() > 0)
              _results = _environment.runQuery( query, docids, _initialRequested, queryType );
            else
              _results = _environment.runQuery( query, _initialRequested, queryType );
          }
      }
      
      if( _expander ) {
        std::vector<indri::api::ScoredExtentResult> fbDocs;
        if (relFBDocs.size() > 0) {
          docids = _environment.documentIDsFromMetadata("docno", relFBDocs);
          for (size_t i = 0; i < docids.size(); i++) {
            indri::api::ScoredExtentResult r(0.0, docids[i]);
            fbDocs.push_back(r);
          }
        }
        std::string expandedQuery;
        if (relFBDocs.size() != 0)
          expandedQuery = _expander->expand( query, fbDocs );
        else
          expandedQuery = _expander->expand( query, _results );
        if( _printQuery ) output << "# expanded: " << expandedQuery << std::endl;
        if (workingSet.size() > 0) {
          docids = _environment.documentIDsFromMetadata("docno", workingSet);
          _results = _environment.runQuery( expandedQuery, docids, _requested, queryType );
        } else {
          _results = _environment.runQuery( expandedQuery, _requested, queryType );
        }
      }
    }
    catch( lemur::api::Exception& e )
    {
      _results.clear();
      LEMUR_RETHROW(e, "QueryThread::_runQuery Exception");
    }
  }
Ejemplo n.º 6
0
bool lemur::file::Keyfile::next( int& key, char* value, int& valueLength ) {
  char keyBuf[KEYFILE_KEYBUF_SIZE];
  int keyLength = KEYFILE_KEYBUF_SIZE;
  bool result = false ;
  try {
    
    result = next( keyBuf, keyLength, value, valueLength ); 
  } catch (lemur::api::Exception &e) {
    key = _decodeKey( keyBuf );
    LEMUR_RETHROW( e, "Caught an internal error while trying to fetch next record with an int key." );
  }
  
  if( result )
    key = _decodeKey( keyBuf );
  return result;
}
Ejemplo n.º 7
0
lemur::api::Stopper* lemur::api::TextHandlerManager::createStopper(string filename) {
  Stopper* stopper = NULL;
  if (filename.empty()) 
    filename = ParamGetString("stopwords");

  if (!filename.empty()) {
    try {
      stopper = new Stopper(filename);
      //cerr << "created stopper" << endl;
    } catch (Exception &ex) {
      LEMUR_RETHROW(ex, "Could not create Stopper using file name");
    }
  }
  
  return stopper;
}
Ejemplo n.º 8
0
void indri::collection::Repository::open(const std::string& path, indri::api::Parameters* options) {
  try {
    _path = path;
    _readOnly = false;

    _memory = defaultMemory;
    if (options)
      _memory = options->get("memory", _memory);

    float queryProportion = 0.75;
    if (options)
      queryProportion = static_cast<float>(options->get("queryProportion", queryProportion));

    std::string indexPath = indri::file::Path::combine(path, "index");
    std::string collectionPath = indri::file::Path::combine(path, "collection");
    std::string indexName = indri::file::Path::combine(indexPath, "index");

    _parameters.loadFile(indri::file::Path::combine(path, "manifest"));

    _buildFields();
    _buildChain(_parameters, options);

    // open all indexes, add a memory index
    _openIndexes(_parameters, indexPath);
    _addMemoryIndex();

    // remove that initial state (only disk indexes)
    _states.erase(_states.begin());

    // open compressed collection
    _collection = new CompressedCollection();
    _collection->open(collectionPath);
    
    // open priors
    _openPriors(path);
    
    // read deleted documents in
    std::string deletedName = indri::file::Path::combine(path, "deleted");
    _deletedList.read(deletedName);

    _startThreads();
  } catch(lemur::api::Exception& e) {
    LEMUR_RETHROW(e, "Couldn't open a repository at '" + path + "' because:");
  } catch(...) {
    LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Something unexpected happened while trying to create '" + path + "'");
  }
}
Ejemplo n.º 9
0
void indri::query::RelevanceModel::generate( const std::string& query, const std::vector<indri::api::ScoredExtentResult>& results  ) {
  try {
    _results = results;
    _logtoposterior(_results);
    _grams.clear();
    _extractDocuments();
    _vectors = _environment.documentVectors( _documentIDs );

    _countGrams();
    _scoreGrams();
    _sortGrams();
    for (unsigned int i = 0; i < _vectors.size(); i++)
      delete _vectors[i];
  } catch( lemur::api::Exception& e ) {
    LEMUR_RETHROW( e, "Couldn't generate relevance model for '" + query + "' because: " );
  }
}
Ejemplo n.º 10
0
  // Runs the query, expanding it if necessary.  Will print output as well if verbose is on.
  void _runQuery( std::stringstream& output, const std::string& query,
                  const std::string &queryType ) {
    try {
      if( _printQuery ) output << "# query: " << query << std::endl;

      if( _printSnippets ) {
        _annotation = _environment.runAnnotatedQuery( query, _initialRequested );
        _results = _annotation->getResults();
      } else {
        _results = _environment.runQuery( query, _initialRequested, queryType );
      }

      if( _expander ) {
        std::string expandedQuery = _expander->expand( query, _results );
        if( _printQuery ) output << "# expanded: " << expandedQuery << std::endl;
        _results = _environment.runQuery( expandedQuery, _requested, queryType );
      }
    }
    catch( lemur::api::Exception& e )
    {
      _results.clear();
      LEMUR_RETHROW(e, "QueryThread::_runQuery Exception");
    }
  }
Ejemplo n.º 11
0
void indri::collection::Repository::_openIndexes(indri::api::Parameters& params, const std::string& parentPath) {
  try {
    indri::api::Parameters container = params["indexes"];

    _active = new index_vector;
    _states.push_back(_active);
    _indexCount = params.get("indexCount", 0);

    if (container.exists("index")) {
      indri::api::Parameters indexes = container["index"];

      for(size_t i=0; i<indexes.size(); i++) {
        indri::api::Parameters indexSpec = indexes[i];
        indri::index::DiskIndex* diskIndex = new indri::index::DiskIndex();
        std::string indexName = (std::string) indexSpec;

        diskIndex->open(parentPath, indexName);
        _active->push_back(diskIndex);
      }
    }
  } catch(lemur::api::Exception& e) {
    LEMUR_RETHROW(e, "_openIndexes: Couldn't open DiskIndexes because:");
  }
}
Ejemplo n.º 12
0
indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() {
  if( !_documentPath.length() )
    return 0;

  PDFDoc* doc = 0;
  TextOutputDev* textOut = 0;
  GString* gfilename = new GString(_documentPath.c_str());
  doc = new PDFDoc( gfilename );
  // if the doc is not ok, or ok to copy, it
  // will be a document of length 0.
  if( doc->isOk() && doc->okToCopy() ) {
    void* stream = &_documentTextBuffer;
    textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse);
    if ( textOut->isOk() ) {
      int firstPage = 1;
      int lastPage = doc->getNumPages();
	  double hDPI=72.0;
	  double vDPI=72.0;
	  int rotate=0;
	  GBool useMediaBox=gFalse;
	  GBool crop=gTrue; 
	  GBool printing=gFalse; 
	  if(doc->readMetadata()!=NULL)
	  {
		  GString rawMetaData = doc->readMetadata();
		  GString preparedMetaData="";

		  //zoek <rdf:RDF  en eindig bij </rdf:RDF>!! 
		  for(int x=0; x<rawMetaData.getLength(); x++) {
			  if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') {
				  //skip characters which the XMLReader doesn't understand
				  preparedMetaData.append(rawMetaData.getChar(x));
			  }
		  }
		  std::string metaData(preparedMetaData.getCString());
		  int startbegin = metaData.find("<rdf");
		  int stopend = metaData.find(">", metaData.rfind("</rdf") );
		  metaData = metaData.substr(startbegin, (stopend-startbegin)+1 );
	  

     	  indri::xml::XMLReader reader;

		  try {
			  std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) );
			  appendPdfMetaData( result.get() );
		  } catch( lemur::api::Exception& e ) {
			LEMUR_RETHROW( e, "Had trouble reading PDF metadata" );
		  } 
		  if( _author.length()>0 || _title.length()>0 )
		  {
			std::string createdPdfHeader;
			createdPdfHeader="<head>\n";
			if(_title.length()>0) {
				createdPdfHeader+="<title>";
				createdPdfHeader+=_title;
				createdPdfHeader+="</title>\n";
			}
			if(_author.length()>0) {
				createdPdfHeader+="<author>";
				createdPdfHeader+=_author;
				createdPdfHeader+="</author>\n";
			}
			createdPdfHeader+="</head>\n";
			char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 );
			strcpy(metastream, createdPdfHeader.c_str());
		  }
	  }
      doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing);
    }
  }
  

  delete textOut;
  delete doc;

  _unparsedDocument.textLength = _documentTextBuffer.position();
  _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty.
  char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 );
  strcpy( docnoPoint, _documentPath.c_str() );
  _unparsedDocument.text = _documentTextBuffer.front();
  _unparsedDocument.content = _documentTextBuffer.front();
  _unparsedDocument.metadata.clear();

  indri::parse::MetadataPair pair;

  pair.key = "path";
  pair.value = docnoPoint;
  pair.valueLength = _documentPath.length()+1;
  _unparsedDocument.metadata.push_back( pair );

  _docnostring.assign(_documentPath.c_str() );
  cleanDocno();
  pair.value = _docnostring.c_str();
  pair.valueLength = _docnostring.length()+1;
  pair.key = "docno";
  _unparsedDocument.metadata.push_back( pair );

  _documentPath = "";

  return &_unparsedDocument;
}
Ejemplo n.º 13
0
void indri::collection::Repository::create(const std::string& path, indri::api::Parameters* options) {
  _path = path;
  _readOnly = false;

  try {
    _cleanAndCreateDirectory(path);
    
    _memory = defaultMemory;
    if (options)
      _memory = options->get("memory", _memory);

    float queryProportion = 0.15f;
    if (options)
      queryProportion = static_cast<float>(options->get("queryProportion", queryProportion));

    if (options)
      _copyParameters(*options);

    _buildFields();
    _buildChain(_parameters, 0);

    std::string indexPath = indri::file::Path::combine(path, "index");
    std::string collectionPath = indri::file::Path::combine(path, "collection");

    if (!indri::file::Path::exists(indexPath))
      indri::file::Path::create(indexPath);

    std::string indexName = indri::file::Path::combine(indexPath, "index");

    _active = new index_vector;
    _states.push_back(_active);
    _active->push_back(new indri::index::MemoryIndex(1, _indexFields));
    _indexCount = 0;

    _collection = new CompressedCollection();

    if (!indri::file::Path::exists(collectionPath))
      indri::file::Path::create(collectionPath);

    std::vector<std::string> forwardFields;
    std::vector<std::string> backwardFields;

    if (options && options->exists("collection.forward")) {
      indri::api::Parameters cfields = options->get("collection.forward");

      for(size_t i=0; i<cfields.size(); i++) {
        forwardFields.push_back((std::string) cfields[i]);
      }
    }

    if (options && options->exists("collection.backward")) {
      indri::api::Parameters cfields = options->get("collection.backward");

      for(size_t i=0; i<cfields.size(); i++) {
        backwardFields.push_back((std::string) cfields[i]);
      }
    }

    _collection->create(collectionPath, forwardFields, backwardFields,
                         options->get("storeDocs", true));

    _startThreads();
  } catch(lemur::api::Exception& e) {
    LEMUR_RETHROW(e, "Couldn't create a repository at '" + path + "' because:");
  } catch(...) {
    LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Something unexpected happened while trying to create '" + path + "'");
  }
}
Ejemplo n.º 14
0
void indri::collection::Repository::merge(const std::string& path, const std::vector<std::string>& inputIndexes) {
  // Create the directory for the output index
  _cleanAndCreateDirectory(path);

  std::string indexPath = indri::file::Path::combine(path, "index");
  std::string collectionPath = indri::file::Path::combine(path, "collection");

  // First, we're going to harvest information from the individual indexes.  We want to 
  // check a few things:
  //    1. do they all use the same stemmer?
  //    2. do they all have the same indexed fields?
  //    3. are they all merged (only have one disk index?)
  //    4. how many documents are in each one?

  // If no indexes are given, make an empty repository and return
  if (inputIndexes.size() == 0) {
      makeEmpty(path);
      return;
  }

  std::vector<lemur::api::DOCID_T> documentMaximums;

  // Open up the first repository and extract field information
  Repository firstRepository;
  try {
    firstRepository.openRead(inputIndexes[0]);
  } catch(lemur::api::Exception& e) {
    LEMUR_RETHROW(e, "Merge failed, couldn't find repository: " + inputIndexes[0]);
  }
  std::vector<Field> indexFields = firstRepository.fields();
  firstRepository.close();

  // Open up the first manifest and check on stemming and fields
  indri::api::Parameters firstManifest;
  std::string firstManifestPath = indri::file::Path::combine(inputIndexes[0], "manifest");
  try {
    firstManifest.loadFile(firstManifestPath);
  } catch(lemur::api::Exception& e) {
    LEMUR_RETHROW(e, "Merge failed, couldn't find repository: " + inputIndexes[0]);
  }

  std::string stemmerName = _stemmerName(firstManifest);
  std::vector<std::string> fieldNames = _fieldNames(firstManifest);

  // Now, gather information about the indexes
  for(size_t i=0; i<inputIndexes.size(); i++) {
    indri::api::Parameters repositoryManifest;
    std::string manifestPath = indri::file::Path::combine(inputIndexes[i], "manifest");

    try {
      repositoryManifest.loadFile(manifestPath);
    } catch(lemur::api::Exception& e) {
      LEMUR_RETHROW(e, "Couldn't find repository: " + inputIndexes[i]);
    }

    if (!repositoryManifest.exists("indexes.index")) {
      documentMaximums.push_back(0);
      continue;
    }

    // Check to make sure there's only one index in there
    size_t indexCount = repositoryManifest["indexes.index"].size();

    if (indexCount > 1) {
      LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that have unmerged internal indexes: " + inputIndexes[i]);
    }

    // How many documents are in this one?
    indri::index::DiskIndex diskIndex;
    std::string basePath = indri::file::Path::combine(inputIndexes[i], "index");
    std::string relativePath = i64_to_string((INT64)repositoryManifest["indexes.index"]);
    diskIndex.open(basePath, relativePath);

    documentMaximums.push_back(diskIndex.documentMaximum());
    diskIndex.close();

    // Only check successive indexes against the first one
    if (i == 0)
      continue;

    // Verify that the same fields and stemmers are used
    if (stemmerName != _stemmerName(repositoryManifest)) {
      LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that use different stemmers: " + inputIndexes[i]);
    }

    if (fieldNames != _fieldNames(repositoryManifest)) {
      LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that use different fields: " + inputIndexes[i]);
    }
  } 
  
  std::vector<std::string> usableIndexes = inputIndexes;
  
  // remove any repositories that have no documents
  for(size_t i=0; i<usableIndexes.size(); i++) {
    if (documentMaximums[i] == 0) {
        documentMaximums.erase(documentMaximums.begin() + i);
        usableIndexes.erase(usableIndexes.begin() + i);
        i--;
    }
  }      
  
  // now that we've removed empty indexes, are there any left?
  if (usableIndexes.size() == 0) {
      makeEmpty(path);
      return;
  }

  // 2. merge the deleted bitmaps
  _mergeBitmaps(path, usableIndexes, documentMaximums);

  // 3. merge compressed collections
  _mergeCompressedCollections(path, usableIndexes, documentMaximums);

  // 4. merge the indexes
  _mergeClosedIndexes(path, usableIndexes, indexFields, documentMaximums);

  // 5. write the manifest file
  _writeMergedManifest(path, firstManifest);
}
Ejemplo n.º 15
0
  UINT64 initialize() {
    try {        
    _environment.setSingleBackgroundModel( _parameters.get("singleBackgroundModel", false) );

    std::vector<std::string> stopwords;
    if( copy_parameters_to_string_vector( stopwords, _parameters, "stopper.word" ) )
      _environment.setStopwords(stopwords);

    std::vector<std::string> smoothingRules;
    if( copy_parameters_to_string_vector( smoothingRules, _parameters, "rule" ) )
      _environment.setScoringRules( smoothingRules );

   if( _parameters.exists( "index" ) ) {
      indri::api::Parameters indexes = _parameters["index"];

      for( size_t i=0; i < indexes.size(); i++ ) {
        _environment.addIndex( std::string(indexes[i]) );
      }
    }

    if( _parameters.exists( "server" ) ) {
      indri::api::Parameters servers = _parameters["server"];

      for( size_t i=0; i < servers.size(); i++ ) {
        _environment.addServer( std::string(servers[i]) );
      }
    }

    if( _parameters.exists("maxWildcardTerms") )
        _environment.setMaxWildcardTerms(_parameters.get("maxWildcardTerms", 100));

    _requested = _parameters.get( "count", 1000 );
    _initialRequested = _parameters.get( "fbDocs", _requested );
    _runID = _parameters.get( "runID", "indri" );
    _trecFormat = _parameters.get( "trecFormat" , false );
    _inexFormat = _parameters.exists( "inex" );

    _printQuery = _parameters.get( "printQuery", false );
    _printDocuments = _parameters.get( "printDocuments", false );
    _printPassages = _parameters.get( "printPassages", false );
    _printSnippets = _parameters.get( "printSnippets", false );

    if (_parameters.exists("baseline")) {
      // doing a baseline
      std::string baseline = _parameters["baseline"];
      _environment.setBaseline(baseline);
      // need a factory for this...
      if( _parameters.get( "fbDocs", 0 ) != 0 ) {
        // have to push the method in...
        std::string rule = "method:" + baseline;
        _parameters.set("rule", rule);
        _expander = new indri::query::TFIDFExpander( &_environment, _parameters );
      }
    } else {
      if( _parameters.get( "fbDocs", 0 ) != 0 ) {
        _expander = new indri::query::RMExpander( &_environment, _parameters );
      }
    }

    if (_parameters.exists("maxWildcardTerms")) {
      _environment.setMaxWildcardTerms((int)_parameters.get("maxWildcardTerms"));
    }    
    } catch ( lemur::api::Exception& e ) {      
      while( _queries.size() ) {
        query_t *query = _queries.front();
        _queries.pop();
        _output.push( new query_t( query->index, query->number, "query: " + query->number + " QueryThread::_initialize exception\n" ) );
        _queueEvent.notifyAll();
        LEMUR_RETHROW(e, "QueryThread::_initialize");
      }
    }
    return 0;
  }