indri::parse::KrovetzStemmerTransformation::KrovetzStemmerTransformation( indri::api::Parameters& parameters ) {
  stemmer = new KrovetzStemmer();
  
  _stemBuffer = 0;
  _stemBufferSize = 0;

  indri::api::Parameters pheadwords;
  indri::api::Parameters pconflations;

  // figure out how many words we're dealing with here
  if( parameters.exists( "h" ) ) {
    pheadwords = parameters["h"];
  }

  if( parameters.exists( "conflation" ) ) {
    pconflations = parameters["conflation"];
  }
  unsigned int i;
    for( i=0; i<pheadwords.size(); i++ ) {
    std::string variant = std::string(pheadwords[i]);
    stemmer->kstem_add_table_entry( variant.c_str(), "" );
  }

  for( i=0; i<pconflations.size(); i++ ) {
    std::string variant = std::string(pconflations[i]["variant"]);
    std::string word = std::string(pconflations[i]["word"]);
    stemmer->kstem_add_table_entry( variant.c_str(), word.c_str() );
  }

}
Beispiel #2
0
void indri::collection::Repository::_writeMergedManifest(const std::string& path, indri::api::Parameters& firstManifest) {
  firstManifest.set("indexCount", 1);
  firstManifest["indexes"].set("index", 0);

  std::string manifestPath = indri::file::Path::combine(path, "manifest");
  firstManifest.writeFile(manifestPath);
}
Beispiel #3
0
static void usage( indri::api::Parameters param ) {
  if( !param.exists( "query" ) || !( param.exists( "index" ) || param.exists( "server" ) ) || !param.exists( "documents" ) ) {
   std::cerr << "rmodel usage: " << std::endl
             << "   rmodel -query=myquery -index=myindex -documents=10 -maxGrams=2" << std::endl
             << "     myquery: a valid Indri query (be sure to use quotes around it if there are spaces in it)" << std::endl
             << "     myindex: a valid Indri index" << std::endl
             << "     documents: the number of documents to use to build the relevance model" << std::endl
             << "     maxGrams (optional): maximum length (in words) of phrases to be added to the model, default is 1 (unigram)" << std::endl;
   exit(-1);
  }
}
Beispiel #4
0
static void usage( indri::api::Parameters param ) {
  if( !param.exists( "query" ) || 
      !( param.exists( "index" ) || param.exists( "server" ) )) {
   std::cerr << "clarity usage: " << std::endl
             << "   clarity -query=myquery -index=myindex -documents=10 -terms=5 -smoothing=\"method:jm,lambda,0.5\"" << std::endl
             << "OR clarity -query=myquery -server=myserver -documents=10 -terms=5 -smoothing=\"method:jm,lambda,0.5\"" << std::endl
             << "     myquery: a valid Indri query (be sure to use quotes around it if there are spaces in it)" << std::endl
             << "     myindex: a valid Indri index" << std::endl
             << "     myserver: a valid IndriDaemon instance" << std::endl
             << "     documents: the number of documents to use to build the relevance model. Default is 5" << std::endl
             << "     terms: the number of terms to use to build the relevance model. Default is 10" 
             << "     smoothing: the smoothing rule to apply. Default is linear smoothing with lambda=0.5" << std::endl;
   exit(-1);
  }
}
Beispiel #5
0
void push_queue( std::queue< query_t* >& q, indri::api::Parameters& queries,
                 int queryOffset ) {

  for( size_t i=0; i<queries.size(); i++ ) {
    std::string queryNumber;
    std::string queryText;
    std::string queryType = "indri";
    if( queries[i].exists( "type" ) )
      queryType = (std::string) queries[i]["type"];
    if (queries[i].exists("text"))
      queryText = (std::string) queries[i]["text"];
    if( queries[i].exists( "number" ) ) {
      queryNumber = (std::string) queries[i]["number"];
    } else {
      int thisQuery=queryOffset + int(i);
      std::stringstream s;
      s << thisQuery;
      queryNumber = s.str();
    }
    if (queryText.size() == 0)
      queryText = (std::string) queries[i];

    // working set and RELFB docs go here.
    // working set to restrict retrieval
    std::vector<std::string> workingSet;
    // Rel fb docs
    std::vector<std::string> relFBDocs;
    copy_parameters_to_string_vector( workingSet, queries[i], "workingSetDocno" );
    copy_parameters_to_string_vector( relFBDocs, queries[i], "feedbackDocno" );

    q.push( new query_t( i, queryNumber, queryText, queryType, workingSet, relFBDocs ) );

  }
}
Beispiel #6
0
static void open_indexes( indri::api::QueryEnvironment& environment, 
                          indri::api::Parameters& param ) {
  if( param.exists( "index" ) ) {
    indri::api::Parameters indexes = param["index"];
    for( unsigned int i=0; i < indexes.size(); i++ ) {
      environment.addIndex( std::string(indexes[i]) );
    }
  }
  if( param.exists( "server" ) ) {
    indri::api::Parameters servers = param["server"];
    for( unsigned int i=0; i < servers.size(); i++ ) {
      environment.addServer( std::string(servers[i]) );
    }
  }
  std::vector<std::string> smoothingRules;
  if( copy_parameters_to_string_vector( smoothingRules, param, "rule" ) )
    environment.setScoringRules( smoothingRules );
}
Beispiel #7
0
void indri::collection::Repository::_buildChain(indri::api::Parameters& parameters, indri::api::Parameters* options) {
  // Extract url from metadata before case normalizing.
  // this could be parameterized.

  if (parameters.get("injectURL", true))
    _transformations.push_back(new indri::parse::URLTextAnnotator());

  bool dontNormalize = parameters.exists("normalize") && (false == (bool) parameters["normalize"]);

  if (dontNormalize == false) {
    _transformations.push_back(new indri::parse::NormalizationTransformation());
    _transformations.push_back(new indri::parse::UTF8CaseNormalizationTransformation());
  }

  for(size_t i=0; i<_fields.size(); i++) {
    if (_fields[i].parserName == "NumericFieldAnnotator") {
      _transformations.push_back(new indri::parse::NumericFieldAnnotator(_fields[i].name));
    }
    else if (_fields[i].parserName == "DateFieldAnnotator") {
      _transformations.push_back(new indri::parse::DateFieldAnnotator(_fields[i].name));
    }
  }

  if (_parameters.exists("stopper.word")) {
    indri::api::Parameters stop = _parameters["stopper.word"];
    _transformations.push_back(new indri::parse::StopperTransformation(stop));
  }
  // the transient chain stopwords need to precede the stemmer.
  if (options) {
    if (options->exists("stopper.word")) {
      indri::api::Parameters stop = (*options)["stopper.word"];
      _transformations.push_back(new indri::parse::StopperTransformation(stop));
    }
  }

  if (_parameters.exists("stemmer.name")) {
    std::string stemmerName = std::string(_parameters["stemmer.name"]);
    indri::api::Parameters stemmerParams = _parameters["stemmer"];
    _transformations.push_back(indri::parse::StemmerFactory::get(stemmerName, stemmerParams));
  }
}
Beispiel #8
0
std::vector<std::string> indri::collection::Repository::_fieldNames(indri::api::Parameters& parameters) {
  std::vector<std::string> fields;

  if (parameters.exists("field")) {
    for(size_t i=0; i<parameters["field"].size(); i++) {
      std::string fieldName = parameters["field"][i];
      fields.push_back(fieldName);
    }
  }

  return fields;
}   
Beispiel #9
0
static bool copy_parameters_to_string_vector( std::vector<std::string>& vec, indri::api::Parameters p, const std::string& parameterName ) {
  if( !p.exists(parameterName) )
    return false;

  indri::api::Parameters slice = p[parameterName];
  
  for( size_t i=0; i<slice.size(); i++ ) {
    vec.push_back( slice[i] );
  }

  return true;
}
Beispiel #10
0
void indri::collection::Repository::_copyParameters(indri::api::Parameters& options) {
  if (options.exists("normalize")) {
    _parameters.set("normalize", (std::string) options["normalize"]);
  }
  if (options.exists("injectURL")) {
    _parameters.set("injectURL", (std::string) options["injectURL"]);
  }

  if (options.exists("field")) {
    _parameters.set("field", "");
    _parameters["field"] = options["field"];
  }

  if (options.exists("stopper")) {
    _parameters.set("stopper", "");
    _parameters["stopper"] = options["stopper"];
  }

  if (options.exists("stemmer")) {
    _parameters.set("stemmer", "");
    _parameters["stemmer"] = options["stemmer"];
  }

}
      void _loadSmoothingRules( indri::api::Parameters& parameters ) {
        if( !parameters.exists("rule") )
          return;

        indri::api::Parameters rules = parameters["rule"];

        for(size_t i=0; i<rules.size(); i++) {
          std::string ruleText = rules[i];

          int nextComma = 0;
          int nextColon = 0;
          int location = 0;

          rule_type* rule = new rule_type;
          rule->node = "RawScorerNode";
          rule->op = "*";
          rule->field = "*";

          for( location = 0; location < ruleText.length(); ) {
            nextComma = ruleText.find( ',', location );
            nextColon = ruleText.find( ':', location );

            std::string key = ruleText.substr( location, nextColon-location );
            std::string value = ruleText.substr( nextColon+1, nextComma-nextColon-1 );

            if( key == "node" ) {
              rule->node = value;
            } else if( key == "field" ) {
              rule->field = value;
            } else if( key == "operator" ) {
              rule->op = value;
            }  else {
              if( rule->smoothing.size() ) rule->smoothing += ",";
              rule->smoothing += key + ":" + value;
            }

            if( nextComma > 0 )
              location = nextComma+1;
            else
              location = ruleText.size();
          }

          _rules.push_back(rule);
        }
      }
static void termscorefunctionfactory_parse( indri::api::Parameters& converted, const std::string& spec ) {
    int nextComma = 0;
    int nextColon = 0;
    int  location = 0;

    for( location = 0; location < spec.length(); ) {
        nextComma = spec.find( ',', location );
        nextColon = spec.find( ':', location );

        std::string key = spec.substr( location, nextColon-location );
        std::string value = spec.substr( nextColon+1, nextComma-nextColon-1 );

        converted.set( key, value );

        if( nextComma > 0 )
            location = nextComma+1;
        else
            location = spec.size();
    }
}
void indri::query::SimpleQueryParser::loadModelParameters( indri::api::Parameters& parameters, std::map<std::string, double>& res ) {
  res.clear();

  if( !parameters.exists("rule") ) { return; }
  indri::api::Parameters rules = parameters["rule"];
  if (rules.size() == 0) { return; }
  size_t x = 0;

  std::vector<std::string> para_vectors = split(rules[x], ',');
  for (size_t i = 0; i < para_vectors.size(); i++) {
    std::string cur = para_vectors[i];
    try {
      std::vector<std::string> this_para = split(cur, ':');
      res[this_para.at(0)] = atof(this_para.at(1).c_str());
    }
    catch (...) {
      LEMUR_THROW( EMPTY_QUERY, "Parse Model Parameters Error!" );
    }
  }
}
Beispiel #14
0
void push_queue( std::queue< query_t* >& q, indri::api::Parameters& queries,
                 int queryOffset ) {

  for( size_t i=0; i<queries.size(); i++ ) {
    std::string queryNumber;
    std::string queryText;
    std::string queryType = "indri";
    if( queries[i].exists( "type" ) )
      queryType = (std::string) queries[i]["type"];

    if( queries[i].exists( "number" ) ) {
      queryText = (std::string) queries[i]["text"];
      queryNumber = (std::string) queries[i]["number"];
    } else {
      queryText = (std::string) queries[i];
      int thisQuery=queryOffset + int(i);
      std::stringstream s;
      s << thisQuery;
      queryNumber = s.str();
    }
    q.push( new query_t( i, queryNumber, queryText, queryType ) );
  }
}
Beispiel #15
0
void indri::collection::Repository::_openIndexes(indri::api::Parameters& params, const std::string& parentPath) {
  try {
    indri::api::Parameters container = params["indexes"];

    _active = new index_vector;
    _states.push_back(_active);
    _indexCount = params.get("indexCount", 0);

    if (container.exists("index")) {
      indri::api::Parameters indexes = container["index"];

      for(size_t i=0; i<indexes.size(); i++) {
        indri::api::Parameters indexSpec = indexes[i];
        indri::index::DiskIndex* diskIndex = new indri::index::DiskIndex();
        std::string indexName = (std::string) indexSpec;

        diskIndex->open(parentPath, indexName);
        _active->push_back(diskIndex);
      }
    }
  } catch(lemur::api::Exception& e) {
    LEMUR_RETHROW(e, "_openIndexes: Couldn't open DiskIndexes because:");
  }
}
Beispiel #16
0
  void _mergeFields() {
    if( repo.exists("field") ) {
      indri::api::Parameters result;
      indri::api::Parameters fields = repo["field"];
      for (size_t i = 0; i < fields.size(); i++) {
        indri::api::Parameters oldField = fields[i];
        std::string fieldName = oldField["name"];
        bool keep = true;
        for (size_t j = 0; j < removeNames.size(); j++) {
          if (removeNames[j] == fieldName) {
            keep = false;
            break;
          }
        }
        if (keep) {
          bool isNumeric = oldField.get("numeric", false);
          bool isOrdinal = oldField.get("ordinal", false);
          bool isParental = oldField.get("parental", false);
          std::string parserName = oldField.get("parserName", 
                                                isNumeric ? "NumericFieldAnnotator" : "");
          indri::api::Parameters field = result.append("field");
          field.set( "name", fieldName );
          field.set( "numeric", isNumeric );
          field.set( "parserName", parserName );
          field.set( "ordinal", isOrdinal );
          field.set( "parental", isParental );
        }
      }
      if (adding) {
        indri::api::Parameters newFields = addFields["field"];
        for (size_t i = 0; i < newFields.size(); i++) {
          indri::api::Parameters newField = newFields[i];
          std::string fieldName = newField["name"];
          bool dupe = false;
          for( size_t j = 0; j<fields.size(); j++ ) {
            std::string parameterFieldName = fields[j]["name"];
            if( parameterFieldName == fieldName ) {
              // it's already in there, skip it...
              dupe = true; 
              break;
            }
          }
          if (dupe) continue;

          bool isNumeric = newField.get("numeric", false);
          bool isOrdinal = newField.get("ordinal", false);
          bool isParental = newField.get("parental", false);
          std::string parserName = newField.get("parserName", 
                                                isNumeric ? "OffsetAnnotationAnnotator" : "");
          indri::api::Parameters field = result.append("field");
          field.set( "name", fieldName );
          field.set( "numeric", isNumeric );
          field.set( "parserName", parserName );
          field.set( "ordinal", isOrdinal );
          field.set( "parental", isParental );
        }
      }
      if (result.exists("field")) {
        repo.set("field");
        repo["field"] = result["field"];
      } else {
        // no fields in new index...
        if (repo.exists("field")) {
          repo.remove("field");
        }
      }
    } else {
      // no fields in original, just insert
      // the new ones. (test they exist...)
      if (adding) {
        repo.set("field");
        repo["field"] = addFields["field"];
      }
    }
  }
Beispiel #17
0
std::string indri::collection::Repository::_stemmerName(indri::api::Parameters& parameters) {
  return parameters.get("stemmer.name", "");
}
Beispiel #18
0
  void processFields( indri::api::Parameters &param ) {
    g_timer.start();
    std::string index = param.get("index");
    std::cout << "Opening: " << index << std::endl;
    // make sure this path doesn't exist.
    std::string idx2 = index + ".new"; // temp target index.

    // presumes a single input oa file for the entire collection.
    std::string offsetAnnotationsPath = param.get("annotations");
      
    /// these need to be combined with existing.
    // fields to add
    // these need to supply numeric/parental/ordinal/etc...
    if (param.exists("addField"))
      addFields = param["addField"];
      
    // fields to remove
    // these only need to be a list of names.
    if (param.exists("removeField")) {
      indri::api::Parameters slice = param["removeField"];
      for (size_t i = 0; i < slice.size(); i++) {
        if( slice[i].exists("name") ) {
          removeNames.push_back( slice[i]["name"] );
        }
      }
    }
      
    // need to know the file class environment to get the 
    // conflations right.
    std::string className = param.get("fileclass", "");

    indri::collection::Repository sourceRepo;
    indri::collection::Repository targetRepo;
    indri::parse::OffsetAnnotationAnnotator oa_annotator;
    indri::parse::FileClassEnvironmentFactory _fileClassFactory;
            
    // Open source repo
    sourceRepo.openRead(index);
    // Copy its parameters, create target repo, adding or removing
    // fields.
    repo.loadFile( indri::file::Path::combine( index, "manifest" ) );
    int mem = param.get("memory", INT64(100*1024*1024));
      
    repo.set("memory", mem);
    adding = addFields.exists("field");
    _mergeFields();
    // Create the offset annotator.
    fce = _fileClassFactory.get( className );
    indri::parse::Conflater* conflater = 0;
    if( fce ) {
      conflater = fce->conflater;
    }
    if (adding) 
      {
        oa_annotator.setConflater( conflater );
        oa_annotator.open( offsetAnnotationsPath );
      }

    targetRepo.create(idx2, &repo);
      
    // for each document in the source repo, fetch ParsedDocument 
    // construct full rep, apply annotator, insert into
    // target repo.

    _index = sourceRepo.indexes()->front(); // presume 1
    _docIter = _index->termListFileIterator();
    _docIter->startIteration();
    // ought to deal with deleted documents here...
    // if there are deleted documents, regular add to collection
    // if not, only rewrite the indexes, then rename the collection.
    indri::index::DeletedDocumentList& deleted = sourceRepo.deletedList();
    UINT64 delCount = deleted.deletedCount();
    if (delCount > 0) 
      {
        // either warn, compact and then process, or 
        // do it the old way... FIXME!
        std::cerr << "Deleted documents detected... compact with dumpindex first." << std::endl;
        return;
      }
    
    for (UINT64 docid = 1; docid <= _index->documentCount(); docid++) 
      {
        if ((docid % 500) == 0)  {
          g_timer.printElapsedSeconds(std::cout);
          std::cout << ": " << docid << "\r";
          std::cout.flush();
        }

        parsed = sourceRepo.collection()->retrieve(docid);
        // combine field and term data with parsed document
        _mergeData();
        // apply annotator
        if (adding)
          parsed = oa_annotator.transform(parsed);
        targetRepo.addDocument(parsed, false);
        // TagList allocs memory for the tags...
        for (size_t i = 0; i < parsed->tags.size(); i++)
          delete(parsed->tags[i]);
        delete(parsed);
        _docIter->nextEntry();
      }
    std::cout << std::endl;
    g_timer.printElapsedSeconds(std::cout);
    std::cout << ": " << _index->documentCount() << std::endl;
    g_timer.printElapsedSeconds(std::cout);
    std::cout << ": closing"  << std::endl;

    targetRepo.close();
    sourceRepo.close();
    std::string oldcollectionPath = indri::file::Path::combine( index, "collection" );
    std::string newcollectionPath = indri::file::Path::combine( idx2, "collection" );
    // clone the collection
    indri::file::Path::remove(newcollectionPath);
    indri::file::Path::rename(oldcollectionPath, newcollectionPath);
    // rename target repo to source repo.
    indri::file::Path::remove(index);
    indri::file::Path::rename(idx2, index);

    g_timer.printElapsedSeconds(std::cout);
    std::cout << ": done"  << std::endl;
  }
void indri::parse::StopperTransformation::read( indri::api::Parameters& stopwords ) {
  for( unsigned int i=0; i < stopwords.size(); i++ ) {
    _table.insert(strdup(((std::string) stopwords[i] ).c_str()));
}
}
void require_parameter( const char* name, indri::api::Parameters& p ) {
  if( !p.exists( name ) ) {
    LEMUR_THROW( LEMUR_MISSING_PARAMETER_ERROR, "Must specify a " + name + " parameter." );
  }
}