indri::parse::KrovetzStemmerTransformation::KrovetzStemmerTransformation( indri::api::Parameters& parameters ) { stemmer = new KrovetzStemmer(); _stemBuffer = 0; _stemBufferSize = 0; indri::api::Parameters pheadwords; indri::api::Parameters pconflations; // figure out how many words we're dealing with here if( parameters.exists( "h" ) ) { pheadwords = parameters["h"]; } if( parameters.exists( "conflation" ) ) { pconflations = parameters["conflation"]; } unsigned int i; for( i=0; i<pheadwords.size(); i++ ) { std::string variant = std::string(pheadwords[i]); stemmer->kstem_add_table_entry( variant.c_str(), "" ); } for( i=0; i<pconflations.size(); i++ ) { std::string variant = std::string(pconflations[i]["variant"]); std::string word = std::string(pconflations[i]["word"]); stemmer->kstem_add_table_entry( variant.c_str(), word.c_str() ); } }
void indri::collection::Repository::_writeMergedManifest(const std::string& path, indri::api::Parameters& firstManifest) { firstManifest.set("indexCount", 1); firstManifest["indexes"].set("index", 0); std::string manifestPath = indri::file::Path::combine(path, "manifest"); firstManifest.writeFile(manifestPath); }
static void usage( indri::api::Parameters param ) { if( !param.exists( "query" ) || !( param.exists( "index" ) || param.exists( "server" ) ) || !param.exists( "documents" ) ) { std::cerr << "rmodel usage: " << std::endl << " rmodel -query=myquery -index=myindex -documents=10 -maxGrams=2" << std::endl << " myquery: a valid Indri query (be sure to use quotes around it if there are spaces in it)" << std::endl << " myindex: a valid Indri index" << std::endl << " documents: the number of documents to use to build the relevance model" << std::endl << " maxGrams (optional): maximum length (in words) of phrases to be added to the model, default is 1 (unigram)" << std::endl; exit(-1); } }
static void usage( indri::api::Parameters param ) { if( !param.exists( "query" ) || !( param.exists( "index" ) || param.exists( "server" ) )) { std::cerr << "clarity usage: " << std::endl << " clarity -query=myquery -index=myindex -documents=10 -terms=5 -smoothing=\"method:jm,lambda,0.5\"" << std::endl << "OR clarity -query=myquery -server=myserver -documents=10 -terms=5 -smoothing=\"method:jm,lambda,0.5\"" << std::endl << " myquery: a valid Indri query (be sure to use quotes around it if there are spaces in it)" << std::endl << " myindex: a valid Indri index" << std::endl << " myserver: a valid IndriDaemon instance" << std::endl << " documents: the number of documents to use to build the relevance model. Default is 5" << std::endl << " terms: the number of terms to use to build the relevance model. Default is 10" << " smoothing: the smoothing rule to apply. Default is linear smoothing with lambda=0.5" << std::endl; exit(-1); } }
void push_queue( std::queue< query_t* >& q, indri::api::Parameters& queries, int queryOffset ) { for( size_t i=0; i<queries.size(); i++ ) { std::string queryNumber; std::string queryText; std::string queryType = "indri"; if( queries[i].exists( "type" ) ) queryType = (std::string) queries[i]["type"]; if (queries[i].exists("text")) queryText = (std::string) queries[i]["text"]; if( queries[i].exists( "number" ) ) { queryNumber = (std::string) queries[i]["number"]; } else { int thisQuery=queryOffset + int(i); std::stringstream s; s << thisQuery; queryNumber = s.str(); } if (queryText.size() == 0) queryText = (std::string) queries[i]; // working set and RELFB docs go here. // working set to restrict retrieval std::vector<std::string> workingSet; // Rel fb docs std::vector<std::string> relFBDocs; copy_parameters_to_string_vector( workingSet, queries[i], "workingSetDocno" ); copy_parameters_to_string_vector( relFBDocs, queries[i], "feedbackDocno" ); q.push( new query_t( i, queryNumber, queryText, queryType, workingSet, relFBDocs ) ); } }
static void open_indexes( indri::api::QueryEnvironment& environment, indri::api::Parameters& param ) { if( param.exists( "index" ) ) { indri::api::Parameters indexes = param["index"]; for( unsigned int i=0; i < indexes.size(); i++ ) { environment.addIndex( std::string(indexes[i]) ); } } if( param.exists( "server" ) ) { indri::api::Parameters servers = param["server"]; for( unsigned int i=0; i < servers.size(); i++ ) { environment.addServer( std::string(servers[i]) ); } } std::vector<std::string> smoothingRules; if( copy_parameters_to_string_vector( smoothingRules, param, "rule" ) ) environment.setScoringRules( smoothingRules ); }
void indri::collection::Repository::_buildChain(indri::api::Parameters& parameters, indri::api::Parameters* options) { // Extract url from metadata before case normalizing. // this could be parameterized. if (parameters.get("injectURL", true)) _transformations.push_back(new indri::parse::URLTextAnnotator()); bool dontNormalize = parameters.exists("normalize") && (false == (bool) parameters["normalize"]); if (dontNormalize == false) { _transformations.push_back(new indri::parse::NormalizationTransformation()); _transformations.push_back(new indri::parse::UTF8CaseNormalizationTransformation()); } for(size_t i=0; i<_fields.size(); i++) { if (_fields[i].parserName == "NumericFieldAnnotator") { _transformations.push_back(new indri::parse::NumericFieldAnnotator(_fields[i].name)); } else if (_fields[i].parserName == "DateFieldAnnotator") { _transformations.push_back(new indri::parse::DateFieldAnnotator(_fields[i].name)); } } if (_parameters.exists("stopper.word")) { indri::api::Parameters stop = _parameters["stopper.word"]; _transformations.push_back(new indri::parse::StopperTransformation(stop)); } // the transient chain stopwords need to precede the stemmer. if (options) { if (options->exists("stopper.word")) { indri::api::Parameters stop = (*options)["stopper.word"]; _transformations.push_back(new indri::parse::StopperTransformation(stop)); } } if (_parameters.exists("stemmer.name")) { std::string stemmerName = std::string(_parameters["stemmer.name"]); indri::api::Parameters stemmerParams = _parameters["stemmer"]; _transformations.push_back(indri::parse::StemmerFactory::get(stemmerName, stemmerParams)); } }
std::vector<std::string> indri::collection::Repository::_fieldNames(indri::api::Parameters& parameters) { std::vector<std::string> fields; if (parameters.exists("field")) { for(size_t i=0; i<parameters["field"].size(); i++) { std::string fieldName = parameters["field"][i]; fields.push_back(fieldName); } } return fields; }
static bool copy_parameters_to_string_vector( std::vector<std::string>& vec, indri::api::Parameters p, const std::string& parameterName ) { if( !p.exists(parameterName) ) return false; indri::api::Parameters slice = p[parameterName]; for( size_t i=0; i<slice.size(); i++ ) { vec.push_back( slice[i] ); } return true; }
void indri::collection::Repository::_copyParameters(indri::api::Parameters& options) { if (options.exists("normalize")) { _parameters.set("normalize", (std::string) options["normalize"]); } if (options.exists("injectURL")) { _parameters.set("injectURL", (std::string) options["injectURL"]); } if (options.exists("field")) { _parameters.set("field", ""); _parameters["field"] = options["field"]; } if (options.exists("stopper")) { _parameters.set("stopper", ""); _parameters["stopper"] = options["stopper"]; } if (options.exists("stemmer")) { _parameters.set("stemmer", ""); _parameters["stemmer"] = options["stemmer"]; } }
void _loadSmoothingRules( indri::api::Parameters& parameters ) { if( !parameters.exists("rule") ) return; indri::api::Parameters rules = parameters["rule"]; for(size_t i=0; i<rules.size(); i++) { std::string ruleText = rules[i]; int nextComma = 0; int nextColon = 0; int location = 0; rule_type* rule = new rule_type; rule->node = "RawScorerNode"; rule->op = "*"; rule->field = "*"; for( location = 0; location < ruleText.length(); ) { nextComma = ruleText.find( ',', location ); nextColon = ruleText.find( ':', location ); std::string key = ruleText.substr( location, nextColon-location ); std::string value = ruleText.substr( nextColon+1, nextComma-nextColon-1 ); if( key == "node" ) { rule->node = value; } else if( key == "field" ) { rule->field = value; } else if( key == "operator" ) { rule->op = value; } else { if( rule->smoothing.size() ) rule->smoothing += ","; rule->smoothing += key + ":" + value; } if( nextComma > 0 ) location = nextComma+1; else location = ruleText.size(); } _rules.push_back(rule); } }
static void termscorefunctionfactory_parse( indri::api::Parameters& converted, const std::string& spec ) { int nextComma = 0; int nextColon = 0; int location = 0; for( location = 0; location < spec.length(); ) { nextComma = spec.find( ',', location ); nextColon = spec.find( ':', location ); std::string key = spec.substr( location, nextColon-location ); std::string value = spec.substr( nextColon+1, nextComma-nextColon-1 ); converted.set( key, value ); if( nextComma > 0 ) location = nextComma+1; else location = spec.size(); } }
void indri::query::SimpleQueryParser::loadModelParameters( indri::api::Parameters& parameters, std::map<std::string, double>& res ) { res.clear(); if( !parameters.exists("rule") ) { return; } indri::api::Parameters rules = parameters["rule"]; if (rules.size() == 0) { return; } size_t x = 0; std::vector<std::string> para_vectors = split(rules[x], ','); for (size_t i = 0; i < para_vectors.size(); i++) { std::string cur = para_vectors[i]; try { std::vector<std::string> this_para = split(cur, ':'); res[this_para.at(0)] = atof(this_para.at(1).c_str()); } catch (...) { LEMUR_THROW( EMPTY_QUERY, "Parse Model Parameters Error!" ); } } }
void push_queue( std::queue< query_t* >& q, indri::api::Parameters& queries, int queryOffset ) { for( size_t i=0; i<queries.size(); i++ ) { std::string queryNumber; std::string queryText; std::string queryType = "indri"; if( queries[i].exists( "type" ) ) queryType = (std::string) queries[i]["type"]; if( queries[i].exists( "number" ) ) { queryText = (std::string) queries[i]["text"]; queryNumber = (std::string) queries[i]["number"]; } else { queryText = (std::string) queries[i]; int thisQuery=queryOffset + int(i); std::stringstream s; s << thisQuery; queryNumber = s.str(); } q.push( new query_t( i, queryNumber, queryText, queryType ) ); } }
void indri::collection::Repository::_openIndexes(indri::api::Parameters& params, const std::string& parentPath) { try { indri::api::Parameters container = params["indexes"]; _active = new index_vector; _states.push_back(_active); _indexCount = params.get("indexCount", 0); if (container.exists("index")) { indri::api::Parameters indexes = container["index"]; for(size_t i=0; i<indexes.size(); i++) { indri::api::Parameters indexSpec = indexes[i]; indri::index::DiskIndex* diskIndex = new indri::index::DiskIndex(); std::string indexName = (std::string) indexSpec; diskIndex->open(parentPath, indexName); _active->push_back(diskIndex); } } } catch(lemur::api::Exception& e) { LEMUR_RETHROW(e, "_openIndexes: Couldn't open DiskIndexes because:"); } }
void _mergeFields() { if( repo.exists("field") ) { indri::api::Parameters result; indri::api::Parameters fields = repo["field"]; for (size_t i = 0; i < fields.size(); i++) { indri::api::Parameters oldField = fields[i]; std::string fieldName = oldField["name"]; bool keep = true; for (size_t j = 0; j < removeNames.size(); j++) { if (removeNames[j] == fieldName) { keep = false; break; } } if (keep) { bool isNumeric = oldField.get("numeric", false); bool isOrdinal = oldField.get("ordinal", false); bool isParental = oldField.get("parental", false); std::string parserName = oldField.get("parserName", isNumeric ? "NumericFieldAnnotator" : ""); indri::api::Parameters field = result.append("field"); field.set( "name", fieldName ); field.set( "numeric", isNumeric ); field.set( "parserName", parserName ); field.set( "ordinal", isOrdinal ); field.set( "parental", isParental ); } } if (adding) { indri::api::Parameters newFields = addFields["field"]; for (size_t i = 0; i < newFields.size(); i++) { indri::api::Parameters newField = newFields[i]; std::string fieldName = newField["name"]; bool dupe = false; for( size_t j = 0; j<fields.size(); j++ ) { std::string parameterFieldName = fields[j]["name"]; if( parameterFieldName == fieldName ) { // it's already in there, skip it... dupe = true; break; } } if (dupe) continue; bool isNumeric = newField.get("numeric", false); bool isOrdinal = newField.get("ordinal", false); bool isParental = newField.get("parental", false); std::string parserName = newField.get("parserName", isNumeric ? "OffsetAnnotationAnnotator" : ""); indri::api::Parameters field = result.append("field"); field.set( "name", fieldName ); field.set( "numeric", isNumeric ); field.set( "parserName", parserName ); field.set( "ordinal", isOrdinal ); field.set( "parental", isParental ); } } if (result.exists("field")) { repo.set("field"); repo["field"] = result["field"]; } else { // no fields in new index... if (repo.exists("field")) { repo.remove("field"); } } } else { // no fields in original, just insert // the new ones. (test they exist...) if (adding) { repo.set("field"); repo["field"] = addFields["field"]; } } }
std::string indri::collection::Repository::_stemmerName(indri::api::Parameters& parameters) { return parameters.get("stemmer.name", ""); }
void processFields( indri::api::Parameters ¶m ) { g_timer.start(); std::string index = param.get("index"); std::cout << "Opening: " << index << std::endl; // make sure this path doesn't exist. std::string idx2 = index + ".new"; // temp target index. // presumes a single input oa file for the entire collection. std::string offsetAnnotationsPath = param.get("annotations"); /// these need to be combined with existing. // fields to add // these need to supply numeric/parental/ordinal/etc... if (param.exists("addField")) addFields = param["addField"]; // fields to remove // these only need to be a list of names. if (param.exists("removeField")) { indri::api::Parameters slice = param["removeField"]; for (size_t i = 0; i < slice.size(); i++) { if( slice[i].exists("name") ) { removeNames.push_back( slice[i]["name"] ); } } } // need to know the file class environment to get the // conflations right. std::string className = param.get("fileclass", ""); indri::collection::Repository sourceRepo; indri::collection::Repository targetRepo; indri::parse::OffsetAnnotationAnnotator oa_annotator; indri::parse::FileClassEnvironmentFactory _fileClassFactory; // Open source repo sourceRepo.openRead(index); // Copy its parameters, create target repo, adding or removing // fields. repo.loadFile( indri::file::Path::combine( index, "manifest" ) ); int mem = param.get("memory", INT64(100*1024*1024)); repo.set("memory", mem); adding = addFields.exists("field"); _mergeFields(); // Create the offset annotator. fce = _fileClassFactory.get( className ); indri::parse::Conflater* conflater = 0; if( fce ) { conflater = fce->conflater; } if (adding) { oa_annotator.setConflater( conflater ); oa_annotator.open( offsetAnnotationsPath ); } targetRepo.create(idx2, &repo); // for each document in the source repo, fetch ParsedDocument // construct full rep, apply annotator, insert into // target repo. _index = sourceRepo.indexes()->front(); // presume 1 _docIter = _index->termListFileIterator(); _docIter->startIteration(); // ought to deal with deleted documents here... // if there are deleted documents, regular add to collection // if not, only rewrite the indexes, then rename the collection. indri::index::DeletedDocumentList& deleted = sourceRepo.deletedList(); UINT64 delCount = deleted.deletedCount(); if (delCount > 0) { // either warn, compact and then process, or // do it the old way... FIXME! std::cerr << "Deleted documents detected... compact with dumpindex first." << std::endl; return; } for (UINT64 docid = 1; docid <= _index->documentCount(); docid++) { if ((docid % 500) == 0) { g_timer.printElapsedSeconds(std::cout); std::cout << ": " << docid << "\r"; std::cout.flush(); } parsed = sourceRepo.collection()->retrieve(docid); // combine field and term data with parsed document _mergeData(); // apply annotator if (adding) parsed = oa_annotator.transform(parsed); targetRepo.addDocument(parsed, false); // TagList allocs memory for the tags... for (size_t i = 0; i < parsed->tags.size(); i++) delete(parsed->tags[i]); delete(parsed); _docIter->nextEntry(); } std::cout << std::endl; g_timer.printElapsedSeconds(std::cout); std::cout << ": " << _index->documentCount() << std::endl; g_timer.printElapsedSeconds(std::cout); std::cout << ": closing" << std::endl; targetRepo.close(); sourceRepo.close(); std::string oldcollectionPath = indri::file::Path::combine( index, "collection" ); std::string newcollectionPath = indri::file::Path::combine( idx2, "collection" ); // clone the collection indri::file::Path::remove(newcollectionPath); indri::file::Path::rename(oldcollectionPath, newcollectionPath); // rename target repo to source repo. indri::file::Path::remove(index); indri::file::Path::rename(idx2, index); g_timer.printElapsedSeconds(std::cout); std::cout << ": done" << std::endl; }
void indri::parse::StopperTransformation::read( indri::api::Parameters& stopwords ) { for( unsigned int i=0; i < stopwords.size(); i++ ) { _table.insert(strdup(((std::string) stopwords[i] ).c_str())); } }
void require_parameter( const char* name, indri::api::Parameters& p ) { if( !p.exists( name ) ) { LEMUR_THROW( LEMUR_MISSING_PARAMETER_ERROR, "Must specify a " + name + " parameter." ); } }