void indri::collection::Repository::_writeMergedManifest(const std::string& path, indri::api::Parameters& firstManifest) { firstManifest.set("indexCount", 1); firstManifest["indexes"].set("index", 0); std::string manifestPath = indri::file::Path::combine(path, "manifest"); firstManifest.writeFile(manifestPath); }
static void termscorefunctionfactory_parse( indri::api::Parameters& converted, const std::string& spec ) { int nextComma = 0; int nextColon = 0; int location = 0; for( location = 0; location < spec.length(); ) { nextComma = spec.find( ',', location ); nextColon = spec.find( ':', location ); std::string key = spec.substr( location, nextColon-location ); std::string value = spec.substr( nextColon+1, nextComma-nextColon-1 ); converted.set( key, value ); if( nextComma > 0 ) location = nextComma+1; else location = spec.size(); } }
void _mergeFields() { if( repo.exists("field") ) { indri::api::Parameters result; indri::api::Parameters fields = repo["field"]; for (size_t i = 0; i < fields.size(); i++) { indri::api::Parameters oldField = fields[i]; std::string fieldName = oldField["name"]; bool keep = true; for (size_t j = 0; j < removeNames.size(); j++) { if (removeNames[j] == fieldName) { keep = false; break; } } if (keep) { bool isNumeric = oldField.get("numeric", false); bool isOrdinal = oldField.get("ordinal", false); bool isParental = oldField.get("parental", false); std::string parserName = oldField.get("parserName", isNumeric ? "NumericFieldAnnotator" : ""); indri::api::Parameters field = result.append("field"); field.set( "name", fieldName ); field.set( "numeric", isNumeric ); field.set( "parserName", parserName ); field.set( "ordinal", isOrdinal ); field.set( "parental", isParental ); } } if (adding) { indri::api::Parameters newFields = addFields["field"]; for (size_t i = 0; i < newFields.size(); i++) { indri::api::Parameters newField = newFields[i]; std::string fieldName = newField["name"]; bool dupe = false; for( size_t j = 0; j<fields.size(); j++ ) { std::string parameterFieldName = fields[j]["name"]; if( parameterFieldName == fieldName ) { // it's already in there, skip it... dupe = true; break; } } if (dupe) continue; bool isNumeric = newField.get("numeric", false); bool isOrdinal = newField.get("ordinal", false); bool isParental = newField.get("parental", false); std::string parserName = newField.get("parserName", isNumeric ? "OffsetAnnotationAnnotator" : ""); indri::api::Parameters field = result.append("field"); field.set( "name", fieldName ); field.set( "numeric", isNumeric ); field.set( "parserName", parserName ); field.set( "ordinal", isOrdinal ); field.set( "parental", isParental ); } } if (result.exists("field")) { repo.set("field"); repo["field"] = result["field"]; } else { // no fields in new index... if (repo.exists("field")) { repo.remove("field"); } } } else { // no fields in original, just insert // the new ones. (test they exist...) if (adding) { repo.set("field"); repo["field"] = addFields["field"]; } } }
void processFields( indri::api::Parameters ¶m ) { g_timer.start(); std::string index = param.get("index"); std::cout << "Opening: " << index << std::endl; // make sure this path doesn't exist. std::string idx2 = index + ".new"; // temp target index. // presumes a single input oa file for the entire collection. std::string offsetAnnotationsPath = param.get("annotations"); /// these need to be combined with existing. // fields to add // these need to supply numeric/parental/ordinal/etc... if (param.exists("addField")) addFields = param["addField"]; // fields to remove // these only need to be a list of names. if (param.exists("removeField")) { indri::api::Parameters slice = param["removeField"]; for (size_t i = 0; i < slice.size(); i++) { if( slice[i].exists("name") ) { removeNames.push_back( slice[i]["name"] ); } } } // need to know the file class environment to get the // conflations right. std::string className = param.get("fileclass", ""); indri::collection::Repository sourceRepo; indri::collection::Repository targetRepo; indri::parse::OffsetAnnotationAnnotator oa_annotator; indri::parse::FileClassEnvironmentFactory _fileClassFactory; // Open source repo sourceRepo.openRead(index); // Copy its parameters, create target repo, adding or removing // fields. repo.loadFile( indri::file::Path::combine( index, "manifest" ) ); int mem = param.get("memory", INT64(100*1024*1024)); repo.set("memory", mem); adding = addFields.exists("field"); _mergeFields(); // Create the offset annotator. fce = _fileClassFactory.get( className ); indri::parse::Conflater* conflater = 0; if( fce ) { conflater = fce->conflater; } if (adding) { oa_annotator.setConflater( conflater ); oa_annotator.open( offsetAnnotationsPath ); } targetRepo.create(idx2, &repo); // for each document in the source repo, fetch ParsedDocument // construct full rep, apply annotator, insert into // target repo. _index = sourceRepo.indexes()->front(); // presume 1 _docIter = _index->termListFileIterator(); _docIter->startIteration(); // ought to deal with deleted documents here... // if there are deleted documents, regular add to collection // if not, only rewrite the indexes, then rename the collection. indri::index::DeletedDocumentList& deleted = sourceRepo.deletedList(); UINT64 delCount = deleted.deletedCount(); if (delCount > 0) { // either warn, compact and then process, or // do it the old way... FIXME! std::cerr << "Deleted documents detected... compact with dumpindex first." << std::endl; return; } for (UINT64 docid = 1; docid <= _index->documentCount(); docid++) { if ((docid % 500) == 0) { g_timer.printElapsedSeconds(std::cout); std::cout << ": " << docid << "\r"; std::cout.flush(); } parsed = sourceRepo.collection()->retrieve(docid); // combine field and term data with parsed document _mergeData(); // apply annotator if (adding) parsed = oa_annotator.transform(parsed); targetRepo.addDocument(parsed, false); // TagList allocs memory for the tags... for (size_t i = 0; i < parsed->tags.size(); i++) delete(parsed->tags[i]); delete(parsed); _docIter->nextEntry(); } std::cout << std::endl; g_timer.printElapsedSeconds(std::cout); std::cout << ": " << _index->documentCount() << std::endl; g_timer.printElapsedSeconds(std::cout); std::cout << ": closing" << std::endl; targetRepo.close(); sourceRepo.close(); std::string oldcollectionPath = indri::file::Path::combine( index, "collection" ); std::string newcollectionPath = indri::file::Path::combine( idx2, "collection" ); // clone the collection indri::file::Path::remove(newcollectionPath); indri::file::Path::rename(oldcollectionPath, newcollectionPath); // rename target repo to source repo. indri::file::Path::remove(index); indri::file::Path::rename(idx2, index); g_timer.printElapsedSeconds(std::cout); std::cout << ": done" << std::endl; }