void indri::collection::Repository::merge(const std::string& path, const std::vector<std::string>& inputIndexes) { // Create the directory for the output index _cleanAndCreateDirectory(path); std::string indexPath = indri::file::Path::combine(path, "index"); std::string collectionPath = indri::file::Path::combine(path, "collection"); // First, we're going to harvest information from the individual indexes. We want to // check a few things: // 1. do they all use the same stemmer? // 2. do they all have the same indexed fields? // 3. are they all merged (only have one disk index?) // 4. how many documents are in each one? // If no indexes are given, make an empty repository and return if (inputIndexes.size() == 0) { makeEmpty(path); return; } std::vector<lemur::api::DOCID_T> documentMaximums; // Open up the first repository and extract field information Repository firstRepository; try { firstRepository.openRead(inputIndexes[0]); } catch(lemur::api::Exception& e) { LEMUR_RETHROW(e, "Merge failed, couldn't find repository: " + inputIndexes[0]); } std::vector<Field> indexFields = firstRepository.fields(); firstRepository.close(); // Open up the first manifest and check on stemming and fields indri::api::Parameters firstManifest; std::string firstManifestPath = indri::file::Path::combine(inputIndexes[0], "manifest"); try { firstManifest.loadFile(firstManifestPath); } catch(lemur::api::Exception& e) { LEMUR_RETHROW(e, "Merge failed, couldn't find repository: " + inputIndexes[0]); } std::string stemmerName = _stemmerName(firstManifest); std::vector<std::string> fieldNames = _fieldNames(firstManifest); // Now, gather information about the indexes for(size_t i=0; i<inputIndexes.size(); i++) { indri::api::Parameters repositoryManifest; std::string manifestPath = indri::file::Path::combine(inputIndexes[i], "manifest"); try { repositoryManifest.loadFile(manifestPath); } catch(lemur::api::Exception& e) { LEMUR_RETHROW(e, "Couldn't find repository: " + inputIndexes[i]); } if (!repositoryManifest.exists("indexes.index")) { documentMaximums.push_back(0); continue; } // Check to make sure there's only one index in there size_t indexCount = repositoryManifest["indexes.index"].size(); if (indexCount > 1) { LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that have unmerged internal indexes: " + inputIndexes[i]); } // How many documents are in this one? indri::index::DiskIndex diskIndex; std::string basePath = indri::file::Path::combine(inputIndexes[i], "index"); std::string relativePath = i64_to_string((INT64)repositoryManifest["indexes.index"]); diskIndex.open(basePath, relativePath); documentMaximums.push_back(diskIndex.documentMaximum()); diskIndex.close(); // Only check successive indexes against the first one if (i == 0) continue; // Verify that the same fields and stemmers are used if (stemmerName != _stemmerName(repositoryManifest)) { LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that use different stemmers: " + inputIndexes[i]); } if (fieldNames != _fieldNames(repositoryManifest)) { LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that use different fields: " + inputIndexes[i]); } } std::vector<std::string> usableIndexes = inputIndexes; // remove any repositories that have no documents for(size_t i=0; i<usableIndexes.size(); i++) { if (documentMaximums[i] == 0) { documentMaximums.erase(documentMaximums.begin() + i); usableIndexes.erase(usableIndexes.begin() + i); i--; } } // now that we've removed empty indexes, are there any left? if (usableIndexes.size() == 0) { makeEmpty(path); return; } // 2. merge the deleted bitmaps _mergeBitmaps(path, usableIndexes, documentMaximums); // 3. merge compressed collections _mergeCompressedCollections(path, usableIndexes, documentMaximums); // 4. merge the indexes _mergeClosedIndexes(path, usableIndexes, indexFields, documentMaximums); // 5. write the manifest file _writeMergedManifest(path, firstManifest); }
void indri::collection::Repository::makeEmpty(const std::string& path) { Repository empty; empty.create(path); empty.close(); }