Exemplo n.º 1
0
void indri::collection::Repository::merge(const std::string& path, const std::vector<std::string>& inputIndexes) {
  // Create the directory for the output index
  _cleanAndCreateDirectory(path);

  std::string indexPath = indri::file::Path::combine(path, "index");
  std::string collectionPath = indri::file::Path::combine(path, "collection");

  // First, we're going to harvest information from the individual indexes.  We want to 
  // check a few things:
  //    1. do they all use the same stemmer?
  //    2. do they all have the same indexed fields?
  //    3. are they all merged (only have one disk index?)
  //    4. how many documents are in each one?

  // If no indexes are given, make an empty repository and return
  if (inputIndexes.size() == 0) {
      makeEmpty(path);
      return;
  }

  std::vector<lemur::api::DOCID_T> documentMaximums;

  // Open up the first repository and extract field information
  Repository firstRepository;
  try {
    firstRepository.openRead(inputIndexes[0]);
  } catch(lemur::api::Exception& e) {
    LEMUR_RETHROW(e, "Merge failed, couldn't find repository: " + inputIndexes[0]);
  }
  std::vector<Field> indexFields = firstRepository.fields();
  firstRepository.close();

  // Open up the first manifest and check on stemming and fields
  indri::api::Parameters firstManifest;
  std::string firstManifestPath = indri::file::Path::combine(inputIndexes[0], "manifest");
  try {
    firstManifest.loadFile(firstManifestPath);
  } catch(lemur::api::Exception& e) {
    LEMUR_RETHROW(e, "Merge failed, couldn't find repository: " + inputIndexes[0]);
  }

  std::string stemmerName = _stemmerName(firstManifest);
  std::vector<std::string> fieldNames = _fieldNames(firstManifest);

  // Now, gather information about the indexes
  for(size_t i=0; i<inputIndexes.size(); i++) {
    indri::api::Parameters repositoryManifest;
    std::string manifestPath = indri::file::Path::combine(inputIndexes[i], "manifest");

    try {
      repositoryManifest.loadFile(manifestPath);
    } catch(lemur::api::Exception& e) {
      LEMUR_RETHROW(e, "Couldn't find repository: " + inputIndexes[i]);
    }

    if (!repositoryManifest.exists("indexes.index")) {
      documentMaximums.push_back(0);
      continue;
    }

    // Check to make sure there's only one index in there
    size_t indexCount = repositoryManifest["indexes.index"].size();

    if (indexCount > 1) {
      LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that have unmerged internal indexes: " + inputIndexes[i]);
    }

    // How many documents are in this one?
    indri::index::DiskIndex diskIndex;
    std::string basePath = indri::file::Path::combine(inputIndexes[i], "index");
    std::string relativePath = i64_to_string((INT64)repositoryManifest["indexes.index"]);
    diskIndex.open(basePath, relativePath);

    documentMaximums.push_back(diskIndex.documentMaximum());
    diskIndex.close();

    // Only check successive indexes against the first one
    if (i == 0)
      continue;

    // Verify that the same fields and stemmers are used
    if (stemmerName != _stemmerName(repositoryManifest)) {
      LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that use different stemmers: " + inputIndexes[i]);
    }

    if (fieldNames != _fieldNames(repositoryManifest)) {
      LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that use different fields: " + inputIndexes[i]);
    }
  } 
  
  std::vector<std::string> usableIndexes = inputIndexes;
  
  // remove any repositories that have no documents
  for(size_t i=0; i<usableIndexes.size(); i++) {
    if (documentMaximums[i] == 0) {
        documentMaximums.erase(documentMaximums.begin() + i);
        usableIndexes.erase(usableIndexes.begin() + i);
        i--;
    }
  }      
  
  // now that we've removed empty indexes, are there any left?
  if (usableIndexes.size() == 0) {
      makeEmpty(path);
      return;
  }

  // 2. merge the deleted bitmaps
  _mergeBitmaps(path, usableIndexes, documentMaximums);

  // 3. merge compressed collections
  _mergeCompressedCollections(path, usableIndexes, documentMaximums);

  // 4. merge the indexes
  _mergeClosedIndexes(path, usableIndexes, indexFields, documentMaximums);

  // 5. write the manifest file
  _writeMergedManifest(path, firstManifest);
}
Exemplo n.º 2
0
void indri::collection::Repository::makeEmpty(const std::string& path) {
  Repository empty;
  empty.create(path);
  empty.close();
}