예제 #1
0
void indri::collection::Repository::_writeMergedManifest(const std::string& path, indri::api::Parameters& firstManifest) {
  firstManifest.set("indexCount", 1);
  firstManifest["indexes"].set("index", 0);

  std::string manifestPath = indri::file::Path::combine(path, "manifest");
  firstManifest.writeFile(manifestPath);
}
예제 #2
0
static void termscorefunctionfactory_parse( indri::api::Parameters& converted, const std::string& spec ) {
    int nextComma = 0;
    int nextColon = 0;
    int  location = 0;

    for( location = 0; location < spec.length(); ) {
        nextComma = spec.find( ',', location );
        nextColon = spec.find( ':', location );

        std::string key = spec.substr( location, nextColon-location );
        std::string value = spec.substr( nextColon+1, nextComma-nextColon-1 );

        converted.set( key, value );

        if( nextComma > 0 )
            location = nextComma+1;
        else
            location = spec.size();
    }
}
예제 #3
0
  void _mergeFields() {
    if( repo.exists("field") ) {
      indri::api::Parameters result;
      indri::api::Parameters fields = repo["field"];
      for (size_t i = 0; i < fields.size(); i++) {
        indri::api::Parameters oldField = fields[i];
        std::string fieldName = oldField["name"];
        bool keep = true;
        for (size_t j = 0; j < removeNames.size(); j++) {
          if (removeNames[j] == fieldName) {
            keep = false;
            break;
          }
        }
        if (keep) {
          bool isNumeric = oldField.get("numeric", false);
          bool isOrdinal = oldField.get("ordinal", false);
          bool isParental = oldField.get("parental", false);
          std::string parserName = oldField.get("parserName", 
                                                isNumeric ? "NumericFieldAnnotator" : "");
          indri::api::Parameters field = result.append("field");
          field.set( "name", fieldName );
          field.set( "numeric", isNumeric );
          field.set( "parserName", parserName );
          field.set( "ordinal", isOrdinal );
          field.set( "parental", isParental );
        }
      }
      if (adding) {
        indri::api::Parameters newFields = addFields["field"];
        for (size_t i = 0; i < newFields.size(); i++) {
          indri::api::Parameters newField = newFields[i];
          std::string fieldName = newField["name"];
          bool dupe = false;
          for( size_t j = 0; j<fields.size(); j++ ) {
            std::string parameterFieldName = fields[j]["name"];
            if( parameterFieldName == fieldName ) {
              // it's already in there, skip it...
              dupe = true; 
              break;
            }
          }
          if (dupe) continue;

          bool isNumeric = newField.get("numeric", false);
          bool isOrdinal = newField.get("ordinal", false);
          bool isParental = newField.get("parental", false);
          std::string parserName = newField.get("parserName", 
                                                isNumeric ? "OffsetAnnotationAnnotator" : "");
          indri::api::Parameters field = result.append("field");
          field.set( "name", fieldName );
          field.set( "numeric", isNumeric );
          field.set( "parserName", parserName );
          field.set( "ordinal", isOrdinal );
          field.set( "parental", isParental );
        }
      }
      if (result.exists("field")) {
        repo.set("field");
        repo["field"] = result["field"];
      } else {
        // no fields in new index...
        if (repo.exists("field")) {
          repo.remove("field");
        }
      }
    } else {
      // no fields in original, just insert
      // the new ones. (test they exist...)
      if (adding) {
        repo.set("field");
        repo["field"] = addFields["field"];
      }
    }
  }
예제 #4
0
  void processFields( indri::api::Parameters &param ) {
    g_timer.start();
    std::string index = param.get("index");
    std::cout << "Opening: " << index << std::endl;
    // make sure this path doesn't exist.
    std::string idx2 = index + ".new"; // temp target index.

    // presumes a single input oa file for the entire collection.
    std::string offsetAnnotationsPath = param.get("annotations");
      
    /// these need to be combined with existing.
    // fields to add
    // these need to supply numeric/parental/ordinal/etc...
    if (param.exists("addField"))
      addFields = param["addField"];
      
    // fields to remove
    // these only need to be a list of names.
    if (param.exists("removeField")) {
      indri::api::Parameters slice = param["removeField"];
      for (size_t i = 0; i < slice.size(); i++) {
        if( slice[i].exists("name") ) {
          removeNames.push_back( slice[i]["name"] );
        }
      }
    }
      
    // need to know the file class environment to get the 
    // conflations right.
    std::string className = param.get("fileclass", "");

    indri::collection::Repository sourceRepo;
    indri::collection::Repository targetRepo;
    indri::parse::OffsetAnnotationAnnotator oa_annotator;
    indri::parse::FileClassEnvironmentFactory _fileClassFactory;
            
    // Open source repo
    sourceRepo.openRead(index);
    // Copy its parameters, create target repo, adding or removing
    // fields.
    repo.loadFile( indri::file::Path::combine( index, "manifest" ) );
    int mem = param.get("memory", INT64(100*1024*1024));
      
    repo.set("memory", mem);
    adding = addFields.exists("field");
    _mergeFields();
    // Create the offset annotator.
    fce = _fileClassFactory.get( className );
    indri::parse::Conflater* conflater = 0;
    if( fce ) {
      conflater = fce->conflater;
    }
    if (adding) 
      {
        oa_annotator.setConflater( conflater );
        oa_annotator.open( offsetAnnotationsPath );
      }

    targetRepo.create(idx2, &repo);
      
    // for each document in the source repo, fetch ParsedDocument 
    // construct full rep, apply annotator, insert into
    // target repo.

    _index = sourceRepo.indexes()->front(); // presume 1
    _docIter = _index->termListFileIterator();
    _docIter->startIteration();
    // ought to deal with deleted documents here...
    // if there are deleted documents, regular add to collection
    // if not, only rewrite the indexes, then rename the collection.
    indri::index::DeletedDocumentList& deleted = sourceRepo.deletedList();
    UINT64 delCount = deleted.deletedCount();
    if (delCount > 0) 
      {
        // either warn, compact and then process, or 
        // do it the old way... FIXME!
        std::cerr << "Deleted documents detected... compact with dumpindex first." << std::endl;
        return;
      }
    
    for (UINT64 docid = 1; docid <= _index->documentCount(); docid++) 
      {
        if ((docid % 500) == 0)  {
          g_timer.printElapsedSeconds(std::cout);
          std::cout << ": " << docid << "\r";
          std::cout.flush();
        }

        parsed = sourceRepo.collection()->retrieve(docid);
        // combine field and term data with parsed document
        _mergeData();
        // apply annotator
        if (adding)
          parsed = oa_annotator.transform(parsed);
        targetRepo.addDocument(parsed, false);
        // TagList allocs memory for the tags...
        for (size_t i = 0; i < parsed->tags.size(); i++)
          delete(parsed->tags[i]);
        delete(parsed);
        _docIter->nextEntry();
      }
    std::cout << std::endl;
    g_timer.printElapsedSeconds(std::cout);
    std::cout << ": " << _index->documentCount() << std::endl;
    g_timer.printElapsedSeconds(std::cout);
    std::cout << ": closing"  << std::endl;

    targetRepo.close();
    sourceRepo.close();
    std::string oldcollectionPath = indri::file::Path::combine( index, "collection" );
    std::string newcollectionPath = indri::file::Path::combine( idx2, "collection" );
    // clone the collection
    indri::file::Path::remove(newcollectionPath);
    indri::file::Path::rename(oldcollectionPath, newcollectionPath);
    // rename target repo to source repo.
    indri::file::Path::remove(index);
    indri::file::Path::rename(idx2, index);

    g_timer.printElapsedSeconds(std::cout);
    std::cout << ": done"  << std::endl;
  }