Exemple #1
0
      bool listen( unsigned int port ) {
        int result;
 
        lemur_compat::initializeNetwork();
        _socket = ::socket( AF_INET, SOCK_STREAM, 0 );

        sockaddr_in sa;
        sa.sin_addr.s_addr = INADDR_ANY;
        sa.sin_port = htons(port);
        sa.sin_family = AF_INET;
        memset( &sa.sin_zero, 0, sizeof sa.sin_zero );

        result = ::bind( _socket, (const sockaddr*) &sa, sizeof sa );
        if( result ) {
          close();
          LEMUR_THROW( LEMUR_IO_ERROR, "Wasn't able to bind port " + i64_to_string(port) );
        }
  
        result = ::listen( _socket, 8 );

        if( result ) {
          close();
          LEMUR_THROW( LEMUR_IO_ERROR, "Wasn't able to listen on port " + i64_to_string(port) );
        }

        return true;
      }
void indri::net::NetworkMessageStream::reply( const std::string& name, const void* buffer, unsigned int size ) {
  std::string header = "BRPY ";
  header += i64_to_string( size );
  header += " ";
  header += name;
  header += "\n";

  _stream->write( header.c_str(), header.length() );
  _stream->blockingWrite( buffer, size );
}
void indri::net::NetworkMessageStream::reply( indri::xml::XMLNode* replyNode ) {
  indri::xml::XMLWriter writer(replyNode);
  std::string body;
  writer.write(body);

  std::string header = "XRPY ";
  header += i64_to_string( body.size() );
  header += "\n";

  _stream->write( header.c_str(), header.size() );
  _stream->blockingWrite( body.c_str(), body.length() );
}
Exemple #4
0
void indri::net::NetworkServerStub::_handleDocumentIDsFromMetadata( indri::xml::XMLNode* request ) {
  // decode the request
  std::string attributeName;
  std::vector<std::string> attributeValues;
  indri::server::QueryServerDocumentIDsResponse* documentIDresponse;

  _decodeMetadataRequest( request, attributeName, attributeValues );
  documentIDresponse = _server->documentIDsFromMetadata( attributeName, attributeValues );
  const std::vector<lemur::api::DOCID_T>& documentIDs = documentIDresponse->getResults();
  indri::xml::XMLNode* response = new indri::xml::XMLNode( "documentIDs" );

  for( size_t i=0; i<documentIDs.size(); i++ ) {
    response->addChild( new indri::xml::XMLNode( "documentID", i64_to_string( documentIDs[i] ) ) );
  }
  delete documentIDresponse;

  _stream->reply( response );
  _stream->replyDone();
  delete response;
}
Exemple #5
0
indri::xml::XMLNode* indri::net::NetworkServerStub::_encodeDocument( const struct indri::api::ParsedDocument* document ) {
  indri::xml::XMLNode* docNode = new indri::xml::XMLNode( "document" );

  indri::xml::XMLNode* metadata = 0; 
  indri::xml::XMLNode* textNode = 0;
  indri::xml::XMLNode* contentNode = 0;
  indri::xml::XMLNode* contentLengthNode = 0;
  indri::xml::XMLNode* positions = 0;

  if( document->metadata.size() ) {
    metadata = new indri::xml::XMLNode( "metadata" );

    for( size_t j=0; j<document->metadata.size(); j++ ) {
      indri::xml::XMLNode* keyNode = new indri::xml::XMLNode( "key", document->metadata[j].key );
      std::string value = base64_encode( document->metadata[j].value, document->metadata[j].valueLength );
      indri::xml::XMLNode* valNode = new indri::xml::XMLNode( "value", value );

      indri::xml::XMLNode* datum = new indri::xml::XMLNode( "datum" );
      datum->addChild( keyNode );
      datum->addChild( valNode );

      metadata->addChild( datum );
    }
  }

  if( document->text ) {
    std::string text = base64_encode( document->text, (int)document->textLength );
    textNode = new indri::xml::XMLNode( "text", text );
  }

  if( document->content && document->text ) {
    INT64 contentOffset = document->content - document->text;
    contentNode = new indri::xml::XMLNode( "content", i64_to_string(contentOffset) );
    contentLengthNode = new indri::xml::XMLNode( "contentLength", i64_to_string(document->contentLength) );
  }

  if( document->positions.size() ) {
    positions = new indri::xml::XMLNode( "positions" );

    for( size_t j=0; j<document->positions.size(); j++ ) {
      indri::xml::XMLNode* position = new indri::xml::XMLNode( "position" );
      indri::xml::XMLNode* begin = new indri::xml::XMLNode( "begin", i64_to_string( document->positions[j].begin ) );
      indri::xml::XMLNode* end = new indri::xml::XMLNode( "end", i64_to_string( document->positions[j].end ) );
      
      position->addChild( begin );
      position->addChild( end );

      positions->addChild( position );
    }
  }

  if( metadata )
    docNode->addChild( metadata );

  if( textNode )
    docNode->addChild( textNode );

  if( contentNode ) {
    docNode->addChild( contentNode );
    docNode->addChild( contentLengthNode );
  }

  if( positions )
    docNode->addChild( positions );

  return docNode;
}
Exemple #6
0
void indri::net::NetworkServerStub::_sendNumericResponse( const char* responseName, UINT64 number ) {
  indri::xml::XMLNode* response = new indri::xml::XMLNode( responseName, i64_to_string(number) );
  _stream->reply( response );
  _stream->replyDone();
  delete response;
}
Exemple #7
0
void indri::net::NetworkServerStub::_handleDocumentVectors( indri::xml::XMLNode* request ) {
  const std::vector<indri::xml::XMLNode*>& children = request->getChildren();
  indri::xml::XMLNode* response = new indri::xml::XMLNode( "document-vector" );

  // convert doc IDs into an array
  std::vector<lemur::api::DOCID_T> documentIDs;
  for( size_t i=0; i<request->getChildren().size(); i++ ) {
    documentIDs.push_back( (lemur::api::DOCID_T) string_to_i64( request->getChildren()[i]->getValue() ) );
  }

  // get the document vectors from the index
  indri::server::QueryServerVectorsResponse* vectorsResponse = _server->documentVectors( documentIDs );

  for( size_t i=0; i<vectorsResponse->getResults().size(); i++ ) {
    indri::api::DocumentVector* docVector = vectorsResponse->getResults()[i];

    indri::xml::XMLNode* docResponse = new indri::xml::XMLNode( "document" );
    indri::xml::XMLNode* stems = new indri::xml::XMLNode( "stems" );
    indri::xml::XMLNode* positions = new indri::xml::XMLNode( "positions" );
    indri::xml::XMLNode* fields = new indri::xml::XMLNode( "fields" );

    const std::vector<std::string>& stemsVector = docVector->stems();

    for( size_t j=0; j<stemsVector.size(); j++ ) {
      const std::string& stem = stemsVector[j];
      std::string encoded = base64_encode( stem.c_str(), (int)stem.length() );
      stems->addChild( new indri::xml::XMLNode( "stem", encoded ) );
    }

    const std::vector<int>& positionsVector = docVector->positions();

    for( size_t j=0; j<docVector->positions().size(); j++ ) {
      int position = positionsVector[j];
      positions->addChild( new indri::xml::XMLNode( "position", i64_to_string(position) ) );
    }

    for( size_t j=0; j<docVector->fields().size(); j++ ) {
      indri::xml::XMLNode* field = new indri::xml::XMLNode( "field" );

      std::string number = i64_to_string( docVector->fields()[j].number );
      std::string begin = i64_to_string( docVector->fields()[j].begin );
      std::string end = i64_to_string( docVector->fields()[j].end );
      std::string ordinal = i64_to_string( docVector->fields()[j].ordinal );
      std::string pOrdinal = i64_to_string( docVector->fields()[j].parentOrdinal );

      field->addChild( new indri::xml::XMLNode( "name", docVector->fields()[j].name ) );
      field->addChild( new indri::xml::XMLNode( "number", number ) );
      field->addChild( new indri::xml::XMLNode( "begin", begin ) );
      field->addChild( new indri::xml::XMLNode( "end", end ) );
      field->addChild( new indri::xml::XMLNode( "ordinal", ordinal ) );
      field->addChild( new indri::xml::XMLNode( "parentOrdinal", pOrdinal ) );

      fields->addChild( field );
    }

    docResponse->addChild(stems);
    docResponse->addChild(positions);
    docResponse->addChild(fields);

    response->addChild( docResponse );

    delete docVector;
  }

  _stream->reply( response );
  _stream->replyDone();

  delete response;
  delete vectorsResponse;
}
void indri::api::Parameters::set( const std::string& key, INT64 value ) {
  std::string v = i64_to_string(value);
  set( key, v );
}
Exemple #9
0
void indri::collection::Repository::merge(const std::string& path, const std::vector<std::string>& inputIndexes) {
  // Create the directory for the output index
  _cleanAndCreateDirectory(path);

  std::string indexPath = indri::file::Path::combine(path, "index");
  std::string collectionPath = indri::file::Path::combine(path, "collection");

  // First, we're going to harvest information from the individual indexes.  We want to 
  // check a few things:
  //    1. do they all use the same stemmer?
  //    2. do they all have the same indexed fields?
  //    3. are they all merged (only have one disk index?)
  //    4. how many documents are in each one?

  // If no indexes are given, make an empty repository and return
  if (inputIndexes.size() == 0) {
      makeEmpty(path);
      return;
  }

  std::vector<lemur::api::DOCID_T> documentMaximums;

  // Open up the first repository and extract field information
  Repository firstRepository;
  try {
    firstRepository.openRead(inputIndexes[0]);
  } catch(lemur::api::Exception& e) {
    LEMUR_RETHROW(e, "Merge failed, couldn't find repository: " + inputIndexes[0]);
  }
  std::vector<Field> indexFields = firstRepository.fields();
  firstRepository.close();

  // Open up the first manifest and check on stemming and fields
  indri::api::Parameters firstManifest;
  std::string firstManifestPath = indri::file::Path::combine(inputIndexes[0], "manifest");
  try {
    firstManifest.loadFile(firstManifestPath);
  } catch(lemur::api::Exception& e) {
    LEMUR_RETHROW(e, "Merge failed, couldn't find repository: " + inputIndexes[0]);
  }

  std::string stemmerName = _stemmerName(firstManifest);
  std::vector<std::string> fieldNames = _fieldNames(firstManifest);

  // Now, gather information about the indexes
  for(size_t i=0; i<inputIndexes.size(); i++) {
    indri::api::Parameters repositoryManifest;
    std::string manifestPath = indri::file::Path::combine(inputIndexes[i], "manifest");

    try {
      repositoryManifest.loadFile(manifestPath);
    } catch(lemur::api::Exception& e) {
      LEMUR_RETHROW(e, "Couldn't find repository: " + inputIndexes[i]);
    }

    if (!repositoryManifest.exists("indexes.index")) {
      documentMaximums.push_back(0);
      continue;
    }

    // Check to make sure there's only one index in there
    size_t indexCount = repositoryManifest["indexes.index"].size();

    if (indexCount > 1) {
      LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that have unmerged internal indexes: " + inputIndexes[i]);
    }

    // How many documents are in this one?
    indri::index::DiskIndex diskIndex;
    std::string basePath = indri::file::Path::combine(inputIndexes[i], "index");
    std::string relativePath = i64_to_string((INT64)repositoryManifest["indexes.index"]);
    diskIndex.open(basePath, relativePath);

    documentMaximums.push_back(diskIndex.documentMaximum());
    diskIndex.close();

    // Only check successive indexes against the first one
    if (i == 0)
      continue;

    // Verify that the same fields and stemmers are used
    if (stemmerName != _stemmerName(repositoryManifest)) {
      LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that use different stemmers: " + inputIndexes[i]);
    }

    if (fieldNames != _fieldNames(repositoryManifest)) {
      LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that use different fields: " + inputIndexes[i]);
    }
  } 
  
  std::vector<std::string> usableIndexes = inputIndexes;
  
  // remove any repositories that have no documents
  for(size_t i=0; i<usableIndexes.size(); i++) {
    if (documentMaximums[i] == 0) {
        documentMaximums.erase(documentMaximums.begin() + i);
        usableIndexes.erase(usableIndexes.begin() + i);
        i--;
    }
  }      
  
  // now that we've removed empty indexes, are there any left?
  if (usableIndexes.size() == 0) {
      makeEmpty(path);
      return;
  }

  // 2. merge the deleted bitmaps
  _mergeBitmaps(path, usableIndexes, documentMaximums);

  // 3. merge compressed collections
  _mergeCompressedCollections(path, usableIndexes, documentMaximums);

  // 4. merge the indexes
  _mergeClosedIndexes(path, usableIndexes, indexFields, documentMaximums);

  // 5. write the manifest file
  _writeMergedManifest(path, firstManifest);
}
indri::api::ParsedDocument* indri::collection::CompressedCollection::retrieve( int documentID ) {
  indri::thread::ScopedLock l( _lock );

  UINT64 offset;
  int actual;
  
  if( !_lookup.get( documentID, &offset, actual, sizeof offset ) ) {
    LEMUR_THROW( LEMUR_IO_ERROR, "Unable to find document " + i64_to_string(documentID) + " in the collection." );
  }

  // flush output buffer; make sure all data is on disk
  if( _output )
    _output->flush();

  // decompress the data
  indri::utility::Buffer output;
  z_stream_s stream;
  stream.zalloc = zlib_alloc;
  stream.zfree = zlib_free;

  inflateInit( &stream );

  zlib_read_document( stream, _storage, offset, output );
  int decompressedSize = stream.total_out;

  // initialize the buffer as a ParsedDocument
  indri::api::ParsedDocument* document = (indri::api::ParsedDocument*) output.front();
  new(document) indri::api::ParsedDocument;

  document->text = 0;
  document->textLength = 0;
  document->content = 0;
  document->contentLength = 0;

  // get the number of fields (it's the last byte)
  char* dataStart = output.front() + sizeof(indri::api::ParsedDocument);
  int fieldCount = copy_quad( dataStart + decompressedSize - 4 );
  int endOffset = decompressedSize - 4 - 2*fieldCount*sizeof(UINT32);
  char* arrayStart = dataStart + endOffset;

  const char* positionData = 0;
  int positionDataLength = 0;

  // store metadata
  for( int i=0; i<fieldCount; i++ ) {
    int keyStart = copy_quad( arrayStart + 2*i*sizeof(UINT32) );
    int valueStart = copy_quad( arrayStart + (2*i+1)*sizeof(UINT32) );
    int valueEnd;

    if( i==(fieldCount-1) ) {
      valueEnd = endOffset;
    } else {
      valueEnd = copy_quad( arrayStart + 2*(i+1)*sizeof(UINT32) );
    }

    indri::parse::MetadataPair pair;
    pair.key = dataStart + keyStart;
    pair.value = dataStart + valueStart;
    pair.valueLength = valueEnd - valueStart;

    // extract text
    if( !strcmp( pair.key, TEXT_KEY ) ) {
      document->text = (char*) pair.value;
      document->textLength = pair.valueLength;
    }

    // extract content
    if( !strcmp( pair.key, CONTENT_KEY ) ) {
      document->content = document->text + copy_quad( (char*) pair.value );
    }

    // extract content length
    if( !strcmp( pair.key, CONTENTLENGTH_KEY ) ) {
      document->contentLength = copy_quad( (char *)pair.value );
    }

    if( !strcmp( pair.key, POSITIONS_KEY ) ) {
      positionData = (char*) pair.value;
      positionDataLength = pair.valueLength;
    }

    document->metadata.push_back( pair );
  }

  // decompress positions
  _readPositions( document, positionData, positionDataLength );

  output.detach();
  return document;
}