bool listen( unsigned int port ) { int result; lemur_compat::initializeNetwork(); _socket = ::socket( AF_INET, SOCK_STREAM, 0 ); sockaddr_in sa; sa.sin_addr.s_addr = INADDR_ANY; sa.sin_port = htons(port); sa.sin_family = AF_INET; memset( &sa.sin_zero, 0, sizeof sa.sin_zero ); result = ::bind( _socket, (const sockaddr*) &sa, sizeof sa ); if( result ) { close(); LEMUR_THROW( LEMUR_IO_ERROR, "Wasn't able to bind port " + i64_to_string(port) ); } result = ::listen( _socket, 8 ); if( result ) { close(); LEMUR_THROW( LEMUR_IO_ERROR, "Wasn't able to listen on port " + i64_to_string(port) ); } return true; }
void indri::net::NetworkMessageStream::reply( const std::string& name, const void* buffer, unsigned int size ) { std::string header = "BRPY "; header += i64_to_string( size ); header += " "; header += name; header += "\n"; _stream->write( header.c_str(), header.length() ); _stream->blockingWrite( buffer, size ); }
void indri::net::NetworkMessageStream::reply( indri::xml::XMLNode* replyNode ) { indri::xml::XMLWriter writer(replyNode); std::string body; writer.write(body); std::string header = "XRPY "; header += i64_to_string( body.size() ); header += "\n"; _stream->write( header.c_str(), header.size() ); _stream->blockingWrite( body.c_str(), body.length() ); }
void indri::net::NetworkServerStub::_handleDocumentIDsFromMetadata( indri::xml::XMLNode* request ) { // decode the request std::string attributeName; std::vector<std::string> attributeValues; indri::server::QueryServerDocumentIDsResponse* documentIDresponse; _decodeMetadataRequest( request, attributeName, attributeValues ); documentIDresponse = _server->documentIDsFromMetadata( attributeName, attributeValues ); const std::vector<lemur::api::DOCID_T>& documentIDs = documentIDresponse->getResults(); indri::xml::XMLNode* response = new indri::xml::XMLNode( "documentIDs" ); for( size_t i=0; i<documentIDs.size(); i++ ) { response->addChild( new indri::xml::XMLNode( "documentID", i64_to_string( documentIDs[i] ) ) ); } delete documentIDresponse; _stream->reply( response ); _stream->replyDone(); delete response; }
indri::xml::XMLNode* indri::net::NetworkServerStub::_encodeDocument( const struct indri::api::ParsedDocument* document ) { indri::xml::XMLNode* docNode = new indri::xml::XMLNode( "document" ); indri::xml::XMLNode* metadata = 0; indri::xml::XMLNode* textNode = 0; indri::xml::XMLNode* contentNode = 0; indri::xml::XMLNode* contentLengthNode = 0; indri::xml::XMLNode* positions = 0; if( document->metadata.size() ) { metadata = new indri::xml::XMLNode( "metadata" ); for( size_t j=0; j<document->metadata.size(); j++ ) { indri::xml::XMLNode* keyNode = new indri::xml::XMLNode( "key", document->metadata[j].key ); std::string value = base64_encode( document->metadata[j].value, document->metadata[j].valueLength ); indri::xml::XMLNode* valNode = new indri::xml::XMLNode( "value", value ); indri::xml::XMLNode* datum = new indri::xml::XMLNode( "datum" ); datum->addChild( keyNode ); datum->addChild( valNode ); metadata->addChild( datum ); } } if( document->text ) { std::string text = base64_encode( document->text, (int)document->textLength ); textNode = new indri::xml::XMLNode( "text", text ); } if( document->content && document->text ) { INT64 contentOffset = document->content - document->text; contentNode = new indri::xml::XMLNode( "content", i64_to_string(contentOffset) ); contentLengthNode = new indri::xml::XMLNode( "contentLength", i64_to_string(document->contentLength) ); } if( document->positions.size() ) { positions = new indri::xml::XMLNode( "positions" ); for( size_t j=0; j<document->positions.size(); j++ ) { indri::xml::XMLNode* position = new indri::xml::XMLNode( "position" ); indri::xml::XMLNode* begin = new indri::xml::XMLNode( "begin", i64_to_string( document->positions[j].begin ) ); indri::xml::XMLNode* end = new indri::xml::XMLNode( "end", i64_to_string( document->positions[j].end ) ); position->addChild( begin ); position->addChild( end ); positions->addChild( position ); } } if( metadata ) docNode->addChild( metadata ); if( textNode ) docNode->addChild( textNode ); if( contentNode ) { docNode->addChild( contentNode ); docNode->addChild( contentLengthNode ); } if( positions ) docNode->addChild( positions ); return docNode; }
void indri::net::NetworkServerStub::_sendNumericResponse( const char* responseName, UINT64 number ) { indri::xml::XMLNode* response = new indri::xml::XMLNode( responseName, i64_to_string(number) ); _stream->reply( response ); _stream->replyDone(); delete response; }
void indri::net::NetworkServerStub::_handleDocumentVectors( indri::xml::XMLNode* request ) { const std::vector<indri::xml::XMLNode*>& children = request->getChildren(); indri::xml::XMLNode* response = new indri::xml::XMLNode( "document-vector" ); // convert doc IDs into an array std::vector<lemur::api::DOCID_T> documentIDs; for( size_t i=0; i<request->getChildren().size(); i++ ) { documentIDs.push_back( (lemur::api::DOCID_T) string_to_i64( request->getChildren()[i]->getValue() ) ); } // get the document vectors from the index indri::server::QueryServerVectorsResponse* vectorsResponse = _server->documentVectors( documentIDs ); for( size_t i=0; i<vectorsResponse->getResults().size(); i++ ) { indri::api::DocumentVector* docVector = vectorsResponse->getResults()[i]; indri::xml::XMLNode* docResponse = new indri::xml::XMLNode( "document" ); indri::xml::XMLNode* stems = new indri::xml::XMLNode( "stems" ); indri::xml::XMLNode* positions = new indri::xml::XMLNode( "positions" ); indri::xml::XMLNode* fields = new indri::xml::XMLNode( "fields" ); const std::vector<std::string>& stemsVector = docVector->stems(); for( size_t j=0; j<stemsVector.size(); j++ ) { const std::string& stem = stemsVector[j]; std::string encoded = base64_encode( stem.c_str(), (int)stem.length() ); stems->addChild( new indri::xml::XMLNode( "stem", encoded ) ); } const std::vector<int>& positionsVector = docVector->positions(); for( size_t j=0; j<docVector->positions().size(); j++ ) { int position = positionsVector[j]; positions->addChild( new indri::xml::XMLNode( "position", i64_to_string(position) ) ); } for( size_t j=0; j<docVector->fields().size(); j++ ) { indri::xml::XMLNode* field = new indri::xml::XMLNode( "field" ); std::string number = i64_to_string( docVector->fields()[j].number ); std::string begin = i64_to_string( docVector->fields()[j].begin ); std::string end = i64_to_string( docVector->fields()[j].end ); std::string ordinal = i64_to_string( docVector->fields()[j].ordinal ); std::string pOrdinal = i64_to_string( docVector->fields()[j].parentOrdinal ); field->addChild( new indri::xml::XMLNode( "name", docVector->fields()[j].name ) ); field->addChild( new indri::xml::XMLNode( "number", number ) ); field->addChild( new indri::xml::XMLNode( "begin", begin ) ); field->addChild( new indri::xml::XMLNode( "end", end ) ); field->addChild( new indri::xml::XMLNode( "ordinal", ordinal ) ); field->addChild( new indri::xml::XMLNode( "parentOrdinal", pOrdinal ) ); fields->addChild( field ); } docResponse->addChild(stems); docResponse->addChild(positions); docResponse->addChild(fields); response->addChild( docResponse ); delete docVector; } _stream->reply( response ); _stream->replyDone(); delete response; delete vectorsResponse; }
void indri::api::Parameters::set( const std::string& key, INT64 value ) { std::string v = i64_to_string(value); set( key, v ); }
void indri::collection::Repository::merge(const std::string& path, const std::vector<std::string>& inputIndexes) { // Create the directory for the output index _cleanAndCreateDirectory(path); std::string indexPath = indri::file::Path::combine(path, "index"); std::string collectionPath = indri::file::Path::combine(path, "collection"); // First, we're going to harvest information from the individual indexes. We want to // check a few things: // 1. do they all use the same stemmer? // 2. do they all have the same indexed fields? // 3. are they all merged (only have one disk index?) // 4. how many documents are in each one? // If no indexes are given, make an empty repository and return if (inputIndexes.size() == 0) { makeEmpty(path); return; } std::vector<lemur::api::DOCID_T> documentMaximums; // Open up the first repository and extract field information Repository firstRepository; try { firstRepository.openRead(inputIndexes[0]); } catch(lemur::api::Exception& e) { LEMUR_RETHROW(e, "Merge failed, couldn't find repository: " + inputIndexes[0]); } std::vector<Field> indexFields = firstRepository.fields(); firstRepository.close(); // Open up the first manifest and check on stemming and fields indri::api::Parameters firstManifest; std::string firstManifestPath = indri::file::Path::combine(inputIndexes[0], "manifest"); try { firstManifest.loadFile(firstManifestPath); } catch(lemur::api::Exception& e) { LEMUR_RETHROW(e, "Merge failed, couldn't find repository: " + inputIndexes[0]); } std::string stemmerName = _stemmerName(firstManifest); std::vector<std::string> fieldNames = _fieldNames(firstManifest); // Now, gather information about the indexes for(size_t i=0; i<inputIndexes.size(); i++) { indri::api::Parameters repositoryManifest; std::string manifestPath = indri::file::Path::combine(inputIndexes[i], "manifest"); try { repositoryManifest.loadFile(manifestPath); } catch(lemur::api::Exception& e) { LEMUR_RETHROW(e, "Couldn't find repository: " + inputIndexes[i]); } if (!repositoryManifest.exists("indexes.index")) { documentMaximums.push_back(0); continue; } // Check to make sure there's only one index in there size_t indexCount = repositoryManifest["indexes.index"].size(); if (indexCount > 1) { LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that have unmerged internal indexes: " + inputIndexes[i]); } // How many documents are in this one? indri::index::DiskIndex diskIndex; std::string basePath = indri::file::Path::combine(inputIndexes[i], "index"); std::string relativePath = i64_to_string((INT64)repositoryManifest["indexes.index"]); diskIndex.open(basePath, relativePath); documentMaximums.push_back(diskIndex.documentMaximum()); diskIndex.close(); // Only check successive indexes against the first one if (i == 0) continue; // Verify that the same fields and stemmers are used if (stemmerName != _stemmerName(repositoryManifest)) { LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that use different stemmers: " + inputIndexes[i]); } if (fieldNames != _fieldNames(repositoryManifest)) { LEMUR_THROW(LEMUR_RUNTIME_ERROR, "Cannot merge repositories that use different fields: " + inputIndexes[i]); } } std::vector<std::string> usableIndexes = inputIndexes; // remove any repositories that have no documents for(size_t i=0; i<usableIndexes.size(); i++) { if (documentMaximums[i] == 0) { documentMaximums.erase(documentMaximums.begin() + i); usableIndexes.erase(usableIndexes.begin() + i); i--; } } // now that we've removed empty indexes, are there any left? if (usableIndexes.size() == 0) { makeEmpty(path); return; } // 2. merge the deleted bitmaps _mergeBitmaps(path, usableIndexes, documentMaximums); // 3. merge compressed collections _mergeCompressedCollections(path, usableIndexes, documentMaximums); // 4. merge the indexes _mergeClosedIndexes(path, usableIndexes, indexFields, documentMaximums); // 5. write the manifest file _writeMergedManifest(path, firstManifest); }
indri::api::ParsedDocument* indri::collection::CompressedCollection::retrieve( int documentID ) { indri::thread::ScopedLock l( _lock ); UINT64 offset; int actual; if( !_lookup.get( documentID, &offset, actual, sizeof offset ) ) { LEMUR_THROW( LEMUR_IO_ERROR, "Unable to find document " + i64_to_string(documentID) + " in the collection." ); } // flush output buffer; make sure all data is on disk if( _output ) _output->flush(); // decompress the data indri::utility::Buffer output; z_stream_s stream; stream.zalloc = zlib_alloc; stream.zfree = zlib_free; inflateInit( &stream ); zlib_read_document( stream, _storage, offset, output ); int decompressedSize = stream.total_out; // initialize the buffer as a ParsedDocument indri::api::ParsedDocument* document = (indri::api::ParsedDocument*) output.front(); new(document) indri::api::ParsedDocument; document->text = 0; document->textLength = 0; document->content = 0; document->contentLength = 0; // get the number of fields (it's the last byte) char* dataStart = output.front() + sizeof(indri::api::ParsedDocument); int fieldCount = copy_quad( dataStart + decompressedSize - 4 ); int endOffset = decompressedSize - 4 - 2*fieldCount*sizeof(UINT32); char* arrayStart = dataStart + endOffset; const char* positionData = 0; int positionDataLength = 0; // store metadata for( int i=0; i<fieldCount; i++ ) { int keyStart = copy_quad( arrayStart + 2*i*sizeof(UINT32) ); int valueStart = copy_quad( arrayStart + (2*i+1)*sizeof(UINT32) ); int valueEnd; if( i==(fieldCount-1) ) { valueEnd = endOffset; } else { valueEnd = copy_quad( arrayStart + 2*(i+1)*sizeof(UINT32) ); } indri::parse::MetadataPair pair; pair.key = dataStart + keyStart; pair.value = dataStart + valueStart; pair.valueLength = valueEnd - valueStart; // extract text if( !strcmp( pair.key, TEXT_KEY ) ) { document->text = (char*) pair.value; document->textLength = pair.valueLength; } // extract content if( !strcmp( pair.key, CONTENT_KEY ) ) { document->content = document->text + copy_quad( (char*) pair.value ); } // extract content length if( !strcmp( pair.key, CONTENTLENGTH_KEY ) ) { document->contentLength = copy_quad( (char *)pair.value ); } if( !strcmp( pair.key, POSITIONS_KEY ) ) { positionData = (char*) pair.value; positionDataLength = pair.valueLength; } document->metadata.push_back( pair ); } // decompress positions _readPositions( document, positionData, positionDataLength ); output.detach(); return document; }