void indri::file::Path::remove( const std::string& path ) { std::stack<indri::file::DirectoryIterator*> iterators; indri::utility::StackDeleter<indri::file::DirectoryIterator> sd( iterators ); iterators.push( new indri::file::DirectoryIterator( path ) ); while( iterators.size() ) { indri::file::DirectoryIterator* top = iterators.top(); // all done, so go up a level if( (*top) == indri::file::DirectoryIterator::end() ) { // release any search handles that may point // to this directory top->close(); int result = rmdir( top->base().c_str() ); if( result != 0 ) LEMUR_THROW( LEMUR_IO_ERROR, "indri::file::Path::remove couldn't remove directory '" + top->base() + "'." ); delete top; iterators.pop(); continue; } std::string path = **top; (*top)++; if( indri::file::Path::isFile( path ) ) { int result = lemur_compat::remove( path.c_str() ); if( result != 0 ) LEMUR_THROW( LEMUR_IO_ERROR, "indri::file::Path::remove couldn't remove file '" + path + "'." ); } else { iterators.push( new indri::file::DirectoryIterator( path ) ); } } }
void indri::file::Path::rename( const std::string& oldName, const std::string& newName ) { #ifndef WIN32 int result = ::rename( oldName.c_str(), newName.c_str() ); if( result != 0 ) { if( errno == EEXIST ) { LEMUR_THROW( LEMUR_IO_ERROR, "The destination file already exists: " + oldName ); } else if( errno == EACCES || errno == EPERM ) { LEMUR_THROW( LEMUR_IO_ERROR, "Insufficient permissions to rename: '" + oldName + "' to '" + newName + "'." ); } else { LEMUR_THROW( LEMUR_IO_ERROR, "Unable to rename: '" + oldName + "' to '" + newName + "'." ); } } #else BOOL result; if( Path::exists( newName ) ) { result = ReplaceFile( newName.c_str(), oldName.c_str(), NULL, REPLACEFILE_IGNORE_MERGE_ERRORS, NULL, NULL ); } else { result = MoveFile( oldName.c_str(), newName.c_str() ); } if( !result ) { LEMUR_THROW( LEMUR_IO_ERROR, "Unable to rename: '" + oldName + "' to '" + newName + "'." ); } #endif }
bool listen( unsigned int port ) { int result; lemur_compat::initializeNetwork(); _socket = ::socket( AF_INET, SOCK_STREAM, 0 ); sockaddr_in sa; sa.sin_addr.s_addr = INADDR_ANY; sa.sin_port = htons(port); sa.sin_family = AF_INET; memset( &sa.sin_zero, 0, sizeof sa.sin_zero ); result = ::bind( _socket, (const sockaddr*) &sa, sizeof sa ); if( result ) { close(); LEMUR_THROW( LEMUR_IO_ERROR, "Wasn't able to bind port " + i64_to_string(port) ); } result = ::listen( _socket, 8 ); if( result ) { close(); LEMUR_THROW( LEMUR_IO_ERROR, "Wasn't able to listen on port " + i64_to_string(port) ); } return true; }
void indri::xml::XMLReader::_read( indri::xml::XMLNode** parent, const char* buffer, int start, int end ) { int tagType; for( int current = _tryFindBeginTag( buffer, start, end ); current >= 0; current = _tryFindBeginTag( buffer, current, end ) ) { indri::xml::XMLNode* node; std::string tagName; std::map<std::string, std::string> attributes; bool tagsBetween; int endLevel; int endTag = _readTag( buffer, current, end, &tagName, &attributes, &tagType ); if( tagType == TAG_CLOSE_TYPE ) LEMUR_THROW( LEMUR_GENERIC_ERROR, "Found a close tag for '" + tagName + "' while looking for an open tag." ); if( tagType == TAG_OPEN_TYPE ) { int closingTag = _findClosingTag( buffer, endTag, end, tagName, &tagsBetween ); if( closingTag == -1 ) LEMUR_THROW( LEMUR_GENERIC_ERROR, "Could not find a close tag for '" + tagName + "'"); if( tagsBetween ) { node = new indri::xml::XMLNode( tagName, attributes ); _read( &node, buffer, endTag, closingTag ); } else { std::string nodeValue; nodeValue.assign( &buffer[endTag], &buffer[closingTag] ); std::string::size_type dataStart = nodeValue.find("<!CDATA["); while (dataStart != std::string::npos) { // munch any CDATA tags in the element's value. nodeValue.erase(dataStart, 8); std::string::size_type dataEnd = nodeValue.find("]]>"); if (dataEnd != std::string::npos) nodeValue.erase(dataEnd, 3); // else bad things here, should throw. dataStart = nodeValue.find("<!CDATA["); } node = new indri::xml::XMLNode( tagName, attributes, nodeValue ); } endLevel = _findEndTag( buffer, closingTag, end )+1; } else { assert( tagType == TAG_OPEN_CLOSE_TYPE ); node = new indri::xml::XMLNode( tagName, attributes ); endLevel = endTag; } if( *parent ) { (*parent)->addChild( node ); } else { *parent = node; break; } current = endLevel; } }
void indri::file::Path::create( const std::string& path ) { if( lemur_compat::mkdir( path.c_str(), 0777 ) < 0 ) { if( errno == EACCES ) { LEMUR_THROW( LEMUR_IO_ERROR, "Couldn't create directory: '" + path + "' because of inadequate permissions." ); } else if( errno == ENOENT ) { LEMUR_THROW( LEMUR_IO_ERROR, "Couldn't create directory: '" + path + "' because at least one of the parent directories does not exist." ); } else if( errno == EEXIST ) { LEMUR_THROW( LEMUR_IO_ERROR, "Couldn't create directory: '" + path + "' because something already exists there." ); } } }
int indri::xml::XMLReader::_findNotName( const char* buffer, int start, int finish ) { int i; for( i=start; i<finish; i++ ) { // this isn't unicode-safe, but it should be good for now #ifndef WIN32 if( !isalpha(buffer[i]) && !isdigit(buffer[i]) && #else if( (buffer[i] >= 0 && !isalpha(buffer[i])) && (buffer[i] >= 0 && !isdigit(buffer[i])) && #endif buffer[i] != '-' && buffer[i] != '_' && buffer[i] != ':' && buffer[i] != '.' ) { break; } } if( i==finish ) LEMUR_THROW( LEMUR_PARSE_ERROR, "Was looking for the end of a tag name, but couldn't find it." ); return i; }
int indri::xml::XMLReader::_findText( const char* buffer, int start, int finish ) { int result = _tryFindText( buffer, start, finish ); if( result==finish ) LEMUR_THROW( LEMUR_GENERIC_ERROR, "Was looking for text, but couldn't find any" ); return result; }
static void zlib_deflate( z_stream_s& stream, indri::file::SequentialWriteBuffer* outfile ) { if( stream.avail_in == 0 ) { // nothing to do... return; } if( stream.avail_out == 0 ) { stream.next_out = (Bytef*) outfile->write( OUTPUT_BUFFER_SIZE ); stream.avail_out = OUTPUT_BUFFER_SIZE; } int result = deflate( &stream, 0 ); // if we're fine, then just return (common case) while( result != Z_OK || stream.avail_in != 0 ) { // either we need more space, or an error happened if( result != Z_OK ) { LEMUR_THROW( LEMUR_IO_ERROR, "Tried to add a document to the collection, but zlib returned an error" ); } // get more space stream.next_out = (Bytef*) outfile->write( OUTPUT_BUFFER_SIZE ); stream.avail_out = OUTPUT_BUFFER_SIZE; result = deflate( &stream, 0 ); } }
lemur::api::DOCID_T indri::api::IndexEnvironment::addString( const std::string& documentString, const std::string& fileClass, const std::vector<indri::parse::MetadataPair>& metadata ) { indri::parse::UnparsedDocument document; indri::parse::Parser* parser; indri::parse::Tokenizer* tokenizer; indri::parse::DocumentIterator* iterator; indri::parse::Conflater* conflater; std::string nothing; _documentsSeen++; document.text = documentString.c_str(); document.textLength = documentString.length() + 1; // for the null document.metadata = metadata; document.content = document.text; document.contentLength = document.textLength - 1; _getParsingContext( &parser, &tokenizer, &iterator, &conflater, fileClass ); if( parser == 0 ) { LEMUR_THROW( LEMUR_RUNTIME_ERROR, "File class '" + fileClass + "' wasn't recognized." ); } indri::parse::TokenizedDocument* tokenized = tokenizer->tokenize( &document ); ParsedDocument* parsed = parser->parse( tokenized ); lemur::api::DOCID_T documentID =_repository.addDocument( parsed ); _documentsIndexed++; if( _callback ) (*_callback)( indri::api::IndexStatus::DocumentCount, nothing, _error, _documentsIndexed, _documentsSeen ); return documentID; }
void indri::api::Parameters::loadFile( const std::string& filename ) { std::ifstream input; indri::xml::XMLReader reader; input.open( filename.c_str(), std::ifstream::in ); if( input.rdstate() & std::ios::failbit ) LEMUR_THROW( LEMUR_IO_ERROR, "Couldn't open parameter file '" + filename + "' for reading." ); input.seekg( 0, std::ios::end ); size_t length = input.tellg(); input.seekg( 0, std::ios::beg ); // null terminate it to make a string in the XML reader for comment strip char* buffer = new char[length + 1]; buffer[length] = '\0'; try { input.read( buffer, length ); std::auto_ptr<indri::xml::XMLNode> result( reader.read( buffer, length ) ); _loadXML( result.get() ); } catch( lemur::api::Exception& e ) { LEMUR_RETHROW( e, "Had trouble parsing parameter file '" + filename + "'" ); } delete[] buffer; input.close(); }
int indri::collection::Repository::addDocument(indri::api::ParsedDocument* document, bool inCollection) { if (_readOnly) LEMUR_THROW(LEMUR_RUNTIME_ERROR, "addDocument: Cannot add documents to a repository that is opened for read-only access."); while (_thrashing) { indri::thread::Thread::sleep(100); } indri::thread::ScopedLock lock(_addLock); for(size_t i=0; i<_transformations.size(); i++) { document = _transformations[i]->transform(document); } index_state state; { // get a copy of current index state indri::thread::ScopedLock stateLock(_stateLock); state = _active; } int documentID = dynamic_cast<indri::index::MemoryIndex*>(state->back())->addDocument(*document); if (inCollection) _collection->addDocument(documentID, document); _countDocumentAdd(); return documentID; }
void indri::infnet::InferenceNetwork::_indexChanged( indri::index::Index& index ) { _closeIterators.clear(); _closeIteratorBound = -1; // doc iterators for( size_t i=0; i<_termNames.size(); i++ ) { indri::index::DocListIterator* iterator = index.docListIterator( _termNames[i] ); if( iterator ) iterator->startIteration(); _docIterators.push_back( iterator ); } // field iterators for( size_t i=0; i<_fieldNames.size(); i++ ) { indri::index::DocExtentListIterator* iterator = index.fieldListIterator( _fieldNames[i] ); if( iterator ) iterator->startIteration(); _fieldIterators.push_back( iterator ); } // prior iterators for( size_t i=0; i<_priorNames.size(); i++ ) { // TODO: this is wasteful, since the prior is associated with the whole collection, // there's no need to fetch it for each index. but, it's just easier to code it like this for now indri::collection::PriorListIterator* iterator = _repository.priorListIterator( _priorNames[i] ); if( iterator ) iterator->startIteration(); else { // if the named prior doesn't exist in the Repository, throw an Exception LEMUR_THROW( LEMUR_RUNTIME_ERROR, "named prior: " + _priorNames[i] + " not found in Repository. Unable to process query." ); } _priorIterators.push_back( iterator ); } // extent iterator nodes std::vector<ListIteratorNode*>::iterator diter; for( diter = _listIteratorNodes.begin(); diter != _listIteratorNodes.end(); diter++ ) { (*diter)->indexChanged( index ); } // belief nodes std::vector<BeliefNode*>::iterator biter; for( biter = _beliefNodes.begin(); biter != _beliefNodes.end(); biter++ ) { (*biter)->indexChanged( index ); } // evaluator nodes std::vector<indri::infnet::EvaluatorNode*>::iterator eiter; for( eiter = _evaluators.begin(); eiter != _evaluators.end(); eiter++ ) { (*eiter)->indexChanged( index ); } // document structure if (_documentStructureHolderNode != 0) { _documentStructureHolderNode->indexChanged( index ); } }
indri::parse::DocumentIterator* indri::parse::DocumentIteratorFactory::get( const std::string& type, const char* startDocTag, const char* endDocTag, const char* startMetadataTag ) { std::string preferred = preferredName( type ); indri::parse::DocumentIterator* result = 0; if( preferred == TYPE_TAGGED ) { indri::parse::TaggedDocumentIterator* iter = new indri::parse::TaggedDocumentIterator(); iter->setTags( startDocTag, endDocTag, startMetadataTag ); result = iter; } else if( preferred == TYPE_WARC ) { result = new indri::parse::WARCDocumentIterator(); } else if( preferred == TYPE_PDF ) { result = new indri::parse::PDFDocumentExtractor(); } else if( preferred == TYPE_TEXT ) { result = new indri::parse::TextDocumentExtractor(); } else if( preferred == TYPE_MBOX ) { result = new indri::parse::MboxDocumentIterator(); } #ifdef WIN32 else if( preferred == TYPE_WORD ) { result = new indri::parse::WordDocumentExtractor(); } else if( preferred == TYPE_PPT ) { result = new indri::parse::PowerPointDocumentExtractor(); } #endif if( !result ) LEMUR_THROW( LEMUR_RUNTIME_ERROR, type + " is an unknown DocumentIterator type." ); return result; }
void indri::parse::TextDocumentExtractor::open( const std::string& filename ) { _in = gzopen( filename.c_str(), "rb" ); _filename = filename; if( !_in ) LEMUR_THROW( LEMUR_IO_ERROR, "Couldn't open file " + filename + "." ); }
static void zlib_read_document( z_stream_s& stream, indri::file::File& infile, UINT64 offset, indri::utility::Buffer& outputBuffer ) { // read in data from the file until the stream ends // split up the data as necessary // decompress positional info // read some data char inputBuffer[INPUT_BUFFER_SIZE]; outputBuffer.grow( INPUT_BUFFER_SIZE ); outputBuffer.write( sizeof(indri::api::ParsedDocument) ); stream.avail_in = 0; stream.avail_out = 0; while(true) { if( !stream.avail_in ) { UINT64 readSize = infile.read( inputBuffer, offset, sizeof inputBuffer ); offset += readSize; stream.avail_in = readSize; stream.next_in = (Bytef*) inputBuffer; } stream.avail_out = outputBuffer.size() - outputBuffer.position(); stream.next_out = (Bytef*) outputBuffer.write( outputBuffer.size() - outputBuffer.position() ); int result = inflate( &stream, Z_NO_FLUSH ); outputBuffer.unwrite( stream.avail_out ); if( result == Z_STREAM_END ) { result = inflate( &stream, Z_FINISH ); if( result < 0 ) LEMUR_THROW( result, "Something bad happened while trying to finish decompressing a document." ); inflateEnd( &stream ); break; } if( result < 0 ) { LEMUR_THROW( result, "Something bad happened while trying to decompress a document." ); } if( stream.avail_out == 0 ) { outputBuffer.grow(); } } }
void indri::api::Parameters::_loadXML( indri::xml::XMLNode* node ) { // this method should only be called on table nodes std::set<std::string> seen; std::set<std::string> arrays; std::set<std::string> appends; indri::xml::XMLNode* current = 0; if (node == NULL) { LEMUR_THROW(LEMUR_BAD_PARAMETER_ERROR, "NO XML in parameter text" ); } // find out which ones are in the XML file, and which ones appear multiple times const std::vector<indri::xml::XMLNode*>& children = node->getChildren(); for( size_t i=0; i<children.size(); i++ ) { indri::xml::XMLNode* child = children[i]; std::string name = child->getName(); if( seen.find(name) == seen.end() ) seen.insert(name); else arrays.insert(name); if( child->getAttribute( "append" ) == "true" ) appends.insert(name); } std::set<std::string>::iterator iter; // delete current items that don't need to be here for( iter = seen.begin(); iter != seen.end(); iter++ ) { if( appends.find(*iter) != appends.end() ) continue; if( (exists(*iter) && get(*iter).size() > 1) || arrays.find(*iter) != arrays.end() ) { remove(*iter); appends.insert(*iter); } } if( children.size() ) { for( size_t i=0; i<children.size(); i++ ) { indri::xml::XMLNode* child = children[i]; std::string name = child->getName(); if( appends.find(name) != appends.end() ) { append(name)._loadXML(child); } else { if( !exists(name) ) set(name, ""); get(name)._loadXML(child); } } } else { set(node->getValue()); } }
void indri::parse::MboxDocumentIterator::open( const std::string& filename ) { _in.clear(); _in.open( filename.c_str() ); _filename = filename; if( !_in.good() ) LEMUR_THROW( LEMUR_IO_ERROR, "Couldn't open file " + filename + "." ); }
void indri::net::XMLReplyReceiver::wait( indri::net::NetworkMessageStream* stream ) { while( !done() && stream->alive() && !_exception.size() ) { stream->read(*this); } if( _exception.size() ) LEMUR_THROW( LEMUR_NETWORK_ERROR, _exception ); }
int indri::xml::XMLReader::_findChar( char ch, const char* buffer, int start, int finish ) { int result = _tryFindChar( ch, buffer, start, finish ); if( result == -1 ) LEMUR_THROW( LEMUR_PARSE_ERROR, "Was looking for '" + ch + "', but couldn't find it." ); return result; }
int indri::xml::XMLReader::_findBeginTag( const char* buffer, int start, int finish ) { int result = _tryFindBeginTag( buffer, start, finish ); if( result == -1 ) LEMUR_THROW( LEMUR_PARSE_ERROR, "Ran off the end of a buffer while looking for a begin tag" ); return result; }
indri::infnet::InferenceNetwork::MAllResults& getResults() { while( !_done && _stream->alive() && !_exception.length() ) _stream->read(*this); if( _exception.length() ) LEMUR_THROW( LEMUR_RUNTIME_ERROR, _exception ); return _results; }
void lemur::file::Keyfile::create( const char* filename, int cacheSize ) { _buildHandle( cacheSize ); int error = create_key( _handle, const_cast<char*>(filename), _handleSize ); if( error ) LEMUR_THROW(LEMUR_KEYFILE_IO_ERROR, "Unable to create '" + filename + "'"); }
void lemur::file::Keyfile::open( const char* filename, int cacheSize, bool readOnly) { _buildHandle( cacheSize ); int error = open_key( _handle, const_cast<char*>(filename), _handleSize, readOnly ? 1 : 0); if( error ) LEMUR_THROW(LEMUR_KEYFILE_IO_ERROR, "Unable to open '" + filename + "'"); }
void lemur::file::Keyfile::remove( const char* key ) { assert( key && "key cannot be null" ); assert( _handle && "call open() or create() first" ); int len = strlen(key); // fix for UTF-8 int error = delete_rec( _handle, const_cast<char*>(key), len ); if( error ) LEMUR_THROW( LEMUR_KEYFILE_IO_ERROR, "Unable to delete record for key: " + key ); }
indri::api::Parameters indri::api::Parameters::get( const std::string& name ) { assert( exists(name) ); if( ! exists(name) ) LEMUR_THROW( LEMUR_IO_ERROR, "Required parameter '" + name + "' was not specified." ); parameter_value* root = _getRoot(); parameter_value* current = _getPath(name, root); return indri::api::Parameters( current ); }
void indri::parse::WARCDocumentIterator::open( const std::string& filename ) { _gzin = gzopen(filename.c_str(), "rb"); if( !_gzin) LEMUR_THROW( LEMUR_IO_ERROR, "Couldn't open file " + filename + "." ); _record = new WARCRecord(_gzin, _buffer); // Consume the first WARC record (type warcinfo) // verify the WARC-Type is warcinfo // if not, it's a partial file or broken... bleah. bool read = _record->readRecord(); if ( ! read ) { LEMUR_THROW(LEMUR_IO_ERROR, "Bad WARC file." ); } std::string warcType = _record->getWarcType(); if ( warcType != "warcinfo" ) { LEMUR_THROW(LEMUR_IO_ERROR, "Bad WARC file." ); } _warcUUID = _record->getUUID(); // file pointer is now positioned to read the first response record. }
void lemur::file::SortMergeTextFiles::_doSingleFileMergesort(std::string &inputFile, std::string &outputFile, std::vector<std::string> &chunkList, int chunkRecordSize) { // our in-memory chunks std::vector<std::string> inMemoryChunk; inMemoryChunk.reserve(chunkRecordSize); int currentChunkRecord=0; // clear the input buffer _inputBuffer.clear(); FILE* _in; _in = fopen( inputFile.c_str(), "rb" ); if( !_in ) { LEMUR_THROW( LEMUR_IO_ERROR, "Couldn't open file " + inputFile + "." ); } // reset the buffer size // setvbuf(_in, NULL, _IOFBF, 65536); setvbuf(_in, NULL, _IOFBF, 5*1024*1024); std::vector<std::string> outputChunks; int countInputRecords=0; char* thisLine; size_t lineLength; while (_readLine(_in, thisLine, lineLength, _inputBuffer)) { if (currentChunkRecord==chunkRecordSize) { chunkList.push_back(_flushChunks(outputFile, &inMemoryChunk, chunkList.size())); inMemoryChunk.clear(); currentChunkRecord=0; } // straight fill-up the buffer if ((lineLength > 0) && (thisLine)) { inMemoryChunk.push_back(std::string(thisLine)); // increment our counters ++currentChunkRecord; ++countInputRecords; } } if (currentChunkRecord > 0) { chunkList.push_back(_flushChunks(outputFile, &inMemoryChunk, chunkList.size())); } // close the input file, we're done with it fclose(_in); // now, merge sort the chunks // _doFinalMergesortFiles(outputChunks, outputFile); }
void indri::index::DeletedDocumentList::read( const std::string& filename ) { indri::file::File file; if( !file.openRead( filename ) ) LEMUR_THROW( LEMUR_IO_ERROR, "Unable to open file: " + filename ); UINT64 fileSize = file.size(); _bitmap.clear(); file.read( _bitmap.write( fileSize ), 0, fileSize ); file.close(); }
void indri::index::DeletedDocumentList::write( const std::string& filename ) { indri::file::File file; if( indri::file::Path::exists( filename ) ) indri::file::Path::remove( filename ); if( !file.create( filename ) ) LEMUR_THROW( LEMUR_IO_ERROR, "Unable to create file: " + filename ); file.write( _bitmap.front(), 0, _bitmap.position() ); file.close(); }
void lemur::file::Keyfile::put( const char* key, const void* value, int valueSize ) { assert( key && "key cannot be null" ); assert( value && "value cannot be null" ); int len = strlen(key); // fix for UTF-8 int error = put_rec( _handle, const_cast<char*>(key), len, static_cast<char*>(const_cast<void*>(value)), valueSize ); if( error ) LEMUR_THROW( LEMUR_KEYFILE_IO_ERROR, "Caught an internal error while putting record for key: " + key ); }