void _mergeData() { // fill in tags from the document vector fields // unnecessary to filter out any fields to remove, as they will // be ignored when indexing. indri::utility::greedy_vector<indri::index::FieldExtent>& fields = _docIter->currentEntry()->fields() ; tl.clear(); for (size_t i = 0; i < fields.size(); i++) { const indri::index::FieldExtent& field = fields[i]; std::string fieldName = _index->field(fields[i].id);; tl.addTag(fieldName.c_str(), fieldName.c_str(), field.begin); tl.endTag(fieldName.c_str(), fieldName.c_str(), field.end); } //stuff it into the parsed doc tl.writeTagList(parsed->tags); // fill in terms from the document text so that they will stop/stem // correctly when added to the new repository. // Potentially issues with url injection here... // probably best not to do this with trecweb/html docs.... // TODO: test this termBuffer.clear(); termBuffer.grow( parsed->textLength * 2 ); for (size_t i = 0; i < parsed->positions.size(); i++ ) { int start = parsed->positions[i].begin; int end = parsed->positions[i].end; int token_len = end - start; const char *token = parsed->text + start; char* write_loc = termBuffer.write( token_len + 1 ); strncpy( write_loc, token, token_len ); write_loc[token_len] = '\0'; parsed->terms.push_back( write_loc ); } }
void IndexWriter::_writeBatch( indri::file::SequentialWriteBuffer* buffer, int document, int length, indri::utility::Buffer& data ) { assert( length < 100*1000*1000 ); _writeSkip( buffer, document, length ); if( data.position() != 0 ) { buffer->write( data.front(), data.position() ); data.clear(); } }
void open( const std::string& anchorFile ) { _in.close(); _in.clear(); _in.open( anchorFile.c_str() ); _buffer.clear(); _readDocumentHeader(); }
static void zlib_read_document( z_stream_s& stream, indri::file::File& infile, UINT64 offset, indri::utility::Buffer& outputBuffer ) { // read in data from the file until the stream ends // split up the data as necessary // decompress positional info // read some data char inputBuffer[INPUT_BUFFER_SIZE]; outputBuffer.grow( INPUT_BUFFER_SIZE ); outputBuffer.write( sizeof(indri::api::ParsedDocument) ); stream.avail_in = 0; stream.avail_out = 0; while(true) { if( !stream.avail_in ) { UINT64 readSize = infile.read( inputBuffer, offset, sizeof inputBuffer ); offset += readSize; stream.avail_in = readSize; stream.next_in = (Bytef*) inputBuffer; } stream.avail_out = outputBuffer.size() - outputBuffer.position(); stream.next_out = (Bytef*) outputBuffer.write( outputBuffer.size() - outputBuffer.position() ); int result = inflate( &stream, Z_NO_FLUSH ); outputBuffer.unwrite( stream.avail_out ); if( result == Z_STREAM_END ) { result = inflate( &stream, Z_FINISH ); if( result < 0 ) LEMUR_THROW( result, "Something bad happened while trying to finish decompressing a document." ); inflateEnd( &stream ); break; } if( result < 0 ) { LEMUR_THROW( result, "Something bad happened while trying to decompress a document." ); } if( stream.avail_out == 0 ) { outputBuffer.grow(); } } }
indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document ) { // find the url metadata pair indri::utility::greedy_vector<indri::parse::MetadataPair>::iterator iter; iter = std::find_if( document->metadata.begin(), document->metadata.end(), indri::parse::MetadataPair::key_equal( "url" ) ); // no need to stick around if there is no url if( iter == document->metadata.end() ) return document; // need to copy this into the buffer and parse it: _buffer.clear(); _buffer.grow( iter->valueLength + 1 ); char* urlText = _buffer.write( iter->valueLength ); // pushes the buffer pointer, trash in urlText // memcpy( _buffer.write( iter->valueLength ), iter->value, iter->valueLength ); memcpy( urlText, iter->value, iter->valueLength ); *_buffer.write(1) = '\0'; // now we're pointing to the copied urlText, so we can start parsing int urlStart = (int)document->terms.size(); char* c = urlText; bool lastSkipped = true; bool foundSlash = false; int remainingStart = -1; // skip the beginning stuff (http://) for( c = urlText; *c; c++ ) { if( *c == '/' && c[1] && c[1] == '/' ) { urlText = c + 2; } } int cnt = 0; // now, try to find the for( c = urlText; *c; c++ ) { if( (*c >= 'A' && *c <= 'Z') || (*c >= 'a' && *c <= 'z') || (*c >= '0' && *c <= '9') ) { if( lastSkipped ) { lastSkipped = false; document->terms.push_back( c ); cnt++; } } else if( *c == '/' && remainingStart < 0 ) { *c = 0; lastSkipped = true; remainingStart = document->terms.size(); } else { lastSkipped = true; *c = 0; } } // put in phony positions entries int tokEnd = document->positions.size() ? document->positions[document->positions.size()-1].end : 0; for (size_t n = document->terms.size()-cnt; n < document->terms.size(); n++) { TermExtent extent; extent.begin = tokEnd++; // hope this doesn't run off the end extent.end = tokEnd; document->positions.push_back( extent ); } // the URL text is now parsed and stored in the document // all we need to do now is put some tags around the text. TagExtent *url = new TagExtent; url->begin = urlStart; url->end = document->terms.size(); url->name = "url"; url->number = 0; document->tags.push_back(url); TagExtent *domain = new TagExtent; domain->begin = urlStart; domain->end = (remainingStart >= 0) ? remainingStart : document->terms.size(); domain->name = "urldomain"; domain->number = 0; document->tags.push_back(domain); if( remainingStart > 0 ) { indri::parse::TagExtent *urlpath = new TagExtent; urlpath->begin = remainingStart; urlpath->end = document->terms.size(); urlpath->name = "urlpath"; urlpath->number = 0; document->tags.push_back(urlpath); } return document; }
bool lemur::file::SortMergeTextFiles::_readLine(FILE *_in, char*& beginLine, size_t& lineLength, indri::utility::Buffer &_buffer ) { lineLength = 0; size_t actual; // make a buffer of a reasonable size so we're not always allocating if( _buffer.size() < 1024*1024 ) _buffer.grow( 1024*1024 ); // if we're running out of room, add 25MB if( (_buffer.size() - _buffer.position()) < 512*1024 ) { _buffer.grow( _buffer.size() + 1024*1024*25 ); } if( _buffer.position() ) { // get rid of null terminator from previous call _buffer.unwrite(1); } size_t readAmount = _buffer.size() - _buffer.position() - 2; // fetch next document line char* buffer = _buffer.write( readAmount ); char* result = fgets( buffer, (int)readAmount, _in ); if(!result) { return false; } actual = strlen(buffer); lineLength += actual; _buffer.unwrite( readAmount - actual ); // all finished reading *_buffer.write(1) = 0; beginLine = _buffer.front() + _buffer.position() - lineLength - 1; // strip the \n off the end if (beginLine[lineLength-1]=='\n') { beginLine[lineLength-1]=0; } return true; }
void IndexWriter::_addInvertedListData( indri::utility::greedy_vector<WriterIndexContext*>& lists, indri::index::TermData* termData, indri::utility::Buffer& listBuffer, UINT64& endOffset ) { indri::utility::greedy_vector<WriterIndexContext*>::iterator iter; const int minimumSkip = 1<<12; // 4k int documentsWritten = 0; const float topdocsFraction = 0.01f; bool hasTopdocs = termData->corpus.documentCount > TOPDOCS_DOCUMENT_COUNT; bool isFrequent = termData->corpus.totalCount > FREQUENT_TERM_COUNT; int topdocsCount = hasTopdocs ? int(termData->corpus.documentCount * 0.01) : 0; int topdocsSpace = hasTopdocs ? ((topdocsCount*3*sizeof(UINT32)) + sizeof(int)) : 0; // write a control byte char control = (hasTopdocs ? 0x01 : 0) | (isFrequent ? 0x02 : 0); _invertedOutput->write( &control, 1 ); UINT64 initialPosition = _invertedOutput->tell(); // leave some room for the topdocs list if( hasTopdocs ) { _invertedOutput->seek( topdocsSpace + initialPosition ); } // maintain a list of top documents std::priority_queue<DocListIterator::TopDocument, std::vector<DocListIterator::TopDocument>, DocListIterator::TopDocument::greater> topdocs; double threshold = 0; int lastDocument = 0; int positions = 0; int docs = 0; // for each matching list: for( iter = lists.begin(); iter != lists.end(); ++iter ) { indri::index::DocListFileIterator::DocListData* listData = (*iter)->iterator->currentEntry(); DocListIterator* iterator = listData->iterator; Index* index = (*iter)->index; indri::utility::RVLCompressStream stream( listBuffer ); int listDocs = 0; int listPositions = 0; while( !iterator->finished() ) { // get the latest entry from the list DocListIterator::DocumentData* documentData = iterator->currentEntry(); // add to document counter docs++; listDocs++; // update the topdocs list if( hasTopdocs ) { int length = index->documentLength( documentData->document ); int count = documentData->positions.size(); // compute DocListIterator::TopDocument::greater (current, top()) // if false, no reason to insert this entry. // note that the test is inverted. // int(length * threshold) <= count is equivalent to // count/length > topdocs.top().count/topdocs.top().length // but we use < to force breaking a tie in favor of keeping // the first seen document. if( int(length * threshold) < count || topdocs.size() < topdocsCount ) { // form a topdocs entry for this document DocListIterator::TopDocument topDocument( documentData->document, count, length ); topdocs.push( topDocument ); while( topdocs.size() > topdocsCount ) topdocs.pop(); threshold = topdocs.top().count / double(topdocs.top().length); } } if( listBuffer.position() > minimumSkip ) { // time to write in a skip _writeBatch( _invertedOutput, documentData->document, listBuffer.position(), listBuffer ); // delta encode documents by batch lastDocument = 0; } assert( documentData->document > lastDocument ); // write this entry out to the list stream << documentData->document - lastDocument; stream << (int) documentData->positions.size(); lastDocument = documentData->document; int lastPosition = 0; for( int i=0; i<documentData->positions.size(); i++ ) { stream << (documentData->positions[i] - lastPosition); lastPosition = documentData->positions[i]; positions++; listPositions++; } iterator->nextEntry(); } indri::index::TermData* td = iterator->termData(); assert( listPositions == td->corpus.totalCount ); assert( listDocs == td->corpus.documentCount ); } assert( docs == termData->corpus.documentCount ); assert( positions == termData->corpus.totalCount ); // write in the final skip info _writeBatch( _invertedOutput, -1, listBuffer.position(), listBuffer ); UINT64 finalPosition = _invertedOutput->tell(); if( hasTopdocs ) { _invertedOutput->seek( initialPosition ); _invertedOutput->write( &topdocsCount, sizeof(int) ); assert( topdocs.size() == topdocsCount ); // write these into the topdocs list in order from smallest fraction to largest fraction, // where fraction = c(w;D)/|D| while( topdocs.size() ) { DocListIterator::TopDocument topDocument = topdocs.top(); _invertedOutput->write( &topDocument.document, sizeof(int) ); _invertedOutput->write( &topDocument.count, sizeof(int) ); _invertedOutput->write( &topDocument.length, sizeof(int) ); topdocs.pop(); } assert( (_invertedOutput->tell() - initialPosition) == topdocsSpace ); _invertedOutput->seek( finalPosition ); } endOffset = finalPosition; }
void _fetchText( indri::utility::greedy_vector<TagExtent *>& tags, indri::utility::greedy_vector<char*>& terms ) { // now, fetch the additional terms char line[65536]; _buffer.clear(); for( int i=0; i<_count; i++ ) { // LINK _in.getline( line, sizeof line-1 ); // LINKDOCNO _in.getline( line, sizeof line-1 ); // TEXT= _in.getline( line, sizeof line-1 ); int textLen = strlen(line+6); strcpy( _buffer.write(textLen+1), line+6 ); _buffer.unwrite(1); assert( *(_buffer.front()+_buffer.position()-1) == '\"' && "Last character should be a quote" ); } *(_buffer.write(1)) = 0; // now there's a bunch of text in _buffer, space separated, with each // link separated by a " symbol char* beginWord = 0; int beginIndex = 0; char* buffer = _buffer.front(); for( unsigned int i=0; i<_buffer.position(); i++ ) { if( isalnum(buffer[i]) && !beginWord ) { beginWord = buffer+i; if(!beginIndex) beginIndex = terms.size(); } else if( isspace(buffer[i]) ) { buffer[i] = 0; if( beginWord ) terms.push_back( beginWord ); beginWord = 0; } else if( buffer[i] == '\"' ) { buffer[i] = 0; if( beginWord ) terms.push_back( beginWord ); beginWord = 0; TagExtent * extent = new TagExtent; extent->name = "inlink"; extent->begin = beginIndex; extent->end = terms.size(); extent->number = 0; extent->parent = 0; assert( extent->begin <= extent->end ); if( beginIndex ) { tags.push_back(extent); if( terms.size() > 125000 ) break; } beginIndex = 0; } } }
FieldModifier() : fce(NULL), parsed(NULL), _docIter(NULL) { termBuffer.grow(1024*1024); }