void _mergeData() { // fill in tags from the document vector fields // unnecessary to filter out any fields to remove, as they will // be ignored when indexing. indri::utility::greedy_vector<indri::index::FieldExtent>& fields = _docIter->currentEntry()->fields() ; tl.clear(); for (size_t i = 0; i < fields.size(); i++) { const indri::index::FieldExtent& field = fields[i]; std::string fieldName = _index->field(fields[i].id);; tl.addTag(fieldName.c_str(), fieldName.c_str(), field.begin); tl.endTag(fieldName.c_str(), fieldName.c_str(), field.end); } //stuff it into the parsed doc tl.writeTagList(parsed->tags); // fill in terms from the document text so that they will stop/stem // correctly when added to the new repository. // Potentially issues with url injection here... // probably best not to do this with trecweb/html docs.... // TODO: test this termBuffer.clear(); termBuffer.grow( parsed->textLength * 2 ); for (size_t i = 0; i < parsed->positions.size(); i++ ) { int start = parsed->positions[i].begin; int end = parsed->positions[i].end; int token_len = end - start; const char *token = parsed->text + start; char* write_loc = termBuffer.write( token_len + 1 ); strncpy( write_loc, token, token_len ); write_loc[token_len] = '\0'; parsed->terms.push_back( write_loc ); } }
static void zlib_read_document( z_stream_s& stream, indri::file::File& infile, UINT64 offset, indri::utility::Buffer& outputBuffer ) { // read in data from the file until the stream ends // split up the data as necessary // decompress positional info // read some data char inputBuffer[INPUT_BUFFER_SIZE]; outputBuffer.grow( INPUT_BUFFER_SIZE ); outputBuffer.write( sizeof(indri::api::ParsedDocument) ); stream.avail_in = 0; stream.avail_out = 0; while(true) { if( !stream.avail_in ) { UINT64 readSize = infile.read( inputBuffer, offset, sizeof inputBuffer ); offset += readSize; stream.avail_in = readSize; stream.next_in = (Bytef*) inputBuffer; } stream.avail_out = outputBuffer.size() - outputBuffer.position(); stream.next_out = (Bytef*) outputBuffer.write( outputBuffer.size() - outputBuffer.position() ); int result = inflate( &stream, Z_NO_FLUSH ); outputBuffer.unwrite( stream.avail_out ); if( result == Z_STREAM_END ) { result = inflate( &stream, Z_FINISH ); if( result < 0 ) LEMUR_THROW( result, "Something bad happened while trying to finish decompressing a document." ); inflateEnd( &stream ); break; } if( result < 0 ) { LEMUR_THROW( result, "Something bad happened while trying to decompress a document." ); } if( stream.avail_out == 0 ) { outputBuffer.grow(); } } }
bool lemur::file::SortMergeTextFiles::_readLine(FILE *_in, char*& beginLine, size_t& lineLength, indri::utility::Buffer &_buffer ) { lineLength = 0; size_t actual; // make a buffer of a reasonable size so we're not always allocating if( _buffer.size() < 1024*1024 ) _buffer.grow( 1024*1024 ); // if we're running out of room, add 25MB if( (_buffer.size() - _buffer.position()) < 512*1024 ) { _buffer.grow( _buffer.size() + 1024*1024*25 ); } if( _buffer.position() ) { // get rid of null terminator from previous call _buffer.unwrite(1); } size_t readAmount = _buffer.size() - _buffer.position() - 2; // fetch next document line char* buffer = _buffer.write( readAmount ); char* result = fgets( buffer, (int)readAmount, _in ); if(!result) { return false; } actual = strlen(buffer); lineLength += actual; _buffer.unwrite( readAmount - actual ); // all finished reading *_buffer.write(1) = 0; beginLine = _buffer.front() + _buffer.position() - lineLength - 1; // strip the \n off the end if (beginLine[lineLength-1]=='\n') { beginLine[lineLength-1]=0; } return true; }
indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document ) { // find the url metadata pair indri::utility::greedy_vector<indri::parse::MetadataPair>::iterator iter; iter = std::find_if( document->metadata.begin(), document->metadata.end(), indri::parse::MetadataPair::key_equal( "url" ) ); // no need to stick around if there is no url if( iter == document->metadata.end() ) return document; // need to copy this into the buffer and parse it: _buffer.clear(); _buffer.grow( iter->valueLength + 1 ); char* urlText = _buffer.write( iter->valueLength ); // pushes the buffer pointer, trash in urlText // memcpy( _buffer.write( iter->valueLength ), iter->value, iter->valueLength ); memcpy( urlText, iter->value, iter->valueLength ); *_buffer.write(1) = '\0'; // now we're pointing to the copied urlText, so we can start parsing int urlStart = (int)document->terms.size(); char* c = urlText; bool lastSkipped = true; bool foundSlash = false; int remainingStart = -1; // skip the beginning stuff (http://) for( c = urlText; *c; c++ ) { if( *c == '/' && c[1] && c[1] == '/' ) { urlText = c + 2; } } int cnt = 0; // now, try to find the for( c = urlText; *c; c++ ) { if( (*c >= 'A' && *c <= 'Z') || (*c >= 'a' && *c <= 'z') || (*c >= '0' && *c <= '9') ) { if( lastSkipped ) { lastSkipped = false; document->terms.push_back( c ); cnt++; } } else if( *c == '/' && remainingStart < 0 ) { *c = 0; lastSkipped = true; remainingStart = document->terms.size(); } else { lastSkipped = true; *c = 0; } } // put in phony positions entries int tokEnd = document->positions.size() ? document->positions[document->positions.size()-1].end : 0; for (size_t n = document->terms.size()-cnt; n < document->terms.size(); n++) { TermExtent extent; extent.begin = tokEnd++; // hope this doesn't run off the end extent.end = tokEnd; document->positions.push_back( extent ); } // the URL text is now parsed and stored in the document // all we need to do now is put some tags around the text. TagExtent *url = new TagExtent; url->begin = urlStart; url->end = document->terms.size(); url->name = "url"; url->number = 0; document->tags.push_back(url); TagExtent *domain = new TagExtent; domain->begin = urlStart; domain->end = (remainingStart >= 0) ? remainingStart : document->terms.size(); domain->name = "urldomain"; domain->number = 0; document->tags.push_back(domain); if( remainingStart > 0 ) { indri::parse::TagExtent *urlpath = new TagExtent; urlpath->begin = remainingStart; urlpath->end = document->terms.size(); urlpath->name = "urlpath"; urlpath->number = 0; document->tags.push_back(urlpath); } return document; }
FieldModifier() : fce(NULL), parsed(NULL), _docIter(NULL) { termBuffer.grow(1024*1024); }