void open( const std::string& anchorFile ) { _in.close(); _in.clear(); _in.open( anchorFile.c_str() ); _buffer.clear(); _readDocumentHeader(); }
void _mergeData() { // fill in tags from the document vector fields // unnecessary to filter out any fields to remove, as they will // be ignored when indexing. indri::utility::greedy_vector<indri::index::FieldExtent>& fields = _docIter->currentEntry()->fields() ; tl.clear(); for (size_t i = 0; i < fields.size(); i++) { const indri::index::FieldExtent& field = fields[i]; std::string fieldName = _index->field(fields[i].id);; tl.addTag(fieldName.c_str(), fieldName.c_str(), field.begin); tl.endTag(fieldName.c_str(), fieldName.c_str(), field.end); } //stuff it into the parsed doc tl.writeTagList(parsed->tags); // fill in terms from the document text so that they will stop/stem // correctly when added to the new repository. // Potentially issues with url injection here... // probably best not to do this with trecweb/html docs.... // TODO: test this termBuffer.clear(); termBuffer.grow( parsed->textLength * 2 ); for (size_t i = 0; i < parsed->positions.size(); i++ ) { int start = parsed->positions[i].begin; int end = parsed->positions[i].end; int token_len = end - start; const char *token = parsed->text + start; char* write_loc = termBuffer.write( token_len + 1 ); strncpy( write_loc, token, token_len ); write_loc[token_len] = '\0'; parsed->terms.push_back( write_loc ); } }
void IndexWriter::_writeBatch( indri::file::SequentialWriteBuffer* buffer, int document, int length, indri::utility::Buffer& data ) { assert( length < 100*1000*1000 ); _writeSkip( buffer, document, length ); if( data.position() != 0 ) { buffer->write( data.front(), data.position() ); data.clear(); } }
indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document ) { // find the url metadata pair indri::utility::greedy_vector<indri::parse::MetadataPair>::iterator iter; iter = std::find_if( document->metadata.begin(), document->metadata.end(), indri::parse::MetadataPair::key_equal( "url" ) ); // no need to stick around if there is no url if( iter == document->metadata.end() ) return document; // need to copy this into the buffer and parse it: _buffer.clear(); _buffer.grow( iter->valueLength + 1 ); char* urlText = _buffer.write( iter->valueLength ); // pushes the buffer pointer, trash in urlText // memcpy( _buffer.write( iter->valueLength ), iter->value, iter->valueLength ); memcpy( urlText, iter->value, iter->valueLength ); *_buffer.write(1) = '\0'; // now we're pointing to the copied urlText, so we can start parsing int urlStart = (int)document->terms.size(); char* c = urlText; bool lastSkipped = true; bool foundSlash = false; int remainingStart = -1; // skip the beginning stuff (http://) for( c = urlText; *c; c++ ) { if( *c == '/' && c[1] && c[1] == '/' ) { urlText = c + 2; } } int cnt = 0; // now, try to find the for( c = urlText; *c; c++ ) { if( (*c >= 'A' && *c <= 'Z') || (*c >= 'a' && *c <= 'z') || (*c >= '0' && *c <= '9') ) { if( lastSkipped ) { lastSkipped = false; document->terms.push_back( c ); cnt++; } } else if( *c == '/' && remainingStart < 0 ) { *c = 0; lastSkipped = true; remainingStart = document->terms.size(); } else { lastSkipped = true; *c = 0; } } // put in phony positions entries int tokEnd = document->positions.size() ? document->positions[document->positions.size()-1].end : 0; for (size_t n = document->terms.size()-cnt; n < document->terms.size(); n++) { TermExtent extent; extent.begin = tokEnd++; // hope this doesn't run off the end extent.end = tokEnd; document->positions.push_back( extent ); } // the URL text is now parsed and stored in the document // all we need to do now is put some tags around the text. TagExtent *url = new TagExtent; url->begin = urlStart; url->end = document->terms.size(); url->name = "url"; url->number = 0; document->tags.push_back(url); TagExtent *domain = new TagExtent; domain->begin = urlStart; domain->end = (remainingStart >= 0) ? remainingStart : document->terms.size(); domain->name = "urldomain"; domain->number = 0; document->tags.push_back(domain); if( remainingStart > 0 ) { indri::parse::TagExtent *urlpath = new TagExtent; urlpath->begin = remainingStart; urlpath->end = document->terms.size(); urlpath->name = "urlpath"; urlpath->number = 0; document->tags.push_back(urlpath); } return document; }
void _fetchText( indri::utility::greedy_vector<TagExtent *>& tags, indri::utility::greedy_vector<char*>& terms ) { // now, fetch the additional terms char line[65536]; _buffer.clear(); for( int i=0; i<_count; i++ ) { // LINK _in.getline( line, sizeof line-1 ); // LINKDOCNO _in.getline( line, sizeof line-1 ); // TEXT= _in.getline( line, sizeof line-1 ); int textLen = strlen(line+6); strcpy( _buffer.write(textLen+1), line+6 ); _buffer.unwrite(1); assert( *(_buffer.front()+_buffer.position()-1) == '\"' && "Last character should be a quote" ); } *(_buffer.write(1)) = 0; // now there's a bunch of text in _buffer, space separated, with each // link separated by a " symbol char* beginWord = 0; int beginIndex = 0; char* buffer = _buffer.front(); for( unsigned int i=0; i<_buffer.position(); i++ ) { if( isalnum(buffer[i]) && !beginWord ) { beginWord = buffer+i; if(!beginIndex) beginIndex = terms.size(); } else if( isspace(buffer[i]) ) { buffer[i] = 0; if( beginWord ) terms.push_back( beginWord ); beginWord = 0; } else if( buffer[i] == '\"' ) { buffer[i] = 0; if( beginWord ) terms.push_back( beginWord ); beginWord = 0; TagExtent * extent = new TagExtent; extent->name = "inlink"; extent->begin = beginIndex; extent->end = terms.size(); extent->number = 0; extent->parent = 0; assert( extent->begin <= extent->end ); if( beginIndex ) { tags.push_back(extent); if( terms.size() > 125000 ) break; } beginIndex = 0; } } }