void IndexWriter::_fetchMatchingInvertedLists( indri::utility::greedy_vector<WriterIndexContext*>& lists, invertedlist_pqueue& queue ) { lists.clear(); WriterIndexContext* first = queue.top(); lists.push_back( first ); const char* firstTerm = first->iterator->currentEntry()->termData->term; queue.pop(); while( queue.size() && !strcmp( firstTerm, queue.top()->iterator->currentEntry()->termData->term ) ) { lists.push_back( queue.top() ); queue.pop(); } }
void indri::index::MemoryIndex::_addOpenTags( indri::utility::greedy_vector<indri::parse::TagExtent *>& indexedTags, indri::utility::greedy_vector<indri::parse::TagExtent *>& openTags, indri::utility::greedy_vector<indri::parse::TagExtent *>& extents, unsigned int& extentIndex, unsigned int position ) { for( ; extentIndex < extents.size(); extentIndex++ ) { indri::parse::TagExtent* extent = extents[extentIndex]; if( extent->begin > (int)position ) break; int tagId = _fieldID( extent->name ); if( tagId == 0 ) continue; openTags.push_back( extent ); indexedTags.push_back( extent ); } }
// Moved work of identifying matches of an extent from ExtentRestriction to here. // This allows the computation for the list of matches for an extent to be // overridden. ExtentParent, ExtentChild, and ExtentDescendant are among the // classes that need to do this, as the parent/child/descedant relationships // are not based on extent containment - these relationships may be among // arbitrary fields in a document. virtual const indri::utility::greedy_vector<indri::index::Extent>& matches( indri::index::Extent &extent ) { int begin = extent.begin; int end = extent.end; const indri::utility::greedy_vector<indri::index::Extent>& exts = extents(); _matches.clear(); for( size_t i = 0 ; i < exts.size(); i++ ) { if ( begin <= exts[i].begin && end >= exts[i].end ) { _matches.push_back( exts[i] ); } else if ( exts[i].begin > end ) { break; } } return _matches; }
// Moved work of identifying matches of an extent from ExtentRestriction to here. // This allows the computation for the list of matches for an extent to be // overridden. ExtentParent, ExtentChild, and ExtentDescendant are among the // classes that need to do this, as the parent/child/descedant relationships // are not based on extent containment - these relationships may be among // arbitrary fields in a document. virtual const indri::utility::greedy_vector<indri::index::Extent>& matches( indri::index::Extent &extent ) { int begin = extent.begin; int end = extent.end; _matches.clear(); const indri::utility::greedy_vector<indri::index::Extent>& exts = extents(); // if there's no extents or we have no length - just return if (begin == end || exts.size()==0) return _matches; // if we are dealing with child extents, we need to reverse the // list pointer to the last good position while((_lastpos > 0) && (exts[_lastpos-1].begin >= begin)){ _lastpos--; } // now, we make sure we're in the correct position // after this loop, _lastpos->begin >= begin while((_lastpos < exts.size()) && (exts[_lastpos].begin < begin)){ _lastpos++; } // for default DocListIteratorNode, any extent: begin+1 == end. while((_lastpos < exts.size()) && (exts[_lastpos].begin < end)) { if(exts[_lastpos].end <= end) { indri::index::Extent ext(exts[_lastpos]); _matches.push_back(ext); } // end if(_exts[_lastpos].end<=end) _lastpos++; } // end while(_lastpos<_exts.size()&&_exts[_lastpos].begin<end) /*** *** old method of matching child extents - deprecated * * for( size_t i = 0 ; i < exts.size(); i++ ) { * if ( begin <= exts[i].begin && end >= exts[i].end ) { * _matches.push_back( exts[i] ); * } else if ( exts[i].begin > end ) { * break; * } * } **/ return _matches; }
void indri::infnet::ExtentAndNode::_and( indri::utility::greedy_vector<indri::index::Extent>& out, const indri::utility::greedy_vector<indri::index::Extent>& one, const indri::utility::greedy_vector<indri::index::Extent>& two ) { indri::utility::greedy_vector<indri::index::Extent>::const_iterator oneIter = one.begin(); indri::utility::greedy_vector<indri::index::Extent>::const_iterator twoIter = two.begin(); out.clear(); indri::index::Extent current; current.begin = 0; current.end = 0; while( oneIter != one.end() && twoIter != two.end() ) { indri::index::Extent intersection; // compute the intersection (may be 0 length) intersection.begin = lemur_compat::max( oneIter->begin, twoIter->begin ); intersection.end = lemur_compat::min( oneIter->end, twoIter->end ); intersection.begin = lemur_compat::min( intersection.begin, intersection.end ); if( current.end < intersection.begin ) { // if last intersection had non-zero length, put it out in the vector if( current.begin < current.end ) out.push_back( current ); current = intersection; } else { // this intersection touches the last intersection, // so we'll just put them together current.end = intersection.end; } if( oneIter->end == intersection.end ) { oneIter++; } if( twoIter->end == intersection.end ) { twoIter++; } } if( current.begin != current.end ) _extents.push_back( current ); }
void _fetchText( indri::utility::greedy_vector<TagExtent *>& tags, indri::utility::greedy_vector<char*>& terms ) { // now, fetch the additional terms char line[65536]; _buffer.clear(); for( int i=0; i<_count; i++ ) { // LINK _in.getline( line, sizeof line-1 ); // LINKDOCNO _in.getline( line, sizeof line-1 ); // TEXT= _in.getline( line, sizeof line-1 ); int textLen = strlen(line+6); strcpy( _buffer.write(textLen+1), line+6 ); _buffer.unwrite(1); assert( *(_buffer.front()+_buffer.position()-1) == '\"' && "Last character should be a quote" ); } *(_buffer.write(1)) = 0; // now there's a bunch of text in _buffer, space separated, with each // link separated by a " symbol char* beginWord = 0; int beginIndex = 0; char* buffer = _buffer.front(); for( unsigned int i=0; i<_buffer.position(); i++ ) { if( isalnum(buffer[i]) && !beginWord ) { beginWord = buffer+i; if(!beginIndex) beginIndex = terms.size(); } else if( isspace(buffer[i]) ) { buffer[i] = 0; if( beginWord ) terms.push_back( beginWord ); beginWord = 0; } else if( buffer[i] == '\"' ) { buffer[i] = 0; if( beginWord ) terms.push_back( beginWord ); beginWord = 0; TagExtent * extent = new TagExtent; extent->name = "inlink"; extent->begin = beginIndex; extent->end = terms.size(); extent->number = 0; extent->parent = 0; assert( extent->begin <= extent->end ); if( beginIndex ) { tags.push_back(extent); if( terms.size() > 125000 ) break; } beginIndex = 0; } } }