Пример #1
0
void IndexWriter::_fetchMatchingInvertedLists( indri::utility::greedy_vector<WriterIndexContext*>& lists, invertedlist_pqueue& queue ) {
  lists.clear();

  WriterIndexContext* first = queue.top();
  lists.push_back( first );
  const char* firstTerm = first->iterator->currentEntry()->termData->term;
  queue.pop();

  while( queue.size() && !strcmp( firstTerm, queue.top()->iterator->currentEntry()->termData->term ) ) {
    lists.push_back( queue.top() );
    queue.pop();
  }
}
Пример #2
0
void indri::index::MemoryIndex::_addOpenTags( indri::utility::greedy_vector<indri::parse::TagExtent *>& indexedTags,
                                              indri::utility::greedy_vector<indri::parse::TagExtent *>& openTags,
                                              indri::utility::greedy_vector<indri::parse::TagExtent *>& extents,
                                              unsigned int& extentIndex, 
                                              unsigned int position ) {
  for( ; extentIndex < extents.size(); extentIndex++ ) {
    indri::parse::TagExtent* extent = extents[extentIndex];
    
    if( extent->begin > (int)position )
      break;
    
    int tagId = _fieldID( extent->name );
    
    if( tagId == 0 )
      continue;
     
    openTags.push_back( extent );
    indexedTags.push_back( extent );
  }
}
 // Moved work of identifying matches of an extent from ExtentRestriction to here.
 // This allows the computation for the list of matches for an extent to be 
 // overridden.  ExtentParent, ExtentChild, and ExtentDescendant are among the
 // classes that need to do this, as the parent/child/descedant relationships
 // are not based on extent containment - these relationships may be among
 // arbitrary fields in a document.
 virtual const indri::utility::greedy_vector<indri::index::Extent>& matches( indri::index::Extent &extent ) {
   int begin = extent.begin;
   int end = extent.end;
   const indri::utility::greedy_vector<indri::index::Extent>& exts = extents();
   _matches.clear();
   for( size_t i = 0 ; i < exts.size(); i++ ) {
     if ( begin <= exts[i].begin && end >= exts[i].end ) {
       _matches.push_back( exts[i] );
     } else if ( exts[i].begin > end ) {
       break;
     }
   }
   return _matches;
 }
Пример #4
0
      // Moved work of identifying matches of an extent from ExtentRestriction to here.
      // This allows the computation for the list of matches for an extent to be 
      // overridden.  ExtentParent, ExtentChild, and ExtentDescendant are among the
      // classes that need to do this, as the parent/child/descedant relationships
      // are not based on extent containment - these relationships may be among
      // arbitrary fields in a document.
      virtual const indri::utility::greedy_vector<indri::index::Extent>& matches( indri::index::Extent &extent ) {
        int begin = extent.begin;
        int end = extent.end;
        _matches.clear();
        const indri::utility::greedy_vector<indri::index::Extent>& exts = extents();

        // if there's no extents or we have no length - just return
        if (begin == end || exts.size()==0) return _matches;

        // if we are dealing with child extents, we need to reverse the
        // list pointer to the last good position
        while((_lastpos > 0) && (exts[_lastpos-1].begin >= begin)){
          _lastpos--;
        }

        // now, we make sure we're in the correct position
        // after this loop, _lastpos->begin >= begin
        while((_lastpos < exts.size()) && (exts[_lastpos].begin < begin)){
          _lastpos++;
        }

        // for default DocListIteratorNode, any extent: begin+1 == end.
        while((_lastpos < exts.size()) && (exts[_lastpos].begin < end)) { 
          if(exts[_lastpos].end <= end) {
            indri::index::Extent ext(exts[_lastpos]);
            _matches.push_back(ext);
          } // end if(_exts[_lastpos].end<=end)
          _lastpos++;
        } // end while(_lastpos<_exts.size()&&_exts[_lastpos].begin<end)

/***
 *** old method of matching child extents - deprecated 
 *
 *      for( size_t i = 0 ; i < exts.size(); i++ ) {
 *        if ( begin <= exts[i].begin && end >= exts[i].end ) {
 *          _matches.push_back( exts[i] );
 *        } else if ( exts[i].begin > end ) {
 *          break;
 *        }
 *      }
 **/
        return _matches;
      }
Пример #5
0
void indri::infnet::ExtentAndNode::_and( indri::utility::greedy_vector<indri::index::Extent>& out, const indri::utility::greedy_vector<indri::index::Extent>& one, const indri::utility::greedy_vector<indri::index::Extent>& two ) {
  indri::utility::greedy_vector<indri::index::Extent>::const_iterator oneIter = one.begin();
  indri::utility::greedy_vector<indri::index::Extent>::const_iterator twoIter = two.begin();

  out.clear();

  indri::index::Extent current;
  current.begin = 0;
  current.end = 0;

  while( oneIter != one.end() && twoIter != two.end() ) {
    indri::index::Extent intersection;

    // compute the intersection (may be 0 length)
    intersection.begin = lemur_compat::max( oneIter->begin, twoIter->begin );
    intersection.end = lemur_compat::min( oneIter->end, twoIter->end );
    intersection.begin = lemur_compat::min( intersection.begin, intersection.end );

    if( current.end < intersection.begin ) {
      // if last intersection had non-zero length, put it out in the vector
      if( current.begin < current.end )
        out.push_back( current );

      current = intersection;
    } else {
      // this intersection touches the last intersection,
      // so we'll just put them together
      current.end = intersection.end;
    }

    if( oneIter->end == intersection.end ) {
      oneIter++;
    }

    if( twoIter->end == intersection.end ) {
      twoIter++;
    }
  }
  
  if( current.begin != current.end )
    _extents.push_back( current );
}
Пример #6
0
      void _fetchText( indri::utility::greedy_vector<TagExtent *>& tags, indri::utility::greedy_vector<char*>& terms ) {
        // now, fetch the additional terms
        char line[65536];
        _buffer.clear();

        for( int i=0; i<_count; i++ ) {
          // LINK
          _in.getline( line, sizeof line-1 );

          // LINKDOCNO 
          _in.getline( line, sizeof line-1 );
          
          // TEXT=
          _in.getline( line, sizeof line-1 );
          int textLen = strlen(line+6);
          strcpy( _buffer.write(textLen+1), line+6 );
          _buffer.unwrite(1);
          
          assert( *(_buffer.front()+_buffer.position()-1) == '\"' && "Last character should be a quote" );
        }
        *(_buffer.write(1)) = 0;

        // now there's a bunch of text in _buffer, space separated, with each
        // link separated by a " symbol

        char* beginWord = 0;
        int beginIndex = 0;
        char* buffer = _buffer.front();

        for( unsigned int i=0; i<_buffer.position(); i++ ) {
          if( isalnum(buffer[i]) && !beginWord ) {
            beginWord = buffer+i;

            if(!beginIndex)
              beginIndex = terms.size();
          } else if( isspace(buffer[i]) ) {
            buffer[i] = 0;
            if( beginWord )
              terms.push_back( beginWord );
            beginWord = 0;
          } else if( buffer[i] == '\"' ) {
            buffer[i] = 0;
            if( beginWord )
              terms.push_back( beginWord );
            beginWord = 0;
        
            TagExtent * extent = new TagExtent;
            extent->name = "inlink";
            extent->begin = beginIndex;
            extent->end = terms.size();
            extent->number = 0;
            extent->parent = 0;

            assert( extent->begin <= extent->end );

            if( beginIndex ) {
              tags.push_back(extent);
              if( terms.size() > 125000 )
                break;
            }


            beginIndex = 0;
          }

        }

      }