示例#1
0
// returns a new instance of FieldInfoList which represents field entities in a document index for a specific field, you must delete the instance later. @see FieldInfoList 
// Note that not all index types support fields - those that do should override this method.
lemur::api::FieldInfoList *lemur::index::LemurIndriIndex::fieldInfoList(lemur::api::DOCID_T docID, int fieldID) const {
  // get the index for this document
  indri::collection::Repository::index_state indexes = _repository->indexes();
  indri::index::Index* index = _indexWithDocument( indexes, docID );

  // ensure we do have an index (i.e. if the docID was invalid...)
  if (!index) return NULL;
  
  // and the indri term-list
  const indri::index::TermList *tList=index->termList((int)docID);

  // ensure we have a term list!
  if (!tList) return NULL;

  // create a blank field info list object
  lemur::api::IndriFieldInfoList *retVal=new lemur::api::IndriFieldInfoList();

  // loop through our fields and insert those that match the field ID
  const indri::utility::greedy_vector< indri::index::FieldExtent > fieldVec=tList->fields();
  int numFields=fieldVec.size();
  for (int i=0; i < numFields; i++) {
    indri::index::FieldExtent thisField=fieldVec[i];
    if (thisField.id==fieldID) {
      retVal->add(thisField);
    }
  }
  delete tList;
  // and return
  return retVal;
}
示例#2
0
void indri::index::MemoryIndex::_removeClosedTags( indri::utility::greedy_vector<indri::parse::TagExtent *>& tags, unsigned int position ) {
  for( size_t i=0; i<tags.size(); ) {
    if( tags[i]->end <= int(position) ) {
      tags.erase( tags.begin() + i );
    } else {
      i++;
    }
  }
}
示例#3
0
void indri::index::MemoryIndex::_writeFieldExtents( lemur::api::DOCID_T documentID, indri::utility::greedy_vector<indri::parse::TagExtent *>& indexedTags ) {
  indri::utility::HashTable< indri::parse::TagExtent *, int> tagIdMap;
  
  // sort fields
  std::sort( indexedTags.begin(), indexedTags.end(), indri::parse::LessTagExtent() );
  
  // we'll add to the end of the fields greedy_vector
  indri::utility::greedy_vector<indri::index::FieldExtent> & fields = _termList.fields();
  // this is used to set the parentOrdinals
  int offset = fields.size();
  
  // convert to field extents, set ids, and create the node map
  for( size_t i=0; i<indexedTags.size(); i++ ) {
    indri::parse::TagExtent * extent = indexedTags[i];
    
    int ordinal = int(i) + 1;

    // this is the id for the field type
    int tagId = _fieldID( extent->name );
    
    // convert the field
    indri::index::FieldExtent converted( tagId, extent->begin, extent->end, extent->number, ordinal);

    // add this node to the map;
    tagIdMap.insert( extent, ordinal );

    // add his field to the field list for the document
    fields.push_back( converted );

    // add this location to the inverted list for fields - deferred to below - 
    // _fieldLists[tagId - 1]->addLocation( documentID, extent->begin, extent->end, extent->number, ordinal );
  }

  // set the parent ordinals
  for( size_t j=0; j<indexedTags.size(); j++ ) {
    indri::parse::TagExtent * extent = indexedTags[j];
    
    // look up the parent 
    int parentOrdinal = 0;
    int * parentIter;
    if ( extent->parent != 0 ) {
      parentIter = tagIdMap.find( extent->parent );
      if( parentIter != 0 ) {
        parentOrdinal = *parentIter;
      } else {
        parentOrdinal = 0;
      }
    }
    // set the parent
    int ordinal = fields[ offset + j ].ordinal;
    int tagId = fields[ offset + j ].id;
    fields[ offset + j ].parentOrdinal = parentOrdinal;

    // add this location to the inverted list for fields
    _fieldLists[tagId - 1]->addLocation( documentID, extent->begin, extent->end, extent->number, ordinal, parentOrdinal );
  }
}
示例#4
0
void IndexWriter::_fetchMatchingInvertedLists( indri::utility::greedy_vector<WriterIndexContext*>& lists, invertedlist_pqueue& queue ) {
  lists.clear();

  WriterIndexContext* first = queue.top();
  lists.push_back( first );
  const char* firstTerm = first->iterator->currentEntry()->termData->term;
  queue.pop();

  while( queue.size() && !strcmp( firstTerm, queue.top()->iterator->currentEntry()->termData->term ) ) {
    lists.push_back( queue.top() );
    queue.pop();
  }
}
 // Moved work of identifying matches of an extent from ExtentRestriction to here.
 // This allows the computation for the list of matches for an extent to be 
 // overridden.  ExtentParent, ExtentChild, and ExtentDescendant are among the
 // classes that need to do this, as the parent/child/descedant relationships
 // are not based on extent containment - these relationships may be among
 // arbitrary fields in a document.
 virtual const indri::utility::greedy_vector<indri::index::Extent>& matches( indri::index::Extent &extent ) {
   int begin = extent.begin;
   int end = extent.end;
   const indri::utility::greedy_vector<indri::index::Extent>& exts = extents();
   _matches.clear();
   for( size_t i = 0 ; i < exts.size(); i++ ) {
     if ( begin <= exts[i].begin && end >= exts[i].end ) {
       _matches.push_back( exts[i] );
     } else if ( exts[i].begin > end ) {
       break;
     }
   }
   return _matches;
 }
示例#6
0
void indri::infnet::FixedPassageNode::_buildSubextents( const indri::utility::greedy_vector<indri::index::Extent>& extents ) {
  _subextents.clear();

  for( size_t i=0; i<extents.size(); i++ ) {
    _addSubextents( extents[i] );
  }
}
示例#7
0
void IndexWriter::_pushInvertedLists( indri::utility::greedy_vector<WriterIndexContext*>& lists, invertedlist_pqueue& queue ) {
  for( int i=0; i<lists.size(); i++ ) {
    lists[i]->iterator->nextEntry();

    if( !lists[i]->iterator->finished() )
      queue.push( lists[i] );
  }
}
      // Moved work of identifying matches of an extent from ExtentRestriction to here.
      // This allows the computation for the list of matches for an extent to be 
      // overridden.  ExtentParent, ExtentChild, and ExtentDescendant are among the
      // classes that need to do this, as the parent/child/descedant relationships
      // are not based on extent containment - these relationships may be among
      // arbitrary fields in a document.
      virtual const indri::utility::greedy_vector<indri::index::Extent>& matches( indri::index::Extent &extent ) {
        int begin = extent.begin;
        int end = extent.end;
        _matches.clear();
        const indri::utility::greedy_vector<indri::index::Extent>& exts = extents();

        // if there's no extents or we have no length - just return
        if (begin == end || exts.size()==0) return _matches;

        // if we are dealing with child extents, we need to reverse the
        // list pointer to the last good position
        while((_lastpos > 0) && (exts[_lastpos-1].begin >= begin)){
          _lastpos--;
        }

        // now, we make sure we're in the correct position
        // after this loop, _lastpos->begin >= begin
        while((_lastpos < exts.size()) && (exts[_lastpos].begin < begin)){
          _lastpos++;
        }

        // for default DocListIteratorNode, any extent: begin+1 == end.
        while((_lastpos < exts.size()) && (exts[_lastpos].begin < end)) { 
          if(exts[_lastpos].end <= end) {
            indri::index::Extent ext(exts[_lastpos]);
            _matches.push_back(ext);
          } // end if(_exts[_lastpos].end<=end)
          _lastpos++;
        } // end while(_lastpos<_exts.size()&&_exts[_lastpos].begin<end)

/***
 *** old method of matching child extents - deprecated 
 *
 *      for( size_t i = 0 ; i < exts.size(); i++ ) {
 *        if ( begin <= exts[i].begin && end >= exts[i].end ) {
 *          _matches.push_back( exts[i] );
 *        } else if ( exts[i].begin > end ) {
 *          break;
 *        }
 *      }
 **/
        return _matches;
      }
示例#9
0
void indri::index::MemoryIndex::_addOpenTags( indri::utility::greedy_vector<indri::parse::TagExtent *>& indexedTags,
                                              indri::utility::greedy_vector<indri::parse::TagExtent *>& openTags,
                                              indri::utility::greedy_vector<indri::parse::TagExtent *>& extents,
                                              unsigned int& extentIndex, 
                                              unsigned int position ) {
  for( ; extentIndex < extents.size(); extentIndex++ ) {
    indri::parse::TagExtent* extent = extents[extentIndex];
    
    if( extent->begin > (int)position )
      break;
    
    int tagId = _fieldID( extent->name );
    
    if( tagId == 0 )
      continue;
     
    openTags.push_back( extent );
    indexedTags.push_back( extent );
  }
}
示例#10
0
void IndexWriter::_storeMatchInformation( indri::utility::greedy_vector<WriterIndexContext*>& lists, int sequence, indri::index::TermData* termData, UINT64 startOffset, UINT64 endOffset ) {
  bool isFrequent = termData->corpus.totalCount > FREQUENT_TERM_COUNT;

  if( isFrequent )
    _isFrequentCount++;

  for( int i=0; i<lists.size(); i++ ) {
    WriterIndexContext* list = lists[i];
    indri::index::DiskDocListIterator* iterator = dynamic_cast<DiskDocListIterator*>(lists[i]->iterator->currentEntry()->iterator);
    bool isMemoryIndex = (iterator == 0);
    bool wasFrequent = (isMemoryIndex || iterator->isFrequent());

    if( !wasFrequent )
      list->wasInfrequentCount++;

    if( wasFrequent )
      list->wasFrequentCount++;

    list->sequenceCount++;

    if( !wasFrequent ) {
      if( !isFrequent ) {
        // common case--remaining infrequent
        assert( sequence - _isFrequentCount  - 1 >= 0 );
        assert( ((sequence -_isFrequentCount  - 1) + _isFrequentCount + 1) <= _corpus.uniqueTerms );
        list->bitmap->add( list->wasInfrequentCount - 1, sequence - _isFrequentCount - 1 );
      } else if( isFrequent ) {
        // becoming frequent
        list->newlyFrequent->add( list->wasInfrequentCount - 1, termData->term );
      }
    }
  }

  if( isFrequent ) {
    indri::index::DiskTermData* diskTermData = disktermdata_create( _fields.size() );
    ::termdata_merge( diskTermData->termData, termData, _fields.size() );
    diskTermData->startOffset = startOffset;
    diskTermData->length = endOffset - startOffset;
    strcpy( const_cast<char*>(diskTermData->termData->term), termData->term );

    _topTerms.push_back( diskTermData );
  } else {
    indri::index::DiskTermData diskTermData;

    diskTermData.termData = termData;
    diskTermData.startOffset = startOffset;
    diskTermData.length = endOffset - startOffset;
    diskTermData.termID = sequence - _topTerms.size();

    _storeTermEntry( _infrequentTerms, &diskTermData );
  }
}
示例#11
0
void IndexWriter::_writeStatistics( indri::utility::greedy_vector<WriterIndexContext*>& lists, indri::index::TermData* termData, UINT64& startOffset ) {
  indri::utility::greedy_vector<WriterIndexContext*>::iterator iter;
  ::termdata_clear( termData, _fields.size() );

  // find out what term we're writing
  strcpy( const_cast<char*>(termData->term), lists[0]->iterator->currentEntry()->termData->term );

  for( iter = lists.begin(); iter != lists.end(); ++iter ) {
    indri::index::DocListFileIterator::DocListData* listData = (*iter)->iterator->currentEntry();
    ::termdata_merge( termData, listData->termData,  _fields.size() );
  }

  _termDataBuffer.clear();
  indri::utility::RVLCompressStream stream( _termDataBuffer );

  stream << termData->term;
  ::termdata_compress( stream, termData,  _fields.size() );

  startOffset = _invertedOutput->tell();

  int dataSize = stream.dataSize();
  _invertedOutput->write( &dataSize, sizeof(UINT32) );
  _invertedOutput->write( stream.data(), stream.dataSize() );
}
 /// sorts the extents by the beginning extent position (if available)
 void sortbegin(indri::utility::greedy_vector<indri::index::Extent>& extents){
   int sorted=0;
   int lastbegin = 0; int lastend = extents.size();
   while(lastbegin<lastend){
     int i=lastbegin;
     int end=lastend-1;
     lastbegin=lastend;
     lastend=i;
     for (;i<end;i++){
       if (extents[i].begin > extents[i+1].begin){
         indri::index::Extent x(extents[i]);
         extents[i]=extents[i+1];extents[i+1]=x;
         if(lastbegin>i)lastbegin=i;
         if(lastend<i+1)lastend=i+1;
       }
     }
   }
 }
示例#13
0
void indri::infnet::ExtentAndNode::_and( indri::utility::greedy_vector<indri::index::Extent>& out, const indri::utility::greedy_vector<indri::index::Extent>& one, const indri::utility::greedy_vector<indri::index::Extent>& two ) {
  indri::utility::greedy_vector<indri::index::Extent>::const_iterator oneIter = one.begin();
  indri::utility::greedy_vector<indri::index::Extent>::const_iterator twoIter = two.begin();

  out.clear();

  indri::index::Extent current;
  current.begin = 0;
  current.end = 0;

  while( oneIter != one.end() && twoIter != two.end() ) {
    indri::index::Extent intersection;

    // compute the intersection (may be 0 length)
    intersection.begin = lemur_compat::max( oneIter->begin, twoIter->begin );
    intersection.end = lemur_compat::min( oneIter->end, twoIter->end );
    intersection.begin = lemur_compat::min( intersection.begin, intersection.end );

    if( current.end < intersection.begin ) {
      // if last intersection had non-zero length, put it out in the vector
      if( current.begin < current.end )
        out.push_back( current );

      current = intersection;
    } else {
      // this intersection touches the last intersection,
      // so we'll just put them together
      current.end = intersection.end;
    }

    if( oneIter->end == intersection.end ) {
      oneIter++;
    }

    if( twoIter->end == intersection.end ) {
      twoIter++;
    }
  }
  
  if( current.begin != current.end )
    _extents.push_back( current );
}
示例#14
0
void IndexWriter::_addInvertedListData( indri::utility::greedy_vector<WriterIndexContext*>& lists, indri::index::TermData* termData, indri::utility::Buffer& listBuffer, UINT64& endOffset ) {
  indri::utility::greedy_vector<WriterIndexContext*>::iterator iter;
  const int minimumSkip = 1<<12; // 4k
  int documentsWritten = 0;

  const float topdocsFraction = 0.01f;
  bool hasTopdocs = termData->corpus.documentCount > TOPDOCS_DOCUMENT_COUNT;
  bool isFrequent = termData->corpus.totalCount > FREQUENT_TERM_COUNT;
  int topdocsCount = hasTopdocs ? int(termData->corpus.documentCount * 0.01) : 0;
  int topdocsSpace = hasTopdocs ? ((topdocsCount*3*sizeof(UINT32)) + sizeof(int)) : 0;

  // write a control byte
  char control = (hasTopdocs ? 0x01 : 0) | (isFrequent ? 0x02 : 0);
  _invertedOutput->write( &control, 1 );

  UINT64 initialPosition = _invertedOutput->tell();

  // leave some room for the topdocs list
  if( hasTopdocs ) {
    _invertedOutput->seek( topdocsSpace + initialPosition );
  }

  // maintain a list of top documents
  std::priority_queue<DocListIterator::TopDocument,
    std::vector<DocListIterator::TopDocument>,
    DocListIterator::TopDocument::greater> topdocs;

  double threshold = 0;

  int lastDocument = 0;
  int positions = 0;
  int docs = 0;

  // for each matching list:
  for( iter = lists.begin(); iter != lists.end(); ++iter ) {
    indri::index::DocListFileIterator::DocListData* listData = (*iter)->iterator->currentEntry();
    DocListIterator* iterator = listData->iterator;
    Index* index = (*iter)->index;
    indri::utility::RVLCompressStream stream( listBuffer );

    int listDocs = 0;
    int listPositions = 0;

    while( !iterator->finished() ) {
      // get the latest entry from the list
      DocListIterator::DocumentData* documentData = iterator->currentEntry();

      // add to document counter
      docs++; listDocs++;

      // update the topdocs list
      if( hasTopdocs ) {
        int length = index->documentLength( documentData->document );
        int count = documentData->positions.size();

        // compute DocListIterator::TopDocument::greater (current, top())
        // if false, no reason to insert this entry.
        // note that the test is inverted. 
        //  int(length * threshold) <= count is equivalent to
        // count/length > topdocs.top().count/topdocs.top().length
        // but we use < to force breaking a tie in favor of keeping
        // the first seen document.
        if( int(length * threshold) < count || topdocs.size() < topdocsCount ) {
          // form a topdocs entry for this document
          DocListIterator::TopDocument topDocument( documentData->document,
                                                    count,
                                                    length );
            topdocs.push( topDocument );
            while( topdocs.size() > topdocsCount )
              topdocs.pop();

          threshold = topdocs.top().count / double(topdocs.top().length);
        }
      }
      
      if( listBuffer.position() > minimumSkip ) {
        // time to write in a skip
        _writeBatch( _invertedOutput, documentData->document, listBuffer.position(), listBuffer );

        // delta encode documents by batch
        lastDocument = 0;
      }

      assert( documentData->document > lastDocument );

      // write this entry out to the list
      stream << documentData->document - lastDocument;
      stream << (int) documentData->positions.size();
      lastDocument = documentData->document;

      int lastPosition = 0;

      for( int i=0; i<documentData->positions.size(); i++ ) {
        stream << (documentData->positions[i] - lastPosition);
        lastPosition = documentData->positions[i];
        positions++; listPositions++;
      }

      iterator->nextEntry();
    }

    indri::index::TermData* td = iterator->termData();

    assert( listPositions == td->corpus.totalCount );
    assert( listDocs == td->corpus.documentCount );
  }

  assert( docs == termData->corpus.documentCount );
  assert( positions == termData->corpus.totalCount );

  // write in the final skip info
  _writeBatch( _invertedOutput, -1, listBuffer.position(), listBuffer );
  UINT64 finalPosition = _invertedOutput->tell();

  if( hasTopdocs ) {
    _invertedOutput->seek( initialPosition );
    _invertedOutput->write( &topdocsCount, sizeof(int) );
    assert( topdocs.size() == topdocsCount );

    // write these into the topdocs list in order from smallest fraction to largest fraction,
    // where fraction = c(w;D)/|D|
    while( topdocs.size() ) {
      DocListIterator::TopDocument topDocument = topdocs.top();
      _invertedOutput->write( &topDocument.document, sizeof(int) );
      _invertedOutput->write( &topDocument.count, sizeof(int) );
      _invertedOutput->write( &topDocument.length, sizeof(int) );
      topdocs.pop();
    }
    
    assert( (_invertedOutput->tell() - initialPosition) == topdocsSpace );
    _invertedOutput->seek( finalPosition );
  }

  endOffset = finalPosition;
}
示例#15
0
      void _fetchText( indri::utility::greedy_vector<TagExtent *>& tags, indri::utility::greedy_vector<char*>& terms ) {
        // now, fetch the additional terms
        char line[65536];
        _buffer.clear();

        for( int i=0; i<_count; i++ ) {
          // LINK
          _in.getline( line, sizeof line-1 );

          // LINKDOCNO 
          _in.getline( line, sizeof line-1 );
          
          // TEXT=
          _in.getline( line, sizeof line-1 );
          int textLen = strlen(line+6);
          strcpy( _buffer.write(textLen+1), line+6 );
          _buffer.unwrite(1);
          
          assert( *(_buffer.front()+_buffer.position()-1) == '\"' && "Last character should be a quote" );
        }
        *(_buffer.write(1)) = 0;

        // now there's a bunch of text in _buffer, space separated, with each
        // link separated by a " symbol

        char* beginWord = 0;
        int beginIndex = 0;
        char* buffer = _buffer.front();

        for( unsigned int i=0; i<_buffer.position(); i++ ) {
          if( isalnum(buffer[i]) && !beginWord ) {
            beginWord = buffer+i;

            if(!beginIndex)
              beginIndex = terms.size();
          } else if( isspace(buffer[i]) ) {
            buffer[i] = 0;
            if( beginWord )
              terms.push_back( beginWord );
            beginWord = 0;
          } else if( buffer[i] == '\"' ) {
            buffer[i] = 0;
            if( beginWord )
              terms.push_back( beginWord );
            beginWord = 0;
        
            TagExtent * extent = new TagExtent;
            extent->name = "inlink";
            extent->begin = beginIndex;
            extent->end = terms.size();
            extent->number = 0;
            extent->parent = 0;

            assert( extent->begin <= extent->end );

            if( beginIndex ) {
              tags.push_back(extent);
              if( terms.size() > 125000 )
                break;
            }


            beginIndex = 0;
          }

        }

      }