// returns a new instance of FieldInfoList which represents field entities in a document index for a specific field, you must delete the instance later. @see FieldInfoList // Note that not all index types support fields - those that do should override this method. lemur::api::FieldInfoList *lemur::index::LemurIndriIndex::fieldInfoList(lemur::api::DOCID_T docID, int fieldID) const { // get the index for this document indri::collection::Repository::index_state indexes = _repository->indexes(); indri::index::Index* index = _indexWithDocument( indexes, docID ); // ensure we do have an index (i.e. if the docID was invalid...) if (!index) return NULL; // and the indri term-list const indri::index::TermList *tList=index->termList((int)docID); // ensure we have a term list! if (!tList) return NULL; // create a blank field info list object lemur::api::IndriFieldInfoList *retVal=new lemur::api::IndriFieldInfoList(); // loop through our fields and insert those that match the field ID const indri::utility::greedy_vector< indri::index::FieldExtent > fieldVec=tList->fields(); int numFields=fieldVec.size(); for (int i=0; i < numFields; i++) { indri::index::FieldExtent thisField=fieldVec[i]; if (thisField.id==fieldID) { retVal->add(thisField); } } delete tList; // and return return retVal; }
void indri::index::MemoryIndex::_removeClosedTags( indri::utility::greedy_vector<indri::parse::TagExtent *>& tags, unsigned int position ) { for( size_t i=0; i<tags.size(); ) { if( tags[i]->end <= int(position) ) { tags.erase( tags.begin() + i ); } else { i++; } } }
void indri::index::MemoryIndex::_writeFieldExtents( lemur::api::DOCID_T documentID, indri::utility::greedy_vector<indri::parse::TagExtent *>& indexedTags ) { indri::utility::HashTable< indri::parse::TagExtent *, int> tagIdMap; // sort fields std::sort( indexedTags.begin(), indexedTags.end(), indri::parse::LessTagExtent() ); // we'll add to the end of the fields greedy_vector indri::utility::greedy_vector<indri::index::FieldExtent> & fields = _termList.fields(); // this is used to set the parentOrdinals int offset = fields.size(); // convert to field extents, set ids, and create the node map for( size_t i=0; i<indexedTags.size(); i++ ) { indri::parse::TagExtent * extent = indexedTags[i]; int ordinal = int(i) + 1; // this is the id for the field type int tagId = _fieldID( extent->name ); // convert the field indri::index::FieldExtent converted( tagId, extent->begin, extent->end, extent->number, ordinal); // add this node to the map; tagIdMap.insert( extent, ordinal ); // add his field to the field list for the document fields.push_back( converted ); // add this location to the inverted list for fields - deferred to below - // _fieldLists[tagId - 1]->addLocation( documentID, extent->begin, extent->end, extent->number, ordinal ); } // set the parent ordinals for( size_t j=0; j<indexedTags.size(); j++ ) { indri::parse::TagExtent * extent = indexedTags[j]; // look up the parent int parentOrdinal = 0; int * parentIter; if ( extent->parent != 0 ) { parentIter = tagIdMap.find( extent->parent ); if( parentIter != 0 ) { parentOrdinal = *parentIter; } else { parentOrdinal = 0; } } // set the parent int ordinal = fields[ offset + j ].ordinal; int tagId = fields[ offset + j ].id; fields[ offset + j ].parentOrdinal = parentOrdinal; // add this location to the inverted list for fields _fieldLists[tagId - 1]->addLocation( documentID, extent->begin, extent->end, extent->number, ordinal, parentOrdinal ); } }
void IndexWriter::_fetchMatchingInvertedLists( indri::utility::greedy_vector<WriterIndexContext*>& lists, invertedlist_pqueue& queue ) { lists.clear(); WriterIndexContext* first = queue.top(); lists.push_back( first ); const char* firstTerm = first->iterator->currentEntry()->termData->term; queue.pop(); while( queue.size() && !strcmp( firstTerm, queue.top()->iterator->currentEntry()->termData->term ) ) { lists.push_back( queue.top() ); queue.pop(); } }
// Moved work of identifying matches of an extent from ExtentRestriction to here. // This allows the computation for the list of matches for an extent to be // overridden. ExtentParent, ExtentChild, and ExtentDescendant are among the // classes that need to do this, as the parent/child/descedant relationships // are not based on extent containment - these relationships may be among // arbitrary fields in a document. virtual const indri::utility::greedy_vector<indri::index::Extent>& matches( indri::index::Extent &extent ) { int begin = extent.begin; int end = extent.end; const indri::utility::greedy_vector<indri::index::Extent>& exts = extents(); _matches.clear(); for( size_t i = 0 ; i < exts.size(); i++ ) { if ( begin <= exts[i].begin && end >= exts[i].end ) { _matches.push_back( exts[i] ); } else if ( exts[i].begin > end ) { break; } } return _matches; }
void indri::infnet::FixedPassageNode::_buildSubextents( const indri::utility::greedy_vector<indri::index::Extent>& extents ) { _subextents.clear(); for( size_t i=0; i<extents.size(); i++ ) { _addSubextents( extents[i] ); } }
void IndexWriter::_pushInvertedLists( indri::utility::greedy_vector<WriterIndexContext*>& lists, invertedlist_pqueue& queue ) { for( int i=0; i<lists.size(); i++ ) { lists[i]->iterator->nextEntry(); if( !lists[i]->iterator->finished() ) queue.push( lists[i] ); } }
// Moved work of identifying matches of an extent from ExtentRestriction to here. // This allows the computation for the list of matches for an extent to be // overridden. ExtentParent, ExtentChild, and ExtentDescendant are among the // classes that need to do this, as the parent/child/descedant relationships // are not based on extent containment - these relationships may be among // arbitrary fields in a document. virtual const indri::utility::greedy_vector<indri::index::Extent>& matches( indri::index::Extent &extent ) { int begin = extent.begin; int end = extent.end; _matches.clear(); const indri::utility::greedy_vector<indri::index::Extent>& exts = extents(); // if there's no extents or we have no length - just return if (begin == end || exts.size()==0) return _matches; // if we are dealing with child extents, we need to reverse the // list pointer to the last good position while((_lastpos > 0) && (exts[_lastpos-1].begin >= begin)){ _lastpos--; } // now, we make sure we're in the correct position // after this loop, _lastpos->begin >= begin while((_lastpos < exts.size()) && (exts[_lastpos].begin < begin)){ _lastpos++; } // for default DocListIteratorNode, any extent: begin+1 == end. while((_lastpos < exts.size()) && (exts[_lastpos].begin < end)) { if(exts[_lastpos].end <= end) { indri::index::Extent ext(exts[_lastpos]); _matches.push_back(ext); } // end if(_exts[_lastpos].end<=end) _lastpos++; } // end while(_lastpos<_exts.size()&&_exts[_lastpos].begin<end) /*** *** old method of matching child extents - deprecated * * for( size_t i = 0 ; i < exts.size(); i++ ) { * if ( begin <= exts[i].begin && end >= exts[i].end ) { * _matches.push_back( exts[i] ); * } else if ( exts[i].begin > end ) { * break; * } * } **/ return _matches; }
void indri::index::MemoryIndex::_addOpenTags( indri::utility::greedy_vector<indri::parse::TagExtent *>& indexedTags, indri::utility::greedy_vector<indri::parse::TagExtent *>& openTags, indri::utility::greedy_vector<indri::parse::TagExtent *>& extents, unsigned int& extentIndex, unsigned int position ) { for( ; extentIndex < extents.size(); extentIndex++ ) { indri::parse::TagExtent* extent = extents[extentIndex]; if( extent->begin > (int)position ) break; int tagId = _fieldID( extent->name ); if( tagId == 0 ) continue; openTags.push_back( extent ); indexedTags.push_back( extent ); } }
void IndexWriter::_storeMatchInformation( indri::utility::greedy_vector<WriterIndexContext*>& lists, int sequence, indri::index::TermData* termData, UINT64 startOffset, UINT64 endOffset ) { bool isFrequent = termData->corpus.totalCount > FREQUENT_TERM_COUNT; if( isFrequent ) _isFrequentCount++; for( int i=0; i<lists.size(); i++ ) { WriterIndexContext* list = lists[i]; indri::index::DiskDocListIterator* iterator = dynamic_cast<DiskDocListIterator*>(lists[i]->iterator->currentEntry()->iterator); bool isMemoryIndex = (iterator == 0); bool wasFrequent = (isMemoryIndex || iterator->isFrequent()); if( !wasFrequent ) list->wasInfrequentCount++; if( wasFrequent ) list->wasFrequentCount++; list->sequenceCount++; if( !wasFrequent ) { if( !isFrequent ) { // common case--remaining infrequent assert( sequence - _isFrequentCount - 1 >= 0 ); assert( ((sequence -_isFrequentCount - 1) + _isFrequentCount + 1) <= _corpus.uniqueTerms ); list->bitmap->add( list->wasInfrequentCount - 1, sequence - _isFrequentCount - 1 ); } else if( isFrequent ) { // becoming frequent list->newlyFrequent->add( list->wasInfrequentCount - 1, termData->term ); } } } if( isFrequent ) { indri::index::DiskTermData* diskTermData = disktermdata_create( _fields.size() ); ::termdata_merge( diskTermData->termData, termData, _fields.size() ); diskTermData->startOffset = startOffset; diskTermData->length = endOffset - startOffset; strcpy( const_cast<char*>(diskTermData->termData->term), termData->term ); _topTerms.push_back( diskTermData ); } else { indri::index::DiskTermData diskTermData; diskTermData.termData = termData; diskTermData.startOffset = startOffset; diskTermData.length = endOffset - startOffset; diskTermData.termID = sequence - _topTerms.size(); _storeTermEntry( _infrequentTerms, &diskTermData ); } }
void IndexWriter::_writeStatistics( indri::utility::greedy_vector<WriterIndexContext*>& lists, indri::index::TermData* termData, UINT64& startOffset ) { indri::utility::greedy_vector<WriterIndexContext*>::iterator iter; ::termdata_clear( termData, _fields.size() ); // find out what term we're writing strcpy( const_cast<char*>(termData->term), lists[0]->iterator->currentEntry()->termData->term ); for( iter = lists.begin(); iter != lists.end(); ++iter ) { indri::index::DocListFileIterator::DocListData* listData = (*iter)->iterator->currentEntry(); ::termdata_merge( termData, listData->termData, _fields.size() ); } _termDataBuffer.clear(); indri::utility::RVLCompressStream stream( _termDataBuffer ); stream << termData->term; ::termdata_compress( stream, termData, _fields.size() ); startOffset = _invertedOutput->tell(); int dataSize = stream.dataSize(); _invertedOutput->write( &dataSize, sizeof(UINT32) ); _invertedOutput->write( stream.data(), stream.dataSize() ); }
/// sorts the extents by the beginning extent position (if available) void sortbegin(indri::utility::greedy_vector<indri::index::Extent>& extents){ int sorted=0; int lastbegin = 0; int lastend = extents.size(); while(lastbegin<lastend){ int i=lastbegin; int end=lastend-1; lastbegin=lastend; lastend=i; for (;i<end;i++){ if (extents[i].begin > extents[i+1].begin){ indri::index::Extent x(extents[i]); extents[i]=extents[i+1];extents[i+1]=x; if(lastbegin>i)lastbegin=i; if(lastend<i+1)lastend=i+1; } } } }
void indri::infnet::ExtentAndNode::_and( indri::utility::greedy_vector<indri::index::Extent>& out, const indri::utility::greedy_vector<indri::index::Extent>& one, const indri::utility::greedy_vector<indri::index::Extent>& two ) { indri::utility::greedy_vector<indri::index::Extent>::const_iterator oneIter = one.begin(); indri::utility::greedy_vector<indri::index::Extent>::const_iterator twoIter = two.begin(); out.clear(); indri::index::Extent current; current.begin = 0; current.end = 0; while( oneIter != one.end() && twoIter != two.end() ) { indri::index::Extent intersection; // compute the intersection (may be 0 length) intersection.begin = lemur_compat::max( oneIter->begin, twoIter->begin ); intersection.end = lemur_compat::min( oneIter->end, twoIter->end ); intersection.begin = lemur_compat::min( intersection.begin, intersection.end ); if( current.end < intersection.begin ) { // if last intersection had non-zero length, put it out in the vector if( current.begin < current.end ) out.push_back( current ); current = intersection; } else { // this intersection touches the last intersection, // so we'll just put them together current.end = intersection.end; } if( oneIter->end == intersection.end ) { oneIter++; } if( twoIter->end == intersection.end ) { twoIter++; } } if( current.begin != current.end ) _extents.push_back( current ); }
void IndexWriter::_addInvertedListData( indri::utility::greedy_vector<WriterIndexContext*>& lists, indri::index::TermData* termData, indri::utility::Buffer& listBuffer, UINT64& endOffset ) { indri::utility::greedy_vector<WriterIndexContext*>::iterator iter; const int minimumSkip = 1<<12; // 4k int documentsWritten = 0; const float topdocsFraction = 0.01f; bool hasTopdocs = termData->corpus.documentCount > TOPDOCS_DOCUMENT_COUNT; bool isFrequent = termData->corpus.totalCount > FREQUENT_TERM_COUNT; int topdocsCount = hasTopdocs ? int(termData->corpus.documentCount * 0.01) : 0; int topdocsSpace = hasTopdocs ? ((topdocsCount*3*sizeof(UINT32)) + sizeof(int)) : 0; // write a control byte char control = (hasTopdocs ? 0x01 : 0) | (isFrequent ? 0x02 : 0); _invertedOutput->write( &control, 1 ); UINT64 initialPosition = _invertedOutput->tell(); // leave some room for the topdocs list if( hasTopdocs ) { _invertedOutput->seek( topdocsSpace + initialPosition ); } // maintain a list of top documents std::priority_queue<DocListIterator::TopDocument, std::vector<DocListIterator::TopDocument>, DocListIterator::TopDocument::greater> topdocs; double threshold = 0; int lastDocument = 0; int positions = 0; int docs = 0; // for each matching list: for( iter = lists.begin(); iter != lists.end(); ++iter ) { indri::index::DocListFileIterator::DocListData* listData = (*iter)->iterator->currentEntry(); DocListIterator* iterator = listData->iterator; Index* index = (*iter)->index; indri::utility::RVLCompressStream stream( listBuffer ); int listDocs = 0; int listPositions = 0; while( !iterator->finished() ) { // get the latest entry from the list DocListIterator::DocumentData* documentData = iterator->currentEntry(); // add to document counter docs++; listDocs++; // update the topdocs list if( hasTopdocs ) { int length = index->documentLength( documentData->document ); int count = documentData->positions.size(); // compute DocListIterator::TopDocument::greater (current, top()) // if false, no reason to insert this entry. // note that the test is inverted. // int(length * threshold) <= count is equivalent to // count/length > topdocs.top().count/topdocs.top().length // but we use < to force breaking a tie in favor of keeping // the first seen document. if( int(length * threshold) < count || topdocs.size() < topdocsCount ) { // form a topdocs entry for this document DocListIterator::TopDocument topDocument( documentData->document, count, length ); topdocs.push( topDocument ); while( topdocs.size() > topdocsCount ) topdocs.pop(); threshold = topdocs.top().count / double(topdocs.top().length); } } if( listBuffer.position() > minimumSkip ) { // time to write in a skip _writeBatch( _invertedOutput, documentData->document, listBuffer.position(), listBuffer ); // delta encode documents by batch lastDocument = 0; } assert( documentData->document > lastDocument ); // write this entry out to the list stream << documentData->document - lastDocument; stream << (int) documentData->positions.size(); lastDocument = documentData->document; int lastPosition = 0; for( int i=0; i<documentData->positions.size(); i++ ) { stream << (documentData->positions[i] - lastPosition); lastPosition = documentData->positions[i]; positions++; listPositions++; } iterator->nextEntry(); } indri::index::TermData* td = iterator->termData(); assert( listPositions == td->corpus.totalCount ); assert( listDocs == td->corpus.documentCount ); } assert( docs == termData->corpus.documentCount ); assert( positions == termData->corpus.totalCount ); // write in the final skip info _writeBatch( _invertedOutput, -1, listBuffer.position(), listBuffer ); UINT64 finalPosition = _invertedOutput->tell(); if( hasTopdocs ) { _invertedOutput->seek( initialPosition ); _invertedOutput->write( &topdocsCount, sizeof(int) ); assert( topdocs.size() == topdocsCount ); // write these into the topdocs list in order from smallest fraction to largest fraction, // where fraction = c(w;D)/|D| while( topdocs.size() ) { DocListIterator::TopDocument topDocument = topdocs.top(); _invertedOutput->write( &topDocument.document, sizeof(int) ); _invertedOutput->write( &topDocument.count, sizeof(int) ); _invertedOutput->write( &topDocument.length, sizeof(int) ); topdocs.pop(); } assert( (_invertedOutput->tell() - initialPosition) == topdocsSpace ); _invertedOutput->seek( finalPosition ); } endOffset = finalPosition; }
void _fetchText( indri::utility::greedy_vector<TagExtent *>& tags, indri::utility::greedy_vector<char*>& terms ) { // now, fetch the additional terms char line[65536]; _buffer.clear(); for( int i=0; i<_count; i++ ) { // LINK _in.getline( line, sizeof line-1 ); // LINKDOCNO _in.getline( line, sizeof line-1 ); // TEXT= _in.getline( line, sizeof line-1 ); int textLen = strlen(line+6); strcpy( _buffer.write(textLen+1), line+6 ); _buffer.unwrite(1); assert( *(_buffer.front()+_buffer.position()-1) == '\"' && "Last character should be a quote" ); } *(_buffer.write(1)) = 0; // now there's a bunch of text in _buffer, space separated, with each // link separated by a " symbol char* beginWord = 0; int beginIndex = 0; char* buffer = _buffer.front(); for( unsigned int i=0; i<_buffer.position(); i++ ) { if( isalnum(buffer[i]) && !beginWord ) { beginWord = buffer+i; if(!beginIndex) beginIndex = terms.size(); } else if( isspace(buffer[i]) ) { buffer[i] = 0; if( beginWord ) terms.push_back( beginWord ); beginWord = 0; } else if( buffer[i] == '\"' ) { buffer[i] = 0; if( beginWord ) terms.push_back( beginWord ); beginWord = 0; TagExtent * extent = new TagExtent; extent->name = "inlink"; extent->begin = beginIndex; extent->end = terms.size(); extent->number = 0; extent->parent = 0; assert( extent->begin <= extent->end ); if( beginIndex ) { tags.push_back(extent); if( terms.size() > 125000 ) break; } beginIndex = 0; } } }