void indri::infnet::InferenceNetwork::_indexChanged( indri::index::Index& index ) { _closeIterators.clear(); _closeIteratorBound = -1; // doc iterators for( size_t i=0; i<_termNames.size(); i++ ) { indri::index::DocListIterator* iterator = index.docListIterator( _termNames[i] ); if( iterator ) iterator->startIteration(); _docIterators.push_back( iterator ); } // field iterators for( size_t i=0; i<_fieldNames.size(); i++ ) { indri::index::DocExtentListIterator* iterator = index.fieldListIterator( _fieldNames[i] ); if( iterator ) iterator->startIteration(); _fieldIterators.push_back( iterator ); } // prior iterators for( size_t i=0; i<_priorNames.size(); i++ ) { // TODO: this is wasteful, since the prior is associated with the whole collection, // there's no need to fetch it for each index. but, it's just easier to code it like this for now indri::collection::PriorListIterator* iterator = _repository.priorListIterator( _priorNames[i] ); if( iterator ) iterator->startIteration(); else { // if the named prior doesn't exist in the Repository, throw an Exception LEMUR_THROW( LEMUR_RUNTIME_ERROR, "named prior: " + _priorNames[i] + " not found in Repository. Unable to process query." ); } _priorIterators.push_back( iterator ); } // extent iterator nodes std::vector<ListIteratorNode*>::iterator diter; for( diter = _listIteratorNodes.begin(); diter != _listIteratorNodes.end(); diter++ ) { (*diter)->indexChanged( index ); } // belief nodes std::vector<BeliefNode*>::iterator biter; for( biter = _beliefNodes.begin(); biter != _beliefNodes.end(); biter++ ) { (*biter)->indexChanged( index ); } // evaluator nodes std::vector<indri::infnet::EvaluatorNode*>::iterator eiter; for( eiter = _evaluators.begin(); eiter != _evaluators.end(); eiter++ ) { (*eiter)->indexChanged( index ); } // document structure if (_documentStructureHolderNode != 0) { _documentStructureHolderNode->indexChanged( index ); } }
void indri::infnet::ContextSimpleCountAccumulator::_computeCounts( indri::index::Index& index ) { _size += index.termCount(); _documentCount += index.documentCount(); _documentOccurrences += index.documentCount( _term ); if( _term.length() != 0 ) { _occurrences += index.termCount( _term ); } }
void indri::infnet::InferenceNetwork::_evaluateIndex( indri::index::Index& index ) { // don't need to do anything unless there are some // evaluators in the network that need full evaluation if( _complexEvaluators.size() ) { lemur::api::DOCID_T maximumDocument = index.documentMaximum(); if (maximumDocument == index.documentBase()) { // empty memory index, nothing to score. return; } lemur::api::DOCID_T lastCandidate = MAX_INT32; // 64 int scoredDocuments = 0; lemur::api::DOCID_T candidate = 0; indri::index::DeletedDocumentList::read_transaction* deleted; deleted = _repository.deletedList().getReadTransaction(); while(1) { // ask the root node for a candidate document // this asks the whole inference network for the // first document that might possibly produce a // usable (above the max score threshold) score candidate = _nextCandidateDocument( deleted ); if (candidate < index.documentBase()) { std::cerr << candidate << " < index.documentBase()" << std::endl; break; } assert( candidate >= index.documentBase() ); // if candidate is MAX_INT32, we're done if( candidate == MAX_INT32 || candidate > maximumDocument ) { break; } // move all the doc info lists to this new document // in preparation for scoring if( candidate != lastCandidate ) { _moveToDocument( candidate ); } // ask all the evaluators to evaluate this document _evaluateDocument( index, candidate ); scoredDocuments++; // if that was the last document, we can quit now if( candidate+1 > maximumDocument ) break; // move all candidate iterators to candidate+1 _moveToDocument( candidate+1 ); lastCandidate = candidate+1; assert( candidate >= index.documentBase() ); } delete deleted; } }
void indri::infnet::InferenceNetwork::_evaluateDocument( indri::index::Index& index, lemur::api::DOCID_T document ) { int candidateLength = index.documentLength( document ); for( size_t i=0; i<_complexEvaluators.size(); i++ ) { _complexEvaluators[i]->evaluate( document, candidateLength ); } }
void _mergeData() { // fill in tags from the document vector fields // unnecessary to filter out any fields to remove, as they will // be ignored when indexing. indri::utility::greedy_vector<indri::index::FieldExtent>& fields = _docIter->currentEntry()->fields() ; tl.clear(); for (size_t i = 0; i < fields.size(); i++) { const indri::index::FieldExtent& field = fields[i]; std::string fieldName = _index->field(fields[i].id);; tl.addTag(fieldName.c_str(), fieldName.c_str(), field.begin); tl.endTag(fieldName.c_str(), fieldName.c_str(), field.end); } //stuff it into the parsed doc tl.writeTagList(parsed->tags); // fill in terms from the document text so that they will stop/stem // correctly when added to the new repository. // Potentially issues with url injection here... // probably best not to do this with trecweb/html docs.... // TODO: test this termBuffer.clear(); termBuffer.grow( parsed->textLength * 2 ); for (size_t i = 0; i < parsed->positions.size(); i++ ) { int start = parsed->positions[i].begin; int end = parsed->positions[i].end; int token_len = end - start; const char *token = parsed->text + start; char* write_loc = termBuffer.write( token_len + 1 ); strncpy( write_loc, token, token_len ); write_loc[token_len] = '\0'; parsed->terms.push_back( write_loc ); } }
void indri::infnet::FieldWildcardNode::indexChanged( indri::index::Index& index ) { if ( _docIter != 0 ) { delete _docIter; _docIter = 0; } _index = & index; _nextDocument = 1; _docIterID = 1; _docIter = index.termListFileIterator(); _docIter->startIteration(); }
void indri::infnet::ShrinkageBeliefNode::indexChanged( indri::index::Index& index ) { _ruleMap.clear(); std::set<smoothing_rule, lt_rule>::iterator ruleIter = _ruleSet.begin(); while( ruleIter != _ruleSet.end() ) { int field = index.field( ruleIter->fieldName ); _ruleMap[ field ] = *ruleIter; ruleIter++; } }
void indri::infnet::DocumentStructureHolderNode::indexChanged( indri::index::Index& index ) { _index = & index; _nextDocument = 1; _docIterID = 1; delete _docIter; _docIter = index.termListFileIterator(); _docIter->startIteration(); delete _documentStructure; _documentStructure = 0; }
void indri::infnet::ContextSimpleCountAccumulator::_computeCounts( indri::index::Index& index ) { assert( _terms.size() ); assert( _context.size() == 0 || _field.size() == 0 ); if( _context.size() ) { _size += index.fieldTermCount( _context ); } else { _size += index.termCount(); } for( unsigned int i=0; i<_terms.size(); i++ ) { if( _terms[i].length() != 0 ) { if( _field.size() ) { _occurrences += index.fieldTermCount( _field, _terms[i] ); } else if( _context.size() ) { _occurrences += index.fieldTermCount( _context, _terms[i] ); } else { _occurrences += index.termCount( _terms[i] ); } } } }
void indri::infnet::ContextCountAccumulator::indexChanged( indri::index::Index& index ) { if( ! _context ) { _contextSize += index.termCount(); } _documentCount += index.documentCount(); }
void indri::infnet::ContextSimpleCountAccumulator::indexChanged( indri::index::Index& index ) { _computeCounts( index ); _maximumDocument = index.documentCount() + index.documentBase(); }
void processFields( indri::api::Parameters ¶m ) { g_timer.start(); std::string index = param.get("index"); std::cout << "Opening: " << index << std::endl; // make sure this path doesn't exist. std::string idx2 = index + ".new"; // temp target index. // presumes a single input oa file for the entire collection. std::string offsetAnnotationsPath = param.get("annotations"); /// these need to be combined with existing. // fields to add // these need to supply numeric/parental/ordinal/etc... if (param.exists("addField")) addFields = param["addField"]; // fields to remove // these only need to be a list of names. if (param.exists("removeField")) { indri::api::Parameters slice = param["removeField"]; for (size_t i = 0; i < slice.size(); i++) { if( slice[i].exists("name") ) { removeNames.push_back( slice[i]["name"] ); } } } // need to know the file class environment to get the // conflations right. std::string className = param.get("fileclass", ""); indri::collection::Repository sourceRepo; indri::collection::Repository targetRepo; indri::parse::OffsetAnnotationAnnotator oa_annotator; indri::parse::FileClassEnvironmentFactory _fileClassFactory; // Open source repo sourceRepo.openRead(index); // Copy its parameters, create target repo, adding or removing // fields. repo.loadFile( indri::file::Path::combine( index, "manifest" ) ); int mem = param.get("memory", INT64(100*1024*1024)); repo.set("memory", mem); adding = addFields.exists("field"); _mergeFields(); // Create the offset annotator. fce = _fileClassFactory.get( className ); indri::parse::Conflater* conflater = 0; if( fce ) { conflater = fce->conflater; } if (adding) { oa_annotator.setConflater( conflater ); oa_annotator.open( offsetAnnotationsPath ); } targetRepo.create(idx2, &repo); // for each document in the source repo, fetch ParsedDocument // construct full rep, apply annotator, insert into // target repo. _index = sourceRepo.indexes()->front(); // presume 1 _docIter = _index->termListFileIterator(); _docIter->startIteration(); // ought to deal with deleted documents here... // if there are deleted documents, regular add to collection // if not, only rewrite the indexes, then rename the collection. indri::index::DeletedDocumentList& deleted = sourceRepo.deletedList(); UINT64 delCount = deleted.deletedCount(); if (delCount > 0) { // either warn, compact and then process, or // do it the old way... FIXME! std::cerr << "Deleted documents detected... compact with dumpindex first." << std::endl; return; } for (UINT64 docid = 1; docid <= _index->documentCount(); docid++) { if ((docid % 500) == 0) { g_timer.printElapsedSeconds(std::cout); std::cout << ": " << docid << "\r"; std::cout.flush(); } parsed = sourceRepo.collection()->retrieve(docid); // combine field and term data with parsed document _mergeData(); // apply annotator if (adding) parsed = oa_annotator.transform(parsed); targetRepo.addDocument(parsed, false); // TagList allocs memory for the tags... for (size_t i = 0; i < parsed->tags.size(); i++) delete(parsed->tags[i]); delete(parsed); _docIter->nextEntry(); } std::cout << std::endl; g_timer.printElapsedSeconds(std::cout); std::cout << ": " << _index->documentCount() << std::endl; g_timer.printElapsedSeconds(std::cout); std::cout << ": closing" << std::endl; targetRepo.close(); sourceRepo.close(); std::string oldcollectionPath = indri::file::Path::combine( index, "collection" ); std::string newcollectionPath = indri::file::Path::combine( idx2, "collection" ); // clone the collection indri::file::Path::remove(newcollectionPath); indri::file::Path::rename(oldcollectionPath, newcollectionPath); // rename target repo to source repo. indri::file::Path::remove(index); indri::file::Path::rename(idx2, index); g_timer.printElapsedSeconds(std::cout); std::cout << ": done" << std::endl; }