void TestHighFreqTerms(const char_t* index, int_t numTerms) { long_t start = lucene::util::Misc::currentTimeMillis(); IndexReader& reader = IndexReader::open(index); TermInfoQueue* tiq = new TermInfoQueue(numTerms); TermEnum* terms = &reader.getTerms(); int_t minFreq = 0; while (terms->next()) { if (terms->DocFreq() > minFreq) { tiq->put(new TermInfo(terms->getTerm(), terms->DocFreq())); if (tiq->Size() > numTerms) { // if tiq overfull tiq->pop(); // remove lowest in tiq minFreq = ((TermInfo*)tiq->top())->docFreq; // reset minFreq } } } while (tiq->Size() != 0) { TermInfo* termInfo = (TermInfo*)tiq->pop(); //_cout << termInfo->term->toString() << _T(" ") << termInfo->docFreq << endl; } reader.close(); delete &reader; _cout << (lucene::util::Misc::currentTimeMillis()-start) << _T(" milliseconds/") << numTerms << _T("Terms") << endl; }
/** * FIXME: Describe <code>rewrite</code> method here. * * @param reader an <code>IndexReader</code> value * @return a <code>Query</code> value * @exception IOException if an error occurs */ Query* RangeQuery::rewrite(IndexReader* reader){ BooleanQuery* query = _CLNEW BooleanQuery; TermEnum* enumerator = reader->terms(lowerTerm); Term* lastTerm = NULL; try { bool checkLower = false; if (!inclusive) // make adjustments to set to exclusive checkLower = true; const TCHAR* testField = getField(); do { lastTerm = enumerator->term(); if (lastTerm != NULL && lastTerm->field() == testField ) { if (!checkLower || _tcscmp(lastTerm->text(),lowerTerm->text()) > 0) { checkLower = false; if (upperTerm != NULL) { int compare = _tcscmp(upperTerm->text(),lastTerm->text()); /* if beyond the upper term, or is exclusive and * this is equal to the upper term, break out */ if ((compare < 0) || (!inclusive && compare == 0)) break; } TermQuery* tq = _CLNEW TermQuery(lastTerm); // found a match tq->setBoost(getBoost()); // set the boost query->add(tq, true, false, false); // add to query } }else { break; } _CLDECDELETE(lastTerm); } while (enumerator->next()); }catch(...){ _CLDECDELETE(lastTerm); //always need to delete this _CLDELETE(query); //in case of error, delete the query enumerator->close(); _CLDELETE(enumerator); throw; //rethrow } _CLDECDELETE(lastTerm); //always need to delete this enumerator->close(); _CLDELETE(enumerator); return query; }
void indexdump(const char* dir) { IndexReader* indexreader = IndexReader::open(dir); int32_t max = indexreader->maxDoc(); for (int i=0; i<max; ++i) { Document* doc = indexreader->document(i); if (doc) { docdump(doc); } } TermEnum* terms = indexreader->terms(); Term* t = 0; while (terms->next()) { t = terms->term(); printf("%s: %s\n", t2a(t->field()).c_str(), t2a(t->text()).c_str()); _CLDECDELETE(t); } }
Query* PrefixQuery::rewrite(IndexReader* reader){ BooleanQuery* query = _CLNEW BooleanQuery(); TermEnum* enumerator = reader->terms(prefix); Term* lastTerm = NULL; try { const TCHAR* prefixText = prefix->text(); const TCHAR* prefixField = prefix->field(); const TCHAR* tmp; size_t i; int32_t prefixLen = prefix->textLength(); do { lastTerm = enumerator->term(); if (lastTerm != NULL && lastTerm->field() == prefixField ){ //now see if term->text() starts with prefixText int32_t termLen = lastTerm->textLength(); if ( prefixLen>termLen ) break; //the prefix is longer than the term, can't be matched tmp = lastTerm->text(); //check for prefix match in reverse, since most change will be at the end for ( i=prefixLen-1;i!=-1;--i ){ if ( tmp[i] != prefixText[i] ){ tmp=NULL;//signals inequality break; } } if ( tmp == NULL ) break; TermQuery* tq = _CLNEW TermQuery(lastTerm); // found a match tq->setBoost(getBoost()); // set the boost query->add(tq,true,false, false); // add to query } else break; _CLDECDELETE(lastTerm); } while (enumerator->next()); }_CLFINALLY( enumerator->close(); _CLDELETE(enumerator); _CLDECDELETE(lastTerm); );
vector<string> CLuceneIndexReader::keywords(const string& keywordmatch, const vector<string>& fieldnames, uint32_t max, uint32_t offset) { vector<string> fn; if (fieldnames.size()) { fn = fieldnames; } else { fn = fieldNames(); } set<wstring> s; wstring prefix = utf8toucs2(keywordmatch); const wchar_t* prefixtext = prefix.c_str(); string::size_type prefixLen = prefix.length(); vector<string>::const_iterator i; Term* lastTerm = 0; for (i = fn.begin(); i != fn.end() && s.size() << max; ++i) { wstring fieldname(utf8toucs2(*i)); Term term(fieldname.c_str(), prefix.c_str()); TermEnum* enumerator = reader->terms(&term); do { lastTerm = enumerator->term(false); if (lastTerm) { if (prefixLen > lastTerm->textLength() || wcsncmp(lastTerm->text(), prefixtext, prefixLen) != 0) { break; } s.insert(lastTerm->text()); } } while (enumerator->next() && s.size() < max); } vector<string> k; k.reserve(s.size()); set<wstring>::const_iterator j; for (j = s.begin(); j != s.end(); ++j) { k.push_back(wchartoutf8(*j)); } return k; }