bool Matcher::add_occurrence(off_t pos, off_t tpos, size_t len) { QueryTerm* mexp = _match_iter.current(); LOG(spam, "Match: %s(%ld)", mexp->term(), tpos); // Add new occurrence to sequence of all occurrences key_occ_ptr k = new key_occ(mexp->term(), pos, tpos, len); if (!k) return false; _occ.push_back(k); if (!(_need_complete_cnt > 0)) { size_t nodeno; // From the head of the sequences, remove any candidates that are // "too old", eg. that is not complete within the winsize window // and also trig further processing of complete matches: for (nodeno = 0; nodeno < _nontermcnt; nodeno++) { match_sequence& ws = _wrk_set[nodeno]; for (match_sequence::iterator it = ws.begin(); it != ws.end();) { MatchCandidate* m = (*it); if ((k->startpos() - m->startpos()) < static_cast<int>(_winsize)) break; it = ws.erase(it); // This moves the iterator forward if (m->partial_ok()) update_match(m); else DerefCandidate(m); } } } // Then add a new candidate starting at the currently found keyword // for each subexpression that matches this keyword for (; mexp != NULL; mexp = _match_iter.next()) { QueryNode* pexp = mexp->_parent; assert(pexp); MatchCandidate* nm = NewCandidate(pexp); if (!nm || nm->elems() < 0) { LOG(error, "Matcher could not allocate memory for candidate - bailing out"); if (nm) DerefCandidate(nm); return false; } match_sequence& cs = _wrk_set[pexp->_node_idx]; if (cs.size() >= _max_match_candidates) { DerefCandidate(nm); LOG(debug, "The max number of match candidates (%zu) in the work set for query node idx '%u' has been reached. " "No more candidates are added", _max_match_candidates, pexp->_node_idx); } else { cs.push_back(nm); } update_wrk_set(cs, k, mexp); } return true; }
void Matcher::dump_statistics() { int i; int nterms = QueryTerms(); fprintf(stderr, "%20s %12s %12s\n", "Term", "Matches", "Exact"); for (i = 0; i < nterms; i++) { QueryTerm* q = _mo->Term(i); fprintf(stderr, "%20s %12d %12d\n", q->term(), q->total_match_cnt, q->exact_match_cnt); } }
void interpolateWith(const lemur::langmod::UnigramLM &qModel, double origModCoeff, int howManyWord, double prSumThresh, double prThresh) { if (!qm) { qm = new lemur::api::IndexedRealVector(); } else { qm->clear(); } qModel.startIteration(); while (qModel.hasMore()) { IndexedReal entry; qModel.nextWordProb((TERMID_T &)entry.ind,entry.val); qm->push_back(entry); } qm->Sort(); double countSum = totalCount(); startIteration(); while (hasMore()) { QueryTerm *qt = nextTerm(); setCount(qt->id(), qt->weight()*origModCoeff/countSum); delete qt; } cout << "-------- FB terms --------" << endl; double prSum = 0; int wdCount = 0; IndexedRealVector::iterator it; it = qm->begin(); while (it != qm->end() && prSum < prSumThresh && wdCount < howManyWord && (*it).val >=prThresh) { incCount((*it).ind, (*it).val*(1-origModCoeff)); cout << ind.term(it->ind) << endl; prSum += (*it).val; it++; wdCount++; } cout << "--------------------------" << endl; colQLikelihood = 0; colQueryLikelihood(); colKLComputed = false; }
void lemur::retrieval::QueryModel::interpolateWith(const lemur::langmod::UnigramLM &qModel, double origModCoeff, int howManyWord, double prSumThresh, double prThresh) { if (!qm) { qm = new lemur::api::IndexedRealVector(); } else { qm->clear(); } qModel.startIteration(); while (qModel.hasMore()) { IndexedReal entry; qModel.nextWordProb((TERMID_T &)entry.ind,entry.val); qm->push_back(entry); } qm->Sort(); double countSum = totalCount(); // discounting the original model startIteration(); while (hasMore()) { QueryTerm *qt = nextTerm(); incCount(qt->id(), qt->weight()*origModCoeff/countSum); delete qt; } // now adding the new model double prSum = 0; int wdCount = 0; IndexedRealVector::iterator it; it = qm->begin(); while (it != qm->end() && prSum < prSumThresh && wdCount < howManyWord && (*it).val >=prThresh) { incCount((*it).ind, (*it).val*(1-origModCoeff)); prSum += (*it).val; it++; wdCount++; } //Sum w in Q qtf * log(qtcf/termcount); colQLikelihood = 0; colQueryLikelihood(); colKLComputed = false; }
void lemur::retrieval::QueryModel::clarity(ostream &os) { int count = 0; double sum=0, ln_Pr=0; startIteration(); QueryTerm *qt; while (hasMore()) { qt = nextTerm(); count++; // query-clarity = SUM_w{P(w|Q)*log(P(w|Q)/P(w))} // P(w)=cf(w)/|C| double pw = ((double)ind.termCount(qt->id())/(double)ind.termCount()); // P(w|Q) is a prob computed by any model, e.g. relevance models double pwq = qt->weight(); sum += pwq; ln_Pr += (pwq)*log(pwq/pw); delete qt; } // clarity should be computed with log_2, so divide by log(2). os << "=" << count << " " << (ln_Pr/(sum ? sum : 1.0)/log(2.0)) << endl; startIteration(); while (hasMore()) { qt = nextTerm(); // print clarity for each query term // clarity should be computed with log_2, so divide by log(2). os << ind.term(qt->id()) << " " << (qt->weight()*log(qt->weight()/ ((double)ind.termCount(qt->id())/ (double)ind.termCount())))/log(2.0) << endl; delete qt; } }
void lemur::retrieval::QueryModel::save(ostream &os) { int count = 0; startIteration(); QueryTerm *qt; while (hasMore()) { qt = nextTerm(); count++; delete qt; } os << " " << count << endl; startIteration(); while (hasMore()) { qt = nextTerm(); os << ind.term(qt->id()) << " "<< qt->weight() << endl; delete qt; } }
void QueryGroupView::re_load_terms(QueryGroup* group) { QueryTermView* view; // destroy widgets of term view for (view = f_term_view_list; view; view = view->next_term_view()) view->destroy_widgets(); // delete term views while (f_term_view_list != NULL) delete f_term_view_list; delete f_query_group; f_query_group = group; // load new terms QueryTerm *term; for (view = 0, term= group->term_list(); term != NULL; term = term->next()) view = new QueryTermView (term, this, view, NULL); }
QueryTerm* match_iterator::first() { for (; _el != NULL; _el = _el->GetNext()) { QueryTerm* q = _el->GetItem(); // If exact match is desired by this subexpression, // only have effect if exact match if (q->Exact() && _len > q->len) continue; if (q->is_wildcard()) { if (fast::util::wildcard_match(_term, q->ucs4_term()) == false) continue; return q; } if (_len < q->ucs4_len) continue; // allow prefix match iff prefix query term or // rest < _stem_extend and length > stem_min if (!q->is_prefix()) { size_t stem_extend = (q->ucs4_len <= _stem_min ? 0 : _stemext); if (_len > q->ucs4_len + stem_extend) continue; } if (juniper::strncmp(_term, q->ucs4_term(), q->ucs4_len) != 0) continue; return q; } return NULL; }
void lemur::retrieval::QueryModel::load(istream &is) { // clear existing counts startIteration(); QueryTerm *qt; while (hasMore()) { qt = nextTerm(); setCount(qt->id(),0); } colQLikelihood = 0; int count; is >> count; char wd[500]; double pr; while (count-- >0) { is >> wd >> pr; TERMID_T id = ind.term(wd); if (id != 0) setCount(id, pr); // don't load OOV terms } colQueryLikelihood(); colKLComputed = false; }
double lemur::retrieval::QueryModel::clarity() const { int count = 0; double sum=0, ln_Pr=0; startIteration(); QueryTerm *qt; while (hasMore()) { qt = nextTerm(); count++; // query-clarity = SUM_w{P(w|Q)*log(P(w|Q)/P(w))} // P(w)=cf(w)/|C| double pw = ((double)ind.termCount(qt->id())/(double)ind.termCount()); // P(w|Q) is a prob computed by any model, e.g. relevance models double pwq = qt->weight(); sum += pwq; ln_Pr += (pwq)*log(pwq/pw); delete qt; } // normalize by sum of probabilities in the input model ln_Pr = ln_Pr/(sum ? sum : 1.0); // clarity should be computed with log_2, so divide by log(2). return (ln_Pr/log(2.0)); }
QueryGroupView::QueryGroupView (QueryGroup *group, Widget parent) : f_query_group (group), f_term_view_list (NULL) { f_restraint = WRestraint (WComposite (parent), "group_view", WAutoManage); f_form = WXmForm (f_restraint, "group_form"); QueryTermView *view = NULL; QueryTerm *term; for (term= group->term_list(); term != NULL; term = term->next()) view = new QueryTermView (term, this, view, NULL); f_form.Realize(); // This statement is extremely critical. 02/03/93 DJB f_form.Manage(); // Need special event handler for top level group to catch // resizes of window and resize the query view. // NOTE: It's just about time to move this to another method. if (XtClass (XtParent (f_restraint)) == xmDrawingAreaWidgetClass) { // Let's see what the parent's size is now: ON_DEBUG(printf ("Drawing area is width: %d, height: %d\n", WCore(f_restraint.Parent()).Width(), WCore(f_restraint.Parent()).Height())); // We need to grow the drawing area if it is smaller than the // restraint widget, since we don't deal with horizontal scrolling. Dimension restraint_width = f_restraint.Width(); Dimension da_width = WCore(f_restraint.Parent()).Width(); if (da_width < restraint_width) { // Can't resize the drawing area because the ^$&(*% Motif // pane widget will not allow the horizontal resize. So, // instead calculate the size increase and do it on the shell. Dimension increase = restraint_width - da_width; // Find the shell widget. Widget w = f_restraint.Parent(); while (!XtIsShell (w)) w = XtParent(w); WTopLevelShell shell (w); ON_DEBUG(printf ("** Resizing shell by = %d\n", increase)); // Change state if needed. Boolean allow_resize = shell.AllowShellResize(); if (!allow_resize) shell.AllowShellResize (True); // Change the width. shell.MinWidth (shell.Width() + increase); shell.Width (shell.Width() + increase); // Restore state if needed. if (!allow_resize) shell.AllowShellResize (False); } else if (da_width > restraint_width) { f_restraint.Width (WCore(f_restraint.Parent()).Width()); } XtAddEventHandler (XtParent (f_restraint), StructureNotifyMask, False, (XtEventHandler) &QueryGroupView::resize, (Widget) f_restraint); // Store the min width in UserData... f_restraint.UserData ((XtPointer)(size_t) f_restraint.Width()); } // Make sure the restraint widget isn't too narrow. else { f_restraint.Width (WCore(f_restraint.Parent()).Width()); } }