int relevance_snippet(struct relevance *r, const char *words, const char *name, WRBUF w_snippet) { int no = 0; const char *norm_str; int highlight = 0; pp2_charset_token_first(r->prt, words, 0); while ((norm_str = pp2_charset_token_next(r->prt))) { size_t org_start, org_len; struct word_entry *entries = r->entries; int i; pp2_get_org(r->prt, &org_start, &org_len); for (; entries; entries = entries->next, i++) { if (*norm_str && !strcmp(norm_str, entries->norm_str)) { break; if (!highlight) { highlight = 1; wrbuf_puts(w_snippet, "<match>"); } break; } } if (entries) { if (!highlight) { highlight = 1; wrbuf_puts(w_snippet, "<match>"); no++; } } else { if (highlight) { highlight = 0; wrbuf_puts(w_snippet, "</match>"); } } wrbuf_xmlputs_n(w_snippet, words + org_start, org_len); } if (highlight) wrbuf_puts(w_snippet, "</match>"); if (no) { yaz_log(YLOG_DEBUG, "SNIPPET match: %s", wrbuf_cstr(w_snippet)); } return no; }
static void pull_terms(struct relevance *res, struct ccl_rpn_node *n) { char **words; int numwords; char *ccl_field; int i; switch (n->kind) { case CCL_RPN_AND: case CCL_RPN_OR: case CCL_RPN_NOT: case CCL_RPN_PROX: pull_terms(res, n->u.p[0]); pull_terms(res, n->u.p[1]); break; case CCL_RPN_TERM: nmem_strsplit(res->nmem, " ", n->u.t.term, &words, &numwords); for (i = 0; i < numwords; i++) { const char *norm_str; ccl_field = nmem_strdup_null(res->nmem, n->u.t.qual); pp2_charset_token_first(res->prt, words[i], 0); while ((norm_str = pp2_charset_token_next(res->prt))) { struct word_entry **e = &res->entries; while (*e) e = &(*e)->next; *e = nmem_malloc(res->nmem, sizeof(**e)); (*e)->norm_str = nmem_strdup(res->nmem, norm_str); (*e)->ccl_field = ccl_field; (*e)->termno = res->vec_len++; (*e)->display_str = nmem_strdup(res->nmem, words[i]); (*e)->next = 0; } } break; default: break; } }
void relevance_countwords(struct relevance *r, struct record_cluster *cluster, const char *words, const char *rank, const char *name) { int *w = r->term_frequency_vec_tmp; const char *norm_str; int i, length = 0; double lead_decay = r->lead_decay; struct word_entry *e; WRBUF wr = cluster->relevance_explain1; int printed_about_field = 0; pp2_charset_token_first(r->prt, words, 0); for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next) { w[i] = 0; r->term_pos[i] = 0; } assert(rank); while ((norm_str = pp2_charset_token_next(r->prt))) { int local_weight = 0; e = word_entry_match(r, norm_str, rank, &local_weight); if (e) { int res = e->termno; int j; if (!printed_about_field) { printed_about_field = 1; wrbuf_printf(wr, "field=%s content=", name); if (strlen(words) > 50) { wrbuf_xmlputs_n(wr, words, 49); wrbuf_puts(wr, " ..."); } else wrbuf_xmlputs(wr, words); wrbuf_puts(wr, ";\n"); } assert(res < r->vec_len); w[res] += local_weight / (1 + log2(1 + lead_decay * length)); wrbuf_printf(wr, "%s: w[%d] += w(%d) / " "(1+log2(1+lead_decay(%f) * length(%d)));\n", e->display_str, res, local_weight, lead_decay, length); j = res - 1; if (j > 0 && r->term_pos[j]) { int d = length + 1 - r->term_pos[j]; wrbuf_printf(wr, "%s: w[%d] += w[%d](%d) * follow(%f) / " "(1+log2(d(%d));\n", e->display_str, res, res, w[res], r->follow_factor, d); w[res] += w[res] * r->follow_factor / (1 + log2(d)); } for (j = 0; j < r->vec_len; j++) r->term_pos[j] = j < res ? 0 : length + 1; } length++; } for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next) { if (length == 0 || w[i] == 0) continue; wrbuf_printf(wr, "%s: tf[%d] += w[%d](%d)", e->display_str, i, i, w[i]); switch (r->length_divide) { case 0: cluster->term_frequency_vecf[i] += (double) w[i]; break; case 1: wrbuf_printf(wr, " / log2(1+length(%d))", length); cluster->term_frequency_vecf[i] += (double) w[i] / log2(1 + length); break; case 2: wrbuf_printf(wr, " / length(%d)", length); cluster->term_frequency_vecf[i] += (double) w[i] / length; } cluster->term_frequency_vec[i] += w[i]; wrbuf_printf(wr, " (%f);\n", cluster->term_frequency_vecf[i]); } cluster->term_frequency_vec[0] += length; }