Ejemplo n.º 1
0
int relevance_snippet(struct relevance *r,
                      const char *words, const char *name,
                      WRBUF w_snippet)
{
    int no = 0;
    const char *norm_str;
    int highlight = 0;

    pp2_charset_token_first(r->prt, words, 0);
    while ((norm_str = pp2_charset_token_next(r->prt)))
    {
        size_t org_start, org_len;
        struct word_entry *entries = r->entries;
        int i;

        pp2_get_org(r->prt, &org_start, &org_len);
        for (; entries; entries = entries->next, i++)
        {
            if (*norm_str && !strcmp(norm_str, entries->norm_str))
            {
                break;
                if (!highlight)
                {
                    highlight = 1;
                    wrbuf_puts(w_snippet, "<match>");
                }
                break;
            }
        }
        if (entries)
        {
            if (!highlight)
            {
                highlight = 1;
                wrbuf_puts(w_snippet, "<match>");
                no++;
            }
        }
        else
        {
            if (highlight)
            {
                highlight = 0;
                wrbuf_puts(w_snippet, "</match>");
            }
        }
        wrbuf_xmlputs_n(w_snippet, words + org_start, org_len);
    }
    if (highlight)
        wrbuf_puts(w_snippet, "</match>");
    if (no)
    {
        yaz_log(YLOG_DEBUG, "SNIPPET match: %s", wrbuf_cstr(w_snippet));
    }
    return no;
}
Ejemplo n.º 2
0
static void pull_terms(struct relevance *res, struct ccl_rpn_node *n)
{
    char **words;
    int numwords;
    char *ccl_field;
    int i;

    switch (n->kind)
    {
    case CCL_RPN_AND:
    case CCL_RPN_OR:
    case CCL_RPN_NOT:
    case CCL_RPN_PROX:
        pull_terms(res, n->u.p[0]);
        pull_terms(res, n->u.p[1]);
        break;
    case CCL_RPN_TERM:
        nmem_strsplit(res->nmem, " ", n->u.t.term, &words, &numwords);
        for (i = 0; i < numwords; i++)
        {
            const char *norm_str;

            ccl_field = nmem_strdup_null(res->nmem, n->u.t.qual);

            pp2_charset_token_first(res->prt, words[i], 0);
            while ((norm_str = pp2_charset_token_next(res->prt)))
            {
                struct word_entry **e = &res->entries;
                while (*e)
                    e = &(*e)->next;
                *e = nmem_malloc(res->nmem, sizeof(**e));
                (*e)->norm_str = nmem_strdup(res->nmem, norm_str);
                (*e)->ccl_field = ccl_field;
                (*e)->termno = res->vec_len++;
                (*e)->display_str = nmem_strdup(res->nmem, words[i]);
                (*e)->next = 0;
            }
        }
        break;
    default:
        break;
    }
}
Ejemplo n.º 3
0
void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
                          const char *words, const char *rank,
                          const char *name)
{
    int *w = r->term_frequency_vec_tmp;
    const char *norm_str;
    int i, length = 0;
    double lead_decay = r->lead_decay;
    struct word_entry *e;
    WRBUF wr = cluster->relevance_explain1;
    int printed_about_field = 0;

    pp2_charset_token_first(r->prt, words, 0);
    for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next)
    {
        w[i] = 0;
        r->term_pos[i] = 0;
    }

    assert(rank);
    while ((norm_str = pp2_charset_token_next(r->prt)))
    {
        int local_weight = 0;
        e = word_entry_match(r, norm_str, rank, &local_weight);
        if (e)
        {
            int res = e->termno;
            int j;

            if (!printed_about_field)
            {
                printed_about_field = 1;
                wrbuf_printf(wr, "field=%s content=", name);
                if (strlen(words) > 50)
                {
                    wrbuf_xmlputs_n(wr, words, 49);
                    wrbuf_puts(wr, " ...");
                }
                else
                    wrbuf_xmlputs(wr, words);
                wrbuf_puts(wr, ";\n");
            }
            assert(res < r->vec_len);
            w[res] += local_weight / (1 + log2(1 + lead_decay * length));
            wrbuf_printf(wr, "%s: w[%d] += w(%d) / "
                         "(1+log2(1+lead_decay(%f) * length(%d)));\n",
                         e->display_str, res, local_weight, lead_decay, length);
            j = res - 1;
            if (j > 0 && r->term_pos[j])
            {
                int d = length + 1 - r->term_pos[j];
                wrbuf_printf(wr, "%s: w[%d] += w[%d](%d) * follow(%f) / "
                             "(1+log2(d(%d));\n",
                             e->display_str, res, res, w[res],
                             r->follow_factor, d);
                w[res] += w[res] * r->follow_factor / (1 + log2(d));
            }
            for (j = 0; j < r->vec_len; j++)
                r->term_pos[j] = j < res ? 0 : length + 1;
        }
        length++;
    }

    for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next)
    {
        if (length == 0 || w[i] == 0)
            continue;
        wrbuf_printf(wr, "%s: tf[%d] += w[%d](%d)", e->display_str, i, i, w[i]);
        switch (r->length_divide)
        {
        case 0:
            cluster->term_frequency_vecf[i] += (double) w[i];
            break;
        case 1:
            wrbuf_printf(wr, " / log2(1+length(%d))", length);
            cluster->term_frequency_vecf[i] +=
                (double) w[i] / log2(1 + length);
            break;
        case 2:
            wrbuf_printf(wr, " / length(%d)", length);
            cluster->term_frequency_vecf[i] += (double) w[i] / length;
        }
        cluster->term_frequency_vec[i] += w[i];
        wrbuf_printf(wr, " (%f);\n", cluster->term_frequency_vecf[i]);
    }

    cluster->term_frequency_vec[0] += length;
}