Beispiel #1
0
void pp2_charset_destroy(pp2_charset_t pct)
{
#if YAZ_HAVE_ICU
    icu_chain_destroy(pct->icu_chn);
#endif
    xfree(pct);
}
Beispiel #2
0
static void check_icu_chain(void)
{
    const char *en_str
        = "O Romeo, Romeo! wherefore art thou\t Romeo?";

    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;

    const char *xml_str = "<icu locale=\"en\">"
        "<transform rule=\"[:Control:] Any-Remove\"/>"
        "<tokenize rule=\"l\"/>"
        "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
        "<display/>"
        "<casemap rule=\"l\"/>"
        "</icu>";


    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    xmlNode *xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);

    chain = icu_chain_xml_config(xml_node, 0, &status);

    xmlFreeDoc(doc);
    YAZ_CHECK(chain);
    if (!chain)
        return;

    YAZ_CHECK(icu_chain_assign_cstr(chain, en_str, &status));

    while (icu_chain_next_token(chain, &status))
    {
        yaz_log(YLOG_LOG, "%d '%s' '%s'",
                icu_chain_token_number(chain),
                icu_chain_token_norm(chain),
                icu_chain_token_display(chain));
    }

    YAZ_CHECK_EQ(icu_chain_token_number(chain), 7);


    YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status));

    while (icu_chain_next_token(chain, &status))
    {
        yaz_log(YLOG_LOG, "%d '%s' '%s'",
                icu_chain_token_number(chain),
                icu_chain_token_norm(chain),
                icu_chain_token_display(chain));
    }


    YAZ_CHECK_EQ(icu_chain_token_number(chain), 3);

    icu_chain_destroy(chain);
}
Beispiel #3
0
static void check_icu_iter4(void)
{
    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;
    xmlNode *xml_node;

    const char *xml_str = "<icu locale=\"en\">"
        "<transform rule=\"[:Control:] Any-Remove\"/>"
        "<tokenize rule=\"l\"/>"
        "<tokenize rule=\"w\"/>"
        "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
        "<display/>"
        "<casemap rule=\"l\"/>"
        "<join rule=\"\"/>"
        "</icu>";

    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    YAZ_CHECK(doc);
    if (!doc)
        return;
    xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);
    if (!xml_node)
        return ;

    chain = icu_chain_xml_config(xml_node, 1, &status);

    xmlFreeDoc(doc);
    YAZ_CHECK(chain);
    if (!chain)
        return;

    YAZ_CHECK(test_iter(chain, "Adobe Acrobat Reader, 1991-1999.",
                        "[adobeacrobatreader19911999]"));

    YAZ_CHECK(test_iter(chain, "Νόταρης, Γιάννης Σωτ",
                        "[νόταρηςγιάννηςσωτ]"));

    // check_iter_threads(chain);

    icu_chain_destroy(chain);
}
Beispiel #4
0
static void check_chain_empty_chain(void)
{
    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;

    const char *xml_str = "<icu locale=\"en\">"
        "</icu>";

    const char *src8 = "some 5487 weired !¤%&(/& sTuFf";
    char *dest8 = 0;

    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    xmlNode *xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);

    chain = icu_chain_xml_config(xml_node, 0, &status);

    xmlFreeDoc(doc);
    YAZ_CHECK(chain);

    YAZ_CHECK(icu_chain_assign_cstr(
                  chain,  src8,
                  &status));

    while (icu_chain_next_token(chain, &status))
    {
        ;
        /* printf("%d '%s' '%s'\n",
           icu_chain_token_number(chain),
           icu_chain_token_norm(chain),
           icu_chain_token_display(chain)); */
    }

    YAZ_CHECK_EQ(icu_chain_token_number(chain), 1);

    dest8 = (char *) icu_chain_token_norm(chain);
    YAZ_CHECK_EQ(strcmp(src8, dest8), 0);

    icu_chain_destroy(chain);
}
Beispiel #5
0
static void check_icu_iter3(void)
{
    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;
    xmlNode *xml_node;

    const char *xml_str =
        "<icu_chain id=\"sort\" locale=\"el\">\n"
        "<transform rule=\"[:Control:] Any-Remove\"/>\n"
        "<transform rule=\"[[:Control:][:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
        "<transform rule=\"NFD; [:Nonspacing Mark:] Remove; NFC\"/>\n"
        "<casemap rule=\"l\"/>\n"
        "<display/>\n"
        "</icu_chain>\n";

    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    YAZ_CHECK(doc);
    if (!doc)
        return;
    xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);
    if (!xml_node)
        return ;

    chain = icu_chain_xml_config(xml_node, 1, &status);

    xmlFreeDoc(doc);
    YAZ_CHECK(chain);
    if (!chain)
        return;

    YAZ_CHECK(test_iter(chain, "Adobe Acrobat Reader, 1991-1999.",
                        "[adobeacrobatreader19911999]"));

    YAZ_CHECK(test_iter(chain, "Νόταρης, Γιάννης Σωτ",
                        "[νοταρηςγιαννηςσωτ]"));

    icu_chain_destroy(chain);
}
Beispiel #6
0
static void check_icu_iter1(void)
{
    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;
    xmlNode *xml_node;
    yaz_icu_iter_t iter;

    const char *xml_str = "<icu locale=\"en\">"
        "<tokenize rule=\"w\"/>"
        "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
        "</icu>";

    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    YAZ_CHECK(doc);
    if (!doc)
        return;
    xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);
    if (!xml_node)
        return ;

    chain = icu_chain_xml_config(xml_node, 1, &status);

    xmlFreeDoc(doc);
    YAZ_CHECK(chain);

    iter = icu_iter_create(chain);
    icu_iter_first(iter, "a string with 15 tokens and 8 displays");
    YAZ_CHECK(iter);
    if (!iter)
        return;
    while (icu_iter_next(iter))
    {
        yaz_log(YLOG_LOG, "[%s]", icu_iter_get_norm(iter));
    }
    icu_iter_destroy(iter);
    icu_chain_destroy(chain);
}
Beispiel #7
0
static void check_chain_empty_token(void)
{
    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;

    const char *xml_str = "<icu locale=\"en\">"
        "<tokenize rule=\"w\"/>"
        "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
        "</icu>";

    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    xmlNode *xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);

    chain = icu_chain_xml_config(xml_node, 0, &status);

    xmlFreeDoc(doc);
    YAZ_CHECK(chain);

    YAZ_CHECK(icu_chain_assign_cstr(
                  chain,  "a string with 15 tokenss and 8 displays",
                  &status));

    while (icu_chain_next_token(chain, &status))
    {
        ;
        /* printf("%d '%s' '%s'\n",
           icu_chain_token_number(chain),
           icu_chain_token_norm(chain),
           icu_chain_token_display(chain)); */
    }

    YAZ_CHECK_EQ(icu_chain_token_number(chain), 15);

    icu_chain_destroy(chain);
}
Beispiel #8
0
static void process_text_file(struct config_t *p_config)
{
    char *line = 0;
    char linebuf[1024];

    xmlDoc *doc = xmlParseFile(p_config->conffile);
    xmlNode *xml_node = xmlDocGetRootElement(doc);

    long unsigned int token_count = 0;
    long unsigned int line_count = 0;

    UErrorCode status = U_ZERO_ERROR;

    if (!xml_node)
    {
        printf("Could not parse XML config file '%s' \n",
                p_config->conffile);
        exit(1);
    }

    p_config->chain = icu_chain_xml_config(xml_node, 1, &status);

    if (!p_config->chain || !U_SUCCESS(status))
    {
        printf("Could not set up ICU chain from config file '%s' \n",
                p_config->conffile);
        exit(1);
    }

    if (p_config->xmloutput)
        fprintf(p_config->outfile,
                "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
                "<icu>\n"
                "<tokens>\n");

    /* read input lines for processing */
    while ((line=fgets(linebuf, sizeof(linebuf)-1, p_config->infile)))
    {
        WRBUF sw = wrbuf_alloc();
        WRBUF cdata = wrbuf_alloc();
        int success = icu_chain_assign_cstr(p_config->chain, line, &status);
        line_count++;

        while (success && icu_chain_next_token(p_config->chain, &status))
        {
            if (U_FAILURE(status))
                success = 0;
            else
            {
                size_t start, len;
                const char *org_string = 0;
                const char *sortkey = icu_chain_token_sortkey(p_config->chain);

                icu_chain_get_org_info2(p_config->chain, &start, &len,
                                        &org_string);
                wrbuf_rewind(sw);
                wrbuf_puts_escaped(sw, sortkey);
                token_count++;
                if (p_config->xmloutput)
                {
                    fprintf(p_config->outfile,
                            "<token id=\"%lu\" line=\"%lu\"",
                            token_count, line_count);

                    wrbuf_rewind(cdata);
                    wrbuf_xmlputs(cdata, icu_chain_token_norm(p_config->chain));
                    fprintf(p_config->outfile, " norm=\"%s\"",
                            wrbuf_cstr(cdata));

                    wrbuf_rewind(cdata);
                    wrbuf_xmlputs(cdata, icu_chain_token_display(p_config->chain));
                    fprintf(p_config->outfile, " display=\"%s\"",
                            wrbuf_cstr(cdata));

                    if (p_config->sortoutput)
                    {
                        wrbuf_rewind(cdata);
                        wrbuf_xmlputs(cdata, wrbuf_cstr(sw));
                        fprintf(p_config->outfile, " sortkey=\"%s\"",
                                wrbuf_cstr(cdata));
                    }
                    fprintf(p_config->outfile, "/>\n");
                }
                else
                {
                    fprintf(p_config->outfile, "%lu %lu '%s' '%s'",
                            token_count,
                            line_count,
                            icu_chain_token_norm(p_config->chain),
                            icu_chain_token_display(p_config->chain));
                    if (p_config->sortoutput)
                    {
                        fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw));
                    }
                    if (p_config->org_output)
                    {
                        fprintf(p_config->outfile, " %ld+%ld",
                                (long) start, (long) len);
                        fputc(' ', p_config->outfile);
                        fwrite(org_string, 1, start, p_config->outfile);
                        fputc('*', p_config->outfile);
                        fwrite(org_string + start, 1, len, p_config->outfile);
                        fputc('*', p_config->outfile);
                        fputs(org_string + start + len, p_config->outfile);
                    }
                    fprintf(p_config->outfile, "\n");
                }
            }
        }
        wrbuf_destroy(sw);
        wrbuf_destroy(cdata);
    }

    if (p_config->xmloutput)
        fprintf(p_config->outfile,
                "</tokens>\n"
                "</icu>\n");

    icu_chain_destroy(p_config->chain);
    xmlFreeDoc(doc);
    if (line)
        free(line);
}
Beispiel #9
0
struct icu_chain *icu_chain_xml_config(const xmlNode *xml_node,
                                       int sort,
                                       UErrorCode *status)
{
    xmlNode *node = 0;
    int no_errors = 0;
    struct icu_chain *chain = 0;
    NMEM nmem = 0;

    *status = U_ZERO_ERROR;

    if (xml_node && xml_node->type == XML_ELEMENT_NODE)
    {
        const char *xml_locale = yaz_xml_get_prop((xmlNode *) xml_node,
                                                  "locale");
        if (xml_locale)
            chain = icu_chain_create((const char *) xml_locale, sort, status);
    }

    if (!chain)
        return 0;

    nmem = nmem_create();
    for (node = xml_node->children; node; node = node->next)
    {
        char *rule = 0;
        struct icu_chain_step *step = 0;
        const char *attr_str;

        nmem_reset(nmem);
        if (node->type != XML_ELEMENT_NODE)
            continue;
        attr_str = yaz_xml_get_prop(node, "rule%s", &rule);
        if (attr_str)
        {
            yaz_log(YLOG_WARN, "Unsupported attribute '%s' for "
                    "element '%s'", attr_str, node->name);
            no_errors++;
        }
        if (!rule && node->children)
            rule = nmem_text_node_cdata(node->children, nmem);

        if (!rule && strcmp((const char *) node->name, "display"))
        {
            yaz_log(YLOG_WARN, "Missing attribute 'rule' for element %s",
                    (const char *) node->name);
            no_errors++;
            continue;
        }
        if (!strcmp((const char *) node->name, "casemap"))
            step = icu_chain_insert_step(chain,
                                         ICU_chain_step_type_casemap,
                                         rule, status);
        else if (!strcmp((const char *) node->name, "transform"))
            step = icu_chain_insert_step(chain,
                                         ICU_chain_step_type_transform,
                                         rule, status);
        else if (!strcmp((const char *) node->name, "transliterate"))
            step = icu_chain_insert_step(chain,
                                         ICU_chain_step_type_transliterate,
                                         rule, status);
        else if (!strcmp((const char *) node->name, "tokenize"))
            step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
                                         rule, status);
        else if (!strcmp((const char *) node->name, "display"))
            step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
                                         rule, status);
        else if (!strcmp((const char *) node->name, "stemming"))
            step = icu_chain_insert_step(chain, YAZ_chain_step_type_stemming,
                                         rule, status);
        else if (!strcmp((const char *) node->name, "join"))
            step = icu_chain_insert_step(chain, ICU_chain_step_type_join,
                                         rule, status);
        else if (!strcmp((const char *) node->name, "normalize"))
        {
            yaz_log(YLOG_WARN, "Element %s is deprecated. "
                    "Use transform instead", node->name);
            step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
                                         rule, status);
        }
        else if (!strcmp((const char *) node->name, "index")
                 || !strcmp((const char *) node->name, "sortkey"))
        {
            yaz_log(YLOG_WARN, "Element %s is no longer needed. "
                    "Remove it from the configuration", node->name);
        }
        else
        {
            yaz_log(YLOG_WARN, "Unknown element %s", node->name);
            no_errors++;
            continue;
        }
        if (!step)
        {
            yaz_log(YLOG_WARN, "Step not created for %s", node->name);
            no_errors++;
        }
        if (step && U_FAILURE(*status))
        {
            yaz_log(YLOG_WARN, "ICU Error %d %s for element %s, rule %s",
                    *status, u_errorName(*status), node->name, rule ?
                    rule : "");
            no_errors++;
            break;
        }
    }
    nmem_destroy(nmem);
    if (no_errors)
    {
        icu_chain_destroy(chain);
        return 0;
    }
    return chain;
}
Beispiel #10
0
static void check_norm(void)
{
    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;
    xmlNode *xml_node;
    yaz_icu_iter_t it;

    const char *xml_str =
        "  <icu_chain id=\"relevance\" locale=\"en\">"
        "    <transform rule=\"[:Control:] Any-Remove\"/>"
        "    <tokenize rule=\"l\"/>"
        "    <transform rule=\"[[:WhiteSpace:][:Punctuation:]`] Remove\"/>"
        "    <casemap rule=\"l\"/>"
        "  </icu_chain>";

    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    YAZ_CHECK(doc);
    if (!doc)
        return;
    xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);
    if (!xml_node)
        return ;
    chain = icu_chain_xml_config(xml_node, 1, &status);

    it = icu_iter_create(chain);
    if (it)
    {
        icu_iter_first(it, " y😄");
        while (icu_iter_next(it))
        {
            const char *norm_str = icu_iter_get_norm(it);
            size_t start, len;

            YAZ_CHECK(norm_str);
            if (norm_str)
                yaz_log(YLOG_LOG, "norm_str len=%ld=%s",
                        (long) strlen(norm_str), norm_str);
            icu_iter_get_org_info(it, &start, &len);
            YAZ_CHECK(start <= 1000);
            YAZ_CHECK(len <= 1000);
        }

        icu_iter_first(it, "\n y😄");
        while (icu_iter_next(it))
        {
            const char *norm_str = icu_iter_get_norm(it);
            size_t start, len;

            YAZ_CHECK(norm_str);
            if (norm_str)
                yaz_log(YLOG_LOG, "norm_str len=%ld=%s",
                        (long) strlen(norm_str), norm_str);
            icu_iter_get_org_info(it, &start, &len);
            YAZ_CHECK(start <= 1000);
            YAZ_CHECK(len <= 1000);
        }
    }
    icu_iter_destroy(it);
    icu_chain_destroy(chain);
    xmlFreeDoc(doc);
}
Beispiel #11
0
static void check_bug_1140(void)
{
    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;

    const char *xml_str = "<icu locale=\"en\">"

        /* if the first rule is normalize instead. Then it works */
#if 0
        "<transform rule=\"[:Control:] Any-Remove\"/>"
#endif
        "<tokenize rule=\"l\"/>"
        "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
        "<display/>"
        "<casemap rule=\"l\"/>"
        "</icu>";


    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    xmlNode *xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);

    chain = icu_chain_xml_config(xml_node, 0, &status);

    xmlFreeDoc(doc);
    YAZ_CHECK(chain);
    if (!chain)
        return;

    YAZ_CHECK(icu_chain_assign_cstr(
                  chain,  "O Romeo, Romeo! wherefore art thou\t Romeo?",
                  &status));

    while (icu_chain_next_token(chain, &status))
    {
        ;
        /* printf("%d '%s' '%s'\n",
           icu_chain_token_number(chain),
           icu_chain_token_norm(chain),
           icu_chain_token_display(chain)); */
    }


    YAZ_CHECK_EQ(icu_chain_token_number(chain), 7);

    YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status));

    while (icu_chain_next_token(chain, &status))
    {
        ;
        /* printf("%d '%s' '%s'\n",
           icu_chain_token_number(chain),
           icu_chain_token_norm(chain),
           icu_chain_token_display(chain)); */
    }

    /* we expect 'what' 'is' 'this', i.e. 3 tokens */
    YAZ_CHECK_EQ(icu_chain_token_number(chain), 3);

    icu_chain_destroy(chain);
}