Exemple #1
0
pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node)
{
#if YAZ_HAVE_ICU
    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;
    while (xml_node && xml_node->type != XML_ELEMENT_NODE)
        xml_node = xml_node->next;
    chain = icu_chain_xml_config(xml_node, 1, &status);
    if (!chain || U_FAILURE(status)){
        //xmlDocPtr icu_doc = 0;
        //xmlChar *xmlstr = 0;
                //int size = 0;
                //xmlDocDumpMemory(icu_doc, size);
        
        yaz_log(YLOG_FATAL, "Could not parse ICU chain config:\n"
                "<%s>\n ... \n</%s>",
                xml_node->name, xml_node->name);
        return 0;
    }
    return pp2_charset_create(chain);
#else // YAZ_HAVE_ICU
    yaz_log(YLOG_FATAL, "Error: ICU support requested with element:\n"
            "<%s>\n ... \n</%s>",
            xml_node->name, xml_node->name);
    yaz_log(YLOG_FATAL, 
            "But no ICU support is compiled into the YAZ library.");
    return 0;
#endif // YAZ_HAVE_ICU
}
Exemple #2
0
static void check_icu_chain(void)
{
    const char *en_str
        = "O Romeo, Romeo! wherefore art thou\t Romeo?";

    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;

    const char *xml_str = "<icu locale=\"en\">"
        "<transform rule=\"[:Control:] Any-Remove\"/>"
        "<tokenize rule=\"l\"/>"
        "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
        "<display/>"
        "<casemap rule=\"l\"/>"
        "</icu>";


    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    xmlNode *xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);

    chain = icu_chain_xml_config(xml_node, 0, &status);

    xmlFreeDoc(doc);
    YAZ_CHECK(chain);
    if (!chain)
        return;

    YAZ_CHECK(icu_chain_assign_cstr(chain, en_str, &status));

    while (icu_chain_next_token(chain, &status))
    {
        yaz_log(YLOG_LOG, "%d '%s' '%s'",
                icu_chain_token_number(chain),
                icu_chain_token_norm(chain),
                icu_chain_token_display(chain));
    }

    YAZ_CHECK_EQ(icu_chain_token_number(chain), 7);


    YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status));

    while (icu_chain_next_token(chain, &status))
    {
        yaz_log(YLOG_LOG, "%d '%s' '%s'",
                icu_chain_token_number(chain),
                icu_chain_token_norm(chain),
                icu_chain_token_display(chain));
    }


    YAZ_CHECK_EQ(icu_chain_token_number(chain), 3);

    icu_chain_destroy(chain);
}
Exemple #3
0
static void check_icu_iter4(void)
{
    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;
    xmlNode *xml_node;

    const char *xml_str = "<icu locale=\"en\">"
        "<transform rule=\"[:Control:] Any-Remove\"/>"
        "<tokenize rule=\"l\"/>"
        "<tokenize rule=\"w\"/>"
        "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
        "<display/>"
        "<casemap rule=\"l\"/>"
        "<join rule=\"\"/>"
        "</icu>";

    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    YAZ_CHECK(doc);
    if (!doc)
        return;
    xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);
    if (!xml_node)
        return ;

    chain = icu_chain_xml_config(xml_node, 1, &status);

    xmlFreeDoc(doc);
    YAZ_CHECK(chain);
    if (!chain)
        return;

    YAZ_CHECK(test_iter(chain, "Adobe Acrobat Reader, 1991-1999.",
                        "[adobeacrobatreader19911999]"));

    YAZ_CHECK(test_iter(chain, "Νόταρης, Γιάννης Σωτ",
                        "[νόταρηςγιάννηςσωτ]"));

    // check_iter_threads(chain);

    icu_chain_destroy(chain);
}
Exemple #4
0
static void check_chain_empty_chain(void)
{
    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;

    const char *xml_str = "<icu locale=\"en\">"
        "</icu>";

    const char *src8 = "some 5487 weired !¤%&(/& sTuFf";
    char *dest8 = 0;

    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    xmlNode *xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);

    chain = icu_chain_xml_config(xml_node, 0, &status);

    xmlFreeDoc(doc);
    YAZ_CHECK(chain);

    YAZ_CHECK(icu_chain_assign_cstr(
                  chain,  src8,
                  &status));

    while (icu_chain_next_token(chain, &status))
    {
        ;
        /* printf("%d '%s' '%s'\n",
           icu_chain_token_number(chain),
           icu_chain_token_norm(chain),
           icu_chain_token_display(chain)); */
    }

    YAZ_CHECK_EQ(icu_chain_token_number(chain), 1);

    dest8 = (char *) icu_chain_token_norm(chain);
    YAZ_CHECK_EQ(strcmp(src8, dest8), 0);

    icu_chain_destroy(chain);
}
Exemple #5
0
static void check_icu_iter3(void)
{
    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;
    xmlNode *xml_node;

    const char *xml_str =
        "<icu_chain id=\"sort\" locale=\"el\">\n"
        "<transform rule=\"[:Control:] Any-Remove\"/>\n"
        "<transform rule=\"[[:Control:][:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
        "<transform rule=\"NFD; [:Nonspacing Mark:] Remove; NFC\"/>\n"
        "<casemap rule=\"l\"/>\n"
        "<display/>\n"
        "</icu_chain>\n";

    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    YAZ_CHECK(doc);
    if (!doc)
        return;
    xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);
    if (!xml_node)
        return ;

    chain = icu_chain_xml_config(xml_node, 1, &status);

    xmlFreeDoc(doc);
    YAZ_CHECK(chain);
    if (!chain)
        return;

    YAZ_CHECK(test_iter(chain, "Adobe Acrobat Reader, 1991-1999.",
                        "[adobeacrobatreader19911999]"));

    YAZ_CHECK(test_iter(chain, "Νόταρης, Γιάννης Σωτ",
                        "[νοταρηςγιαννηςσωτ]"));

    icu_chain_destroy(chain);
}
Exemple #6
0
static void check_icu_iter1(void)
{
    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;
    xmlNode *xml_node;
    yaz_icu_iter_t iter;

    const char *xml_str = "<icu locale=\"en\">"
        "<tokenize rule=\"w\"/>"
        "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
        "</icu>";

    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    YAZ_CHECK(doc);
    if (!doc)
        return;
    xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);
    if (!xml_node)
        return ;

    chain = icu_chain_xml_config(xml_node, 1, &status);

    xmlFreeDoc(doc);
    YAZ_CHECK(chain);

    iter = icu_iter_create(chain);
    icu_iter_first(iter, "a string with 15 tokens and 8 displays");
    YAZ_CHECK(iter);
    if (!iter)
        return;
    while (icu_iter_next(iter))
    {
        yaz_log(YLOG_LOG, "[%s]", icu_iter_get_norm(iter));
    }
    icu_iter_destroy(iter);
    icu_chain_destroy(chain);
}
Exemple #7
0
static void check_chain_empty_token(void)
{
    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;

    const char *xml_str = "<icu locale=\"en\">"
        "<tokenize rule=\"w\"/>"
        "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
        "</icu>";

    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    xmlNode *xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);

    chain = icu_chain_xml_config(xml_node, 0, &status);

    xmlFreeDoc(doc);
    YAZ_CHECK(chain);

    YAZ_CHECK(icu_chain_assign_cstr(
                  chain,  "a string with 15 tokenss and 8 displays",
                  &status));

    while (icu_chain_next_token(chain, &status))
    {
        ;
        /* printf("%d '%s' '%s'\n",
           icu_chain_token_number(chain),
           icu_chain_token_norm(chain),
           icu_chain_token_display(chain)); */
    }

    YAZ_CHECK_EQ(icu_chain_token_number(chain), 15);

    icu_chain_destroy(chain);
}
Exemple #8
0
static void process_text_file(struct config_t *p_config)
{
    char *line = 0;
    char linebuf[1024];

    xmlDoc *doc = xmlParseFile(p_config->conffile);
    xmlNode *xml_node = xmlDocGetRootElement(doc);

    long unsigned int token_count = 0;
    long unsigned int line_count = 0;

    UErrorCode status = U_ZERO_ERROR;

    if (!xml_node)
    {
        printf("Could not parse XML config file '%s' \n",
                p_config->conffile);
        exit(1);
    }

    p_config->chain = icu_chain_xml_config(xml_node, 1, &status);

    if (!p_config->chain || !U_SUCCESS(status))
    {
        printf("Could not set up ICU chain from config file '%s' \n",
                p_config->conffile);
        exit(1);
    }

    if (p_config->xmloutput)
        fprintf(p_config->outfile,
                "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
                "<icu>\n"
                "<tokens>\n");

    /* read input lines for processing */
    while ((line=fgets(linebuf, sizeof(linebuf)-1, p_config->infile)))
    {
        WRBUF sw = wrbuf_alloc();
        WRBUF cdata = wrbuf_alloc();
        int success = icu_chain_assign_cstr(p_config->chain, line, &status);
        line_count++;

        while (success && icu_chain_next_token(p_config->chain, &status))
        {
            if (U_FAILURE(status))
                success = 0;
            else
            {
                size_t start, len;
                const char *org_string = 0;
                const char *sortkey = icu_chain_token_sortkey(p_config->chain);

                icu_chain_get_org_info2(p_config->chain, &start, &len,
                                        &org_string);
                wrbuf_rewind(sw);
                wrbuf_puts_escaped(sw, sortkey);
                token_count++;
                if (p_config->xmloutput)
                {
                    fprintf(p_config->outfile,
                            "<token id=\"%lu\" line=\"%lu\"",
                            token_count, line_count);

                    wrbuf_rewind(cdata);
                    wrbuf_xmlputs(cdata, icu_chain_token_norm(p_config->chain));
                    fprintf(p_config->outfile, " norm=\"%s\"",
                            wrbuf_cstr(cdata));

                    wrbuf_rewind(cdata);
                    wrbuf_xmlputs(cdata, icu_chain_token_display(p_config->chain));
                    fprintf(p_config->outfile, " display=\"%s\"",
                            wrbuf_cstr(cdata));

                    if (p_config->sortoutput)
                    {
                        wrbuf_rewind(cdata);
                        wrbuf_xmlputs(cdata, wrbuf_cstr(sw));
                        fprintf(p_config->outfile, " sortkey=\"%s\"",
                                wrbuf_cstr(cdata));
                    }
                    fprintf(p_config->outfile, "/>\n");
                }
                else
                {
                    fprintf(p_config->outfile, "%lu %lu '%s' '%s'",
                            token_count,
                            line_count,
                            icu_chain_token_norm(p_config->chain),
                            icu_chain_token_display(p_config->chain));
                    if (p_config->sortoutput)
                    {
                        fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw));
                    }
                    if (p_config->org_output)
                    {
                        fprintf(p_config->outfile, " %ld+%ld",
                                (long) start, (long) len);
                        fputc(' ', p_config->outfile);
                        fwrite(org_string, 1, start, p_config->outfile);
                        fputc('*', p_config->outfile);
                        fwrite(org_string + start, 1, len, p_config->outfile);
                        fputc('*', p_config->outfile);
                        fputs(org_string + start + len, p_config->outfile);
                    }
                    fprintf(p_config->outfile, "\n");
                }
            }
        }
        wrbuf_destroy(sw);
        wrbuf_destroy(cdata);
    }

    if (p_config->xmloutput)
        fprintf(p_config->outfile,
                "</tokens>\n"
                "</icu>\n");

    icu_chain_destroy(p_config->chain);
    xmlFreeDoc(doc);
    if (line)
        free(line);
}
Exemple #9
0
static void check_norm(void)
{
    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;
    xmlNode *xml_node;
    yaz_icu_iter_t it;

    const char *xml_str =
        "  <icu_chain id=\"relevance\" locale=\"en\">"
        "    <transform rule=\"[:Control:] Any-Remove\"/>"
        "    <tokenize rule=\"l\"/>"
        "    <transform rule=\"[[:WhiteSpace:][:Punctuation:]`] Remove\"/>"
        "    <casemap rule=\"l\"/>"
        "  </icu_chain>";

    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    YAZ_CHECK(doc);
    if (!doc)
        return;
    xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);
    if (!xml_node)
        return ;
    chain = icu_chain_xml_config(xml_node, 1, &status);

    it = icu_iter_create(chain);
    if (it)
    {
        icu_iter_first(it, " y😄");
        while (icu_iter_next(it))
        {
            const char *norm_str = icu_iter_get_norm(it);
            size_t start, len;

            YAZ_CHECK(norm_str);
            if (norm_str)
                yaz_log(YLOG_LOG, "norm_str len=%ld=%s",
                        (long) strlen(norm_str), norm_str);
            icu_iter_get_org_info(it, &start, &len);
            YAZ_CHECK(start <= 1000);
            YAZ_CHECK(len <= 1000);
        }

        icu_iter_first(it, "\n y😄");
        while (icu_iter_next(it))
        {
            const char *norm_str = icu_iter_get_norm(it);
            size_t start, len;

            YAZ_CHECK(norm_str);
            if (norm_str)
                yaz_log(YLOG_LOG, "norm_str len=%ld=%s",
                        (long) strlen(norm_str), norm_str);
            icu_iter_get_org_info(it, &start, &len);
            YAZ_CHECK(start <= 1000);
            YAZ_CHECK(len <= 1000);
        }
    }
    icu_iter_destroy(it);
    icu_chain_destroy(chain);
    xmlFreeDoc(doc);
}
Exemple #10
0
static void check_bug_1140(void)
{
    UErrorCode status = U_ZERO_ERROR;
    struct icu_chain *chain = 0;

    const char *xml_str = "<icu locale=\"en\">"

        /* if the first rule is normalize instead. Then it works */
#if 0
        "<transform rule=\"[:Control:] Any-Remove\"/>"
#endif
        "<tokenize rule=\"l\"/>"
        "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
        "<display/>"
        "<casemap rule=\"l\"/>"
        "</icu>";


    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
    xmlNode *xml_node = xmlDocGetRootElement(doc);
    YAZ_CHECK(xml_node);

    chain = icu_chain_xml_config(xml_node, 0, &status);

    xmlFreeDoc(doc);
    YAZ_CHECK(chain);
    if (!chain)
        return;

    YAZ_CHECK(icu_chain_assign_cstr(
                  chain,  "O Romeo, Romeo! wherefore art thou\t Romeo?",
                  &status));

    while (icu_chain_next_token(chain, &status))
    {
        ;
        /* printf("%d '%s' '%s'\n",
           icu_chain_token_number(chain),
           icu_chain_token_norm(chain),
           icu_chain_token_display(chain)); */
    }


    YAZ_CHECK_EQ(icu_chain_token_number(chain), 7);

    YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status));

    while (icu_chain_next_token(chain, &status))
    {
        ;
        /* printf("%d '%s' '%s'\n",
           icu_chain_token_number(chain),
           icu_chain_token_norm(chain),
           icu_chain_token_display(chain)); */
    }

    /* we expect 'what' 'is' 'this', i.e. 3 tokens */
    YAZ_CHECK_EQ(icu_chain_token_number(chain), 3);

    icu_chain_destroy(chain);
}