pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node) { #if YAZ_HAVE_ICU UErrorCode status = U_ZERO_ERROR; struct icu_chain *chain = 0; while (xml_node && xml_node->type != XML_ELEMENT_NODE) xml_node = xml_node->next; chain = icu_chain_xml_config(xml_node, 1, &status); if (!chain || U_FAILURE(status)){ //xmlDocPtr icu_doc = 0; //xmlChar *xmlstr = 0; //int size = 0; //xmlDocDumpMemory(icu_doc, size); yaz_log(YLOG_FATAL, "Could not parse ICU chain config:\n" "<%s>\n ... \n</%s>", xml_node->name, xml_node->name); return 0; } return pp2_charset_create(chain); #else // YAZ_HAVE_ICU yaz_log(YLOG_FATAL, "Error: ICU support requested with element:\n" "<%s>\n ... \n</%s>", xml_node->name, xml_node->name); yaz_log(YLOG_FATAL, "But no ICU support is compiled into the YAZ library."); return 0; #endif // YAZ_HAVE_ICU }
static void check_icu_chain(void) { const char *en_str = "O Romeo, Romeo! wherefore art thou\t Romeo?"; UErrorCode status = U_ZERO_ERROR; struct icu_chain *chain = 0; const char *xml_str = "<icu locale=\"en\">" "<transform rule=\"[:Control:] Any-Remove\"/>" "<tokenize rule=\"l\"/>" "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>" "<display/>" "<casemap rule=\"l\"/>" "</icu>"; xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str)); xmlNode *xml_node = xmlDocGetRootElement(doc); YAZ_CHECK(xml_node); chain = icu_chain_xml_config(xml_node, 0, &status); xmlFreeDoc(doc); YAZ_CHECK(chain); if (!chain) return; YAZ_CHECK(icu_chain_assign_cstr(chain, en_str, &status)); while (icu_chain_next_token(chain, &status)) { yaz_log(YLOG_LOG, "%d '%s' '%s'", icu_chain_token_number(chain), icu_chain_token_norm(chain), icu_chain_token_display(chain)); } YAZ_CHECK_EQ(icu_chain_token_number(chain), 7); YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status)); while (icu_chain_next_token(chain, &status)) { yaz_log(YLOG_LOG, "%d '%s' '%s'", icu_chain_token_number(chain), icu_chain_token_norm(chain), icu_chain_token_display(chain)); } YAZ_CHECK_EQ(icu_chain_token_number(chain), 3); icu_chain_destroy(chain); }
static void check_icu_iter4(void) { UErrorCode status = U_ZERO_ERROR; struct icu_chain *chain = 0; xmlNode *xml_node; const char *xml_str = "<icu locale=\"en\">" "<transform rule=\"[:Control:] Any-Remove\"/>" "<tokenize rule=\"l\"/>" "<tokenize rule=\"w\"/>" "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>" "<display/>" "<casemap rule=\"l\"/>" "<join rule=\"\"/>" "</icu>"; xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str)); YAZ_CHECK(doc); if (!doc) return; xml_node = xmlDocGetRootElement(doc); YAZ_CHECK(xml_node); if (!xml_node) return ; chain = icu_chain_xml_config(xml_node, 1, &status); xmlFreeDoc(doc); YAZ_CHECK(chain); if (!chain) return; YAZ_CHECK(test_iter(chain, "Adobe Acrobat Reader, 1991-1999.", "[adobeacrobatreader19911999]")); YAZ_CHECK(test_iter(chain, "Νόταρης, Γιάννης Σωτ", "[νόταρηςγιάννηςσωτ]")); // check_iter_threads(chain); icu_chain_destroy(chain); }
static void check_chain_empty_chain(void) { UErrorCode status = U_ZERO_ERROR; struct icu_chain *chain = 0; const char *xml_str = "<icu locale=\"en\">" "</icu>"; const char *src8 = "some 5487 weired !¤%&(/& sTuFf"; char *dest8 = 0; xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str)); xmlNode *xml_node = xmlDocGetRootElement(doc); YAZ_CHECK(xml_node); chain = icu_chain_xml_config(xml_node, 0, &status); xmlFreeDoc(doc); YAZ_CHECK(chain); YAZ_CHECK(icu_chain_assign_cstr( chain, src8, &status)); while (icu_chain_next_token(chain, &status)) { ; /* printf("%d '%s' '%s'\n", icu_chain_token_number(chain), icu_chain_token_norm(chain), icu_chain_token_display(chain)); */ } YAZ_CHECK_EQ(icu_chain_token_number(chain), 1); dest8 = (char *) icu_chain_token_norm(chain); YAZ_CHECK_EQ(strcmp(src8, dest8), 0); icu_chain_destroy(chain); }
static void check_icu_iter3(void) { UErrorCode status = U_ZERO_ERROR; struct icu_chain *chain = 0; xmlNode *xml_node; const char *xml_str = "<icu_chain id=\"sort\" locale=\"el\">\n" "<transform rule=\"[:Control:] Any-Remove\"/>\n" "<transform rule=\"[[:Control:][:WhiteSpace:][:Punctuation:]] Remove\"/>\n" "<transform rule=\"NFD; [:Nonspacing Mark:] Remove; NFC\"/>\n" "<casemap rule=\"l\"/>\n" "<display/>\n" "</icu_chain>\n"; xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str)); YAZ_CHECK(doc); if (!doc) return; xml_node = xmlDocGetRootElement(doc); YAZ_CHECK(xml_node); if (!xml_node) return ; chain = icu_chain_xml_config(xml_node, 1, &status); xmlFreeDoc(doc); YAZ_CHECK(chain); if (!chain) return; YAZ_CHECK(test_iter(chain, "Adobe Acrobat Reader, 1991-1999.", "[adobeacrobatreader19911999]")); YAZ_CHECK(test_iter(chain, "Νόταρης, Γιάννης Σωτ", "[νοταρηςγιαννηςσωτ]")); icu_chain_destroy(chain); }
static void check_icu_iter1(void) { UErrorCode status = U_ZERO_ERROR; struct icu_chain *chain = 0; xmlNode *xml_node; yaz_icu_iter_t iter; const char *xml_str = "<icu locale=\"en\">" "<tokenize rule=\"w\"/>" "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>" "</icu>"; xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str)); YAZ_CHECK(doc); if (!doc) return; xml_node = xmlDocGetRootElement(doc); YAZ_CHECK(xml_node); if (!xml_node) return ; chain = icu_chain_xml_config(xml_node, 1, &status); xmlFreeDoc(doc); YAZ_CHECK(chain); iter = icu_iter_create(chain); icu_iter_first(iter, "a string with 15 tokens and 8 displays"); YAZ_CHECK(iter); if (!iter) return; while (icu_iter_next(iter)) { yaz_log(YLOG_LOG, "[%s]", icu_iter_get_norm(iter)); } icu_iter_destroy(iter); icu_chain_destroy(chain); }
static void check_chain_empty_token(void) { UErrorCode status = U_ZERO_ERROR; struct icu_chain *chain = 0; const char *xml_str = "<icu locale=\"en\">" "<tokenize rule=\"w\"/>" "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>" "</icu>"; xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str)); xmlNode *xml_node = xmlDocGetRootElement(doc); YAZ_CHECK(xml_node); chain = icu_chain_xml_config(xml_node, 0, &status); xmlFreeDoc(doc); YAZ_CHECK(chain); YAZ_CHECK(icu_chain_assign_cstr( chain, "a string with 15 tokenss and 8 displays", &status)); while (icu_chain_next_token(chain, &status)) { ; /* printf("%d '%s' '%s'\n", icu_chain_token_number(chain), icu_chain_token_norm(chain), icu_chain_token_display(chain)); */ } YAZ_CHECK_EQ(icu_chain_token_number(chain), 15); icu_chain_destroy(chain); }
static void process_text_file(struct config_t *p_config) { char *line = 0; char linebuf[1024]; xmlDoc *doc = xmlParseFile(p_config->conffile); xmlNode *xml_node = xmlDocGetRootElement(doc); long unsigned int token_count = 0; long unsigned int line_count = 0; UErrorCode status = U_ZERO_ERROR; if (!xml_node) { printf("Could not parse XML config file '%s' \n", p_config->conffile); exit(1); } p_config->chain = icu_chain_xml_config(xml_node, 1, &status); if (!p_config->chain || !U_SUCCESS(status)) { printf("Could not set up ICU chain from config file '%s' \n", p_config->conffile); exit(1); } if (p_config->xmloutput) fprintf(p_config->outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" "<icu>\n" "<tokens>\n"); /* read input lines for processing */ while ((line=fgets(linebuf, sizeof(linebuf)-1, p_config->infile))) { WRBUF sw = wrbuf_alloc(); WRBUF cdata = wrbuf_alloc(); int success = icu_chain_assign_cstr(p_config->chain, line, &status); line_count++; while (success && icu_chain_next_token(p_config->chain, &status)) { if (U_FAILURE(status)) success = 0; else { size_t start, len; const char *org_string = 0; const char *sortkey = icu_chain_token_sortkey(p_config->chain); icu_chain_get_org_info2(p_config->chain, &start, &len, &org_string); wrbuf_rewind(sw); wrbuf_puts_escaped(sw, sortkey); token_count++; if (p_config->xmloutput) { fprintf(p_config->outfile, "<token id=\"%lu\" line=\"%lu\"", token_count, line_count); wrbuf_rewind(cdata); wrbuf_xmlputs(cdata, icu_chain_token_norm(p_config->chain)); fprintf(p_config->outfile, " norm=\"%s\"", wrbuf_cstr(cdata)); wrbuf_rewind(cdata); wrbuf_xmlputs(cdata, icu_chain_token_display(p_config->chain)); fprintf(p_config->outfile, " display=\"%s\"", wrbuf_cstr(cdata)); if (p_config->sortoutput) { wrbuf_rewind(cdata); wrbuf_xmlputs(cdata, wrbuf_cstr(sw)); fprintf(p_config->outfile, " sortkey=\"%s\"", wrbuf_cstr(cdata)); } fprintf(p_config->outfile, "/>\n"); } else { fprintf(p_config->outfile, "%lu %lu '%s' '%s'", token_count, line_count, icu_chain_token_norm(p_config->chain), icu_chain_token_display(p_config->chain)); if (p_config->sortoutput) { fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw)); } if (p_config->org_output) { fprintf(p_config->outfile, " %ld+%ld", (long) start, (long) len); fputc(' ', p_config->outfile); fwrite(org_string, 1, start, p_config->outfile); fputc('*', p_config->outfile); fwrite(org_string + start, 1, len, p_config->outfile); fputc('*', p_config->outfile); fputs(org_string + start + len, p_config->outfile); } fprintf(p_config->outfile, "\n"); } } } wrbuf_destroy(sw); wrbuf_destroy(cdata); } if (p_config->xmloutput) fprintf(p_config->outfile, "</tokens>\n" "</icu>\n"); icu_chain_destroy(p_config->chain); xmlFreeDoc(doc); if (line) free(line); }
static void check_norm(void) { UErrorCode status = U_ZERO_ERROR; struct icu_chain *chain = 0; xmlNode *xml_node; yaz_icu_iter_t it; const char *xml_str = " <icu_chain id=\"relevance\" locale=\"en\">" " <transform rule=\"[:Control:] Any-Remove\"/>" " <tokenize rule=\"l\"/>" " <transform rule=\"[[:WhiteSpace:][:Punctuation:]`] Remove\"/>" " <casemap rule=\"l\"/>" " </icu_chain>"; xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str)); YAZ_CHECK(doc); if (!doc) return; xml_node = xmlDocGetRootElement(doc); YAZ_CHECK(xml_node); if (!xml_node) return ; chain = icu_chain_xml_config(xml_node, 1, &status); it = icu_iter_create(chain); if (it) { icu_iter_first(it, " y😄"); while (icu_iter_next(it)) { const char *norm_str = icu_iter_get_norm(it); size_t start, len; YAZ_CHECK(norm_str); if (norm_str) yaz_log(YLOG_LOG, "norm_str len=%ld=%s", (long) strlen(norm_str), norm_str); icu_iter_get_org_info(it, &start, &len); YAZ_CHECK(start <= 1000); YAZ_CHECK(len <= 1000); } icu_iter_first(it, "\n y😄"); while (icu_iter_next(it)) { const char *norm_str = icu_iter_get_norm(it); size_t start, len; YAZ_CHECK(norm_str); if (norm_str) yaz_log(YLOG_LOG, "norm_str len=%ld=%s", (long) strlen(norm_str), norm_str); icu_iter_get_org_info(it, &start, &len); YAZ_CHECK(start <= 1000); YAZ_CHECK(len <= 1000); } } icu_iter_destroy(it); icu_chain_destroy(chain); xmlFreeDoc(doc); }
static void check_bug_1140(void) { UErrorCode status = U_ZERO_ERROR; struct icu_chain *chain = 0; const char *xml_str = "<icu locale=\"en\">" /* if the first rule is normalize instead. Then it works */ #if 0 "<transform rule=\"[:Control:] Any-Remove\"/>" #endif "<tokenize rule=\"l\"/>" "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>" "<display/>" "<casemap rule=\"l\"/>" "</icu>"; xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str)); xmlNode *xml_node = xmlDocGetRootElement(doc); YAZ_CHECK(xml_node); chain = icu_chain_xml_config(xml_node, 0, &status); xmlFreeDoc(doc); YAZ_CHECK(chain); if (!chain) return; YAZ_CHECK(icu_chain_assign_cstr( chain, "O Romeo, Romeo! wherefore art thou\t Romeo?", &status)); while (icu_chain_next_token(chain, &status)) { ; /* printf("%d '%s' '%s'\n", icu_chain_token_number(chain), icu_chain_token_norm(chain), icu_chain_token_display(chain)); */ } YAZ_CHECK_EQ(icu_chain_token_number(chain), 7); YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status)); while (icu_chain_next_token(chain, &status)) { ; /* printf("%d '%s' '%s'\n", icu_chain_token_number(chain), icu_chain_token_norm(chain), icu_chain_token_display(chain)); */ } /* we expect 'what' 'is' 'this', i.e. 3 tokens */ YAZ_CHECK_EQ(icu_chain_token_number(chain), 3); icu_chain_destroy(chain); }