int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr, UErrorCode *status) { if (chain->iter) icu_iter_destroy(chain->iter); chain->iter = icu_iter_create(chain); icu_iter_first(chain->iter, src8cstr); return 1; }
pp2_charset_token_t pp2_charset_tokenize(pp2_charset_t pct) { pp2_charset_token_t prt = xmalloc(sizeof(*prt)); assert(pct); prt->norm_str = wrbuf_alloc(); prt->sort_str = wrbuf_alloc(); prt->cp = 0; prt->last_cp = 0; prt->pct = pct; #if YAZ_HAVE_ICU prt->iter = 0; if (pct->icu_chn) prt->iter = icu_iter_create(pct->icu_chn); #endif return prt; }
static void check_icu_iter1(void) { UErrorCode status = U_ZERO_ERROR; struct icu_chain *chain = 0; xmlNode *xml_node; yaz_icu_iter_t iter; const char *xml_str = "<icu locale=\"en\">" "<tokenize rule=\"w\"/>" "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>" "</icu>"; xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str)); YAZ_CHECK(doc); if (!doc) return; xml_node = xmlDocGetRootElement(doc); YAZ_CHECK(xml_node); if (!xml_node) return ; chain = icu_chain_xml_config(xml_node, 1, &status); xmlFreeDoc(doc); YAZ_CHECK(chain); iter = icu_iter_create(chain); icu_iter_first(iter, "a string with 15 tokens and 8 displays"); YAZ_CHECK(iter); if (!iter) return; while (icu_iter_next(iter)) { yaz_log(YLOG_LOG, "[%s]", icu_iter_get_norm(iter)); } icu_iter_destroy(iter); icu_chain_destroy(chain); }
static void check_norm(void) { UErrorCode status = U_ZERO_ERROR; struct icu_chain *chain = 0; xmlNode *xml_node; yaz_icu_iter_t it; const char *xml_str = " <icu_chain id=\"relevance\" locale=\"en\">" " <transform rule=\"[:Control:] Any-Remove\"/>" " <tokenize rule=\"l\"/>" " <transform rule=\"[[:WhiteSpace:][:Punctuation:]`] Remove\"/>" " <casemap rule=\"l\"/>" " </icu_chain>"; xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str)); YAZ_CHECK(doc); if (!doc) return; xml_node = xmlDocGetRootElement(doc); YAZ_CHECK(xml_node); if (!xml_node) return ; chain = icu_chain_xml_config(xml_node, 1, &status); it = icu_iter_create(chain); if (it) { icu_iter_first(it, " y😄"); while (icu_iter_next(it)) { const char *norm_str = icu_iter_get_norm(it); size_t start, len; YAZ_CHECK(norm_str); if (norm_str) yaz_log(YLOG_LOG, "norm_str len=%ld=%s", (long) strlen(norm_str), norm_str); icu_iter_get_org_info(it, &start, &len); YAZ_CHECK(start <= 1000); YAZ_CHECK(len <= 1000); } icu_iter_first(it, "\n y😄"); while (icu_iter_next(it)) { const char *norm_str = icu_iter_get_norm(it); size_t start, len; YAZ_CHECK(norm_str); if (norm_str) yaz_log(YLOG_LOG, "norm_str len=%ld=%s", (long) strlen(norm_str), norm_str); icu_iter_get_org_info(it, &start, &len); YAZ_CHECK(start <= 1000); YAZ_CHECK(len <= 1000); } } icu_iter_destroy(it); icu_chain_destroy(chain); xmlFreeDoc(doc); }
static int test_iter(struct icu_chain *chain, const char *input, const char *expected) { yaz_icu_iter_t iter = icu_iter_create(chain); WRBUF result, second, sort_result; int success = 1; if (!iter) { yaz_log(YLOG_WARN, "test_iter: input=%s !iter", input); return 0; } if (icu_iter_next(iter)) { yaz_log(YLOG_WARN, "test_iter: expecting 0 before icu_iter_first"); return 0; } sort_result = wrbuf_alloc(); result = wrbuf_alloc(); icu_iter_first(iter, input); while (icu_iter_next(iter)) { const char *sort_str = icu_iter_get_sortkey(iter); if (sort_str) { wrbuf_puts(sort_result, "["); wrbuf_puts_escaped(sort_result, sort_str); wrbuf_puts(sort_result, "]"); } else { wrbuf_puts(sort_result, "[NULL]"); } wrbuf_puts(result, "["); wrbuf_puts(result, icu_iter_get_norm(iter)); wrbuf_puts(result, "]"); } yaz_log(YLOG_LOG, "sortkey=%s", wrbuf_cstr(sort_result)); second = wrbuf_alloc(); icu_iter_first(iter, input); while (icu_iter_next(iter)) { wrbuf_puts(second, "["); wrbuf_puts(second, icu_iter_get_norm(iter)); wrbuf_puts(second, "]"); } icu_iter_destroy(iter); if (strcmp(expected, wrbuf_cstr(result))) { yaz_log(YLOG_WARN, "test_iter: input=%s expected=%s got=%s", input, expected, wrbuf_cstr(result)); success = 0; } if (strcmp(expected, wrbuf_cstr(second))) { yaz_log(YLOG_WARN, "test_iter: input=%s expected=%s got=%s (2nd)", input, expected, wrbuf_cstr(second)); success = 0; } wrbuf_destroy(result); wrbuf_destroy(second); wrbuf_destroy(sort_result); return success; }