} if (!have_phrase) { add_normalized_strings_token(strings, str, token, options); } string_tree_finalize_token(tree); } } void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, normalize_options_t options) { size_t len = strlen(str); token_array *tokens = tokenize_keep_whitespace(str); string_tree_t *token_tree = string_tree_new_size(len); add_normalized_strings_tokenized(token_tree, str, tokens, options); string_tree_iterator_t *tokenized_iter = string_tree_iterator_new(token_tree); string_tree_iterator_t *iter; char_array *temp_string = char_array_new_size(len); char *token; char *lang; kh_resize(str_set, unique_strings, kh_size(unique_strings) + tokenized_iter->remaining); for (; string_tree_iterator_done(tokenized_iter); string_tree_iterator_next(tokenized_iter)) {
string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { char_array *key = NULL; log_debug("input=%s\n", str); token_array *tokens = tokenize_keep_whitespace(str); if (tokens == NULL) { return NULL; } size_t len = strlen(str); log_debug("tokenized, num tokens=%zu\n", tokens->n); phrase_language_array *phrases = NULL; phrase_array *lang_phrases = NULL; for (int i = 0; i < options.num_languages; i++) { char *lang = options.languages[i]; log_debug("lang=%s\n", lang); lang_phrases = search_address_dictionaries_tokens(str, tokens, lang); if (lang_phrases == NULL) { log_debug("lang_phrases NULL\n"); continue; } log_debug("lang_phrases->n = %zu\n", lang_phrases->n); phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); for (int j = 0; j < lang_phrases->n; j++) { phrase_t p = lang_phrases->a[j]; phrase_language_array_push(phrases, (phrase_language_t){lang, p}); } phrase_array_destroy(lang_phrases); } lang_phrases = search_address_dictionaries_tokens(str, tokens, ALL_LANGUAGES); if (lang_phrases != NULL) { phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); for (int j = 0; j < lang_phrases->n; j++) { phrase_t p = lang_phrases->a[j]; phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p}); } phrase_array_destroy(lang_phrases); } string_tree_t *tree = string_tree_new_size(len); if (phrases != NULL) { log_debug("phrases not NULL, n=%zu\n", phrases->n); ks_introsort(phrase_language_array, phrases->n, phrases->a); phrase_language_t phrase_lang; int start = 0; int end = 0; key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN); for (int i = 0; i < phrases->n; i++) { phrase_lang = phrases->a[i]; char_array_clear(key); char_array_cat(key, phrase_lang.language); char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); size_t namespace_len = key->n; phrase_t phrase = phrase_lang.phrase; end = phrase.start; for (int j = start; j < end; j++) { token_t token = tokens->a[j]; if (token.type != WHITESPACE) { log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { log_debug("Adding space\n"); string_tree_add_string(tree, " "); } string_tree_finalize_token(tree); } expansion_value_t value; value.value = phrase.data; token_t token; if (value.components & options.address_components) { key->n = namespace_len; for (int j = phrase.start; j < phrase.start + phrase.len; j++) { token = tokens->a[j]; if (token.type != WHITESPACE) { char_array_cat_len(key, str + token.offset, token.len); } else { char_array_cat(key, " "); } } char *key_str = char_array_get_string(key); log_debug("key_str=%s\n", key_str); address_expansion_array *expansions = address_dictionary_get_expansions(key_str); if (expansions != NULL) { for (int j = 0; j < expansions->n; j++) { address_expansion_t expansion = expansions->a[j]; if (expansion.canonical_index != NULL_CANONICAL_INDEX) { char *canonical = address_dictionary_get_canonical(expansion.canonical_index); if (phrase.start + phrase.len < tokens->n - 1) { token_t next_token = tokens->a[phrase.start + phrase.len]; if (!is_numeric_token(next_token.type)) { string_tree_add_string(tree, canonical); } else { uint32_t start_index = cstring_array_start_token(tree->strings); cstring_array_append_string(tree->strings, canonical); cstring_array_append_string(tree->strings, " "); cstring_array_terminate(tree->strings); } } else { string_tree_add_string(tree, canonical); } } else { for (int k = phrase.start; k < phrase.start + phrase.len; k++) { token = tokens->a[k]; if (token.type != WHITESPACE) { string_tree_add_string_len(tree, str + token.offset, token.len); } else { string_tree_add_string(tree, " "); } } } } string_tree_finalize_token(tree); } } else { for (int j = phrase.start; j < phrase.start + phrase.len; j++) { token = tokens->a[j]; if (token.type != WHITESPACE) { log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { string_tree_add_string(tree, " "); } string_tree_finalize_token(tree); } } start = phrase.start + phrase.len; } char_array_destroy(key); end = (int)tokens->n; for (int j = start; j < end; j++) { token_t token = tokens->a[j]; if (token.type != WHITESPACE) { log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { log_debug("Adding space\n"); string_tree_add_string(tree, " "); } string_tree_finalize_token(tree); } } else { string_tree_add_string(tree, str); string_tree_finalize_token(tree); } if (phrases != NULL) { phrase_language_array_destroy(phrases); } token_array_destroy(tokens); return tree; }
string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) { size_t len = strlen(str); string_tree_t *tree = string_tree_new_size(len); size_t consumed = 0; khash_t(int_set) *scripts = kh_init(int_set); char *utf8_normalized = NULL; char *ptr = str; script_t script; while (consumed < len) { string_script_t script_span = get_string_script(ptr, len - consumed); script = script_span.script; size_t script_len = script_span.len; bool is_ascii = script_span.ascii; // Shortcut if the string is all ASCII if (options & NORMALIZE_STRING_LOWERCASE && is_ascii && script_len == len) { utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE); if (utf8_normalized != NULL) { string_tree_add_string(tree, utf8_normalized); string_tree_finalize_token(tree); free(utf8_normalized); utf8_normalized = NULL; } kh_destroy(int_set, scripts); return tree; } log_debug("script_len=%zu\n", script_len); if (script != SCRIPT_LATIN && script_len > 0) { int ret; khiter_t key = kh_put(int_set, scripts, (khint_t)script, &ret); if (ret < 0) { log_error("Error in kh_put\n"); string_tree_destroy(tree); kh_destroy(int_set, scripts); return NULL; } } consumed += script_len; ptr += script_len; } add_latin_alternatives(tree, str, len, options); size_t non_latin_scripts = kh_size(scripts); if (non_latin_scripts > 0) { string_tree_t *transliterators = string_tree_new_size(non_latin_scripts); khint_t key; char *trans_name = NULL; kh_foreach_key(scripts, key, { script = (script_t)key; for (size_t i = 0; i < num_languages; i++) { char *lang = languages[i]; foreach_transliterator(script, lang, trans_name, { string_tree_add_string(transliterators, trans_name); }) } foreach_transliterator(script, "", trans_name, { string_tree_add_string(transliterators, trans_name); })