inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, normalize_options_t options) { cstring_array *strings = tree->strings; for (int i = 0; i < tokens->n; i++) { token_t token = tokens->a[i]; bool have_phrase = false; for (int j = 0; j < options.num_languages; j++) { char *lang = options.languages[j]; if (expand_affixes(tree, str, lang, token, options)) { have_phrase = true; break; } } if (!have_phrase) { add_normalized_strings_token(strings, str, token, options); } string_tree_finalize_token(tree); } }
string_tree_t *regex_string_tree(char *regex, size_t len) { uint8_t *char_ptr = (uint8_t *)regex; bool in_set = false; bool in_brackets = false; int32_t codepoint; int32_t last_codepoint = 0; ssize_t char_len; size_t bracket_start; size_t bracket_len; char temp_char[MAX_UTF8_CHAR_SIZE]; ssize_t temp_char_len; string_tree_t *tree = string_tree_new(); if (len == 0) { // Single token with zero-length string_tree_add_string_len(tree, regex, len); string_tree_finalize_token(tree); return tree; } uint32_array *char_set = uint32_array_new(); size_t idx = 0; int i, j; bool add_to_index = false; while (idx < len) { char_len = utf8proc_iterate(char_ptr, len, &codepoint); if (char_len <= 0) { uint32_array_destroy(char_set); string_tree_destroy(tree); return NULL; } if (!(utf8proc_codepoint_valid(codepoint))) { idx += char_len; char_ptr += char_len; continue; } add_to_index = true; if (codepoint == LSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { log_debug("begin set\n"); in_set = true; codepoint = BEGIN_SET_CODEPOINT; uint32_array_clear(char_set); } else if (codepoint == RSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT && in_set) { log_debug("end set"); for (j = 0; j < char_set->n; j++) { temp_char_len = utf8proc_encode_char(char_set->a[j], (uint8_t *)temp_char); log_debug("Adding string %.*s\n", (int)temp_char_len, temp_char); string_tree_add_string_len(tree, temp_char, temp_char_len); } string_tree_finalize_token(tree); uint32_array_clear(char_set); // Add a special codepoint to the sequence to distinguish from an escaped square bracket codepoint = END_SET_CODEPOINT; in_set = false; } else if (codepoint == LCURLY_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { in_brackets = true; bracket_start = idx + char_len; bracket_len = 0; add_to_index = false; } else if (codepoint == RCURLY_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT && in_brackets) { log_debug("Adding bracketed string: %.*s\n", (int) bracket_len, regex + bracket_start); string_tree_add_string_len(tree, regex + bracket_start, bracket_len); in_brackets = false; } else if ((codepoint == LPAREN_CODEPOINT || codepoint == RPAREN_CODEPOINT) && last_codepoint != BACKSLASH_CODEPOINT) { log_debug("group\n"); add_to_index = false; } else if (in_set) { log_debug("in set\n"); // Queue node, we'll add them to the trie uint32_array_push(char_set, codepoint); add_to_index = false; } else if (in_brackets) { add_to_index = false; bracket_len += char_len; } else if (codepoint == BACKSLASH_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { add_to_index = false; } log_debug("codepoint = %d\n", codepoint); if (add_to_index) { temp_char_len = utf8proc_encode_char(codepoint, (uint8_t *)temp_char); log_debug("char = %.*s\n", (int)temp_char_len, temp_char); string_tree_add_string_len(tree, temp_char, temp_char_len); string_tree_finalize_token(tree); } idx += char_len; char_ptr += char_len; } uint32_array_destroy(char_set); return tree; }
string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { char_array *key = NULL; log_debug("input=%s\n", str); token_array *tokens = tokenize_keep_whitespace(str); if (tokens == NULL) { return NULL; } size_t len = strlen(str); log_debug("tokenized, num tokens=%zu\n", tokens->n); phrase_language_array *phrases = NULL; phrase_array *lang_phrases = NULL; for (int i = 0; i < options.num_languages; i++) { char *lang = options.languages[i]; log_debug("lang=%s\n", lang); lang_phrases = search_address_dictionaries_tokens(str, tokens, lang); if (lang_phrases == NULL) { log_debug("lang_phrases NULL\n"); continue; } log_debug("lang_phrases->n = %zu\n", lang_phrases->n); phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); for (int j = 0; j < lang_phrases->n; j++) { phrase_t p = lang_phrases->a[j]; phrase_language_array_push(phrases, (phrase_language_t){lang, p}); } phrase_array_destroy(lang_phrases); } lang_phrases = search_address_dictionaries_tokens(str, tokens, ALL_LANGUAGES); if (lang_phrases != NULL) { phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); for (int j = 0; j < lang_phrases->n; j++) { phrase_t p = lang_phrases->a[j]; phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p}); } phrase_array_destroy(lang_phrases); } string_tree_t *tree = string_tree_new_size(len); if (phrases != NULL) { log_debug("phrases not NULL, n=%zu\n", phrases->n); ks_introsort(phrase_language_array, phrases->n, phrases->a); phrase_language_t phrase_lang; int start = 0; int end = 0; key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN); for (int i = 0; i < phrases->n; i++) { phrase_lang = phrases->a[i]; char_array_clear(key); char_array_cat(key, phrase_lang.language); char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); size_t namespace_len = key->n; phrase_t phrase = phrase_lang.phrase; end = phrase.start; for (int j = start; j < end; j++) { token_t token = tokens->a[j]; if (token.type != WHITESPACE) { log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { log_debug("Adding space\n"); string_tree_add_string(tree, " "); } string_tree_finalize_token(tree); } expansion_value_t value; value.value = phrase.data; token_t token; if (value.components & options.address_components) { key->n = namespace_len; for (int j = phrase.start; j < phrase.start + phrase.len; j++) { token = tokens->a[j]; if (token.type != WHITESPACE) { char_array_cat_len(key, str + token.offset, token.len); } else { char_array_cat(key, " "); } } char *key_str = char_array_get_string(key); log_debug("key_str=%s\n", key_str); address_expansion_array *expansions = address_dictionary_get_expansions(key_str); if (expansions != NULL) { for (int j = 0; j < expansions->n; j++) { address_expansion_t expansion = expansions->a[j]; if (expansion.canonical_index != NULL_CANONICAL_INDEX) { char *canonical = address_dictionary_get_canonical(expansion.canonical_index); if (phrase.start + phrase.len < tokens->n - 1) { token_t next_token = tokens->a[phrase.start + phrase.len]; if (!is_numeric_token(next_token.type)) { string_tree_add_string(tree, canonical); } else { uint32_t start_index = cstring_array_start_token(tree->strings); cstring_array_append_string(tree->strings, canonical); cstring_array_append_string(tree->strings, " "); cstring_array_terminate(tree->strings); } } else { string_tree_add_string(tree, canonical); } } else { for (int k = phrase.start; k < phrase.start + phrase.len; k++) { token = tokens->a[k]; if (token.type != WHITESPACE) { string_tree_add_string_len(tree, str + token.offset, token.len); } else { string_tree_add_string(tree, " "); } } } } string_tree_finalize_token(tree); } } else { for (int j = phrase.start; j < phrase.start + phrase.len; j++) { token = tokens->a[j]; if (token.type != WHITESPACE) { log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { string_tree_add_string(tree, " "); } string_tree_finalize_token(tree); } } start = phrase.start + phrase.len; } char_array_destroy(key); end = (int)tokens->n; for (int j = start; j < end; j++) { token_t token = tokens->a[j]; if (token.type != WHITESPACE) { log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { log_debug("Adding space\n"); string_tree_add_string(tree, " "); } string_tree_finalize_token(tree); } } else { string_tree_add_string(tree, str); string_tree_finalize_token(tree); } if (phrases != NULL) { phrase_language_array_destroy(phrases); } token_array_destroy(tokens); return tree; }
string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) { size_t len = strlen(str); string_tree_t *tree = string_tree_new_size(len); size_t consumed = 0; khash_t(int_set) *scripts = kh_init(int_set); char *utf8_normalized = NULL; char *ptr = str; script_t script; while (consumed < len) { string_script_t script_span = get_string_script(ptr, len - consumed); script = script_span.script; size_t script_len = script_span.len; bool is_ascii = script_span.ascii; // Shortcut if the string is all ASCII if (options & NORMALIZE_STRING_LOWERCASE && is_ascii && script_len == len) { utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE); if (utf8_normalized != NULL) { string_tree_add_string(tree, utf8_normalized); string_tree_finalize_token(tree); free(utf8_normalized); utf8_normalized = NULL; } kh_destroy(int_set, scripts); return tree; } log_debug("script_len=%zu\n", script_len); if (script != SCRIPT_LATIN && script_len > 0) { int ret; khiter_t key = kh_put(int_set, scripts, (khint_t)script, &ret); if (ret < 0) { log_error("Error in kh_put\n"); string_tree_destroy(tree); kh_destroy(int_set, scripts); return NULL; } } consumed += script_len; ptr += script_len; } add_latin_alternatives(tree, str, len, options); size_t non_latin_scripts = kh_size(scripts); if (non_latin_scripts > 0) { string_tree_t *transliterators = string_tree_new_size(non_latin_scripts); khint_t key; char *trans_name = NULL; kh_foreach_key(scripts, key, { script = (script_t)key; for (size_t i = 0; i < num_languages; i++) { char *lang = languages[i]; foreach_transliterator(script, lang, trans_name, { string_tree_add_string(transliterators, trans_name); }) } foreach_transliterator(script, "", trans_name, { string_tree_add_string(transliterators, trans_name); })