bool phrases_have_same_canonical(size_t num_tokens1, char **tokens1, size_t num_tokens2, char **tokens2, phrase_t match1, phrase_t match2, canonical_match_t *response) { address_expansion_value_t *val1 = address_dictionary_get_expansions(match1.data); address_expansion_value_t *val2 = address_dictionary_get_expansions(match2.data); if (val1 == NULL || val2 == NULL) return false; address_expansion_array *expansions_array1 = val1->expansions; address_expansion_array *expansions_array2 = val2->expansions; if (expansions_array1 == NULL || expansions_array2 == NULL) return false; address_expansion_t *expansions1 = expansions_array1->a; address_expansion_t *expansions2 = expansions_array2->a; *response = CANONICAL_NO_MATCH; bool same_canonical = false; for (size_t i = 0; i < expansions_array1->n; i++) { address_expansion_t e1 = expansions1[i]; for (size_t j = 0; j < expansions_array2->n; j++) { address_expansion_t e2 = expansions2[j]; same_canonical = compare_canonical(e1, tokens1, match1, e2, tokens2, match2); if (same_canonical) { bool e1_canonical = e1.canonical_index == NULL_CANONICAL_INDEX; bool e2_canonical = e2.canonical_index == NULL_CANONICAL_INDEX; if (e1_canonical && e2_canonical) { *response = BOTH_CANONICAL; } else if (e1_canonical) { *response = FIRST_CANONICAL; } else if (e2_canonical) { *response = SECOND_CANONICAL; } else { *response = NEITHER_CANONICAL; } break; } } if (same_canonical) break; } return same_canonical; }
address_expansion_array *get_affix_expansions(char_array *key, char *str, char *lang, token_t token, phrase_t phrase, bool reverse, normalize_options_t options) { expansion_value_t value; value.value = phrase.data; address_expansion_array *expansions = NULL; if (value.components & options.address_components && (value.separable || !value.canonical)) { char_array_clear(key); char_array_cat(key, lang); char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); if (reverse) { char_array_cat(key, TRIE_SUFFIX_CHAR); char_array_cat_reversed_len(key, str + token.offset + phrase.start, phrase.len); } else { char_array_cat(key, TRIE_PREFIX_CHAR); char_array_cat_len(key, str + token.offset + phrase.start, phrase.len); } char *key_str = char_array_get_string(key); log_debug("key_str=%s\n", key_str); expansions = address_dictionary_get_expansions(key_str); } return expansions; }
string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { char_array *key = NULL; log_debug("input=%s\n", str); token_array *tokens = tokenize_keep_whitespace(str); if (tokens == NULL) { return NULL; } size_t len = strlen(str); log_debug("tokenized, num tokens=%zu\n", tokens->n); phrase_language_array *phrases = NULL; phrase_array *lang_phrases = NULL; for (int i = 0; i < options.num_languages; i++) { char *lang = options.languages[i]; log_debug("lang=%s\n", lang); lang_phrases = search_address_dictionaries_tokens(str, tokens, lang); if (lang_phrases == NULL) { log_debug("lang_phrases NULL\n"); continue; } log_debug("lang_phrases->n = %zu\n", lang_phrases->n); phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); for (int j = 0; j < lang_phrases->n; j++) { phrase_t p = lang_phrases->a[j]; phrase_language_array_push(phrases, (phrase_language_t){lang, p}); } phrase_array_destroy(lang_phrases); } lang_phrases = search_address_dictionaries_tokens(str, tokens, ALL_LANGUAGES); if (lang_phrases != NULL) { phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); for (int j = 0; j < lang_phrases->n; j++) { phrase_t p = lang_phrases->a[j]; phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p}); } phrase_array_destroy(lang_phrases); } string_tree_t *tree = string_tree_new_size(len); if (phrases != NULL) { log_debug("phrases not NULL, n=%zu\n", phrases->n); ks_introsort(phrase_language_array, phrases->n, phrases->a); phrase_language_t phrase_lang; int start = 0; int end = 0; key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN); for (int i = 0; i < phrases->n; i++) { phrase_lang = phrases->a[i]; char_array_clear(key); char_array_cat(key, phrase_lang.language); char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); size_t namespace_len = key->n; phrase_t phrase = phrase_lang.phrase; end = phrase.start; for (int j = start; j < end; j++) { token_t token = tokens->a[j]; if (token.type != WHITESPACE) { log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { log_debug("Adding space\n"); string_tree_add_string(tree, " "); } string_tree_finalize_token(tree); } expansion_value_t value; value.value = phrase.data; token_t token; if (value.components & options.address_components) { key->n = namespace_len; for (int j = phrase.start; j < phrase.start + phrase.len; j++) { token = tokens->a[j]; if (token.type != WHITESPACE) { char_array_cat_len(key, str + token.offset, token.len); } else { char_array_cat(key, " "); } } char *key_str = char_array_get_string(key); log_debug("key_str=%s\n", key_str); address_expansion_array *expansions = address_dictionary_get_expansions(key_str); if (expansions != NULL) { for (int j = 0; j < expansions->n; j++) { address_expansion_t expansion = expansions->a[j]; if (expansion.canonical_index != NULL_CANONICAL_INDEX) { char *canonical = address_dictionary_get_canonical(expansion.canonical_index); if (phrase.start + phrase.len < tokens->n - 1) { token_t next_token = tokens->a[phrase.start + phrase.len]; if (!is_numeric_token(next_token.type)) { string_tree_add_string(tree, canonical); } else { uint32_t start_index = cstring_array_start_token(tree->strings); cstring_array_append_string(tree->strings, canonical); cstring_array_append_string(tree->strings, " "); cstring_array_terminate(tree->strings); } } else { string_tree_add_string(tree, canonical); } } else { for (int k = phrase.start; k < phrase.start + phrase.len; k++) { token = tokens->a[k]; if (token.type != WHITESPACE) { string_tree_add_string_len(tree, str + token.offset, token.len); } else { string_tree_add_string(tree, " "); } } } } string_tree_finalize_token(tree); } } else { for (int j = phrase.start; j < phrase.start + phrase.len; j++) { token = tokens->a[j]; if (token.type != WHITESPACE) { log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { string_tree_add_string(tree, " "); } string_tree_finalize_token(tree); } } start = phrase.start + phrase.len; } char_array_destroy(key); end = (int)tokens->n; for (int j = start; j < end; j++) { token_t token = tokens->a[j]; if (token.type != WHITESPACE) { log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { log_debug("Adding space\n"); string_tree_add_string(tree, " "); } string_tree_finalize_token(tree); } } else { string_tree_add_string(tree, str); string_tree_finalize_token(tree); } if (phrases != NULL) { phrase_language_array_destroy(phrases); } token_array_destroy(tokens); return tree; }