Exemple #1
0
bool phrases_have_same_canonical(size_t num_tokens1, char **tokens1, size_t num_tokens2, char **tokens2, phrase_t match1, phrase_t match2, canonical_match_t *response) {
    address_expansion_value_t *val1 = address_dictionary_get_expansions(match1.data);
    address_expansion_value_t *val2 = address_dictionary_get_expansions(match2.data);

    if (val1 == NULL || val2 == NULL) return false;

    address_expansion_array *expansions_array1 = val1->expansions;
    address_expansion_array *expansions_array2 = val2->expansions;

    if (expansions_array1 == NULL || expansions_array2 == NULL) return false;

    address_expansion_t *expansions1 = expansions_array1->a;
    address_expansion_t *expansions2 = expansions_array2->a;

    *response = CANONICAL_NO_MATCH;

    bool same_canonical = false;
    for (size_t i = 0; i < expansions_array1->n; i++) {
        address_expansion_t e1 = expansions1[i];

        for (size_t j = 0; j < expansions_array2->n; j++) {
            address_expansion_t e2 = expansions2[j];

            same_canonical = compare_canonical(e1, tokens1, match1, e2, tokens2, match2);
            if (same_canonical) {
                bool e1_canonical = e1.canonical_index == NULL_CANONICAL_INDEX;
                bool e2_canonical = e2.canonical_index == NULL_CANONICAL_INDEX;

                if (e1_canonical && e2_canonical) {
                    *response = BOTH_CANONICAL;
                } else if (e1_canonical) {
                    *response = FIRST_CANONICAL;
                } else if (e2_canonical) {
                    *response = SECOND_CANONICAL;
                } else {
                    *response = NEITHER_CANONICAL;
                }
                break;
            }
        }
        if (same_canonical) break;
    }

    return same_canonical;
}
Exemple #2
0
address_expansion_array *get_affix_expansions(char_array *key, char *str, char *lang, token_t token, phrase_t phrase, bool reverse, normalize_options_t options) {
    expansion_value_t value;
    value.value = phrase.data;
    address_expansion_array *expansions = NULL;

    if (value.components & options.address_components && (value.separable || !value.canonical)) {
        char_array_clear(key);
        char_array_cat(key, lang);
        char_array_cat(key, NAMESPACE_SEPARATOR_CHAR);
        if (reverse) {
            char_array_cat(key, TRIE_SUFFIX_CHAR);
            char_array_cat_reversed_len(key, str + token.offset + phrase.start, phrase.len);
        } else {
            char_array_cat(key, TRIE_PREFIX_CHAR);
            char_array_cat_len(key, str + token.offset + phrase.start, phrase.len);
        }
        char *key_str = char_array_get_string(key);
        log_debug("key_str=%s\n", key_str);
        expansions = address_dictionary_get_expansions(key_str);
    }
    return expansions;
}
Exemple #3
0
string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
    char_array *key = NULL;

    log_debug("input=%s\n", str);
    token_array *tokens = tokenize_keep_whitespace(str);

    if (tokens == NULL) {
        return NULL;
    }

    size_t len = strlen(str);

    log_debug("tokenized, num tokens=%zu\n", tokens->n);

    phrase_language_array *phrases = NULL;
    phrase_array *lang_phrases = NULL;

    for (int i = 0; i < options.num_languages; i++)  {
        char *lang = options.languages[i];
        log_debug("lang=%s\n", lang);
        lang_phrases = search_address_dictionaries_tokens(str, tokens, lang);
        
        if (lang_phrases == NULL) { 
            log_debug("lang_phrases NULL\n");
            continue;
        }

        log_debug("lang_phrases->n = %zu\n", lang_phrases->n);

        phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n);

        for (int j = 0; j < lang_phrases->n; j++) {
            phrase_t p = lang_phrases->a[j];
            phrase_language_array_push(phrases, (phrase_language_t){lang, p});
        }

        phrase_array_destroy(lang_phrases);
    }


    lang_phrases = search_address_dictionaries_tokens(str, tokens, ALL_LANGUAGES);
    if (lang_phrases != NULL) {
        phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n);

        for (int j = 0; j < lang_phrases->n; j++) {
            phrase_t p = lang_phrases->a[j];
            phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p});
        }
        phrase_array_destroy(lang_phrases);
    }

    string_tree_t *tree = string_tree_new_size(len);

    if (phrases != NULL) {
        log_debug("phrases not NULL, n=%zu\n", phrases->n);
        ks_introsort(phrase_language_array, phrases->n, phrases->a);

        phrase_language_t phrase_lang;

        int start = 0;
        int end = 0;

        key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN);

        for (int i = 0; i < phrases->n; i++) {
            phrase_lang = phrases->a[i];
            char_array_clear(key);

            char_array_cat(key, phrase_lang.language);
            char_array_cat(key, NAMESPACE_SEPARATOR_CHAR);

            size_t namespace_len = key->n;

            phrase_t phrase = phrase_lang.phrase;

            end = phrase.start;

            for (int j = start; j < end; j++) {
                token_t token = tokens->a[j]; 
                if (token.type != WHITESPACE) {
                    log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);

                    string_tree_add_string_len(tree, str + token.offset, token.len);
                } else {
                    log_debug("Adding space\n");
                    string_tree_add_string(tree, " ");
                }
                string_tree_finalize_token(tree);       
            }

            expansion_value_t value;
            value.value = phrase.data;

            token_t token;

            if (value.components & options.address_components) {
                key->n = namespace_len;
                for (int j = phrase.start; j < phrase.start + phrase.len; j++) {
                    token = tokens->a[j];
                    if (token.type != WHITESPACE) {
                        char_array_cat_len(key, str + token.offset, token.len);
                    } else {
                        char_array_cat(key, " ");
                    }
                }

                char *key_str = char_array_get_string(key);
                log_debug("key_str=%s\n", key_str);
                address_expansion_array *expansions = address_dictionary_get_expansions(key_str);

                if (expansions != NULL) {
                    for (int j = 0; j < expansions->n; j++) {
                        address_expansion_t expansion = expansions->a[j];
                        if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
                            char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
                            if (phrase.start + phrase.len < tokens->n - 1) {
                                token_t next_token = tokens->a[phrase.start + phrase.len];
                                if (!is_numeric_token(next_token.type)) {
                                    string_tree_add_string(tree, canonical);
                                } else {
                                    uint32_t start_index = cstring_array_start_token(tree->strings);
                                    cstring_array_append_string(tree->strings, canonical);
                                    cstring_array_append_string(tree->strings, " ");
                                    cstring_array_terminate(tree->strings);
                                }
                            } else {
                                string_tree_add_string(tree, canonical);

                            }
                        } else {
                            for (int k = phrase.start; k < phrase.start + phrase.len; k++) {
                                token = tokens->a[k];
                                if (token.type != WHITESPACE) {
                                    string_tree_add_string_len(tree, str + token.offset, token.len);
                                } else {
                                    string_tree_add_string(tree, " ");
                                }
                            }

                        }
                    }

                    string_tree_finalize_token(tree);

                }
            } else {
                for (int j = phrase.start; j < phrase.start + phrase.len; j++) {
                    token = tokens->a[j];
                    if (token.type != WHITESPACE) {
                        log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
                        string_tree_add_string_len(tree, str + token.offset, token.len);
                    } else {
                        string_tree_add_string(tree, " ");
                    }
                    string_tree_finalize_token(tree);

                }
            }

            start = phrase.start + phrase.len;

        }

        char_array_destroy(key);

        end = (int)tokens->n;

        for (int j = start; j < end; j++) {
            token_t token = tokens->a[j]; 
            if (token.type != WHITESPACE) {
                log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);

                string_tree_add_string_len(tree, str + token.offset, token.len);
            } else {
                log_debug("Adding space\n");
                string_tree_add_string(tree, " ");
            }
            string_tree_finalize_token(tree);       
        }


    } else {
        string_tree_add_string(tree, str);
        string_tree_finalize_token(tree);
    }

    if (phrases != NULL) {
        phrase_language_array_destroy(phrases);
    }

    token_array_destroy(tokens);

    return tree;
}