Exemple #1
0
void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options) {
    
    char *transliterated = NULL;
    char *utf8_normalized = NULL;
    char *prev_string = NULL;

    if (options & NORMALIZE_STRING_LATIN_ASCII) {
        transliterated = transliterate(LATIN_ASCII, str, len);
        if (transliterated != NULL) {
            utf8_normalized = normalize_string_utf8(transliterated, options);
            free(transliterated);
            transliterated = NULL;
        }

        if (utf8_normalized != NULL) {
            string_tree_add_string(tree, utf8_normalized);
            prev_string = utf8_normalized;
            utf8_normalized = NULL;
        }
    }

    char *str_copy = strndup(str, len);
    utf8_normalized = normalize_string_utf8(str_copy, options);
    free(str_copy);

    if (options & NORMALIZE_STRING_LATIN_ASCII && utf8_normalized != NULL) {
        transliterated = transliterate(LATIN_ASCII, utf8_normalized, strlen(utf8_normalized));
        free(utf8_normalized);
    } else {
        transliterated = utf8_normalized;
    }

    if (transliterated != NULL) {
        if (prev_string == NULL || strcmp(prev_string, transliterated) != 0) {
            string_tree_add_string(tree, transliterated);
        }
        free(transliterated);
        transliterated = NULL;
    }

    if (prev_string != NULL) {
        free(prev_string);
    }

}
Exemple #2
0
string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
    char_array *key = NULL;

    log_debug("input=%s\n", str);
    token_array *tokens = tokenize_keep_whitespace(str);

    if (tokens == NULL) {
        return NULL;
    }

    size_t len = strlen(str);

    log_debug("tokenized, num tokens=%zu\n", tokens->n);

    phrase_language_array *phrases = NULL;
    phrase_array *lang_phrases = NULL;

    for (int i = 0; i < options.num_languages; i++)  {
        char *lang = options.languages[i];
        log_debug("lang=%s\n", lang);
        lang_phrases = search_address_dictionaries_tokens(str, tokens, lang);
        
        if (lang_phrases == NULL) { 
            log_debug("lang_phrases NULL\n");
            continue;
        }

        log_debug("lang_phrases->n = %zu\n", lang_phrases->n);

        phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n);

        for (int j = 0; j < lang_phrases->n; j++) {
            phrase_t p = lang_phrases->a[j];
            phrase_language_array_push(phrases, (phrase_language_t){lang, p});
        }

        phrase_array_destroy(lang_phrases);
    }


    lang_phrases = search_address_dictionaries_tokens(str, tokens, ALL_LANGUAGES);
    if (lang_phrases != NULL) {
        phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n);

        for (int j = 0; j < lang_phrases->n; j++) {
            phrase_t p = lang_phrases->a[j];
            phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p});
        }
        phrase_array_destroy(lang_phrases);
    }

    string_tree_t *tree = string_tree_new_size(len);

    if (phrases != NULL) {
        log_debug("phrases not NULL, n=%zu\n", phrases->n);
        ks_introsort(phrase_language_array, phrases->n, phrases->a);

        phrase_language_t phrase_lang;

        int start = 0;
        int end = 0;

        key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN);

        for (int i = 0; i < phrases->n; i++) {
            phrase_lang = phrases->a[i];
            char_array_clear(key);

            char_array_cat(key, phrase_lang.language);
            char_array_cat(key, NAMESPACE_SEPARATOR_CHAR);

            size_t namespace_len = key->n;

            phrase_t phrase = phrase_lang.phrase;

            end = phrase.start;

            for (int j = start; j < end; j++) {
                token_t token = tokens->a[j]; 
                if (token.type != WHITESPACE) {
                    log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);

                    string_tree_add_string_len(tree, str + token.offset, token.len);
                } else {
                    log_debug("Adding space\n");
                    string_tree_add_string(tree, " ");
                }
                string_tree_finalize_token(tree);       
            }

            expansion_value_t value;
            value.value = phrase.data;

            token_t token;

            if (value.components & options.address_components) {
                key->n = namespace_len;
                for (int j = phrase.start; j < phrase.start + phrase.len; j++) {
                    token = tokens->a[j];
                    if (token.type != WHITESPACE) {
                        char_array_cat_len(key, str + token.offset, token.len);
                    } else {
                        char_array_cat(key, " ");
                    }
                }

                char *key_str = char_array_get_string(key);
                log_debug("key_str=%s\n", key_str);
                address_expansion_array *expansions = address_dictionary_get_expansions(key_str);

                if (expansions != NULL) {
                    for (int j = 0; j < expansions->n; j++) {
                        address_expansion_t expansion = expansions->a[j];
                        if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
                            char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
                            if (phrase.start + phrase.len < tokens->n - 1) {
                                token_t next_token = tokens->a[phrase.start + phrase.len];
                                if (!is_numeric_token(next_token.type)) {
                                    string_tree_add_string(tree, canonical);
                                } else {
                                    uint32_t start_index = cstring_array_start_token(tree->strings);
                                    cstring_array_append_string(tree->strings, canonical);
                                    cstring_array_append_string(tree->strings, " ");
                                    cstring_array_terminate(tree->strings);
                                }
                            } else {
                                string_tree_add_string(tree, canonical);

                            }
                        } else {
                            for (int k = phrase.start; k < phrase.start + phrase.len; k++) {
                                token = tokens->a[k];
                                if (token.type != WHITESPACE) {
                                    string_tree_add_string_len(tree, str + token.offset, token.len);
                                } else {
                                    string_tree_add_string(tree, " ");
                                }
                            }

                        }
                    }

                    string_tree_finalize_token(tree);

                }
            } else {
                for (int j = phrase.start; j < phrase.start + phrase.len; j++) {
                    token = tokens->a[j];
                    if (token.type != WHITESPACE) {
                        log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
                        string_tree_add_string_len(tree, str + token.offset, token.len);
                    } else {
                        string_tree_add_string(tree, " ");
                    }
                    string_tree_finalize_token(tree);

                }
            }

            start = phrase.start + phrase.len;

        }

        char_array_destroy(key);

        end = (int)tokens->n;

        for (int j = start; j < end; j++) {
            token_t token = tokens->a[j]; 
            if (token.type != WHITESPACE) {
                log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);

                string_tree_add_string_len(tree, str + token.offset, token.len);
            } else {
                log_debug("Adding space\n");
                string_tree_add_string(tree, " ");
            }
            string_tree_finalize_token(tree);       
        }


    } else {
        string_tree_add_string(tree, str);
        string_tree_finalize_token(tree);
    }

    if (phrases != NULL) {
        phrase_language_array_destroy(phrases);
    }

    token_array_destroy(tokens);

    return tree;
}
Exemple #3
0
string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) {
    size_t len = strlen(str);
    string_tree_t *tree = string_tree_new_size(len);

    size_t consumed = 0;

    khash_t(int_set) *scripts = kh_init(int_set);
    char *utf8_normalized = NULL;

    char *ptr = str;

    script_t script;

    while (consumed < len)  {
        string_script_t script_span = get_string_script(ptr, len - consumed);
        script = script_span.script;
        size_t script_len = script_span.len;
        bool is_ascii = script_span.ascii;

        // Shortcut if the string is all ASCII
        if (options & NORMALIZE_STRING_LOWERCASE && is_ascii && script_len == len) {
            utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE);
            if (utf8_normalized != NULL) {
                string_tree_add_string(tree, utf8_normalized);
                string_tree_finalize_token(tree);
                free(utf8_normalized);
                utf8_normalized = NULL;
            }

            kh_destroy(int_set, scripts);
            return tree;
        }

        log_debug("script_len=%zu\n", script_len);

        if (script != SCRIPT_LATIN && script_len > 0) {
            int ret;
            khiter_t key = kh_put(int_set, scripts, (khint_t)script, &ret);
            if (ret < 0) {
                log_error("Error in kh_put\n");
                string_tree_destroy(tree);
                kh_destroy(int_set, scripts);
                return NULL;
            }
        }

        consumed += script_len;
        ptr += script_len;
    }

    add_latin_alternatives(tree, str, len, options);

    size_t non_latin_scripts = kh_size(scripts);

    if (non_latin_scripts > 0) {
        string_tree_t *transliterators = string_tree_new_size(non_latin_scripts);

        khint_t key;
        char *trans_name = NULL;

        kh_foreach_key(scripts, key, {
            script = (script_t)key;
            for (size_t i = 0; i < num_languages; i++) {
                char *lang = languages[i];
                foreach_transliterator(script, lang, trans_name, {
                    string_tree_add_string(transliterators, trans_name);
                })
            }

            foreach_transliterator(script, "", trans_name, {
                string_tree_add_string(transliterators, trans_name);
            })