Esempio n. 1
0
void geodb_destroy(geodb_t *self) {
    if (self == NULL) return;

    if (self->names != NULL) {
        trie_destroy(self->names);
    }

    if (self->features != NULL) {
        trie_destroy(self->features);
    }

    if (self->postal_codes != NULL) {
        cstring_array_destroy(self->postal_codes);
    }

    if (self->hash_reader != NULL) {
        sparkey_hash_close(&self->hash_reader);
    }

    if (self->log_iter != NULL) {
        sparkey_logiter_close(&self->log_iter);
    }

    if (self->value_buf != NULL) {
        char_array_destroy(self->value_buf);
    }

    if (self->geoname != NULL) {
        geoname_destroy(self->geoname);
    }

    if (self->postal_code != NULL) {
        gn_postal_code_destroy(self->postal_code);
    }

    free(self);
}
int main(int argc, char **argv) {
    char *filename;

    if (argc == 2) {
        filename = argv[1];
    } else {
        filename = DEFAULT_TRANSLITERATION_PATH;
    }

    FILE *f = fopen(filename, "wb");

    if (f == NULL) {
        log_error("File could not be opened, ensure directory exists: %s", filename);
        exit(1);
    }

    size_t num_source_transliterators = sizeof(transliterators_source) / sizeof(transliterator_source_t);

    char *key;
    size_t key_len;

    context_type_t pre_context_type;
    size_t pre_context_max_len;
    char *pre_context;
    size_t pre_context_len;

    context_type_t post_context_type;
    size_t post_context_max_len;
    char *post_context;
    size_t post_context_len;

    char *replacement;
    size_t replacement_len;

    char *revisit;
    size_t revisit_len;

    char *group_regex_str;
    size_t group_regex_len;

    transliteration_module_init();

    transliteration_table_t *trans_table = get_transliteration_table();

    trie_t *trie = trans_table->trie;

    for (int i = 0; i < num_source_transliterators; i++) {
        transliterator_source_t trans_source = transliterators_source[i];

        size_t trans_name_len = strlen(trans_source.name);

        log_info("Doing transliterator: %s\n", trans_source.name);

        char_array *trans_key = char_array_from_string(trans_source.name);
        char_array_cat(trans_key, NAMESPACE_SEPARATOR_CHAR);

        char *trans_name = strdup(trans_source.name);
        if (trans_name == NULL) {
            log_error("strdup returned NULL on trans_source.name\n");
            goto exit_teardown;
        }

        transliterator_t *trans = transliterator_new(trans_name, trans_source.internal, trans_table->steps->n, trans_source.steps_length);

        for (int j = 0; j < trans_source.steps_length; j++) {
            transliteration_step_source_t step_source = steps_source[trans_source.steps_start + j];

            size_t step_name_len = strlen(step_source.name);

            log_debug("Doing step: %s, type=%d\n", step_source.name, step_source.type);

            if (!transliteration_table_add_step(trans_table, step_source.type, step_source.name)) {
                log_error("Step couldn't be added\n");
                goto exit_teardown;
            }

            if (step_source.type != STEP_RULESET) {
                continue;
            }

            char_array *step_key = char_array_from_string(char_array_get_string(trans_key));
            char_array_cat(step_key, step_source.name);
            char_array_cat(step_key, NAMESPACE_SEPARATOR_CHAR);

            char *step_key_str = char_array_get_string(step_key);
            size_t step_key_len = strlen(step_key_str);

            for (int k = 0; k < step_source.rules_length; k++) {
                transliteration_rule_source_t rule_source = rules_source[step_source.rules_start + k];
                key = rule_source.key;
                key_len = rule_source.key_len;

                pre_context_type = rule_source.pre_context_type;
                pre_context_max_len = rule_source.pre_context_max_len;
                pre_context = rule_source.pre_context;
                pre_context_len = rule_source.pre_context_len;

                post_context_type = rule_source.post_context_type;
                post_context_max_len = rule_source.post_context_max_len;
                post_context = rule_source.post_context;
                post_context_len = rule_source.post_context_len;

                replacement = rule_source.replacement;
                replacement_len = rule_source.replacement_len;

                revisit = rule_source.revisit;
                revisit_len = rule_source.revisit_len;

                group_regex_str = rule_source.group_regex_str;
                group_regex_len = rule_source.group_regex_len;

                uint32_t data = trans_table->replacements->n;
                
                char_array *rule_key = char_array_from_string(step_key_str);

                uint32_t replacement_string_index = cstring_array_num_strings(trans_table->replacement_strings);
                cstring_array_add_string_len(trans_table->replacement_strings, replacement, replacement_len);

                uint32_t revisit_index = 0;
                if (revisit != NULL && revisit_len > 0) {
                    revisit_index = cstring_array_num_strings(trans_table->revisit_strings);
                    cstring_array_add_string_len(trans_table->revisit_strings, revisit, revisit_len);
                }

                group_capture_array *groups = parse_groups(group_regex_str, group_regex_len);

                transliteration_replacement_t *trans_repl = transliteration_replacement_new(replacement_string_index, revisit_index, groups);

                uint32_t replacement_index = trans_table->replacements->n;
                transliteration_replacement_array_push(trans_table->replacements, trans_repl);

                int c;

                char *token;

                log_debug("Doing rule: %s\n", key);

                string_tree_t *tree = regex_string_tree(key, key_len);

                string_tree_t *pre_context_tree = NULL;
                string_tree_iterator_t *pre_context_iter = NULL;

                cstring_array *pre_context_strings = NULL;

                if (pre_context_type != CONTEXT_TYPE_NONE) {
                    pre_context_strings = cstring_array_new();
                }

                if (pre_context_type == CONTEXT_TYPE_REGEX) {
                    log_debug("pre_context_type == CONTEXT_TYPE_REGEX\n");
                    pre_context_tree = regex_string_tree(pre_context, pre_context_len);

                    pre_context_iter = string_tree_iterator_new(pre_context_tree);

                    char_array *pre_context_perm = char_array_new_size(pre_context_len);

                    for (; !string_tree_iterator_done(pre_context_iter); string_tree_iterator_next(pre_context_iter)) {
                        char_array_clear(pre_context_perm);
                        for (c = 0; c < pre_context_iter->num_tokens; c++) {
                            token = string_tree_iterator_get_string(pre_context_iter, c);
                            if (token == NULL || strlen(token) == 0) {
                                log_warn("pre_token_context is NULL or 0 length: %s\n", token);
                            }
                            char_array_cat(pre_context_perm, token);
                        }
                        token = char_array_get_string(pre_context_perm);
                        if (token == NULL || strlen(token) == 0) {
                            log_warn("pre_perm is NULL or 0 length\n");
                        }
                        cstring_array_add_string(pre_context_strings, token);
                    }

                    char_array_destroy(pre_context_perm);
                    string_tree_iterator_destroy(pre_context_iter);
                    string_tree_destroy(pre_context_tree);
                } else if (pre_context_type == CONTEXT_TYPE_STRING) {
                    if (pre_context == NULL || strlen(pre_context) == 0) {
                        log_warn("pre_context STRING NULL or 0 length\n");
                    }
                    cstring_array_add_string(pre_context_strings, pre_context);
                } else if (pre_context_type == CONTEXT_TYPE_WORD_BOUNDARY) {
                    cstring_array_add_string(pre_context_strings, WORD_BOUNDARY_CHAR);
                }

                size_t num_pre_context_strings = 0;
                if (pre_context_type != CONTEXT_TYPE_NONE) {
                    num_pre_context_strings = cstring_array_num_strings(pre_context_strings);
                    log_debug("num_pre_context_strings = %zu\n", num_pre_context_strings);
                }

                string_tree_t *post_context_tree = NULL;
                string_tree_iterator_t *post_context_iter = NULL;

                cstring_array *post_context_strings = NULL;

                if (post_context_type != CONTEXT_TYPE_NONE) {
                    post_context_strings = cstring_array_new();
                }

                if (post_context_type == CONTEXT_TYPE_REGEX) {
                    log_debug("post_context_type == CONTEXT_TYPE_REGEX\n");
                    post_context_tree = regex_string_tree(post_context, post_context_len);

                    post_context_iter = string_tree_iterator_new(post_context_tree);

                    char_array *post_context_perm = char_array_new_size(post_context_len);

                    for (; !string_tree_iterator_done(post_context_iter); string_tree_iterator_next(post_context_iter)) {
                        char_array_clear(post_context_perm);
                        for (c = 0; c < post_context_iter->num_tokens; c++) {
                            token = string_tree_iterator_get_string(post_context_iter, c);
                            if (token == NULL) {
                                log_error ("post_token_context is NULL\n");
                            } else if (strlen(token) == 0) {
                                log_error("post_token_context is 0 length\n");
                            }
                            char_array_cat(post_context_perm, token);
                        }

                        cstring_array_add_string(post_context_strings, char_array_get_string(post_context_perm));
                    }

                    char_array_destroy(post_context_perm);
                    string_tree_iterator_destroy(post_context_iter);
                    string_tree_destroy(post_context_tree);
                } else if (post_context_type == CONTEXT_TYPE_STRING) {
                    if (post_context == NULL || strlen(post_context) == 0) {
                        log_error("post_context STRING NULL or 0 length\n");
                    }
                    cstring_array_add_string(post_context_strings, post_context);
                } else if (post_context_type == CONTEXT_TYPE_WORD_BOUNDARY) {
                    cstring_array_add_string(post_context_strings, WORD_BOUNDARY_CHAR);
                }

                size_t num_post_context_strings = 0;
                if (post_context_type != CONTEXT_TYPE_NONE) {
                    num_post_context_strings = cstring_array_num_strings(post_context_strings);
                    log_debug("num_post_context_strings = %zu\n", num_post_context_strings);
                }

                cstring_array *context_strings = NULL;
                size_t num_context_strings = 0;
                char *context_start_char = NULL;
                bool combined_context_strings = false;

                int ante, post;

                if (num_pre_context_strings > 0 && num_post_context_strings > 0) {
                    context_start_char = PRE_CONTEXT_CHAR;
                    combined_context_strings = true;
                    size_t max_string_size = 2 * MAX_UTF8_CHAR_SIZE + 
                                             ((pre_context_max_len * MAX_UTF8_CHAR_SIZE) * 
                                             (post_context_max_len * MAX_UTF8_CHAR_SIZE));
                    num_context_strings = num_pre_context_strings * num_post_context_strings;
                    char_array *context = char_array_new_size(max_string_size);
                    context_strings = cstring_array_new_size(num_context_strings * max_string_size + num_context_strings);
                    for (ante = 0; ante < num_pre_context_strings; ante++) {
                        char_array_clear(context);

                        token = cstring_array_get_string(pre_context_strings, ante);
                        if (token == NULL || strlen(token) == 0) {
                            log_error("pre_context token was NULL or 0 length\n");
                            goto exit_teardown;
                        }

                        char_array_cat(context, token);
                        size_t context_len = strlen(char_array_get_string(context));

                        for (post = 0; post < num_post_context_strings; post++) {
                            context->n = context_len;
                            char_array_cat(context, POST_CONTEXT_CHAR);
                            token = cstring_array_get_string(post_context_strings, post);
                            char_array_cat(context, token);
                            if (token == NULL || strlen(token) == 0) {
                                log_error("post_context token was NULL or 0 length\n");
                                goto exit_teardown;
                            }

                            token = char_array_get_string(context);
                            cstring_array_add_string(context_strings, token);

                        }

                    }

                    char_array_destroy(context);

                } else if (num_pre_context_strings > 0) {
                    context_start_char = PRE_CONTEXT_CHAR;
                    num_context_strings = num_pre_context_strings;
                    context_strings = pre_context_strings;
                } else if (num_post_context_strings > 0) {
                    context_start_char = POST_CONTEXT_CHAR;
                    num_context_strings = num_post_context_strings;
                    context_strings = post_context_strings;
                }

                if (num_context_strings > 0) {
                    log_debug("num_context_strings = %zu\n", num_context_strings);
                }


                if (tree == NULL) {
                    log_error("Tree was NULL, rule=%s\n", key);
                    goto exit_teardown;
                }

                string_tree_iterator_t *iter = string_tree_iterator_new(tree);

                log_debug("iter->remaining=%d\n", iter->remaining);
                
                char *key_str;

                for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) {
                    rule_key->n = step_key_len;

                    for (c = 0; c < iter->num_tokens; c++) {
                        token = string_tree_iterator_get_string(iter, c);
                        if (token == NULL) {
                            log_error("string_tree_iterator_get_string was NULL: %s\n", key);
                            goto exit_teardown;
                        }
                        char_array_cat(rule_key, token);
                        log_debug("string_tree token was %s\n", token);
                    }

                    log_debug("rule_key=%s\n", char_array_get_string(rule_key));

                    size_t context_key_len;

                    if (num_context_strings == 0) {

                        token = char_array_get_string(rule_key);
                        if (trie_get(trie, token) == NULL_NODE_ID) {
                            trie_add(trie, token, replacement_index);
                        } else {
                            log_warn("Key exists: %s, skipping\n", token);                            
                        }
                    } else {
                        char_array_cat(rule_key, context_start_char);
                        context_key_len = strlen(char_array_get_string(rule_key));

                        for (c = 0; c < num_context_strings; c++) {
                            rule_key->n = context_key_len;
                            token = cstring_array_get_string(context_strings, c);
                            if (token == NULL) {
                                log_error("token was NULL for c=%d\n", c);
                            }
                            char_array_cat(rule_key, token);
                            token = char_array_get_string(rule_key);
                            if (trie_get(trie, token) == NULL_NODE_ID) {
                                trie_add(trie, token, replacement_index);
                            } else {
                                log_warn("Key exists: %s, skipping\n", token);
                            }
                        }

                    }

                }

                string_tree_iterator_destroy(iter);
                string_tree_destroy(tree);

                char_array_destroy(rule_key);

                if (pre_context_strings != NULL) {
                    cstring_array_destroy(pre_context_strings);
                }

                if (post_context_strings != NULL) {
                    cstring_array_destroy(post_context_strings);
                }

                // Only needed if we created a combined context array
                if (combined_context_strings) {
                    cstring_array_destroy(context_strings);
                }
            }

            char_array_destroy(step_key);

        }

        char_array_destroy(trans_key);

        if (!transliteration_table_add_transliterator(trans)) {
            goto exit_teardown;
        }

    }

    size_t num_source_scripts = sizeof(script_transliteration_rules) / sizeof(script_transliteration_rule_t);

    for (int i = 0; i < num_source_scripts; i++) {
        script_transliteration_rule_t rule = script_transliteration_rules[i];

        if (!transliteration_table_add_script_language(rule.script_language, rule.index)) {
            goto exit_teardown;
        }

        transliterator_index_t index = rule.index;

        for (int j = index.transliterator_index; j < index.transliterator_index + index.num_transliterators; j++) {
            char *trans_name = script_transliterators[j];
            if (trans_name == NULL) {
                goto exit_teardown;
            }
            cstring_array_add_string(trans_table->transliterator_names, trans_name);
        }

    }

    transliteration_table_write(f);
    fclose(f);
    transliteration_module_teardown();
    log_info("Done!\n");
    exit(EXIT_SUCCESS);

exit_teardown:
    log_error("FAIL\n");
    transliteration_module_teardown();
    exit(EXIT_FAILURE);

}
Esempio n. 3
0
geodb_t *geodb_init(char *dir) {
    if (dir == NULL) return NULL;

    geodb_t *gdb = malloc(sizeof(geodb_t));

    if (gdb == NULL) return NULL;

    char_array *path = char_array_new_size(strlen(dir));
    char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, GEODB_NAMES_TRIE_FILENAME);

    char *names_path = char_array_get_string(path);

    gdb->names = trie_load(names_path);
    if (gdb->names == NULL) {
        goto exit_geodb_created;
    }

    char_array_clear(path);

    char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, GEODB_FEATURES_TRIE_FILENAME);

    char *features_path = char_array_get_string(path);

    gdb->features = trie_load(features_path);
    if(gdb->features == NULL) {
        goto exit_geodb_created;
    }

    char_array_clear(path);

    char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, GEODB_POSTAL_CODES_FILENAME);
    char *postal_codes_path = char_array_get_string(path);

    FILE *f = fopen(postal_codes_path, "rb");

    uint64_t num_postal_strings = 0;
    if (!file_read_uint64(f, (uint64_t *)&num_postal_strings)) {
        goto exit_geodb_created;
    }

    size_t postal_codes_str_len;

    if (!file_read_uint64(f, (uint64_t *)&postal_codes_str_len)) {
        goto exit_geodb_created;
    }

    char_array *array = char_array_new_size(postal_codes_str_len);

    if (!file_read_chars(f, array->a, postal_codes_str_len)) {
        goto exit_geodb_created;
    }

    array->n = postal_codes_str_len;

    gdb->postal_codes = cstring_array_from_char_array(array);

    if (cstring_array_num_strings(gdb->postal_codes) != num_postal_strings) {
        goto exit_geodb_created;
    }

    fclose(f);
    char_array_clear(path);

    char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, GEODB_HASH_FILENAME);

    char *hash_file_path = strdup(char_array_get_string(path));

    char_array_clear(path);

    char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, GEODB_LOG_FILENAME);

    char *log_path = char_array_get_string(path);

    gdb->hash_reader = NULL;

    if ((sparkey_hash_open(&gdb->hash_reader, hash_file_path, log_path)) != SPARKEY_SUCCESS) {
        free(hash_file_path);
        char_array_destroy(path);
        goto exit_geodb_created;
    }

    free(hash_file_path);
    char_array_destroy(path);

    gdb->log_iter = NULL;

    if ((sparkey_logiter_create(&gdb->log_iter, sparkey_hash_getreader(gdb->hash_reader))) != SPARKEY_SUCCESS) {
        goto exit_geodb_created;
    }

    gdb->value_buf = char_array_new_size(sparkey_logreader_maxvaluelen(sparkey_hash_getreader(gdb->hash_reader)));
    if (gdb->value_buf == NULL) {
        goto exit_geodb_created;
    }

    gdb->geoname = geoname_new();
    if (gdb->geoname == NULL) {
        goto exit_geodb_created;
    }

    gdb->postal_code = gn_postal_code_new();
    if (gdb->postal_code == NULL) {
        goto exit_geodb_created;
    }

    return gdb;

exit_geodb_created:
    geodb_destroy(gdb);
    return NULL;
}
Esempio n. 4
0
void add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, normalize_options_t options) {
    cstring_array *strings = tree->strings;

    bool have_suffix = suffix.len > 0;
    bool have_prefix = prefix.len > 0;

    address_expansion_array *prefix_expansions = NULL;
    address_expansion_array *suffix_expansions = NULL;

    address_expansion_t prefix_expansion;
    address_expansion_t suffix_expansion;

    char_array *key = char_array_new_size(token.len);
    char *expansion;

    uint64_t num_strings = 0;
    char *root_word = NULL;
    size_t root_len;
    token_t root_token;
    cstring_array *root_strings = NULL;
    int add_space = 0;
    int spaces = 0;

    size_t prefix_start, prefix_end, root_end, suffix_start;

    if (have_prefix) {
        prefix_expansions = get_affix_expansions(key, str, lang, token, prefix, false, options);
        if (prefix_expansions == NULL) have_prefix = false;
    }

    if (have_suffix) {
        suffix_expansions = get_affix_expansions(key, str, lang, token, suffix, true, options);
        if (suffix_expansions == NULL) have_suffix = false;
    }
    
    if (have_prefix && have_suffix) {
        for (int i = 0; i < prefix_expansions->n; i++) {
            prefix_expansion = prefix_expansions->a[i];
            char_array_clear(key);

            cat_affix_expansion(key, str, prefix_expansion, token, prefix);
            prefix_start = key->n - 1;

            add_space = (int)prefix_expansion.separable;
            if (prefix.len + suffix.len < token.len && !prefix_expansion.separable) {
                add_space = suffix_expansion.separable;
            }

            for (spaces = 0; spaces <= add_space; spaces++) {
                key->n = prefix_start;
                if (spaces) {
                    char_array_cat(key, " ");
                }

                prefix_end = key->n;

                if (prefix.len + suffix.len < token.len) {
                    root_len = token.len - suffix.len - prefix.len;
                    root_token = (token_t){token.offset + prefix.len, root_len, token.type};
                    root_strings = cstring_array_new_size(root_len);
                    add_normalized_strings_token(root_strings, str, root_token, options);
                    num_strings = cstring_array_num_strings(root_strings);

                    for (int j = 0; j < num_strings; j++) {
                        key->n = prefix_end;
                        root_word = cstring_array_get_string(root_strings, j);
                        char_array_cat(key, root_word);
                        root_end = key->n - 1;

                        for (int k = 0; k < suffix_expansions->n; k++) {
                            key->n = root_end;
                            suffix_expansion = suffix_expansions->a[k];

                            int add_suffix_space = suffix_expansion.separable;

                            suffix_start = key->n;
                            for (int suffix_spaces = 0; suffix_spaces <= add_suffix_space; suffix_spaces++) {
                                key->n = suffix_start;
                                if (suffix_spaces) {
                                    char_array_cat(key, " ");
                                }

                                cat_affix_expansion(key, str, suffix_expansion, token, suffix);

                                expansion = char_array_get_string(key);
                                cstring_array_add_string(strings, expansion);

                            }


                        }
                    }

                } else {
                    for (int j = 0; j < suffix_expansions->n; j++) {
                        key->n = prefix_end;
                        suffix_expansion = suffix_expansions->a[j];

                        cat_affix_expansion(key, str, suffix_expansion, token, suffix);

                        expansion = char_array_get_string(key);
                        cstring_array_add_string(tree->strings, expansion);
                    }
                }
            }

        }
    } else if (have_suffix) {
        root_len = suffix.start;
        root_token = (token_t){token.offset, root_len, token.type};
        root_strings = cstring_array_new_size(root_len);
        add_normalized_strings_token(root_strings, str, root_token, options);
        num_strings = cstring_array_num_strings(root_strings);

        for (int j = 0; j < num_strings; j++) {
            char_array_clear(key);
            root_word = cstring_array_get_string(root_strings, j);
            char_array_cat(key, root_word);

            root_end = key->n - 1;

            for (int k = 0; k < suffix_expansions->n; k++) {
                key->n = root_end;
                suffix_expansion = suffix_expansions->a[k];

                add_space = suffix_expansion.separable;
                suffix_start = key->n;

                for (int spaces = 0; spaces <= add_space; spaces++) {
                    key->n = suffix_start;
                    if (spaces) {
                        char_array_cat(key, " ");
                    }

                    cat_affix_expansion(key, str, suffix_expansion, token, suffix);

                    expansion = char_array_get_string(key);
                    cstring_array_add_string(tree->strings, expansion);
                }
            }
        }
    } else if (have_prefix) {
        root_len = token.len - prefix.len;
        root_token = (token_t){token.offset + prefix.len, root_len, token.type};
        root_strings = cstring_array_new_size(root_len);
        add_normalized_strings_token(root_strings, str, root_token, options);
        num_strings = cstring_array_num_strings(root_strings);

        for (int j = 0; j < prefix_expansions->n; j++) {
            char_array_clear(key);
            prefix_expansion = prefix_expansions->a[j];

            cat_affix_expansion(key, str, prefix_expansion, token, prefix);
            prefix_end = key->n - 1;

            add_space = prefix_expansion.separable;
            for (int spaces = 0; spaces <= add_space; spaces++) {
                key->n = prefix_end;
                if (spaces) {
                    char_array_cat(key, " ");
                }
                for (int k = 0; k < num_strings; k++) {
                    root_word = cstring_array_get_string(root_strings, k);
                    char_array_cat(key, root_word);

                    expansion = char_array_get_string(key);
                    cstring_array_add_string(tree->strings, expansion);
                }

            }
        }
    }

    char_array_destroy(key);

    if (root_strings != NULL) {
        cstring_array_destroy(root_strings);
    }

}
Esempio n. 5
0
string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
    char_array *key = NULL;

    log_debug("input=%s\n", str);
    token_array *tokens = tokenize_keep_whitespace(str);

    if (tokens == NULL) {
        return NULL;
    }

    size_t len = strlen(str);

    log_debug("tokenized, num tokens=%zu\n", tokens->n);

    phrase_language_array *phrases = NULL;
    phrase_array *lang_phrases = NULL;

    for (int i = 0; i < options.num_languages; i++)  {
        char *lang = options.languages[i];
        log_debug("lang=%s\n", lang);
        lang_phrases = search_address_dictionaries_tokens(str, tokens, lang);
        
        if (lang_phrases == NULL) { 
            log_debug("lang_phrases NULL\n");
            continue;
        }

        log_debug("lang_phrases->n = %zu\n", lang_phrases->n);

        phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n);

        for (int j = 0; j < lang_phrases->n; j++) {
            phrase_t p = lang_phrases->a[j];
            phrase_language_array_push(phrases, (phrase_language_t){lang, p});
        }

        phrase_array_destroy(lang_phrases);
    }


    lang_phrases = search_address_dictionaries_tokens(str, tokens, ALL_LANGUAGES);
    if (lang_phrases != NULL) {
        phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n);

        for (int j = 0; j < lang_phrases->n; j++) {
            phrase_t p = lang_phrases->a[j];
            phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p});
        }
        phrase_array_destroy(lang_phrases);
    }

    string_tree_t *tree = string_tree_new_size(len);

    if (phrases != NULL) {
        log_debug("phrases not NULL, n=%zu\n", phrases->n);
        ks_introsort(phrase_language_array, phrases->n, phrases->a);

        phrase_language_t phrase_lang;

        int start = 0;
        int end = 0;

        key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN);

        for (int i = 0; i < phrases->n; i++) {
            phrase_lang = phrases->a[i];
            char_array_clear(key);

            char_array_cat(key, phrase_lang.language);
            char_array_cat(key, NAMESPACE_SEPARATOR_CHAR);

            size_t namespace_len = key->n;

            phrase_t phrase = phrase_lang.phrase;

            end = phrase.start;

            for (int j = start; j < end; j++) {
                token_t token = tokens->a[j]; 
                if (token.type != WHITESPACE) {
                    log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);

                    string_tree_add_string_len(tree, str + token.offset, token.len);
                } else {
                    log_debug("Adding space\n");
                    string_tree_add_string(tree, " ");
                }
                string_tree_finalize_token(tree);       
            }

            expansion_value_t value;
            value.value = phrase.data;

            token_t token;

            if (value.components & options.address_components) {
                key->n = namespace_len;
                for (int j = phrase.start; j < phrase.start + phrase.len; j++) {
                    token = tokens->a[j];
                    if (token.type != WHITESPACE) {
                        char_array_cat_len(key, str + token.offset, token.len);
                    } else {
                        char_array_cat(key, " ");
                    }
                }

                char *key_str = char_array_get_string(key);
                log_debug("key_str=%s\n", key_str);
                address_expansion_array *expansions = address_dictionary_get_expansions(key_str);

                if (expansions != NULL) {
                    for (int j = 0; j < expansions->n; j++) {
                        address_expansion_t expansion = expansions->a[j];
                        if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
                            char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
                            if (phrase.start + phrase.len < tokens->n - 1) {
                                token_t next_token = tokens->a[phrase.start + phrase.len];
                                if (!is_numeric_token(next_token.type)) {
                                    string_tree_add_string(tree, canonical);
                                } else {
                                    uint32_t start_index = cstring_array_start_token(tree->strings);
                                    cstring_array_append_string(tree->strings, canonical);
                                    cstring_array_append_string(tree->strings, " ");
                                    cstring_array_terminate(tree->strings);
                                }
                            } else {
                                string_tree_add_string(tree, canonical);

                            }
                        } else {
                            for (int k = phrase.start; k < phrase.start + phrase.len; k++) {
                                token = tokens->a[k];
                                if (token.type != WHITESPACE) {
                                    string_tree_add_string_len(tree, str + token.offset, token.len);
                                } else {
                                    string_tree_add_string(tree, " ");
                                }
                            }

                        }
                    }

                    string_tree_finalize_token(tree);

                }
            } else {
                for (int j = phrase.start; j < phrase.start + phrase.len; j++) {
                    token = tokens->a[j];
                    if (token.type != WHITESPACE) {
                        log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
                        string_tree_add_string_len(tree, str + token.offset, token.len);
                    } else {
                        string_tree_add_string(tree, " ");
                    }
                    string_tree_finalize_token(tree);

                }
            }

            start = phrase.start + phrase.len;

        }

        char_array_destroy(key);

        end = (int)tokens->n;

        for (int j = start; j < end; j++) {
            token_t token = tokens->a[j]; 
            if (token.type != WHITESPACE) {
                log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);

                string_tree_add_string_len(tree, str + token.offset, token.len);
            } else {
                log_debug("Adding space\n");
                string_tree_add_string(tree, " ");
            }
            string_tree_finalize_token(tree);       
        }


    } else {
        string_tree_add_string(tree, str);
        string_tree_finalize_token(tree);
    }

    if (phrases != NULL) {
        phrase_language_array_destroy(phrases);
    }

    token_array_destroy(tokens);

    return tree;
}
Esempio n. 6
0
static PyObject *py_normalize_token(PyObject *self, PyObject *args) 
{
    PyObject *s;

    uint32_t offset;
    uint32_t len;
    uint16_t type;

    uint64_t options;
    if (!PyArg_ParseTuple(args, "O(IIH)K:normalize", &s, &offset, &len, &type, &options)) {
        PyErr_SetString(PyExc_TypeError,
                        "Error parsing arguments");
        return 0;
    }

    token_t token = (token_t){(size_t)offset, (size_t)len, type};

    PyObject *unistr = PyUnicode_FromObject(s);
    if (unistr == NULL) {
        PyErr_SetString(PyExc_TypeError,
                        "Parameter could not be converted to unicode in scanner");
        return 0;
    }

    #ifdef IS_PY3K
        // Python 3 encoding, supported by Python 3.3+

        char *input = PyUnicode_AsUTF8(unistr);

    #else
        // Python 2 encoding

        PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
        if (str == NULL) {
            PyErr_SetString(PyExc_ValueError,
                            "Parameter could not be utf-8 encoded");
            goto exit_decref_unistr;
        }

        char *input = PyBytes_AsString(str);

    #endif

    if (input == NULL) {
        goto exit_decref_str;
    }

    char_array *token_buffer = char_array_new_size(token.len);

    add_normalized_token(token_buffer, input, token, options);
    char *token_str = char_array_get_string(token_buffer);
    PyObject *result = PyUnicode_DecodeUTF8((const char *)token_str, token_buffer->n - 1, "strict");

    if (result == NULL) {
        PyErr_SetString(PyExc_ValueError,
                        "Error decoding token");
        char_array_destroy(token_buffer);
        goto exit_decref_str;
    }

    char_array_destroy(token_buffer);

    #ifndef IS_PY3K
    Py_XDECREF(str);
    #endif
    Py_XDECREF(unistr);

    return result;

exit_decref_str:
#ifndef IS_PY3K
    Py_XDECREF(str);
#endif
exit_decref_unistr:
    Py_XDECREF(unistr);
    return 0;
}
Esempio n. 7
0
bool address_dictionary_read(FILE *f) {
    if (address_dict != NULL) return false;

    uint32_t signature;

    if (!file_read_uint32(f, &signature) || signature != ADDRESS_DICTIONARY_SIGNATURE) {
        return false;
    }

    address_dict = malloc(sizeof(address_dictionary_t));
    if (address_dict == NULL) return false;

    uint32_t canonical_str_len;

    if (!file_read_uint32(f, &canonical_str_len)) {
        goto exit_address_dict_created;
    }

    char_array *array = char_array_new_size(canonical_str_len);

    if (array == NULL) {
        goto exit_address_dict_created;
    }

    if (!file_read_chars(f, array->a, canonical_str_len)) {
        char_array_destroy(array);
        goto exit_address_dict_created;
    }

    array->n = canonical_str_len;

    address_dict->canonical = cstring_array_from_char_array(array);

    uint32_t num_values;

    if (!file_read_uint32(f, &num_values)) {
        goto exit_address_dict_created;
    }

    address_dict->values = address_expansion_value_array_new_size(num_values);

    for (uint32_t i = 0; i < num_values; i++) {
        address_expansion_value_t *value = address_expansion_value_read(f);
        if (value == NULL) {
            goto exit_address_dict_created;
        }
        address_expansion_value_array_push(address_dict->values, value);
    }

    address_dict->trie = trie_read(f);

    if (address_dict->trie == NULL) {
        goto exit_address_dict_created;
    }

    return true;

exit_address_dict_created:
    address_dictionary_destroy(address_dict);
    return false;
}
Esempio n. 8
0
int main(int argc, char **argv) {
    char *filename;

    if (argc == 2) {
        filename = argv[1];
    } else {
        filename = DEFAULT_NUMEX_PATH;
    }

    FILE *f = fopen(filename, "wb");

    if (f == NULL) {
        log_error("File could not be opened, ensure directory exists: %s\n", filename);
        numex_module_teardown();
        exit(1);
    }

    if (!numex_module_init()) {
        log_error("Numex table initialization unsuccessful\n");
        numex_module_teardown();
        exit(1);
    }

    numex_table_t *numex_table = get_numex_table();

    size_t num_languages = sizeof(numex_languages) / sizeof(numex_language_source_t);

    size_t num_source_keys = sizeof(numex_keys) / sizeof(char *);
    size_t num_source_rules = sizeof(numex_rules) / sizeof(numex_rule_t);

    if (num_source_keys != num_source_rules) {
        log_error("num_sourcE_keys != num_source_rules, aborting\n");
        numex_module_teardown();
        exit(1);
    }

    size_t num_ordinal_indicator_rules = sizeof(ordinal_indicator_rules) / sizeof(ordinal_indicator_t);

    char_array *key = char_array_new();

    for (int i = 0; i < num_languages; i++) {
        numex_language_source_t lang_source = numex_languages[i];

        char *lang = lang_source.name;

        int j;

        size_t rule_index = lang_source.rule_index;
        size_t num_rules = lang_source.num_rules;
        size_t ordinal_indicator_index = lang_source.ordinal_indicator_index;
        size_t num_ordinal_indicators = lang_source.num_ordinal_indicators;

        numex_rule_t rule;

        uint32_t value;

        log_info("Doing language=%s\n", lang);

        for (j = rule_index; j < rule_index + num_rules; j++) {
            char *numex_key = numex_keys[j];
            numex_rule_t rule = numex_rules[j];

            value = rule.rule_type != NUMEX_STOPWORD ? numex_table->rules->n : NUMEX_STOPWORD_INDEX;
            numex_rule_array_push(numex_table->rules, rule);

            char_array_clear(key);
            char_array_cat(key, lang);
            char_array_cat(key, NAMESPACE_SEPARATOR_CHAR);
            char_array_cat(key, numex_key);

            char *str_key = char_array_get_string(key);

            trie_add(numex_table->trie, str_key, value);
        }

        for (j = ordinal_indicator_index; j < ordinal_indicator_index + num_ordinal_indicators; j++) {
            value = numex_table->ordinal_indicators->n;
            ordinal_indicator_t ordinal_source = ordinal_indicator_rules[j];

            if (ordinal_source.key == NULL) {
                log_error("ordinal source key was NULL at index %d\n", j);
                exit(EXIT_FAILURE);
            }

            char *ordinal_indicator_key = strdup(ordinal_source.key);
            if (ordinal_indicator_key == NULL) {
                log_error("Error in strdup\n");
                exit(EXIT_FAILURE);
            }

            char *suffix = NULL;
            if (ordinal_source.suffix != NULL) {
                suffix = strdup(ordinal_source.suffix);
                if (suffix == NULL) {
                    log_error("Error in strdup\n");
                    exit(EXIT_FAILURE);
                }
            }
            ordinal_indicator_t *ordinal = ordinal_indicator_new(ordinal_indicator_key, ordinal_source.gender, ordinal_source.category, suffix);
            ordinal_indicator_array_push(numex_table->ordinal_indicators, ordinal);            

            char_array_clear(key);
            char_array_cat(key, lang);
            char_array_cat(key, ORDINAL_NAMESPACE_PREFIX);

            switch (ordinal_source.gender) {
                case GENDER_MASCULINE:
                    char_array_cat(key, GENDER_MASCULINE_PREFIX);
                    break;
                case GENDER_FEMININE:
                    char_array_cat(key, GENDER_FEMININE_PREFIX);
                    break;
                case GENDER_NEUTER:
                    char_array_cat(key, GENDER_NEUTER_PREFIX);
                    break;
                case GENDER_NONE:
                default:
                    char_array_cat(key, GENDER_NONE_PREFIX);
            }

            switch (ordinal_source.category) {
                case CATEGORY_PLURAL:
                    char_array_cat(key, CATEGORY_PLURAL_PREFIX);
                    break;
                case CATEGORY_DEFAULT:
                default:
                    char_array_cat(key, CATEGORY_DEFAULT_PREFIX);

            }

            char_array_cat(key, NAMESPACE_SEPARATOR_CHAR);

            char *reversed = utf8_reversed_string(ordinal_source.key);
            char_array_cat(key, reversed);
            free(reversed);

            char *str_key = char_array_get_string(key);

            if (trie_get(numex_table->trie, str_key) == NULL_NODE_ID) {
                trie_add(numex_table->trie, str_key, value);
            } else {
                log_warn("Key exists: %s, skipping\n", str_key);                            
            }
        }

        char *name = strdup(lang_source.name);
        if (name == NULL) {
            log_error("Error in strdup\n");
            exit(EXIT_FAILURE);
        }

        numex_language_t *language = numex_language_new(name, lang_source.whole_tokens_only, lang_source.rule_index, lang_source.num_rules, lang_source.ordinal_indicator_index, lang_source.num_ordinal_indicators);
        numex_table_add_language(language);

    }

    char_array_destroy(key);

    if (!numex_table_write(f)) {
        log_error("Error writing numex table\n");
        exit(1);
    }

    fclose(f);

    numex_module_teardown();

    log_info("Done\n");
}