inline char *cstring_array_get_phrase(cstring_array *str, char_array *phrase_tokens, phrase_t phrase) { char_array_clear(phrase_tokens); size_t phrase_end = phrase.start + phrase.len; for (int k = phrase.start; k < phrase_end; k++) { char *w = cstring_array_get_string(str, k); char_array_append(phrase_tokens, w); if (k < phrase_end - 1) { char_array_append(phrase_tokens, " "); } } char_array_terminate(phrase_tokens); return char_array_get_string(phrase_tokens); }
address_expansion_array *get_affix_expansions(char_array *key, char *str, char *lang, token_t token, phrase_t phrase, bool reverse, normalize_options_t options) { expansion_value_t value; value.value = phrase.data; address_expansion_array *expansions = NULL; if (value.components & options.address_components && (value.separable || !value.canonical)) { char_array_clear(key); char_array_cat(key, lang); char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); if (reverse) { char_array_cat(key, TRIE_SUFFIX_CHAR); char_array_cat_reversed_len(key, str + token.offset + phrase.start, phrase.len); } else { char_array_cat(key, TRIE_PREFIX_CHAR); char_array_cat_len(key, str + token.offset + phrase.start, phrase.len); } char *key_str = char_array_get_string(key); log_debug("key_str=%s\n", key_str); expansions = address_dictionary_get_expansions(key_str); } return expansions; }
geodb_t *geodb_init(char *dir) { if (dir == NULL) return NULL; geodb_t *gdb = malloc(sizeof(geodb_t)); if (gdb == NULL) return NULL; char_array *path = char_array_new_size(strlen(dir)); char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, GEODB_NAMES_TRIE_FILENAME); char *names_path = char_array_get_string(path); gdb->names = trie_load(names_path); if (gdb->names == NULL) { goto exit_geodb_created; } char_array_clear(path); char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, GEODB_FEATURES_TRIE_FILENAME); char *features_path = char_array_get_string(path); gdb->features = trie_load(features_path); if(gdb->features == NULL) { goto exit_geodb_created; } char_array_clear(path); char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, GEODB_POSTAL_CODES_FILENAME); char *postal_codes_path = char_array_get_string(path); FILE *f = fopen(postal_codes_path, "rb"); uint64_t num_postal_strings = 0; if (!file_read_uint64(f, (uint64_t *)&num_postal_strings)) { goto exit_geodb_created; } size_t postal_codes_str_len; if (!file_read_uint64(f, (uint64_t *)&postal_codes_str_len)) { goto exit_geodb_created; } char_array *array = char_array_new_size(postal_codes_str_len); if (!file_read_chars(f, array->a, postal_codes_str_len)) { goto exit_geodb_created; } array->n = postal_codes_str_len; gdb->postal_codes = cstring_array_from_char_array(array); if (cstring_array_num_strings(gdb->postal_codes) != num_postal_strings) { goto exit_geodb_created; } fclose(f); char_array_clear(path); char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, GEODB_HASH_FILENAME); char *hash_file_path = strdup(char_array_get_string(path)); char_array_clear(path); char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, GEODB_LOG_FILENAME); char *log_path = char_array_get_string(path); gdb->hash_reader = NULL; if ((sparkey_hash_open(&gdb->hash_reader, hash_file_path, log_path)) != SPARKEY_SUCCESS) { free(hash_file_path); char_array_destroy(path); goto exit_geodb_created; } free(hash_file_path); char_array_destroy(path); gdb->log_iter = NULL; if ((sparkey_logiter_create(&gdb->log_iter, sparkey_hash_getreader(gdb->hash_reader))) != SPARKEY_SUCCESS) { goto exit_geodb_created; } gdb->value_buf = char_array_new_size(sparkey_logreader_maxvaluelen(sparkey_hash_getreader(gdb->hash_reader))); if (gdb->value_buf == NULL) { goto exit_geodb_created; } gdb->geoname = geoname_new(); if (gdb->geoname == NULL) { goto exit_geodb_created; } gdb->postal_code = gn_postal_code_new(); if (gdb->postal_code == NULL) { goto exit_geodb_created; } return gdb; exit_geodb_created: geodb_destroy(gdb); return NULL; }
int main(int argc, char **argv) { char *filename; if (argc == 2) { filename = argv[1]; } else { filename = DEFAULT_TRANSLITERATION_PATH; } FILE *f = fopen(filename, "wb"); if (f == NULL) { log_error("File could not be opened, ensure directory exists: %s", filename); exit(1); } size_t num_source_transliterators = sizeof(transliterators_source) / sizeof(transliterator_source_t); char *key; size_t key_len; context_type_t pre_context_type; size_t pre_context_max_len; char *pre_context; size_t pre_context_len; context_type_t post_context_type; size_t post_context_max_len; char *post_context; size_t post_context_len; char *replacement; size_t replacement_len; char *revisit; size_t revisit_len; char *group_regex_str; size_t group_regex_len; transliteration_module_init(); transliteration_table_t *trans_table = get_transliteration_table(); trie_t *trie = trans_table->trie; for (int i = 0; i < num_source_transliterators; i++) { transliterator_source_t trans_source = transliterators_source[i]; size_t trans_name_len = strlen(trans_source.name); log_info("Doing transliterator: %s\n", trans_source.name); char_array *trans_key = char_array_from_string(trans_source.name); char_array_cat(trans_key, NAMESPACE_SEPARATOR_CHAR); char *trans_name = strdup(trans_source.name); if (trans_name == NULL) { log_error("strdup returned NULL on trans_source.name\n"); goto exit_teardown; } transliterator_t *trans = transliterator_new(trans_name, trans_source.internal, trans_table->steps->n, trans_source.steps_length); for (int j = 0; j < trans_source.steps_length; j++) { transliteration_step_source_t step_source = steps_source[trans_source.steps_start + j]; size_t step_name_len = strlen(step_source.name); log_debug("Doing step: %s, type=%d\n", step_source.name, step_source.type); if (!transliteration_table_add_step(trans_table, step_source.type, step_source.name)) { log_error("Step couldn't be added\n"); goto exit_teardown; } if (step_source.type != STEP_RULESET) { continue; } char_array *step_key = char_array_from_string(char_array_get_string(trans_key)); char_array_cat(step_key, step_source.name); char_array_cat(step_key, NAMESPACE_SEPARATOR_CHAR); char *step_key_str = char_array_get_string(step_key); size_t step_key_len = strlen(step_key_str); for (int k = 0; k < step_source.rules_length; k++) { transliteration_rule_source_t rule_source = rules_source[step_source.rules_start + k]; key = rule_source.key; key_len = rule_source.key_len; pre_context_type = rule_source.pre_context_type; pre_context_max_len = rule_source.pre_context_max_len; pre_context = rule_source.pre_context; pre_context_len = rule_source.pre_context_len; post_context_type = rule_source.post_context_type; post_context_max_len = rule_source.post_context_max_len; post_context = rule_source.post_context; post_context_len = rule_source.post_context_len; replacement = rule_source.replacement; replacement_len = rule_source.replacement_len; revisit = rule_source.revisit; revisit_len = rule_source.revisit_len; group_regex_str = rule_source.group_regex_str; group_regex_len = rule_source.group_regex_len; uint32_t data = trans_table->replacements->n; char_array *rule_key = char_array_from_string(step_key_str); uint32_t replacement_string_index = cstring_array_num_strings(trans_table->replacement_strings); cstring_array_add_string_len(trans_table->replacement_strings, replacement, replacement_len); uint32_t revisit_index = 0; if (revisit != NULL && revisit_len > 0) { revisit_index = cstring_array_num_strings(trans_table->revisit_strings); cstring_array_add_string_len(trans_table->revisit_strings, revisit, revisit_len); } group_capture_array *groups = parse_groups(group_regex_str, group_regex_len); transliteration_replacement_t *trans_repl = transliteration_replacement_new(replacement_string_index, revisit_index, groups); uint32_t replacement_index = trans_table->replacements->n; transliteration_replacement_array_push(trans_table->replacements, trans_repl); int c; char *token; log_debug("Doing rule: %s\n", key); string_tree_t *tree = regex_string_tree(key, key_len); string_tree_t *pre_context_tree = NULL; string_tree_iterator_t *pre_context_iter = NULL; cstring_array *pre_context_strings = NULL; if (pre_context_type != CONTEXT_TYPE_NONE) { pre_context_strings = cstring_array_new(); } if (pre_context_type == CONTEXT_TYPE_REGEX) { log_debug("pre_context_type == CONTEXT_TYPE_REGEX\n"); pre_context_tree = regex_string_tree(pre_context, pre_context_len); pre_context_iter = string_tree_iterator_new(pre_context_tree); char_array *pre_context_perm = char_array_new_size(pre_context_len); for (; !string_tree_iterator_done(pre_context_iter); string_tree_iterator_next(pre_context_iter)) { char_array_clear(pre_context_perm); for (c = 0; c < pre_context_iter->num_tokens; c++) { token = string_tree_iterator_get_string(pre_context_iter, c); if (token == NULL || strlen(token) == 0) { log_warn("pre_token_context is NULL or 0 length: %s\n", token); } char_array_cat(pre_context_perm, token); } token = char_array_get_string(pre_context_perm); if (token == NULL || strlen(token) == 0) { log_warn("pre_perm is NULL or 0 length\n"); } cstring_array_add_string(pre_context_strings, token); } char_array_destroy(pre_context_perm); string_tree_iterator_destroy(pre_context_iter); string_tree_destroy(pre_context_tree); } else if (pre_context_type == CONTEXT_TYPE_STRING) { if (pre_context == NULL || strlen(pre_context) == 0) { log_warn("pre_context STRING NULL or 0 length\n"); } cstring_array_add_string(pre_context_strings, pre_context); } else if (pre_context_type == CONTEXT_TYPE_WORD_BOUNDARY) { cstring_array_add_string(pre_context_strings, WORD_BOUNDARY_CHAR); } size_t num_pre_context_strings = 0; if (pre_context_type != CONTEXT_TYPE_NONE) { num_pre_context_strings = cstring_array_num_strings(pre_context_strings); log_debug("num_pre_context_strings = %zu\n", num_pre_context_strings); } string_tree_t *post_context_tree = NULL; string_tree_iterator_t *post_context_iter = NULL; cstring_array *post_context_strings = NULL; if (post_context_type != CONTEXT_TYPE_NONE) { post_context_strings = cstring_array_new(); } if (post_context_type == CONTEXT_TYPE_REGEX) { log_debug("post_context_type == CONTEXT_TYPE_REGEX\n"); post_context_tree = regex_string_tree(post_context, post_context_len); post_context_iter = string_tree_iterator_new(post_context_tree); char_array *post_context_perm = char_array_new_size(post_context_len); for (; !string_tree_iterator_done(post_context_iter); string_tree_iterator_next(post_context_iter)) { char_array_clear(post_context_perm); for (c = 0; c < post_context_iter->num_tokens; c++) { token = string_tree_iterator_get_string(post_context_iter, c); if (token == NULL) { log_error ("post_token_context is NULL\n"); } else if (strlen(token) == 0) { log_error("post_token_context is 0 length\n"); } char_array_cat(post_context_perm, token); } cstring_array_add_string(post_context_strings, char_array_get_string(post_context_perm)); } char_array_destroy(post_context_perm); string_tree_iterator_destroy(post_context_iter); string_tree_destroy(post_context_tree); } else if (post_context_type == CONTEXT_TYPE_STRING) { if (post_context == NULL || strlen(post_context) == 0) { log_error("post_context STRING NULL or 0 length\n"); } cstring_array_add_string(post_context_strings, post_context); } else if (post_context_type == CONTEXT_TYPE_WORD_BOUNDARY) { cstring_array_add_string(post_context_strings, WORD_BOUNDARY_CHAR); } size_t num_post_context_strings = 0; if (post_context_type != CONTEXT_TYPE_NONE) { num_post_context_strings = cstring_array_num_strings(post_context_strings); log_debug("num_post_context_strings = %zu\n", num_post_context_strings); } cstring_array *context_strings = NULL; size_t num_context_strings = 0; char *context_start_char = NULL; bool combined_context_strings = false; int ante, post; if (num_pre_context_strings > 0 && num_post_context_strings > 0) { context_start_char = PRE_CONTEXT_CHAR; combined_context_strings = true; size_t max_string_size = 2 * MAX_UTF8_CHAR_SIZE + ((pre_context_max_len * MAX_UTF8_CHAR_SIZE) * (post_context_max_len * MAX_UTF8_CHAR_SIZE)); num_context_strings = num_pre_context_strings * num_post_context_strings; char_array *context = char_array_new_size(max_string_size); context_strings = cstring_array_new_size(num_context_strings * max_string_size + num_context_strings); for (ante = 0; ante < num_pre_context_strings; ante++) { char_array_clear(context); token = cstring_array_get_string(pre_context_strings, ante); if (token == NULL || strlen(token) == 0) { log_error("pre_context token was NULL or 0 length\n"); goto exit_teardown; } char_array_cat(context, token); size_t context_len = strlen(char_array_get_string(context)); for (post = 0; post < num_post_context_strings; post++) { context->n = context_len; char_array_cat(context, POST_CONTEXT_CHAR); token = cstring_array_get_string(post_context_strings, post); char_array_cat(context, token); if (token == NULL || strlen(token) == 0) { log_error("post_context token was NULL or 0 length\n"); goto exit_teardown; } token = char_array_get_string(context); cstring_array_add_string(context_strings, token); } } char_array_destroy(context); } else if (num_pre_context_strings > 0) { context_start_char = PRE_CONTEXT_CHAR; num_context_strings = num_pre_context_strings; context_strings = pre_context_strings; } else if (num_post_context_strings > 0) { context_start_char = POST_CONTEXT_CHAR; num_context_strings = num_post_context_strings; context_strings = post_context_strings; } if (num_context_strings > 0) { log_debug("num_context_strings = %zu\n", num_context_strings); } if (tree == NULL) { log_error("Tree was NULL, rule=%s\n", key); goto exit_teardown; } string_tree_iterator_t *iter = string_tree_iterator_new(tree); log_debug("iter->remaining=%d\n", iter->remaining); char *key_str; for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) { rule_key->n = step_key_len; for (c = 0; c < iter->num_tokens; c++) { token = string_tree_iterator_get_string(iter, c); if (token == NULL) { log_error("string_tree_iterator_get_string was NULL: %s\n", key); goto exit_teardown; } char_array_cat(rule_key, token); log_debug("string_tree token was %s\n", token); } log_debug("rule_key=%s\n", char_array_get_string(rule_key)); size_t context_key_len; if (num_context_strings == 0) { token = char_array_get_string(rule_key); if (trie_get(trie, token) == NULL_NODE_ID) { trie_add(trie, token, replacement_index); } else { log_warn("Key exists: %s, skipping\n", token); } } else { char_array_cat(rule_key, context_start_char); context_key_len = strlen(char_array_get_string(rule_key)); for (c = 0; c < num_context_strings; c++) { rule_key->n = context_key_len; token = cstring_array_get_string(context_strings, c); if (token == NULL) { log_error("token was NULL for c=%d\n", c); } char_array_cat(rule_key, token); token = char_array_get_string(rule_key); if (trie_get(trie, token) == NULL_NODE_ID) { trie_add(trie, token, replacement_index); } else { log_warn("Key exists: %s, skipping\n", token); } } } } string_tree_iterator_destroy(iter); string_tree_destroy(tree); char_array_destroy(rule_key); if (pre_context_strings != NULL) { cstring_array_destroy(pre_context_strings); } if (post_context_strings != NULL) { cstring_array_destroy(post_context_strings); } // Only needed if we created a combined context array if (combined_context_strings) { cstring_array_destroy(context_strings); } } char_array_destroy(step_key); } char_array_destroy(trans_key); if (!transliteration_table_add_transliterator(trans)) { goto exit_teardown; } } size_t num_source_scripts = sizeof(script_transliteration_rules) / sizeof(script_transliteration_rule_t); for (int i = 0; i < num_source_scripts; i++) { script_transliteration_rule_t rule = script_transliteration_rules[i]; if (!transliteration_table_add_script_language(rule.script_language, rule.index)) { goto exit_teardown; } transliterator_index_t index = rule.index; for (int j = index.transliterator_index; j < index.transliterator_index + index.num_transliterators; j++) { char *trans_name = script_transliterators[j]; if (trans_name == NULL) { goto exit_teardown; } cstring_array_add_string(trans_table->transliterator_names, trans_name); } } transliteration_table_write(f); fclose(f); transliteration_module_teardown(); log_info("Done!\n"); exit(EXIT_SUCCESS); exit_teardown: log_error("FAIL\n"); transliteration_module_teardown(); exit(EXIT_FAILURE); }
void add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, normalize_options_t options) { cstring_array *strings = tree->strings; bool have_suffix = suffix.len > 0; bool have_prefix = prefix.len > 0; address_expansion_array *prefix_expansions = NULL; address_expansion_array *suffix_expansions = NULL; address_expansion_t prefix_expansion; address_expansion_t suffix_expansion; char_array *key = char_array_new_size(token.len); char *expansion; uint64_t num_strings = 0; char *root_word = NULL; size_t root_len; token_t root_token; cstring_array *root_strings = NULL; int add_space = 0; int spaces = 0; size_t prefix_start, prefix_end, root_end, suffix_start; if (have_prefix) { prefix_expansions = get_affix_expansions(key, str, lang, token, prefix, false, options); if (prefix_expansions == NULL) have_prefix = false; } if (have_suffix) { suffix_expansions = get_affix_expansions(key, str, lang, token, suffix, true, options); if (suffix_expansions == NULL) have_suffix = false; } if (have_prefix && have_suffix) { for (int i = 0; i < prefix_expansions->n; i++) { prefix_expansion = prefix_expansions->a[i]; char_array_clear(key); cat_affix_expansion(key, str, prefix_expansion, token, prefix); prefix_start = key->n - 1; add_space = (int)prefix_expansion.separable; if (prefix.len + suffix.len < token.len && !prefix_expansion.separable) { add_space = suffix_expansion.separable; } for (spaces = 0; spaces <= add_space; spaces++) { key->n = prefix_start; if (spaces) { char_array_cat(key, " "); } prefix_end = key->n; if (prefix.len + suffix.len < token.len) { root_len = token.len - suffix.len - prefix.len; root_token = (token_t){token.offset + prefix.len, root_len, token.type}; root_strings = cstring_array_new_size(root_len); add_normalized_strings_token(root_strings, str, root_token, options); num_strings = cstring_array_num_strings(root_strings); for (int j = 0; j < num_strings; j++) { key->n = prefix_end; root_word = cstring_array_get_string(root_strings, j); char_array_cat(key, root_word); root_end = key->n - 1; for (int k = 0; k < suffix_expansions->n; k++) { key->n = root_end; suffix_expansion = suffix_expansions->a[k]; int add_suffix_space = suffix_expansion.separable; suffix_start = key->n; for (int suffix_spaces = 0; suffix_spaces <= add_suffix_space; suffix_spaces++) { key->n = suffix_start; if (suffix_spaces) { char_array_cat(key, " "); } cat_affix_expansion(key, str, suffix_expansion, token, suffix); expansion = char_array_get_string(key); cstring_array_add_string(strings, expansion); } } } } else { for (int j = 0; j < suffix_expansions->n; j++) { key->n = prefix_end; suffix_expansion = suffix_expansions->a[j]; cat_affix_expansion(key, str, suffix_expansion, token, suffix); expansion = char_array_get_string(key); cstring_array_add_string(tree->strings, expansion); } } } } } else if (have_suffix) { root_len = suffix.start; root_token = (token_t){token.offset, root_len, token.type}; root_strings = cstring_array_new_size(root_len); add_normalized_strings_token(root_strings, str, root_token, options); num_strings = cstring_array_num_strings(root_strings); for (int j = 0; j < num_strings; j++) { char_array_clear(key); root_word = cstring_array_get_string(root_strings, j); char_array_cat(key, root_word); root_end = key->n - 1; for (int k = 0; k < suffix_expansions->n; k++) { key->n = root_end; suffix_expansion = suffix_expansions->a[k]; add_space = suffix_expansion.separable; suffix_start = key->n; for (int spaces = 0; spaces <= add_space; spaces++) { key->n = suffix_start; if (spaces) { char_array_cat(key, " "); } cat_affix_expansion(key, str, suffix_expansion, token, suffix); expansion = char_array_get_string(key); cstring_array_add_string(tree->strings, expansion); } } } } else if (have_prefix) { root_len = token.len - prefix.len; root_token = (token_t){token.offset + prefix.len, root_len, token.type}; root_strings = cstring_array_new_size(root_len); add_normalized_strings_token(root_strings, str, root_token, options); num_strings = cstring_array_num_strings(root_strings); for (int j = 0; j < prefix_expansions->n; j++) { char_array_clear(key); prefix_expansion = prefix_expansions->a[j]; cat_affix_expansion(key, str, prefix_expansion, token, prefix); prefix_end = key->n - 1; add_space = prefix_expansion.separable; for (int spaces = 0; spaces <= add_space; spaces++) { key->n = prefix_end; if (spaces) { char_array_cat(key, " "); } for (int k = 0; k < num_strings; k++) { root_word = cstring_array_get_string(root_strings, k); char_array_cat(key, root_word); expansion = char_array_get_string(key); cstring_array_add_string(tree->strings, expansion); } } } } char_array_destroy(key); if (root_strings != NULL) { cstring_array_destroy(root_strings); } }
string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { char_array *key = NULL; log_debug("input=%s\n", str); token_array *tokens = tokenize_keep_whitespace(str); if (tokens == NULL) { return NULL; } size_t len = strlen(str); log_debug("tokenized, num tokens=%zu\n", tokens->n); phrase_language_array *phrases = NULL; phrase_array *lang_phrases = NULL; for (int i = 0; i < options.num_languages; i++) { char *lang = options.languages[i]; log_debug("lang=%s\n", lang); lang_phrases = search_address_dictionaries_tokens(str, tokens, lang); if (lang_phrases == NULL) { log_debug("lang_phrases NULL\n"); continue; } log_debug("lang_phrases->n = %zu\n", lang_phrases->n); phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); for (int j = 0; j < lang_phrases->n; j++) { phrase_t p = lang_phrases->a[j]; phrase_language_array_push(phrases, (phrase_language_t){lang, p}); } phrase_array_destroy(lang_phrases); } lang_phrases = search_address_dictionaries_tokens(str, tokens, ALL_LANGUAGES); if (lang_phrases != NULL) { phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); for (int j = 0; j < lang_phrases->n; j++) { phrase_t p = lang_phrases->a[j]; phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p}); } phrase_array_destroy(lang_phrases); } string_tree_t *tree = string_tree_new_size(len); if (phrases != NULL) { log_debug("phrases not NULL, n=%zu\n", phrases->n); ks_introsort(phrase_language_array, phrases->n, phrases->a); phrase_language_t phrase_lang; int start = 0; int end = 0; key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN); for (int i = 0; i < phrases->n; i++) { phrase_lang = phrases->a[i]; char_array_clear(key); char_array_cat(key, phrase_lang.language); char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); size_t namespace_len = key->n; phrase_t phrase = phrase_lang.phrase; end = phrase.start; for (int j = start; j < end; j++) { token_t token = tokens->a[j]; if (token.type != WHITESPACE) { log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { log_debug("Adding space\n"); string_tree_add_string(tree, " "); } string_tree_finalize_token(tree); } expansion_value_t value; value.value = phrase.data; token_t token; if (value.components & options.address_components) { key->n = namespace_len; for (int j = phrase.start; j < phrase.start + phrase.len; j++) { token = tokens->a[j]; if (token.type != WHITESPACE) { char_array_cat_len(key, str + token.offset, token.len); } else { char_array_cat(key, " "); } } char *key_str = char_array_get_string(key); log_debug("key_str=%s\n", key_str); address_expansion_array *expansions = address_dictionary_get_expansions(key_str); if (expansions != NULL) { for (int j = 0; j < expansions->n; j++) { address_expansion_t expansion = expansions->a[j]; if (expansion.canonical_index != NULL_CANONICAL_INDEX) { char *canonical = address_dictionary_get_canonical(expansion.canonical_index); if (phrase.start + phrase.len < tokens->n - 1) { token_t next_token = tokens->a[phrase.start + phrase.len]; if (!is_numeric_token(next_token.type)) { string_tree_add_string(tree, canonical); } else { uint32_t start_index = cstring_array_start_token(tree->strings); cstring_array_append_string(tree->strings, canonical); cstring_array_append_string(tree->strings, " "); cstring_array_terminate(tree->strings); } } else { string_tree_add_string(tree, canonical); } } else { for (int k = phrase.start; k < phrase.start + phrase.len; k++) { token = tokens->a[k]; if (token.type != WHITESPACE) { string_tree_add_string_len(tree, str + token.offset, token.len); } else { string_tree_add_string(tree, " "); } } } } string_tree_finalize_token(tree); } } else { for (int j = phrase.start; j < phrase.start + phrase.len; j++) { token = tokens->a[j]; if (token.type != WHITESPACE) { log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { string_tree_add_string(tree, " "); } string_tree_finalize_token(tree); } } start = phrase.start + phrase.len; } char_array_destroy(key); end = (int)tokens->n; for (int j = start; j < end; j++) { token_t token = tokens->a[j]; if (token.type != WHITESPACE) { log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { log_debug("Adding space\n"); string_tree_add_string(tree, " "); } string_tree_finalize_token(tree); } } else { string_tree_add_string(tree, str); string_tree_finalize_token(tree); } if (phrases != NULL) { phrase_language_array_destroy(phrases); } token_array_destroy(tokens); return tree; }
int main(int argc, char **argv) { char *filename; if (argc == 2) { filename = argv[1]; } else { filename = DEFAULT_NUMEX_PATH; } FILE *f = fopen(filename, "wb"); if (f == NULL) { log_error("File could not be opened, ensure directory exists: %s\n", filename); numex_module_teardown(); exit(1); } if (!numex_module_init()) { log_error("Numex table initialization unsuccessful\n"); numex_module_teardown(); exit(1); } numex_table_t *numex_table = get_numex_table(); size_t num_languages = sizeof(numex_languages) / sizeof(numex_language_source_t); size_t num_source_keys = sizeof(numex_keys) / sizeof(char *); size_t num_source_rules = sizeof(numex_rules) / sizeof(numex_rule_t); if (num_source_keys != num_source_rules) { log_error("num_sourcE_keys != num_source_rules, aborting\n"); numex_module_teardown(); exit(1); } size_t num_ordinal_indicator_rules = sizeof(ordinal_indicator_rules) / sizeof(ordinal_indicator_t); char_array *key = char_array_new(); for (int i = 0; i < num_languages; i++) { numex_language_source_t lang_source = numex_languages[i]; char *lang = lang_source.name; int j; size_t rule_index = lang_source.rule_index; size_t num_rules = lang_source.num_rules; size_t ordinal_indicator_index = lang_source.ordinal_indicator_index; size_t num_ordinal_indicators = lang_source.num_ordinal_indicators; numex_rule_t rule; uint32_t value; log_info("Doing language=%s\n", lang); for (j = rule_index; j < rule_index + num_rules; j++) { char *numex_key = numex_keys[j]; numex_rule_t rule = numex_rules[j]; value = rule.rule_type != NUMEX_STOPWORD ? numex_table->rules->n : NUMEX_STOPWORD_INDEX; numex_rule_array_push(numex_table->rules, rule); char_array_clear(key); char_array_cat(key, lang); char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); char_array_cat(key, numex_key); char *str_key = char_array_get_string(key); trie_add(numex_table->trie, str_key, value); } for (j = ordinal_indicator_index; j < ordinal_indicator_index + num_ordinal_indicators; j++) { value = numex_table->ordinal_indicators->n; ordinal_indicator_t ordinal_source = ordinal_indicator_rules[j]; if (ordinal_source.key == NULL) { log_error("ordinal source key was NULL at index %d\n", j); exit(EXIT_FAILURE); } char *ordinal_indicator_key = strdup(ordinal_source.key); if (ordinal_indicator_key == NULL) { log_error("Error in strdup\n"); exit(EXIT_FAILURE); } char *suffix = NULL; if (ordinal_source.suffix != NULL) { suffix = strdup(ordinal_source.suffix); if (suffix == NULL) { log_error("Error in strdup\n"); exit(EXIT_FAILURE); } } ordinal_indicator_t *ordinal = ordinal_indicator_new(ordinal_indicator_key, ordinal_source.gender, ordinal_source.category, suffix); ordinal_indicator_array_push(numex_table->ordinal_indicators, ordinal); char_array_clear(key); char_array_cat(key, lang); char_array_cat(key, ORDINAL_NAMESPACE_PREFIX); switch (ordinal_source.gender) { case GENDER_MASCULINE: char_array_cat(key, GENDER_MASCULINE_PREFIX); break; case GENDER_FEMININE: char_array_cat(key, GENDER_FEMININE_PREFIX); break; case GENDER_NEUTER: char_array_cat(key, GENDER_NEUTER_PREFIX); break; case GENDER_NONE: default: char_array_cat(key, GENDER_NONE_PREFIX); } switch (ordinal_source.category) { case CATEGORY_PLURAL: char_array_cat(key, CATEGORY_PLURAL_PREFIX); break; case CATEGORY_DEFAULT: default: char_array_cat(key, CATEGORY_DEFAULT_PREFIX); } char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); char *reversed = utf8_reversed_string(ordinal_source.key); char_array_cat(key, reversed); free(reversed); char *str_key = char_array_get_string(key); if (trie_get(numex_table->trie, str_key) == NULL_NODE_ID) { trie_add(numex_table->trie, str_key, value); } else { log_warn("Key exists: %s, skipping\n", str_key); } } char *name = strdup(lang_source.name); if (name == NULL) { log_error("Error in strdup\n"); exit(EXIT_FAILURE); } numex_language_t *language = numex_language_new(name, lang_source.whole_tokens_only, lang_source.rule_index, lang_source.num_rules, lang_source.ordinal_indicator_index, lang_source.num_ordinal_indicators); numex_table_add_language(language); } char_array_destroy(key); if (!numex_table_write(f)) { log_error("Error writing numex table\n"); exit(1); } fclose(f); numex_module_teardown(); log_info("Done\n"); }