double test_accuracy(char *filename) { language_classifier_data_set_t *data_set = language_classifier_data_set_init(filename); if (data_set == NULL) { log_error("Error creating data set\n"); exit(EXIT_FAILURE); } language_classifier_minibatch_t *minibatch; uint32_t correct = 0; uint32_t total = 0; language_classifier_t *classifier = get_language_classifier(); trie_t *label_ids = trie_new_from_cstring_array(classifier->labels); while (language_classifier_data_set_next(data_set)) { char *address = char_array_get_string(data_set->address); char *language = char_array_get_string(data_set->language); uint32_t label_id; if (!trie_get_data(label_ids, language, &label_id)) { continue; } language_classifier_response_t *response = classify_languages(address); if (response == NULL || response->num_languages == 0) { printf("%s\tNULL\t%s\n", language, address); continue; } char *top_lang = response->languages[0]; if (string_equals(top_lang, language)) { correct++; } else { printf("%s\t%s\t%s\n", language, top_lang, address); } total++; language_classifier_response_destroy(response); } log_info("total=%zu\n", total); trie_destroy(label_ids); return (double) correct / total; }
static inline bool crf_get_feature_id(crf_t *self, char *feature, uint32_t *feature_id) { return trie_get_data(self->state_features, feature, feature_id); }
static inline bool averaged_perceptron_get_feature_id(averaged_perceptron_t *self, char *feature, uint32_t *feature_id) { return trie_get_data(self->features, feature, feature_id); }
bool address_dictionary_add_expansion(char *name, char *language, address_expansion_t expansion) { if (address_dict == NULL || address_dict->values == NULL) { log_error(ADDRESS_DICTIONARY_SETUP_ERROR); return false; } if (name == NULL) return false; char *key; bool is_prefix = false; bool is_suffix = false; bool is_phrase = false; for (size_t i = 0; i < expansion.num_dictionaries; i++) { dictionary_type_t dict = expansion.dictionary_ids[i]; if (dict == DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE || dict == DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE) { is_suffix = true; } else if (dict == DICTIONARY_CONCATENATED_PREFIX_SEPARABLE || dict == DICTIONARY_ELISION) { is_prefix = true; } else { is_phrase = true; } } char_array *array = char_array_new_size(strlen(name)); if (array == NULL) { return false; } if (language != NULL) { char_array_cat(array, language); char_array_cat(array, NAMESPACE_SEPARATOR_CHAR); } if (!is_suffix && !is_prefix) { char_array_cat(array, name); } else if (is_prefix) { char_array_cat(array, TRIE_PREFIX_CHAR); char_array_cat(array, name); } else if (is_suffix) { char_array_cat(array, TRIE_SUFFIX_CHAR); char_array_cat_reversed(array, name); } key = char_array_to_string(array); log_debug("key=%s\n", key); uint32_t expansion_index; address_expansion_value_t *value; if (trie_get_data(address_dict->trie, key, &expansion_index)) { value = address_dict->values->a[expansion_index]; value->components |= expansion.address_components; address_expansion_array_push(value->expansions, expansion); } else { value = address_expansion_value_new_with_expansion(expansion); expansion_index = (uint32_t)address_dict->values->n; address_expansion_value_array_push(address_dict->values, value); if (!trie_add(address_dict->trie, key, expansion_index)) { log_warn("Key %s could not be added to trie\n", key); goto exit_key_created;; } } free(key); return true; exit_key_created: free(key); return false; }