int32 ngram_model_read_classdef(ngram_model_t *model, const char *file_name) { hash_table_t *classes; glist_t hl = NULL; gnode_t *gn; int32 rv = -1; classes = hash_table_new(0, FALSE); if (read_classdef_file(classes, file_name) < 0) { hash_table_free(classes); return -1; } /* Create a new class in the language model for each classdef. */ hl = hash_table_tolist(classes, NULL); for (gn = hl; gn; gn = gnode_next(gn)) { hash_entry_t *he = gnode_ptr(gn); classdef_t *classdef = he->val; if (ngram_model_add_class(model, he->key, 1.0, classdef->words, classdef->weights, classdef->n_words) < 0) goto error_out; } rv = 0; error_out: for (gn = hl; gn; gn = gnode_next(gn)) { hash_entry_t *he = gnode_ptr(gn); ckd_free((char *)he->key); classdef_free(he->val); } glist_free(hl); hash_table_free(classes); return rv; }
ngram_model_t * ngram_model_set_read(cmd_ln_t * config, const char *lmctlfile, logmath_t * lmath) { FILE *ctlfp; glist_t lms = NULL; glist_t lmnames = NULL; __BIGSTACKVARIABLE__ char str[1024]; ngram_model_t *set = NULL; hash_table_t *classes; char *basedir, *c; /* Read all the class definition files to accumulate a mapping of * classnames to definitions. */ classes = hash_table_new(0, FALSE); if ((ctlfp = fopen(lmctlfile, "r")) == NULL) { E_ERROR_SYSTEM("Failed to open %s", lmctlfile); return NULL; } /* Try to find the base directory to append to relative paths in * the lmctl file. */ if ((c = strrchr(lmctlfile, '/')) || (c = strrchr(lmctlfile, '\\'))) { /* Include the trailing slash. */ basedir = ckd_calloc(c - lmctlfile + 2, 1); memcpy(basedir, lmctlfile, c - lmctlfile + 1); } else { basedir = NULL; } E_INFO("Reading LM control file '%s'\n", lmctlfile); if (basedir) E_INFO("Will prepend '%s' to unqualified paths\n", basedir); if (fscanf(ctlfp, "%1023s", str) == 1) { if (strcmp(str, "{") == 0) { /* Load LMclass files */ while ((fscanf(ctlfp, "%1023s", str) == 1) && (strcmp(str, "}") != 0)) { char *deffile; if (basedir && !path_is_absolute(str)) deffile = string_join(basedir, str, NULL); else deffile = ckd_salloc(str); E_INFO("Reading classdef from '%s'\n", deffile); if (read_classdef_file(classes, deffile) < 0) { ckd_free(deffile); goto error_out; } ckd_free(deffile); } if (strcmp(str, "}") != 0) { E_ERROR("Unexpected EOF in %s\n", lmctlfile); goto error_out; } /* This might be the first LM name. */ if (fscanf(ctlfp, "%1023s", str) != 1) str[0] = '\0'; } } else str[0] = '\0'; /* Read in one LM at a time and add classes to them as necessary. */ while (str[0] != '\0') { char *lmfile; ngram_model_t *lm; if (basedir && str[0] != '/' && str[0] != '\\') lmfile = string_join(basedir, str, NULL); else lmfile = ckd_salloc(str); E_INFO("Reading lm from '%s'\n", lmfile); lm = ngram_model_read(config, lmfile, NGRAM_AUTO, lmath); if (lm == NULL) { ckd_free(lmfile); goto error_out; } if (fscanf(ctlfp, "%1023s", str) != 1) { E_ERROR("LMname missing after LMFileName '%s'\n", lmfile); ckd_free(lmfile); goto error_out; } ckd_free(lmfile); lms = glist_add_ptr(lms, lm); lmnames = glist_add_ptr(lmnames, ckd_salloc(str)); if (fscanf(ctlfp, "%1023s", str) == 1) { if (strcmp(str, "{") == 0) { /* LM uses classes; read their names */ while ((fscanf(ctlfp, "%1023s", str) == 1) && (strcmp(str, "}") != 0)) { void *val; classdef_t *classdef; if (hash_table_lookup(classes, str, &val) == -1) { E_ERROR("Unknown class %s in control file\n", str); goto error_out; } classdef = val; if (ngram_model_add_class(lm, str, 1.0, classdef->words, classdef->weights, classdef->n_words) < 0) { goto error_out; } E_INFO("Added class %s containing %d words\n", str, classdef->n_words); } if (strcmp(str, "}") != 0) { E_ERROR("Unexpected EOF in %s\n", lmctlfile); goto error_out; } if (fscanf(ctlfp, "%1023s", str) != 1) str[0] = '\0'; } } else str[0] = '\0'; } fclose(ctlfp); /* Now construct arrays out of lms and lmnames, and build an * ngram_model_set. */ lms = glist_reverse(lms); lmnames = glist_reverse(lmnames); { int32 n_models; ngram_model_t **lm_array; char **name_array; gnode_t *lm_node, *name_node; int32 i; n_models = glist_count(lms); lm_array = ckd_calloc(n_models, sizeof(*lm_array)); name_array = ckd_calloc(n_models, sizeof(*name_array)); lm_node = lms; name_node = lmnames; for (i = 0; i < n_models; ++i) { lm_array[i] = gnode_ptr(lm_node); name_array[i] = gnode_ptr(name_node); lm_node = gnode_next(lm_node); name_node = gnode_next(name_node); } set = ngram_model_set_init(config, lm_array, name_array, NULL, n_models); for (i = 0; i < n_models; ++i) { ngram_model_free(lm_array[i]); } ckd_free(lm_array); ckd_free(name_array); } error_out: { gnode_t *gn; glist_t hlist; if (set == NULL) { for (gn = lms; gn; gn = gnode_next(gn)) { ngram_model_free(gnode_ptr(gn)); } } glist_free(lms); for (gn = lmnames; gn; gn = gnode_next(gn)) { ckd_free(gnode_ptr(gn)); } glist_free(lmnames); hlist = hash_table_tolist(classes, NULL); for (gn = hlist; gn; gn = gnode_next(gn)) { hash_entry_t *he = gnode_ptr(gn); ckd_free((char *) he->key); classdef_free(he->val); } glist_free(hlist); hash_table_free(classes); ckd_free(basedir); } return set; }
void run_tests(logmath_t *lmath, ngram_model_t *model) { int32 rv, i; TEST_ASSERT(model); TEST_EQUAL(ngram_wid(model, "scylla"), 285); TEST_EQUAL(strcmp(ngram_word(model, 285), "scylla"), 0); rv = ngram_model_read_classdef(model, LMDIR "/100.probdef"); TEST_EQUAL(rv, 0); /* Verify that class word IDs remain the same. */ TEST_EQUAL(ngram_wid(model, "scylla"), 285); TEST_EQUAL(strcmp(ngram_word(model, 285), "scylla"), 0); /* Verify in-class word IDs. */ TEST_EQUAL(ngram_wid(model, "scylla:scylla"), 0x80000000 | 400); /* Verify in-class and out-class unigram scores. */ TEST_EQUAL_LOG(ngram_score(model, "scylla:scylla", NULL), logmath_log10_to_log(lmath, -2.7884) + logmath_log(lmath, 0.4)); TEST_EQUAL_LOG(ngram_score(model, "scooby:scylla", NULL), logmath_log10_to_log(lmath, -2.7884) + logmath_log(lmath, 0.1)); TEST_EQUAL_LOG(ngram_score(model, "scylla", NULL), logmath_log10_to_log(lmath, -2.7884)); TEST_EQUAL_LOG(ngram_score(model, "oh:zero", NULL), logmath_log10_to_log(lmath, -1.9038) + logmath_log(lmath, 0.7)); TEST_EQUAL_LOG(ngram_score(model, "zero", NULL), logmath_log10_to_log(lmath, -1.9038)); /* Verify class bigram scores. */ TEST_EQUAL_LOG(ngram_score(model, "scylla", "on", NULL), logmath_log10_to_log(lmath, -1.2642)); TEST_EQUAL_LOG(ngram_score(model, "scylla:scylla", "on", NULL), logmath_log10_to_log(lmath, -1.2642) + logmath_log(lmath, 0.4)); TEST_EQUAL_LOG(ngram_score(model, "apparently", "scylla", NULL), logmath_log10_to_log(lmath, -0.5172)); TEST_EQUAL_LOG(ngram_score(model, "apparently", "karybdis:scylla", NULL), logmath_log10_to_log(lmath, -0.5172)); TEST_EQUAL_LOG(ngram_score(model, "apparently", "scooby:scylla", NULL), logmath_log10_to_log(lmath, -0.5172)); /* Verify class trigram scores. */ TEST_EQUAL_LOG(ngram_score(model, "zero", "be", "will", NULL), logmath_log10_to_log(lmath, -0.5725)); TEST_EQUAL_LOG(ngram_score(model, "oh:zero", "be", "will", NULL), logmath_log10_to_log(lmath, -0.5725) + logmath_log(lmath, 0.7)); TEST_EQUAL_LOG(ngram_score(model, "should", "variance", "zero", NULL), logmath_log10_to_log(lmath, -0.9404)); TEST_EQUAL_LOG(ngram_score(model, "should", "variance", "zero:zero", NULL), logmath_log10_to_log(lmath, -0.9404)); /* Add words to classes. */ rv = ngram_model_add_class_word(model, "scylla", "scrappy:scylla", 1.0); TEST_ASSERT(rv >= 0); TEST_EQUAL(ngram_wid(model, "scrappy:scylla"), 0x80000196); TEST_EQUAL_LOG(ngram_score(model, "scrappy:scylla", NULL), logmath_log10_to_log(lmath, -2.7884) + logmath_log(lmath, 0.2)); printf("scrappy:scylla %08x %d %f\n", ngram_wid(model, "scrappy:scylla"), ngram_score(model, "scrappy:scylla", NULL), logmath_exp(lmath, ngram_score(model, "scrappy:scylla", NULL))); /* Add a lot of words to a class. */ for (i = 0; i < 129; ++i) { char word[32]; sprintf(word, "%d:scylla", i); rv = ngram_model_add_class_word(model, "scylla", word, 1.0); printf("%s %08x %d %f\n", word, ngram_wid(model, word), ngram_score(model, word, NULL), logmath_exp(lmath, ngram_score(model, word, NULL))); TEST_ASSERT(rv >= 0); TEST_EQUAL(ngram_wid(model, word), 0x80000197 + i); } /* Add a new class. */ { const char *words[] = { "blatz:foobie", "hurf:foobie" }; float32 weights[] = { 0.6, 0.4 }; int32 foobie_prob; rv = ngram_model_add_class(model, "[foobie]", 1.0, words, weights, 2); TEST_ASSERT(rv >= 0); foobie_prob = ngram_score(model, "[foobie]", NULL); TEST_EQUAL_LOG(ngram_score(model, "blatz:foobie", NULL), foobie_prob + logmath_log(lmath, 0.6)); TEST_EQUAL_LOG(ngram_score(model, "hurf:foobie", NULL), foobie_prob + logmath_log(lmath, 0.4)); } }