static void build_widmap(ngram_model_t * base, logmath_t * lmath, int32 n) { ngram_model_set_t *set = (ngram_model_set_t *) base; ngram_model_t **models = set->lms; hash_table_t *vocab; glist_t hlist; gnode_t *gn; int32 i; /* Construct a merged vocabulary and a set of word-ID mappings. */ vocab = hash_table_new(models[0]->n_words, FALSE); /* Create the set of merged words. */ for (i = 0; i < set->n_models; ++i) { int32 j; for (j = 0; j < models[i]->n_words; ++j) { /* Ignore collisions. */ (void) hash_table_enter_int32(vocab, models[i]->word_str[j], j); } } /* Create the array of words, then sort it. */ if (hash_table_lookup(vocab, "<UNK>", NULL) != 0) (void) hash_table_enter_int32(vocab, "<UNK>", 0); /* Now we know the number of unigrams, initialize the base model. */ ngram_model_init(base, &ngram_model_set_funcs, lmath, n, hash_table_inuse(vocab)); base->writable = FALSE; /* We will reuse the pointers from the submodels. */ i = 0; hlist = hash_table_tolist(vocab, NULL); for (gn = hlist; gn; gn = gnode_next(gn)) { hash_entry_t *ent = gnode_ptr(gn); base->word_str[i++] = (char *) ent->key; } glist_free(hlist); qsort(base->word_str, base->n_words, sizeof(*base->word_str), my_compare); /* Now create the word ID mappings. */ if (set->widmap) ckd_free_2d((void **) set->widmap); set->widmap = (int32 **) ckd_calloc_2d(base->n_words, set->n_models, sizeof(**set->widmap)); for (i = 0; i < base->n_words; ++i) { int32 j; /* Also create the master wid mapping. */ (void) hash_table_enter_int32(base->wid, base->word_str[i], i); /* printf("%s: %d => ", base->word_str[i], i); */ for (j = 0; j < set->n_models; ++j) { set->widmap[i][j] = ngram_wid(models[j], base->word_str[i]); /* printf("%d ", set->widmap[i][j]); */ } /* printf("\n"); */ } hash_table_free(vocab); }
void ngram_model_set_map_words(ngram_model_t * base, const char **words, int32 n_words) { ngram_model_set_t *set = (ngram_model_set_t *) base; int32 i; /* Recreate the word mapping. */ if (base->writable) { for (i = 0; i < base->n_words; ++i) { ckd_free(base->word_str[i]); } } ckd_free(base->word_str); ckd_free_2d((void **) set->widmap); base->writable = TRUE; base->n_words = base->n_1g_alloc = n_words; base->word_str = ckd_calloc(n_words, sizeof(*base->word_str)); set->widmap = (int32 **) ckd_calloc_2d(n_words, set->n_models, sizeof(**set->widmap)); hash_table_empty(base->wid); for (i = 0; i < n_words; ++i) { int32 j; base->word_str[i] = ckd_salloc(words[i]); (void) hash_table_enter_int32(base->wid, base->word_str[i], i); for (j = 0; j < set->n_models; ++j) { set->widmap[i][j] = ngram_wid(set->lms[j], base->word_str[i]); } } }
int ngram_model_casefold(ngram_model_t * model, int kase) { int writable, i; hash_table_t *new_wid; /* Were word strings already allocated? */ writable = model->writable; /* Either way, we are going to allocate some word strings. */ model->writable = TRUE; /* And, don't forget, we need to rebuild the word to unigram ID * mapping. */ new_wid = hash_table_new(model->n_words, FALSE); for (i = 0; i < model->n_words; ++i) { char *outstr; if (writable) { outstr = model->word_str[i]; } else { outstr = ckd_salloc(model->word_str[i]); } /* Don't case-fold <tags> or [classes] */ if (outstr[0] == '<' || outstr[0] == '[') { } else { switch (kase) { case NGRAM_UPPER: ucase(outstr); break; case NGRAM_LOWER: lcase(outstr); break; default: ; } } model->word_str[i] = outstr; /* Now update the hash table. We might have terrible * collisions here, so warn about them. */ if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) { E_WARN("Duplicate word in dictionary after conversion: %s\n", model->word_str[i]); } } /* Swap out the hash table. */ hash_table_free(model->wid); model->wid = new_wid; return 0; }
/** * Add a word to the word string and ID mapping. */ int32 ngram_add_word_internal(ngram_model_t * model, const char *word, int32 classid) { /* Check for hash collisions. */ int32 wid; if (hash_table_lookup_int32(model->wid, word, &wid) == 0) { E_WARN("Omit duplicate word '%s'\n", word); return wid; } /* Take the next available word ID */ wid = model->n_words; if (classid >= 0) { wid = NGRAM_CLASSWID(wid, classid); } /* Reallocate word_str if necessary. */ if (model->n_words >= model->n_1g_alloc) { model->n_1g_alloc += UG_ALLOC_STEP; model->word_str = ckd_realloc(model->word_str, sizeof(*model->word_str) * model->n_1g_alloc); } /* Add the word string in the appropriate manner. */ /* Class words are always dynamically allocated. */ model->word_str[model->n_words] = ckd_salloc(word); /* Now enter it into the hash table. */ if (hash_table_enter_int32 (model->wid, model->word_str[model->n_words], wid) != wid) { E_ERROR ("Hash insertion failed for word %s => %p (should not happen)\n", model->word_str[model->n_words], (void *) (long) (wid)); } /* Increment number of words. */ ++model->n_words; return wid; }
fsg_model_t * fsg_model_read(FILE * fp, logmath_t * lmath, float32 lw) { fsg_model_t *fsg; hash_table_t *vocab; hash_iter_t *itor; int32 lastwid; char **wordptr; char *lineptr; char *fsgname; int32 lineno; int32 n, i, j; int n_state, n_trans, n_null_trans; glist_t nulls; float32 p; lineno = 0; vocab = hash_table_new(32, FALSE); wordptr = NULL; lineptr = NULL; nulls = NULL; fsgname = NULL; fsg = NULL; /* Scan upto FSG_BEGIN header */ for (;;) { n = nextline_str2words(fp, &lineno, &lineptr, &wordptr); if (n < 0) { E_ERROR("%s declaration missing\n", FSG_MODEL_BEGIN_DECL); goto parse_error; } if ((strcmp(wordptr[0], FSG_MODEL_BEGIN_DECL) == 0)) { if (n > 2) { E_ERROR("Line[%d]: malformed FSG_BEGIN declaration\n", lineno); goto parse_error; } break; } } /* Save FSG name, or it will get clobbered below :(. * If name is missing, try the default. */ if (n == 2) { fsgname = ckd_salloc(wordptr[1]); } else { E_WARN("FSG name is missing\n"); fsgname = ckd_salloc("unknown"); } /* Read #states */ n = nextline_str2words(fp, &lineno, &lineptr, &wordptr); if ((n != 2) || ((strcmp(wordptr[0], FSG_MODEL_N_DECL) != 0) && (strcmp(wordptr[0], FSG_MODEL_NUM_STATES_DECL) != 0)) || (sscanf(wordptr[1], "%d", &n_state) != 1) || (n_state <= 0)) { E_ERROR ("Line[%d]: #states declaration line missing or malformed\n", lineno); goto parse_error; } /* Now create the FSG. */ fsg = fsg_model_init(fsgname, lmath, lw, n_state); ckd_free(fsgname); fsgname = NULL; /* Read start state */ n = nextline_str2words(fp, &lineno, &lineptr, &wordptr); if ((n != 2) || ((strcmp(wordptr[0], FSG_MODEL_S_DECL) != 0) && (strcmp(wordptr[0], FSG_MODEL_START_STATE_DECL) != 0)) || (sscanf(wordptr[1], "%d", &(fsg->start_state)) != 1) || (fsg->start_state < 0) || (fsg->start_state >= fsg->n_state)) { E_ERROR ("Line[%d]: start state declaration line missing or malformed\n", lineno); goto parse_error; } /* Read final state */ n = nextline_str2words(fp, &lineno, &lineptr, &wordptr); if ((n != 2) || ((strcmp(wordptr[0], FSG_MODEL_F_DECL) != 0) && (strcmp(wordptr[0], FSG_MODEL_FINAL_STATE_DECL) != 0)) || (sscanf(wordptr[1], "%d", &(fsg->final_state)) != 1) || (fsg->final_state < 0) || (fsg->final_state >= fsg->n_state)) { E_ERROR ("Line[%d]: final state declaration line missing or malformed\n", lineno); goto parse_error; } /* Read transitions */ lastwid = 0; n_trans = n_null_trans = 0; for (;;) { int32 wid, tprob; n = nextline_str2words(fp, &lineno, &lineptr, &wordptr); if (n <= 0) { E_ERROR("Line[%d]: transition or FSG_END statement expected\n", lineno); goto parse_error; } if ((strcmp(wordptr[0], FSG_MODEL_END_DECL) == 0)) { break; } if ((strcmp(wordptr[0], FSG_MODEL_T_DECL) == 0) || (strcmp(wordptr[0], FSG_MODEL_TRANSITION_DECL) == 0)) { if (((n != 4) && (n != 5)) || (sscanf(wordptr[1], "%d", &i) != 1) || (sscanf(wordptr[2], "%d", &j) != 1) || (i < 0) || (i >= fsg->n_state) || (j < 0) || (j >= fsg->n_state)) { E_ERROR ("Line[%d]: transition spec malformed; Expecting: from-state to-state trans-prob [word]\n", lineno); goto parse_error; } p = atof_c(wordptr[3]); if ((p <= 0.0) || (p > 1.0)) { E_ERROR ("Line[%d]: transition spec malformed; Expecting float as transition probability\n", lineno); goto parse_error; } } else { E_ERROR("Line[%d]: transition or FSG_END statement expected\n", lineno); goto parse_error; } tprob = (int32) (logmath_log(lmath, p) * fsg->lw); /* Add word to "dictionary". */ if (n > 4) { if (hash_table_lookup_int32(vocab, wordptr[4], &wid) < 0) { (void) hash_table_enter_int32(vocab, ckd_salloc(wordptr[4]), lastwid); wid = lastwid; ++lastwid; } fsg_model_trans_add(fsg, i, j, tprob, wid); ++n_trans; } else { if (fsg_model_null_trans_add(fsg, i, j, tprob) == 1) { ++n_null_trans; nulls = glist_add_ptr(nulls, fsg_model_null_trans(fsg, i, j)); } } } E_INFO("FSG: %d states, %d unique words, %d transitions (%d null)\n", fsg->n_state, hash_table_inuse(vocab), n_trans, n_null_trans); /* Now create a string table from the "dictionary" */ fsg->n_word = hash_table_inuse(vocab); fsg->n_word_alloc = fsg->n_word + 10; /* Pad it a bit. */ fsg->vocab = ckd_calloc(fsg->n_word_alloc, sizeof(*fsg->vocab)); for (itor = hash_table_iter(vocab); itor; itor = hash_table_iter_next(itor)) { char const *word = hash_entry_key(itor->ent); int32 wid = (int32) (long) hash_entry_val(itor->ent); fsg->vocab[wid] = (char *) word; } hash_table_free(vocab); /* Do transitive closure on null transitions */ nulls = fsg_model_null_trans_closure(fsg, nulls); glist_free(nulls); ckd_free(lineptr); ckd_free(wordptr); return fsg; parse_error: for (itor = hash_table_iter(vocab); itor; itor = hash_table_iter_next(itor)) ckd_free((char *) hash_entry_key(itor->ent)); glist_free(nulls); hash_table_free(vocab); ckd_free(fsgname); ckd_free(lineptr); ckd_free(wordptr); fsg_model_free(fsg); return NULL; }
s3wid_t dict_add_word(dict_t * d, char const *word, s3cipid_t const * p, int32 np) { int32 len; dictword_t *wordp; s3wid_t newwid; char *wword; if (d->n_word >= d->max_words) { E_INFO("Reallocating to %d KiB for word entries\n", (d->max_words + S3DICT_INC_SZ) * sizeof(dictword_t) / 1024); d->word = (dictword_t *) ckd_realloc(d->word, (d->max_words + S3DICT_INC_SZ) * sizeof(dictword_t)); d->max_words = d->max_words + S3DICT_INC_SZ; } wordp = d->word + d->n_word; wordp->word = (char *) ckd_salloc(word); /* Freed in dict_free */ /* Associate word string with d->n_word in hash table */ if (hash_table_enter_int32(d->ht, wordp->word, d->n_word) != d->n_word) { ckd_free(wordp->word); wordp->word = NULL; return BAD_S3WID; } /* Fill in word entry, and set defaults */ if (p && (np > 0)) { wordp->ciphone = (s3cipid_t *) ckd_malloc(np * sizeof(s3cipid_t)); /* Freed in dict_free */ memcpy(wordp->ciphone, p, np * sizeof(s3cipid_t)); wordp->pronlen = np; } else { wordp->ciphone = NULL; wordp->pronlen = 0; } wordp->alt = BAD_S3WID; wordp->basewid = d->n_word; /* Determine base/alt wids */ wword = ckd_salloc(word); if ((len = dict_word2basestr(wword)) > 0) { int32 w; /* Truncated to a baseword string; find its ID */ if (hash_table_lookup_int32(d->ht, wword, &w) < 0) { E_ERROR("Missing base word for: %s\n", word); ckd_free(wword); ckd_free(wordp->word); wordp->word = NULL; return BAD_S3WID; } /* Link into alt list */ wordp->basewid = w; wordp->alt = d->word[w].alt; d->word[w].alt = d->n_word; } ckd_free(wword); newwid = d->n_word++; return newwid; }
int ngram_model_recode(ngram_model_t *model, const char *from, const char *to) { iconv_t ic; char *outbuf; size_t maxlen; int i, writable; hash_table_t *new_wid; /* FIXME: Need to do a special case thing for the GB-HEX encoding * used in Sphinx3 Mandarin models. */ if ((ic = iconv_open(to, from)) == (iconv_t)-1) { E_ERROR_SYSTEM("iconv_open() failed"); return -1; } /* iconv(3) is a piece of crap and won't accept a NULL out buffer, * unlike wcstombs(3). So we have to either call it over and over * again until our buffer is big enough, or call it with a huge * buffer and then copy things back to the output. We will use a * mix of these two approaches here. We'll keep a single big * buffer around, and expand it as necessary. */ maxlen = 0; for (i = 0; i < model->n_words; ++i) { if (strlen(model->word_str[i]) > maxlen) maxlen = strlen(model->word_str[i]); } /* Were word strings already allocated? */ writable = model->writable; /* Either way, we are going to allocate some word strings. */ model->writable = TRUE; /* Really should be big enough except for pathological cases. */ maxlen = maxlen * sizeof(int) + 15; outbuf = ckd_calloc(maxlen, 1); /* And, don't forget, we need to rebuild the word to unigram ID * mapping. */ new_wid = hash_table_new(model->n_words, FALSE); for (i = 0; i < model->n_words; ++i) { ICONV_CONST char *in; char *out; size_t inleft, outleft, result; start_conversion: in = (ICONV_CONST char *)model->word_str[i]; /* Yes, this assumes that we don't have any NUL bytes. */ inleft = strlen(in); out = outbuf; outleft = maxlen; while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) { if (errno != E2BIG) { /* FIXME: if we already converted any words, then they * are going to be in an inconsistent state. */ E_ERROR_SYSTEM("iconv() failed"); ckd_free(outbuf); hash_table_free(new_wid); return -1; } /* Reset the internal state of conversion. */ iconv(ic, NULL, NULL, NULL, NULL); /* Make everything bigger. */ maxlen *= 2; out = outbuf = ckd_realloc(outbuf, maxlen); /* Reset the input pointers. */ in = (ICONV_CONST char *)model->word_str[i]; inleft = strlen(in); } /* Now flush a shift-out sequence, if any. */ if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) { if (errno != E2BIG) { /* FIXME: if we already converted any words, then they * are going to be in an inconsistent state. */ E_ERROR_SYSTEM("iconv() failed (state reset sequence)"); ckd_free(outbuf); hash_table_free(new_wid); return -1; } /* Reset the internal state of conversion. */ iconv(ic, NULL, NULL, NULL, NULL); /* Make everything bigger. */ maxlen *= 2; outbuf = ckd_realloc(outbuf, maxlen); /* Be very evil. */ goto start_conversion; } result = maxlen - outleft; /* Okay, that was hard, now let's go shopping. */ if (writable) { /* Grow or shrink the output string as necessary. */ model->word_str[i] = ckd_realloc(model->word_str[i], result + 1); model->word_str[i][result] = '\0'; } else { /* It actually was not allocated previously, so do that now. */ model->word_str[i] = ckd_calloc(result + 1, 1); } /* Copy the new thing in. */ memcpy(model->word_str[i], outbuf, result); /* Now update the hash table. We might have terrible * collisions if a non-reversible conversion was requested., * so warn about them. */ if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) { E_WARN("Duplicate word in dictionary after conversion: %s\n", model->word_str[i]); } } ckd_free(outbuf); iconv_close(ic); /* Swap out the hash table. */ hash_table_free(model->wid); model->wid = new_wid; return 0; }