int mdef_ciphone_id(mdef_t * m, char *ci) { int32 id; if (hash_table_lookup_int32(m->ciphone_ht, ci, &id) < 0) return -1; return id; }
int32 ngram_wid(ngram_model_t * model, const char *word) { int32 val; if (hash_table_lookup_int32(model->wid, word, &val) == -1) return ngram_unknown_wid(model); else return val; }
int32 ngram_unknown_wid(ngram_model_t * model) { int32 val; /* FIXME: This could be memoized for speed if necessary. */ /* Look up <UNK>, if not found return NGRAM_INVALID_WID. */ if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1) return NGRAM_INVALID_WID; else return val; }
s3wid_t dict_wordid(dict_t *d, const char *word) { int32 w; assert(d); assert(word); if (hash_table_lookup_int32(d->ht, word, &w) < 0) return (BAD_S3WID); return w; }
/** * Add a word to the word string and ID mapping. */ int32 ngram_add_word_internal(ngram_model_t * model, const char *word, int32 classid) { /* Check for hash collisions. */ int32 wid; if (hash_table_lookup_int32(model->wid, word, &wid) == 0) { E_WARN("Omit duplicate word '%s'\n", word); return wid; } /* Take the next available word ID */ wid = model->n_words; if (classid >= 0) { wid = NGRAM_CLASSWID(wid, classid); } /* Reallocate word_str if necessary. */ if (model->n_words >= model->n_1g_alloc) { model->n_1g_alloc += UG_ALLOC_STEP; model->word_str = ckd_realloc(model->word_str, sizeof(*model->word_str) * model->n_1g_alloc); } /* Add the word string in the appropriate manner. */ /* Class words are always dynamically allocated. */ model->word_str[model->n_words] = ckd_salloc(word); /* Now enter it into the hash table. */ if (hash_table_enter_int32 (model->wid, model->word_str[model->n_words], wid) != wid) { E_ERROR ("Hash insertion failed for word %s => %p (should not happen)\n", model->word_str[model->n_words], (void *) (long) (wid)); } /* Increment number of words. */ ++model->n_words; return wid; }
fsg_model_t * fsg_model_read(FILE * fp, logmath_t * lmath, float32 lw) { fsg_model_t *fsg; hash_table_t *vocab; hash_iter_t *itor; int32 lastwid; char **wordptr; char *lineptr; char *fsgname; int32 lineno; int32 n, i, j; int n_state, n_trans, n_null_trans; glist_t nulls; float32 p; lineno = 0; vocab = hash_table_new(32, FALSE); wordptr = NULL; lineptr = NULL; nulls = NULL; fsgname = NULL; fsg = NULL; /* Scan upto FSG_BEGIN header */ for (;;) { n = nextline_str2words(fp, &lineno, &lineptr, &wordptr); if (n < 0) { E_ERROR("%s declaration missing\n", FSG_MODEL_BEGIN_DECL); goto parse_error; } if ((strcmp(wordptr[0], FSG_MODEL_BEGIN_DECL) == 0)) { if (n > 2) { E_ERROR("Line[%d]: malformed FSG_BEGIN declaration\n", lineno); goto parse_error; } break; } } /* Save FSG name, or it will get clobbered below :(. * If name is missing, try the default. */ if (n == 2) { fsgname = ckd_salloc(wordptr[1]); } else { E_WARN("FSG name is missing\n"); fsgname = ckd_salloc("unknown"); } /* Read #states */ n = nextline_str2words(fp, &lineno, &lineptr, &wordptr); if ((n != 2) || ((strcmp(wordptr[0], FSG_MODEL_N_DECL) != 0) && (strcmp(wordptr[0], FSG_MODEL_NUM_STATES_DECL) != 0)) || (sscanf(wordptr[1], "%d", &n_state) != 1) || (n_state <= 0)) { E_ERROR ("Line[%d]: #states declaration line missing or malformed\n", lineno); goto parse_error; } /* Now create the FSG. */ fsg = fsg_model_init(fsgname, lmath, lw, n_state); ckd_free(fsgname); fsgname = NULL; /* Read start state */ n = nextline_str2words(fp, &lineno, &lineptr, &wordptr); if ((n != 2) || ((strcmp(wordptr[0], FSG_MODEL_S_DECL) != 0) && (strcmp(wordptr[0], FSG_MODEL_START_STATE_DECL) != 0)) || (sscanf(wordptr[1], "%d", &(fsg->start_state)) != 1) || (fsg->start_state < 0) || (fsg->start_state >= fsg->n_state)) { E_ERROR ("Line[%d]: start state declaration line missing or malformed\n", lineno); goto parse_error; } /* Read final state */ n = nextline_str2words(fp, &lineno, &lineptr, &wordptr); if ((n != 2) || ((strcmp(wordptr[0], FSG_MODEL_F_DECL) != 0) && (strcmp(wordptr[0], FSG_MODEL_FINAL_STATE_DECL) != 0)) || (sscanf(wordptr[1], "%d", &(fsg->final_state)) != 1) || (fsg->final_state < 0) || (fsg->final_state >= fsg->n_state)) { E_ERROR ("Line[%d]: final state declaration line missing or malformed\n", lineno); goto parse_error; } /* Read transitions */ lastwid = 0; n_trans = n_null_trans = 0; for (;;) { int32 wid, tprob; n = nextline_str2words(fp, &lineno, &lineptr, &wordptr); if (n <= 0) { E_ERROR("Line[%d]: transition or FSG_END statement expected\n", lineno); goto parse_error; } if ((strcmp(wordptr[0], FSG_MODEL_END_DECL) == 0)) { break; } if ((strcmp(wordptr[0], FSG_MODEL_T_DECL) == 0) || (strcmp(wordptr[0], FSG_MODEL_TRANSITION_DECL) == 0)) { if (((n != 4) && (n != 5)) || (sscanf(wordptr[1], "%d", &i) != 1) || (sscanf(wordptr[2], "%d", &j) != 1) || (i < 0) || (i >= fsg->n_state) || (j < 0) || (j >= fsg->n_state)) { E_ERROR ("Line[%d]: transition spec malformed; Expecting: from-state to-state trans-prob [word]\n", lineno); goto parse_error; } p = atof_c(wordptr[3]); if ((p <= 0.0) || (p > 1.0)) { E_ERROR ("Line[%d]: transition spec malformed; Expecting float as transition probability\n", lineno); goto parse_error; } } else { E_ERROR("Line[%d]: transition or FSG_END statement expected\n", lineno); goto parse_error; } tprob = (int32) (logmath_log(lmath, p) * fsg->lw); /* Add word to "dictionary". */ if (n > 4) { if (hash_table_lookup_int32(vocab, wordptr[4], &wid) < 0) { (void) hash_table_enter_int32(vocab, ckd_salloc(wordptr[4]), lastwid); wid = lastwid; ++lastwid; } fsg_model_trans_add(fsg, i, j, tprob, wid); ++n_trans; } else { if (fsg_model_null_trans_add(fsg, i, j, tprob) == 1) { ++n_null_trans; nulls = glist_add_ptr(nulls, fsg_model_null_trans(fsg, i, j)); } } } E_INFO("FSG: %d states, %d unique words, %d transitions (%d null)\n", fsg->n_state, hash_table_inuse(vocab), n_trans, n_null_trans); /* Now create a string table from the "dictionary" */ fsg->n_word = hash_table_inuse(vocab); fsg->n_word_alloc = fsg->n_word + 10; /* Pad it a bit. */ fsg->vocab = ckd_calloc(fsg->n_word_alloc, sizeof(*fsg->vocab)); for (itor = hash_table_iter(vocab); itor; itor = hash_table_iter_next(itor)) { char const *word = hash_entry_key(itor->ent); int32 wid = (int32) (long) hash_entry_val(itor->ent); fsg->vocab[wid] = (char *) word; } hash_table_free(vocab); /* Do transitive closure on null transitions */ nulls = fsg_model_null_trans_closure(fsg, nulls); glist_free(nulls); ckd_free(lineptr); ckd_free(wordptr); return fsg; parse_error: for (itor = hash_table_iter(vocab); itor; itor = hash_table_iter_next(itor)) ckd_free((char *) hash_entry_key(itor->ent)); glist_free(nulls); hash_table_free(vocab); ckd_free(fsgname); ckd_free(lineptr); ckd_free(wordptr); fsg_model_free(fsg); return NULL; }
static void read_ngram_instance(lineiter_t ** li, hash_table_t * wid, logmath_t * lmath, int order, int order_max, ngram_raw_t * raw_ngram) { int n; int words_expected; int i; char *wptr[NGRAM_MAX_ORDER + 1]; uint32 *word_out; *li = lineiter_next(*li); if (*li == NULL) { E_ERROR("Unexpected end of ARPA file. Failed to read %d-gram\n", order); return; } string_trim((*li)->buf, STRING_BOTH); words_expected = order + 1; if ((n = str2words((*li)->buf, wptr, NGRAM_MAX_ORDER + 1)) < words_expected) { if ((*li)->buf[0] != '\0') { E_WARN("Format error; %d-gram ignored: %s\n", order, (*li)->buf); } } else { if (order == order_max) { raw_ngram->weights = (float *) ckd_calloc(1, sizeof(*raw_ngram->weights)); raw_ngram->weights[0] = atof_c(wptr[0]); if (raw_ngram->weights[0] > 0) { E_WARN("%d-gram [%s] has positive probability. Zeroize\n", order, wptr[1]); raw_ngram->weights[0] = 0.0f; } raw_ngram->weights[0] = logmath_log10_to_log_float(lmath, raw_ngram->weights[0]); } else { float weight, backoff; raw_ngram->weights = (float *) ckd_calloc(2, sizeof(*raw_ngram->weights)); weight = atof_c(wptr[0]); if (weight > 0) { E_WARN("%d-gram [%s] has positive probability. Zeroize\n", order, wptr[1]); raw_ngram->weights[0] = 0.0f; } else { raw_ngram->weights[0] = logmath_log10_to_log_float(lmath, weight); } if (n == order + 1) { raw_ngram->weights[1] = 0.0f; } else { backoff = atof_c(wptr[order + 1]); raw_ngram->weights[1] = logmath_log10_to_log_float(lmath, backoff); } } raw_ngram->words = (uint32 *) ckd_calloc(order, sizeof(*raw_ngram->words)); for (word_out = raw_ngram->words + order - 1, i = 1; word_out >= raw_ngram->words; --word_out, i++) { hash_table_lookup_int32(wid, wptr[i], (int32 *) word_out); } } }
s3wid_t dict_add_word(dict_t * d, char const *word, s3cipid_t const * p, int32 np) { int32 len; dictword_t *wordp; s3wid_t newwid; char *wword; if (d->n_word >= d->max_words) { E_INFO("Reallocating to %d KiB for word entries\n", (d->max_words + S3DICT_INC_SZ) * sizeof(dictword_t) / 1024); d->word = (dictword_t *) ckd_realloc(d->word, (d->max_words + S3DICT_INC_SZ) * sizeof(dictword_t)); d->max_words = d->max_words + S3DICT_INC_SZ; } wordp = d->word + d->n_word; wordp->word = (char *) ckd_salloc(word); /* Freed in dict_free */ /* Associate word string with d->n_word in hash table */ if (hash_table_enter_int32(d->ht, wordp->word, d->n_word) != d->n_word) { ckd_free(wordp->word); wordp->word = NULL; return BAD_S3WID; } /* Fill in word entry, and set defaults */ if (p && (np > 0)) { wordp->ciphone = (s3cipid_t *) ckd_malloc(np * sizeof(s3cipid_t)); /* Freed in dict_free */ memcpy(wordp->ciphone, p, np * sizeof(s3cipid_t)); wordp->pronlen = np; } else { wordp->ciphone = NULL; wordp->pronlen = 0; } wordp->alt = BAD_S3WID; wordp->basewid = d->n_word; /* Determine base/alt wids */ wword = ckd_salloc(word); if ((len = dict_word2basestr(wword)) > 0) { int32 w; /* Truncated to a baseword string; find its ID */ if (hash_table_lookup_int32(d->ht, wword, &w) < 0) { E_ERROR("Missing base word for: %s\n", word); ckd_free(wword); ckd_free(wordp->word); wordp->word = NULL; return BAD_S3WID; } /* Link into alt list */ wordp->basewid = w; wordp->alt = d->word[w].alt; d->word[w].alt = d->n_word; } ckd_free(wword); newwid = d->n_word++; return newwid; }
static int read_ngram_instance(lineiter_t ** li, hash_table_t * wid, logmath_t * lmath, int order, int order_max, ngram_raw_t * raw_ngram) { int n; int words_expected; int i; char *wptr[NGRAM_MAX_ORDER + 1]; uint32 *word_out; if (*li) *li = lineiter_next(*li); if (*li == NULL) { E_ERROR("Unexpected end of ARPA file. Failed to read %d-gram\n", order); return -1; } words_expected = order + 1; if ((n = str2words((*li)->buf, wptr, NGRAM_MAX_ORDER + 1)) < words_expected) { E_ERROR("Format error; %d-gram ignored: %s\n", order, (*li)->buf); return -1; } raw_ngram->order = order; if (order == order_max) { raw_ngram->prob = atof_c(wptr[0]); if (raw_ngram->prob > 0) { E_WARN("%d-gram '%s' has positive probability\n", order, wptr[1]); raw_ngram->prob = 0.0f; } raw_ngram->prob = logmath_log10_to_log_float(lmath, raw_ngram->prob); } else { float weight, backoff; weight = atof_c(wptr[0]); if (weight > 0) { E_WARN("%d-gram '%s' has positive probability\n", order, wptr[1]); raw_ngram->prob = 0.0f; } else { raw_ngram->prob = logmath_log10_to_log_float(lmath, weight); } if (n == order + 1) { raw_ngram->backoff = 0.0f; } else { backoff = atof_c(wptr[order + 1]); raw_ngram->backoff = logmath_log10_to_log_float(lmath, backoff); } } raw_ngram->words = (uint32 *) ckd_calloc(order, sizeof(*raw_ngram->words)); for (word_out = raw_ngram->words + order - 1, i = 1; word_out >= raw_ngram->words; --word_out, i++) { hash_table_lookup_int32(wid, wptr[i], (int32 *) word_out); } return 0; }