/** * Map the given ngram string to an array of word IDs of the individual * words in the ngram. * * args: * ngram - the ngram string to map * length - the length of the ngram string * w - the word ID array * lm - the language model to use * * returns: * the number of words in the ngram string, or 0 if the string contains an * unknown word */ int ngram2wid(char *ngram, int length, s3lmwid32_t * w, lm_t * lm) { char *word[1024]; int nwd; int i; if ((nwd = str2words(ngram, word, length)) < 0) E_FATAL("Increase word[] and w[] arrays size\n"); for (i = 0; i < nwd; i++) { w[i] = lm_wid(lm, word[i]); if (NOT_LMWID(lm, w[i])) { E_ERROR("Unknown word: %s\n", word[i]); return 0; } } return nwd; }
void lmcontext_load (corpus_t *corp, char *uttid, s3wid_t *pred, s3wid_t *succ) { char *str, wd[4096], *strp; s3wid_t w[3]; int32 i, n; dict_t *dict; s3lmwid_t lwid; if ((str = corpus_lookup (corp, uttid)) == NULL) E_FATAL("Couldn't find LM context for %s\n", uttid); dict = dict_getdict (); strp = str; for (i = 0; i < 4; i++) { if (sscanf (strp, "%s%n", wd, &n) != 1) { if (i < 3) E_FATAL("Bad LM context spec for %s: %s\n", uttid, str); else break; } strp += n; if (strcmp (wd, "-") == 0) w[i] = BAD_WID; else { w[i] = dict_wordid (wd); if (NOT_WID(w[i])) E_FATAL("LM context word (%s) for %s not in dictionary\n", wd, uttid); w[i] = dict_basewid(w[i]); switch (i) { case 0: if ((n = dict->word[w[0]].n_comp) > 0) w[0] = dict->word[w[0]].comp[n-1].wid; break; case 1: if ((n = dict->word[w[1]].n_comp) > 0) { w[0] = dict->word[w[1]].comp[n-2].wid; w[1] = dict->word[w[1]].comp[n-1].wid; } break; case 2: if (w[2] != dict_wordid(FINISH_WORD)) E_FATAL("Illegal successor LM context for %s: %s\n", uttid, str); break; default: assert (0); /* Should never get here */ break; } } } if (IS_WID(w[0]) && NOT_WID(w[1])) E_FATAL("Bad LM context spec for %s: %s\n", uttid, str); for (i = 0; i < 3; i++) { if (IS_WID(w[i])) { lwid = lm_lmwid (w[i]); if (NOT_LMWID(lwid)) E_FATAL("LM context word (%s) for %s not in LM\n", wd, uttid); } } pred[0] = w[0]; pred[1] = w[1]; *succ = w[2]; }