lmset_t * lmset_read_ctl(const char *ctlfile, dict_t * dict, float64 lw, float64 wip, float64 uw, const char *lmdumpdir, logmath_t *logmath) { FILE *ctlfp; FILE *tmp; char lmfile[4096], lmname[4096], str[4096]; lmclass_set_t *lmclass_set; lmclass_t **lmclass, *cl; int32 n_lmclass, n_lmclass_used; int32 i; lm_t *lm; lmset_t *lms = NULL; tmp = NULL; E_INFO("Reading LM control file '%s'\n", ctlfile); if ((ctlfp = fopen(ctlfile, "r")) == NULL) { E_ERROR_SYSTEM("Failed to open LM control file"); return NULL; } lmclass_set = lmclass_newset(); lms = (lmset_t *) ckd_calloc(1, sizeof(lmset_t)); lms->n_lm = 0; lms->n_alloc_lm = 0; if (fscanf(ctlfp, "%s", str) == 1) { if (strcmp(str, "{") == 0) { /* Load LMclass files */ while ((fscanf(ctlfp, "%s", str) == 1) && (strcmp(str, "}") != 0)) lmclass_set = lmclass_loadfile(lmclass_set, str, logmath); if (strcmp(str, "}") != 0) E_FATAL("Unexpected EOF(%s)\n", ctlfile); if (fscanf(ctlfp, "%s", str) != 1) str[0] = '\0'; } } else str[0] = '\0'; /* Fill in dictionary word id information for each LMclass word */ for (cl = lmclass_firstclass(lmclass_set); lmclass_isclass(cl); cl = lmclass_nextclass(lmclass_set, cl)) { /* For every words in the class, set the dictwid correctly The following piece of code replace s2's kb_init_lmclass_dictwid (cl); doesn't do any checking even the id is a bad dict id. This only sets the information in the lmclass_set, but not lm-2-dict or dict-2-lm map. In Sphinx 3, they are done in wid_dict_lm_map in wid.c. */ lmclass_word_t *w; int32 wid; for (w = lmclass_firstword(cl); lmclass_isword(w); w = lmclass_nextword(cl, w)) { wid = dict_wordid(dict, lmclass_getword(w)); #if 0 E_INFO("In class %s, Word %s, wid %d\n", cl->name, lmclass_getword(w), wid); #endif lmclass_set_dictwid(w, wid); } } /* At this point if str[0] != '\0', we have an LM filename */ n_lmclass = lmclass_get_nclass(lmclass_set); lmclass = (lmclass_t **) ckd_calloc(n_lmclass, sizeof(lmclass_t *)); E_INFO("Number of LM class specified %d in file %s\n", n_lmclass, ctlfile); /* Read in one LM at a time */ while (str[0] != '\0') { strcpy(lmfile, str); if (fscanf(ctlfp, "%s", lmname) != 1) E_FATAL("LMname missing after LMFileName '%s'\n", lmfile); n_lmclass_used = 0; if (fscanf(ctlfp, "%s", str) == 1) { if (strcmp(str, "{") == 0) { while ((fscanf(ctlfp, "%s", str) == 1) && (strcmp(str, "}") != 0)) { if (n_lmclass_used >= n_lmclass) { E_FATAL("Too many LM classes specified for '%s'\n", lmfile); } lmclass[n_lmclass_used] = lmclass_get_lmclass(lmclass_set, str); if (!(lmclass_isclass(lmclass[n_lmclass_used]))) E_FATAL("LM class '%s' not found\n", str); n_lmclass_used++; } if (strcmp(str, "}") != 0) E_FATAL("Unexpected EOF(%s)\n", ctlfile); if (fscanf(ctlfp, "%s", str) != 1) str[0] = '\0'; } } else str[0] = '\0'; lm = (lm_t *) lm_read_advance(lmfile, lmname, lw, wip, uw, dict_size(dict), NULL, 1, logmath, FALSE, FALSE); if (n_lmclass_used > 0) { E_INFO("Did I enter here?\n"); lm_build_lmclass_info(lm, lw, uw, wip, n_lmclass_used, lmclass); } if (lms->n_lm == lms->n_alloc_lm) { lms->lmarray = (lm_t **) ckd_realloc(lms->lmarray, (lms->n_alloc_lm + LM_ALLOC_BLOCK) * sizeof(lm_t *)); lms->n_alloc_lm += LM_ALLOC_BLOCK; } lms->lmarray[lms->n_lm] = lm; lms->n_lm += 1; E_INFO("%d %d\n", lms->n_alloc_lm, lms->n_lm); } assert(lms); assert(lms->lmarray); E_INFO("No. of LM set allocated %d, no. of LM %d \n", lms->n_alloc_lm, lms->n_lm); if (dict != NULL) { for (i = 0; i < lms->n_lm; i++) { assert(lms->lmarray[i]); assert(dict); if ((lms->lmarray[i]->dict2lmwid = wid_dict_lm_map(dict, lms->lmarray[i], lw)) == NULL) E_FATAL ("Dict/LM word-id mapping failed for LM index %d, named %s\n", i, lmset_idx_to_name(lms, i)); } } else { E_FATAL ("Dict is specified to be NULL (dict_init is not called before lmset_read_lm?), dict2lmwid is not built inside lmset_read_lm\n"); } ckd_free(lmclass_set); ckd_free(lmclass); fclose(ctlfp); return lms; }
void kb (int argc, char *argv[], float ip, /* word insertion penalty */ float lw, /* langauge weight */ float pip) /* phone insertion penalty */ { char *pname = argv[0]; char hmm_file_name[256]; int32 num_phones, num_ci_phones; int32 i, use_darpa_lm; /* FIXME: This is evil. But if we do it, let's prototype it somewhere, OK? */ unlimit (); /* Remove memory size limits */ language_weight = lw; insertion_penalty = ip; phone_insertion_penalty = pip; pconf (argc, argv, kb_param, 0, 0, 0); if ((phone_file_name == 0) || (dict_file_name == 0)) pusage (pname, (Config_t *)kb_param); log_info("%s(%d): Reading phone file [%s]\n", __FILE__, __LINE__, phone_file_name); if (phone_read (phone_file_name)) exit (-1); if (useWDPhonesOnly) phone_add_diphones(); num_ci_phones = phoneCiCount(); /* Read the distribution map file */ log_info("%s(%d): Reading map file [%s]\n", __FILE__, __LINE__, mapFileName); read_map (mapFileName, TRUE /* useCiTrans compress */); log_info("%s(%d): Reading dict file [%s]\n", __FILE__, __LINE__, dict_file_name); word_dict = dict_new (); if (dict_read (word_dict, dict_file_name, phrase_dict_file_name, noise_dict_file_name, !useWDPhonesOnly)) exit (-1); use_darpa_lm = TRUE; if (use_darpa_lm) { lmSetStartSym (lm_start_sym); lmSetEndSym (lm_end_sym); /* * Read control file describing multiple LMs, if specified. * File format (optional stuff is indicated by enclosing in []): * * [{ LMClassFileName LMClassFilename ... }] * TrigramLMFileName LMName [{ LMClassName LMClassName ... }] * TrigramLMFileName LMName [{ LMClassName LMClassName ... }] * ... * (There should be whitespace around the { and } delimiters.) * * This is an extension of the older format that had only TrigramLMFilenName * and LMName pairs. The new format allows a set of LMClass files to be read * in and referred to by the trigram LMs. (Incidentally, if one wants to use * LM classes in a trigram LM, one MUST use the -lmctlfn flag. It is not * possible to read in a class-based trigram LM using the -lmfn flag.) * * No "comments" allowed in this file. */ if (lm_ctl_filename) { FILE *ctlfp; char lmfile[4096], lmname[4096], str[4096]; lmclass_set_t lmclass_set; lmclass_t *lmclass, cl; int32 n_lmclass, n_lmclass_used; lmclass_set = lmclass_newset(); E_INFO("Reading LM control file '%s'\n", lm_ctl_filename); ctlfp = CM_fopen (lm_ctl_filename, "r"); if (fscanf (ctlfp, "%s", str) == 1) { if (strcmp (str, "{") == 0) { /* Load LMclass files */ while ((fscanf (ctlfp, "%s", str) == 1) && (strcmp (str, "}") != 0)) lmclass_set = lmclass_loadfile (lmclass_set, str); if (strcmp (str, "}") != 0) E_FATAL("Unexpected EOF(%s)\n", lm_ctl_filename); if (fscanf (ctlfp, "%s", str) != 1) str[0] = '\0'; } } else str[0] = '\0'; /* Fill in dictionary word id information for each LMclass word */ for (cl = lmclass_firstclass(lmclass_set); lmclass_isclass(cl); cl = lmclass_nextclass(lmclass_set, cl)) { kb_init_lmclass_dictwid (cl); } /* At this point if str[0] != '\0', we have an LM filename */ n_lmclass = lmclass_get_nclass(lmclass_set); lmclass = (lmclass_t *) CM_calloc (n_lmclass, sizeof(lmclass_t)); /* Read in one LM at a time */ while (str[0] != '\0') { strcpy (lmfile, str); if (fscanf (ctlfp, "%s", lmname) != 1) E_FATAL("LMname missing after LMFileName '%s'\n", lmfile); n_lmclass_used = 0; if (fscanf (ctlfp, "%s", str) == 1) { if (strcmp (str, "{") == 0) { /* LM uses classes; read their names */ while ((fscanf (ctlfp, "%s", str) == 1) && (strcmp (str, "}") != 0)) { if (n_lmclass_used >= n_lmclass) E_FATAL("Too many LM classes specified for '%s'\n", lmfile); lmclass[n_lmclass_used] = lmclass_get_lmclass (lmclass_set, str); if (! (lmclass_isclass(lmclass[n_lmclass_used]))) E_FATAL("LM class '%s' not found\n", str); n_lmclass_used++; } if (strcmp (str, "}") != 0) E_FATAL("Unexpected EOF(%s)\n", lm_ctl_filename); if (fscanf (ctlfp, "%s", str) != 1) str[0] = '\0'; } } else str[0] = '\0'; if (n_lmclass_used > 0) lm_read_clm (lmfile, lmname, language_weight, unigramWeight, insertion_penalty, lmclass, n_lmclass_used); else lm_read (lmfile, lmname, language_weight, unigramWeight, insertion_penalty); } fclose (ctlfp); NoLangModel = FALSE; } /* Read "base" LM file, if specified */ if (lm_file_name) { lmSetStartSym (lm_start_sym); lmSetEndSym (lm_end_sym); lm_read (lm_file_name, "", language_weight, unigramWeight, insertion_penalty); /* Make initial OOV list known to this base LM */ lm_init_oov (); NoLangModel = FALSE; } #ifdef USE_ILM /* Init ILM module (non-std-Darpa LM, eg ug/bg cache LM) */ ilm_init (); #endif } #if 0 /* Compute the phrase lm probabilities */ computePhraseLMProbs (); #endif num_phones = phone_count (); numSmds = hmm_num_sseq(); smds = (SMD *) CM_calloc (numSmds, sizeof (SMD)); /* * Read the hmm's into the SMD structures */ if (useBigHmmFiles) { for (i = 0; i < num_ci_phones; i++) { sprintf (hmm_file_name, "%s.%s", phone_from_id (i), hmm_ext); hmm_tied_read_big_bin (hmm_dir_list, hmm_file_name, smds, transSmooth, NUMOFCODEENTRIES, TRUE, transWeight); } } else { for (i = 0; i < num_phones; i++) { if ((!useCiTrans) || (phone_id_to_base_id(i) == i)) { sprintf (hmm_file_name, "%s.%s", phone_from_id (i), hmm_ext); hmm_tied_read_bin (hmm_dir_list, hmm_file_name, &smds[hmm_pid2sid(i)], transSmooth, NUMOFCODEENTRIES, TRUE, transWeight); } } } /* * Use Ci transitions ? */ if (useCiTrans) { for (i = 0; i < num_phones; i++) { if (hmm_pid2sid(phone_id_to_base_id(i)) != hmm_pid2sid(i)) { /* * Just make a copy of the CI phone transitions */ memcpy (&smds[hmm_pid2sid(i)], &smds[hmm_pid2sid(phone_id_to_base_id(i))], sizeof (SMD)); } } } /* * Read the distributions */ read_dists (hmm_dir, code1_ext, code2_ext, code3_ext, code4_ext, NUMOFCODEENTRIES, hmm_smooth_min, useCiPhonesOnly); if (Use8BitSenProb) SCVQSetSenoneCompression (8); /* * Map the distributions to the correct locations */ remap (smds); }