lmclass_set_t lmclass_newset ( void ) { lmclass_set_t set; set = (lmclass_set_t) CM_calloc (1, sizeof(struct lmclass_set_s)); set->lmclass_list = NULL; return set; }
static void mk_phone_map (void) { int32 numPhones = phone_count(); int32 numPhoneMappings = 0; /* will be bogus without VERBOSE_PHONE_MAP! */ int32 pid; #ifdef VERBOSE_PHONE_MAP int32 pid1, pid2; int32 numCiPhones = phoneCiCount(); int32 numVocPhones = sizeof(voc)/sizeof(char *); int32 numUstPhones = sizeof(ust)/sizeof(char *); int32 op_id, res_id; char op[32], res[32]; #endif /* * In case we get called twice. */ if (PhoneMap) free (PhoneMap); PhoneMap = (int32 *) CM_calloc (numPhones, sizeof(int32)); for (pid = 0; pid < numPhones; pid++) PhoneMap[pid] = pid; #ifdef VERBOSE_PHONE_MAP for (pid = 0; pid < numCiPhones; pid++) { sprintf (op, "TD(%s,Y)e", phone_from_id(pid)); sprintf (res, "JH(%s,Y)e", phone_from_id(pid)); op_id = phone_to_id (op, FALSE); res_id = phone_to_id (res, FALSE); if ((op_id != NO_PHONE) && (res_id != NO_PHONE)) { PhoneMap[op_id] = res_id; numPhoneMappings++; } sprintf (op, "DD(%s,Y)e", phone_from_id(pid)); sprintf (res, "JH(%s,Y)e", phone_from_id(pid)); op_id = phone_to_id (op, FALSE); res_id = phone_to_id (res, FALSE); if ((op_id != NO_PHONE) && (res_id != NO_PHONE)) { PhoneMap[op_id] = res_id; numPhoneMappings++; } } for (pid1 = 0; pid1 < numVocPhones; pid1++) { for (pid2 = 0; pid2 < numUstPhones; pid2++) { sprintf (op, "TD(%s,%s)e", voc[pid1], ust[pid2]); sprintf (res, "DX(%s,%s)", voc[pid1], ust[pid2]); op_id = phone_to_id (op, FALSE); res_id = phone_to_id (res, FALSE); if ((op_id != NO_PHONE) && (res_id != NO_PHONE)) { PhoneMap[op_id] = res_id; numPhoneMappings++; } sprintf (op, "DD(%s,%s)e", voc[pid1], ust[pid2]); sprintf (res, "D(%s,%s)", voc[pid1], ust[pid2]); op_id = phone_to_id (op, FALSE); res_id = phone_to_id (res, FALSE); if ((op_id != NO_PHONE) && (res_id != NO_PHONE)) { PhoneMap[op_id] = res_id; numPhoneMappings++; } } for (pid2 = 0; pid2 < numCiPhones; pid2++) { sprintf (op, "TD(%s,%s)e", phone_from_id(pid2), voc[pid1]); sprintf (res, "T(%s,%s)", phone_from_id(pid2), voc[pid1]); op_id = phone_to_id (op, FALSE); res_id = phone_to_id (res, FALSE); if ((op_id != NO_PHONE) && (res_id != NO_PHONE)) { PhoneMap[op_id] = res_id; numPhoneMappings++; } sprintf (op, "DD(%s,%s)e", phone_from_id(pid2), voc[pid1]); sprintf (res, "D(%s,%s)", phone_from_id(pid2), voc[pid1]); op_id = phone_to_id (op, FALSE); res_id = phone_to_id (res, FALSE); if ((op_id != NO_PHONE) && (res_id != NO_PHONE)) { PhoneMap[op_id] = res_id; numPhoneMappings++; } sprintf (op, "BD(%s,%s)e", phone_from_id(pid2), voc[pid1]); sprintf (res, "B(%s,%s)", phone_from_id(pid2), voc[pid1]); op_id = phone_to_id (op, FALSE); res_id = phone_to_id (res, FALSE); if ((op_id != NO_PHONE) && (res_id != NO_PHONE)) { PhoneMap[op_id] = res_id; numPhoneMappings++; } sprintf (op, "GD(%s,%s)e", phone_from_id(pid2), voc[pid1]); sprintf (res, "G(%s,%s)", phone_from_id(pid2), voc[pid1]); op_id = phone_to_id (op, FALSE); res_id = phone_to_id (res, FALSE); if ((op_id != NO_PHONE) && (res_id != NO_PHONE)) { PhoneMap[op_id] = res_id; numPhoneMappings++; } sprintf (op, "KD(%s,%s)e", phone_from_id(pid2), voc[pid1]); sprintf (res, "K(%s,%s)", phone_from_id(pid2), voc[pid1]); op_id = phone_to_id (op, FALSE); res_id = phone_to_id (res, FALSE); if ((op_id != NO_PHONE) && (res_id != NO_PHONE)) { PhoneMap[op_id] = res_id; numPhoneMappings++; } sprintf (op, "PD(%s,%s)e", phone_from_id(pid2), voc[pid1]); sprintf (res, "P(%s,%s)", phone_from_id(pid2), voc[pid1]); op_id = phone_to_id (op, FALSE); res_id = phone_to_id (res, FALSE); if ((op_id != NO_PHONE) && (res_id != NO_PHONE)) { PhoneMap[op_id] = res_id; numPhoneMappings++; } } } #endif /* VERBOSE_PHONE_MAP */ E_INFO ("Using %d phonological mappings\n", numPhoneMappings); }
void kb (int argc, char *argv[], float ip, /* word insertion penalty */ float lw, /* langauge weight */ float pip) /* phone insertion penalty */ { char *pname = argv[0]; char hmm_file_name[256]; int32 num_phones, num_ci_phones; int32 i, use_darpa_lm; /* FIXME: This is evil. But if we do it, let's prototype it somewhere, OK? */ unlimit (); /* Remove memory size limits */ language_weight = lw; insertion_penalty = ip; phone_insertion_penalty = pip; pconf (argc, argv, kb_param, 0, 0, 0); if ((phone_file_name == 0) || (dict_file_name == 0)) pusage (pname, (Config_t *)kb_param); log_info("%s(%d): Reading phone file [%s]\n", __FILE__, __LINE__, phone_file_name); if (phone_read (phone_file_name)) exit (-1); if (useWDPhonesOnly) phone_add_diphones(); num_ci_phones = phoneCiCount(); /* Read the distribution map file */ log_info("%s(%d): Reading map file [%s]\n", __FILE__, __LINE__, mapFileName); read_map (mapFileName, TRUE /* useCiTrans compress */); log_info("%s(%d): Reading dict file [%s]\n", __FILE__, __LINE__, dict_file_name); word_dict = dict_new (); if (dict_read (word_dict, dict_file_name, phrase_dict_file_name, noise_dict_file_name, !useWDPhonesOnly)) exit (-1); use_darpa_lm = TRUE; if (use_darpa_lm) { lmSetStartSym (lm_start_sym); lmSetEndSym (lm_end_sym); /* * Read control file describing multiple LMs, if specified. * File format (optional stuff is indicated by enclosing in []): * * [{ LMClassFileName LMClassFilename ... }] * TrigramLMFileName LMName [{ LMClassName LMClassName ... }] * TrigramLMFileName LMName [{ LMClassName LMClassName ... }] * ... * (There should be whitespace around the { and } delimiters.) * * This is an extension of the older format that had only TrigramLMFilenName * and LMName pairs. The new format allows a set of LMClass files to be read * in and referred to by the trigram LMs. (Incidentally, if one wants to use * LM classes in a trigram LM, one MUST use the -lmctlfn flag. It is not * possible to read in a class-based trigram LM using the -lmfn flag.) * * No "comments" allowed in this file. */ if (lm_ctl_filename) { FILE *ctlfp; char lmfile[4096], lmname[4096], str[4096]; lmclass_set_t lmclass_set; lmclass_t *lmclass, cl; int32 n_lmclass, n_lmclass_used; lmclass_set = lmclass_newset(); E_INFO("Reading LM control file '%s'\n", lm_ctl_filename); ctlfp = CM_fopen (lm_ctl_filename, "r"); if (fscanf (ctlfp, "%s", str) == 1) { if (strcmp (str, "{") == 0) { /* Load LMclass files */ while ((fscanf (ctlfp, "%s", str) == 1) && (strcmp (str, "}") != 0)) lmclass_set = lmclass_loadfile (lmclass_set, str); if (strcmp (str, "}") != 0) E_FATAL("Unexpected EOF(%s)\n", lm_ctl_filename); if (fscanf (ctlfp, "%s", str) != 1) str[0] = '\0'; } } else str[0] = '\0'; /* Fill in dictionary word id information for each LMclass word */ for (cl = lmclass_firstclass(lmclass_set); lmclass_isclass(cl); cl = lmclass_nextclass(lmclass_set, cl)) { kb_init_lmclass_dictwid (cl); } /* At this point if str[0] != '\0', we have an LM filename */ n_lmclass = lmclass_get_nclass(lmclass_set); lmclass = (lmclass_t *) CM_calloc (n_lmclass, sizeof(lmclass_t)); /* Read in one LM at a time */ while (str[0] != '\0') { strcpy (lmfile, str); if (fscanf (ctlfp, "%s", lmname) != 1) E_FATAL("LMname missing after LMFileName '%s'\n", lmfile); n_lmclass_used = 0; if (fscanf (ctlfp, "%s", str) == 1) { if (strcmp (str, "{") == 0) { /* LM uses classes; read their names */ while ((fscanf (ctlfp, "%s", str) == 1) && (strcmp (str, "}") != 0)) { if (n_lmclass_used >= n_lmclass) E_FATAL("Too many LM classes specified for '%s'\n", lmfile); lmclass[n_lmclass_used] = lmclass_get_lmclass (lmclass_set, str); if (! (lmclass_isclass(lmclass[n_lmclass_used]))) E_FATAL("LM class '%s' not found\n", str); n_lmclass_used++; } if (strcmp (str, "}") != 0) E_FATAL("Unexpected EOF(%s)\n", lm_ctl_filename); if (fscanf (ctlfp, "%s", str) != 1) str[0] = '\0'; } } else str[0] = '\0'; if (n_lmclass_used > 0) lm_read_clm (lmfile, lmname, language_weight, unigramWeight, insertion_penalty, lmclass, n_lmclass_used); else lm_read (lmfile, lmname, language_weight, unigramWeight, insertion_penalty); } fclose (ctlfp); NoLangModel = FALSE; } /* Read "base" LM file, if specified */ if (lm_file_name) { lmSetStartSym (lm_start_sym); lmSetEndSym (lm_end_sym); lm_read (lm_file_name, "", language_weight, unigramWeight, insertion_penalty); /* Make initial OOV list known to this base LM */ lm_init_oov (); NoLangModel = FALSE; } #ifdef USE_ILM /* Init ILM module (non-std-Darpa LM, eg ug/bg cache LM) */ ilm_init (); #endif } #if 0 /* Compute the phrase lm probabilities */ computePhraseLMProbs (); #endif num_phones = phone_count (); numSmds = hmm_num_sseq(); smds = (SMD *) CM_calloc (numSmds, sizeof (SMD)); /* * Read the hmm's into the SMD structures */ if (useBigHmmFiles) { for (i = 0; i < num_ci_phones; i++) { sprintf (hmm_file_name, "%s.%s", phone_from_id (i), hmm_ext); hmm_tied_read_big_bin (hmm_dir_list, hmm_file_name, smds, transSmooth, NUMOFCODEENTRIES, TRUE, transWeight); } } else { for (i = 0; i < num_phones; i++) { if ((!useCiTrans) || (phone_id_to_base_id(i) == i)) { sprintf (hmm_file_name, "%s.%s", phone_from_id (i), hmm_ext); hmm_tied_read_bin (hmm_dir_list, hmm_file_name, &smds[hmm_pid2sid(i)], transSmooth, NUMOFCODEENTRIES, TRUE, transWeight); } } } /* * Use Ci transitions ? */ if (useCiTrans) { for (i = 0; i < num_phones; i++) { if (hmm_pid2sid(phone_id_to_base_id(i)) != hmm_pid2sid(i)) { /* * Just make a copy of the CI phone transitions */ memcpy (&smds[hmm_pid2sid(i)], &smds[hmm_pid2sid(phone_id_to_base_id(i))], sizeof (SMD)); } } } /* * Read the distributions */ read_dists (hmm_dir, code1_ext, code2_ext, code3_ext, code4_ext, NUMOFCODEENTRIES, hmm_smooth_min, useCiPhonesOnly); if (Use8BitSenProb) SCVQSetSenoneCompression (8); /* * Map the distributions to the correct locations */ remap (smds); }