static void read_dmp_weight_array(FILE * fp, logmath_t * lmath, uint8 do_swap, int32 counts, ngram_raw_t * raw_ngrams, int weight_idx) { int32 i, k; dmp_weight_t *tmp_weight_arr; fread(&k, sizeof(k), 1, fp); if (do_swap) SWAP_INT32(&k); tmp_weight_arr = (dmp_weight_t *) ckd_calloc(k, sizeof(*tmp_weight_arr)); fread(tmp_weight_arr, sizeof(*tmp_weight_arr), k, fp); for (i = 0; i < k; i++) { if (do_swap) SWAP_INT32(&tmp_weight_arr[i].l); /* Convert values to log. */ tmp_weight_arr[i].f = logmath_log10_to_log_float(lmath, tmp_weight_arr[i].f); } //replace indexes with real probs in raw bigrams for (i = 0; i < counts; i++) { raw_ngrams[i].weights[weight_idx] = tmp_weight_arr[(int) raw_ngrams[i].weights[weight_idx]].f; } ckd_free(tmp_weight_arr); }
static void read_ngram_instance(lineiter_t ** li, hash_table_t * wid, logmath_t * lmath, int order, int order_max, ngram_raw_t * raw_ngram) { int n; int words_expected; int i; char *wptr[NGRAM_MAX_ORDER + 1]; uint32 *word_out; *li = lineiter_next(*li); if (*li == NULL) { E_ERROR("Unexpected end of ARPA file. Failed to read %d-gram\n", order); return; } string_trim((*li)->buf, STRING_BOTH); words_expected = order + 1; if ((n = str2words((*li)->buf, wptr, NGRAM_MAX_ORDER + 1)) < words_expected) { if ((*li)->buf[0] != '\0') { E_WARN("Format error; %d-gram ignored: %s\n", order, (*li)->buf); } } else { if (order == order_max) { raw_ngram->weights = (float *) ckd_calloc(1, sizeof(*raw_ngram->weights)); raw_ngram->weights[0] = atof_c(wptr[0]); if (raw_ngram->weights[0] > 0) { E_WARN("%d-gram [%s] has positive probability. Zeroize\n", order, wptr[1]); raw_ngram->weights[0] = 0.0f; } raw_ngram->weights[0] = logmath_log10_to_log_float(lmath, raw_ngram->weights[0]); } else { float weight, backoff; raw_ngram->weights = (float *) ckd_calloc(2, sizeof(*raw_ngram->weights)); weight = atof_c(wptr[0]); if (weight > 0) { E_WARN("%d-gram [%s] has positive probability. Zeroize\n", order, wptr[1]); raw_ngram->weights[0] = 0.0f; } else { raw_ngram->weights[0] = logmath_log10_to_log_float(lmath, weight); } if (n == order + 1) { raw_ngram->weights[1] = 0.0f; } else { backoff = atof_c(wptr[order + 1]); raw_ngram->weights[1] = logmath_log10_to_log_float(lmath, backoff); } } raw_ngram->words = (uint32 *) ckd_calloc(order, sizeof(*raw_ngram->words)); for (word_out = raw_ngram->words + order - 1, i = 1; word_out >= raw_ngram->words; --word_out, i++) { hash_table_lookup_int32(wid, wptr[i], (int32 *) word_out); } } }
static int read_1grams_arpa(lineiter_t ** li, uint32 count, ngram_model_t * base, unigram_t * unigrams) { uint32 i; int n; int n_parts; char *wptr[3]; while (*li && strcmp((*li)->buf, "\\1-grams:") != 0) { *li = lineiter_next(*li); } if (*li == NULL) { E_ERROR_SYSTEM("Failed to read \\1-grams: mark"); return -1; } n_parts = 2; for (i = 0; i < count; i++) { *li = lineiter_next(*li); if (*li == NULL) { E_ERROR ("Unexpected end of ARPA file. Failed to read %dth unigram\n", i + 1); return -1; } if ((n = str2words((*li)->buf, wptr, 3)) < n_parts) { E_ERROR("Format error at line %s, Failed to read unigrams\n", (*li)->buf); return -1; } unigram_t *unigram = &unigrams[i]; unigram->prob = logmath_log10_to_log_float(base->lmath, atof_c(wptr[0])); if (unigram->prob > 0) { E_WARN("Unigram '%s' has positive probability\n", wptr[1]); unigram->prob = 0; } if (n == n_parts + 1) { unigram->bo = logmath_log10_to_log_float(base->lmath, atof_c(wptr[2])); } else { unigram->bo = 0.0f; } /* TODO: classify float with fpclassify and warn if bad value occurred */ base->word_str[i] = ckd_salloc(wptr[1]); } /* fill hash-table that maps unigram names to their word ids */ for (i = 0; i < count; i++) { if ((hash_table_enter (base->wid, base->word_str[i], (void *) (long) i)) != (void *) (long) i) { E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]); } } return 0; }
ngram_model_t * ngram_model_trie_read_dmp(cmd_ln_t * config, const char *file_name, logmath_t * lmath) { uint8 do_swap; int32 is_pipe; int32 k; uint32 j; int32 vn, ts; int32 count; uint32 counts[3]; uint32 fixed_counts[3]; uint32 *unigram_next; int i, order; char str[1024]; FILE *fp; ngram_model_trie_t *model; ngram_model_t *base; ngram_raw_t **raw_ngrams; E_INFO("Trying to read LM in DMP format\n"); if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) { E_ERROR("Dump file %s not found\n", file_name); return NULL; } do_swap = FALSE; fread(&k, sizeof(k), 1, fp); if (k != strlen(dmp_hdr) + 1) { SWAP_INT32(&k); if (k != strlen(dmp_hdr) + 1) { E_ERROR ("Wrong magic header size number %x: %s is not a dump file\n", k, file_name); return NULL; } do_swap = 1; } if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Cannot read header\n"); return NULL; } if (strncmp(str, dmp_hdr, k) != 0) { E_ERROR("Wrong header %s: %s is not a dump file\n", dmp_hdr); return NULL; } if (fread(&k, sizeof(k), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&k); if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Cannot read LM filename in header\n"); return NULL; } /* read version#, if present (must be <= 0) */ if (fread(&vn, sizeof(vn), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&vn); if (vn <= 0) { /* read and don't compare timestamps (we don't care) */ if (fread(&ts, sizeof(ts), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&ts); /* read and skip format description */ for (;;) { if (fread(&k, sizeof(k), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&k); if (k == 0) break; if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Failed to read word\n"); return NULL; } } /* read model->ucount */ if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[0] = count; } else { counts[0] = vn; } /* read model->bcount, tcount */ if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[1] = count; if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[2] = count; E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", counts[0], counts[1], counts[2]); model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); base = &model->base; if (counts[2] > 0) order = 3; else if (counts[1] > 0) order = 2; else order = 1; ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); model->trie = lm_trie_create(counts[0], order); unigram_next = (uint32 *) ckd_calloc((int32) counts[0] + 1, sizeof(unigram_next)); for (j = 0; j <= (int32) counts[0]; j++) { int32 bigrams; dmp_weight_t weight; /* Skip over the mapping ID, we don't care about it. */ fread(&bigrams, sizeof(int32), 1, fp); /* Read the weights from actual unigram structure. */ fread(&weight, sizeof(weight), 1, fp); if (do_swap) SWAP_INT32(&weight.l); weight.f = logmath_log10_to_log_float(lmath, weight.f); model->trie->unigrams[j].prob = weight.f; fread(&weight, sizeof(weight), 1, fp); if (do_swap) SWAP_INT32(&weight.l); weight.f = logmath_log10_to_log_float(lmath, weight.f); model->trie->unigrams[j].bo = weight.f; //store pointer to dmp next to recognize wid fread(&bigrams, sizeof(int32), 1, fp); if (do_swap) SWAP_INT32(&bigrams); model->trie->unigrams[j].next = bigrams; unigram_next[j] = bigrams; } if (order > 1) { raw_ngrams = ngrams_raw_read_dmp(fp, lmath, counts, order, unigram_next, do_swap); ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order); for (i = 0; i < order; i++) { base->n_counts[i] = fixed_counts[i]; } //build reversed trie lm_trie_alloc_ngram(model->trie, order > 2 ? fixed_counts : counts, order); lm_trie_build(model->trie, raw_ngrams, counts, order); counts[1]++; //free raw ngrams ngrams_raw_free(raw_ngrams, counts, order); } ckd_free(unigram_next); /* read ascii word strings */ read_word_str(base, fp); fclose_comp(fp, is_pipe); return base; }
static int read_ngram_instance(lineiter_t ** li, hash_table_t * wid, logmath_t * lmath, int order, int order_max, ngram_raw_t * raw_ngram) { int n; int words_expected; int i; char *wptr[NGRAM_MAX_ORDER + 1]; uint32 *word_out; if (*li) *li = lineiter_next(*li); if (*li == NULL) { E_ERROR("Unexpected end of ARPA file. Failed to read %d-gram\n", order); return -1; } words_expected = order + 1; if ((n = str2words((*li)->buf, wptr, NGRAM_MAX_ORDER + 1)) < words_expected) { E_ERROR("Format error; %d-gram ignored: %s\n", order, (*li)->buf); return -1; } raw_ngram->order = order; if (order == order_max) { raw_ngram->prob = atof_c(wptr[0]); if (raw_ngram->prob > 0) { E_WARN("%d-gram '%s' has positive probability\n", order, wptr[1]); raw_ngram->prob = 0.0f; } raw_ngram->prob = logmath_log10_to_log_float(lmath, raw_ngram->prob); } else { float weight, backoff; weight = atof_c(wptr[0]); if (weight > 0) { E_WARN("%d-gram '%s' has positive probability\n", order, wptr[1]); raw_ngram->prob = 0.0f; } else { raw_ngram->prob = logmath_log10_to_log_float(lmath, weight); } if (n == order + 1) { raw_ngram->backoff = 0.0f; } else { backoff = atof_c(wptr[order + 1]); raw_ngram->backoff = logmath_log10_to_log_float(lmath, backoff); } } raw_ngram->words = (uint32 *) ckd_calloc(order, sizeof(*raw_ngram->words)); for (word_out = raw_ngram->words + order - 1, i = 1; word_out >= raw_ngram->words; --word_out, i++) { hash_table_lookup_int32(wid, wptr[i], (int32 *) word_out); } return 0; }