FILE *fopen_compchk (char *file, int32 *ispipe) { char tmpfile[16384]; FILE *fp; int32 k, isgz; struct stat statbuf; k = strlen (file); #if (WIN32) *ispipe = (k > 3) && ((strcmp (file+k-3, ".gz") == 0) || (strcmp (file+k-3, ".GZ") == 0)); isgz = *ispipe; #else *ispipe = 0; isgz = 0; if ((k > 2) && ((strcmp (file+k-2, ".Z") == 0) || (strcmp (file+k-2, ".z") == 0))) { *ispipe = 1; } else { if ((k > 3) && ((strcmp (file+k-3, ".gz") == 0) || (strcmp (file+k-3, ".GZ") == 0))) { *ispipe = 1; isgz = 1; } } #endif strcpy (tmpfile, file); if (stat (tmpfile, &statbuf) != 0) { /* File doesn't exist; try other compressed/uncompressed form, as appropriate */ E_ERROR_SYSTEM("stat(%s) failed\n", tmpfile); if (*ispipe) { if (isgz) tmpfile[k-3] = '\0'; else tmpfile[k-2] = '\0'; if (stat (tmpfile, &statbuf) != 0) return NULL; } else { strcpy (tmpfile+k, ".gz"); if (stat (tmpfile, &statbuf) != 0) { #if (! WIN32) strcpy (tmpfile+k, ".Z"); if (stat (tmpfile, &statbuf) != 0) return NULL; #else return NULL; #endif } } E_WARN("Using %s instead of %s\n", tmpfile, file); } return (fopen_comp (tmpfile, "r", ispipe)); }
ngram_model_t * ngram_model_trie_read_bin(cmd_ln_t * config, const char *path, logmath_t * lmath) { int32 is_pipe; FILE *fp; size_t hdr_size; char *hdr; int cmp_res; uint8 i, order; uint32 counts[NGRAM_MAX_ORDER]; ngram_model_trie_t *model; ngram_model_t *base; E_INFO("Trying to read LM in trie binary format\n"); if ((fp = fopen_comp(path, "rb", &is_pipe)) == NULL) { E_ERROR("File %s not found\n", path); return NULL; } hdr_size = strlen(trie_hdr); hdr = (char *) ckd_calloc(hdr_size + 1, sizeof(*hdr)); fread(hdr, sizeof(*hdr), hdr_size, fp); cmp_res = strcmp(hdr, trie_hdr); ckd_free(hdr); if (cmp_res) { E_INFO("Header doesn't match\n"); fclose_comp(fp, is_pipe); return NULL; } model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); base = &model->base; fread(&order, sizeof(order), 1, fp); for (i = 0; i < order; i++) { fread(&counts[i], sizeof(counts[i]), 1, fp); } ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); for (i = 0; i < order; i++) { base->n_counts[i] = counts[i]; } model->trie = lm_trie_read_bin(counts, order, fp); read_word_str(base, fp); fclose_comp(fp, is_pipe); return base; }
int ngram_model_trie_write_bin(ngram_model_t * base, const char *path) { int i; int32 is_pipe; ngram_model_trie_t *model = (ngram_model_trie_t *) base; FILE *fp = fopen_comp(path, "wb", &is_pipe); if (!fp) { E_ERROR("Unable to open %s to write binary trie LM\n", path); return -1; } fwrite(trie_hdr, sizeof(*trie_hdr), strlen(trie_hdr), fp); fwrite(&model->base.n, sizeof(model->base.n), 1, fp); for (i = 0; i < model->base.n; i++) { fwrite(&model->base.n_counts[i], sizeof(model->base.n_counts[i]), 1, fp); } lm_trie_write_bin(model->trie, base->n_counts[0], fp); write_word_str(fp, base); fclose_comp(fp, is_pipe); return 0; }
FILE * fopen_compchk(const char *file, int32 * ispipe) { #ifndef HAVE_POPEN *ispipe = 0; /* No popen() on WinCE */ /* And therefore the rest of this function is useless. */ return (fopen_comp(file, "r", ispipe)); #else /* HAVE_POPEN */ int32 isgz; FILE *fh; /* First just try to fopen_comp() it */ if ((fh = fopen_comp(file, "r", ispipe)) != NULL) return fh; else { char *tmpfile; int k; /* File doesn't exist; try other compressed/uncompressed form, as appropriate */ guess_comptype(file, ispipe, &isgz); k = strlen(file); tmpfile = ckd_calloc(k+5, 1); strcpy(tmpfile, file); switch (isgz) { case COMP_GZIP: tmpfile[k - 3] = '\0'; break; case COMP_BZIP2: tmpfile[k - 4] = '\0'; break; case COMP_COMPRESS: tmpfile[k - 2] = '\0'; break; case COMP_NONE: strcpy(tmpfile + k, ".gz"); if ((fh = fopen_comp(tmpfile, "r", ispipe)) != NULL) { E_WARN("Using %s instead of %s\n", tmpfile, file); ckd_free(tmpfile); return fh; } strcpy(tmpfile + k, ".bz2"); if ((fh = fopen_comp(tmpfile, "r", ispipe)) != NULL) { E_WARN("Using %s instead of %s\n", tmpfile, file); ckd_free(tmpfile); return fh; } strcpy(tmpfile + k, ".Z"); if ((fh = fopen_comp(tmpfile, "r", ispipe)) != NULL) { E_WARN("Using %s instead of %s\n", tmpfile, file); ckd_free(tmpfile); return fh; } ckd_free(tmpfile); return NULL; } E_WARN("Using %s instead of %s\n", tmpfile, file); fh = fopen_comp(tmpfile, "r", ispipe); ckd_free(tmpfile); return NULL; } #endif /* HAVE_POPEN */ }
int32 read_classdef_file(hash_table_t * classes, const char *file_name) { FILE *fp; int32 is_pipe; int inclass; /**< Are we currently reading a list of class words? */ int32 rv = -1; gnode_t *gn; glist_t classwords = NULL; glist_t classprobs = NULL; char *classname = NULL; if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) { E_ERROR("File %s not found\n", file_name); return -1; } inclass = FALSE; while (!feof(fp)) { char line[512]; char *wptr[2]; int n_words; if (fgets(line, sizeof(line), fp) == NULL) break; n_words = str2words(line, wptr, 2); if (n_words <= 0) continue; if (inclass) { /* Look for an end of class marker. */ if (n_words == 2 && 0 == strcmp(wptr[0], "END")) { classdef_t *classdef; gnode_t *word, *weight; int32 i; if (classname == NULL || 0 != strcmp(wptr[1], classname)) goto error_out; inclass = FALSE; /* Construct a class from the list of words collected. */ classdef = ckd_calloc(1, sizeof(*classdef)); classwords = glist_reverse(classwords); classprobs = glist_reverse(classprobs); classdef->n_words = glist_count(classwords); classdef->words = ckd_calloc(classdef->n_words, sizeof(*classdef->words)); classdef->weights = ckd_calloc(classdef->n_words, sizeof(*classdef->weights)); word = classwords; weight = classprobs; for (i = 0; i < classdef->n_words; ++i) { classdef->words[i] = gnode_ptr(word); classdef->weights[i] = gnode_float32(weight); word = gnode_next(word); weight = gnode_next(weight); } /* Add this class to the hash table. */ if (hash_table_enter(classes, classname, classdef) != classdef) { classdef_free(classdef); goto error_out; } /* Reset everything. */ glist_free(classwords); glist_free(classprobs); classwords = NULL; classprobs = NULL; classname = NULL; } else { float32 fprob; if (n_words == 2) fprob = atof_c(wptr[1]); else fprob = 1.0f; /* Add it to the list of words for this class. */ classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0])); classprobs = glist_add_float32(classprobs, fprob); } } else { /* Start a new LM class if the LMCLASS marker is seen */ if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) { if (inclass) goto error_out; inclass = TRUE; classname = ckd_salloc(wptr[1]); } /* Otherwise, just ignore whatever junk we got */ } } rv = 0; /* Success. */ error_out: /* Free all the stuff we might have allocated. */ fclose_comp(fp, is_pipe); for (gn = classwords; gn; gn = gnode_next(gn)) ckd_free(gnode_ptr(gn)); glist_free(classwords); glist_free(classprobs); ckd_free(classname); return rv; }
ngram_model_t * ngram_model_trie_read_dmp(cmd_ln_t * config, const char *file_name, logmath_t * lmath) { uint8 do_swap; int32 is_pipe; int32 k; uint32 j; int32 vn, ts; int32 count; uint32 counts[3]; uint32 fixed_counts[3]; uint32 *unigram_next; int i, order; char str[1024]; FILE *fp; ngram_model_trie_t *model; ngram_model_t *base; ngram_raw_t **raw_ngrams; E_INFO("Trying to read LM in DMP format\n"); if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) { E_ERROR("Dump file %s not found\n", file_name); return NULL; } do_swap = FALSE; fread(&k, sizeof(k), 1, fp); if (k != strlen(dmp_hdr) + 1) { SWAP_INT32(&k); if (k != strlen(dmp_hdr) + 1) { E_ERROR ("Wrong magic header size number %x: %s is not a dump file\n", k, file_name); return NULL; } do_swap = 1; } if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Cannot read header\n"); return NULL; } if (strncmp(str, dmp_hdr, k) != 0) { E_ERROR("Wrong header %s: %s is not a dump file\n", dmp_hdr); return NULL; } if (fread(&k, sizeof(k), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&k); if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Cannot read LM filename in header\n"); return NULL; } /* read version#, if present (must be <= 0) */ if (fread(&vn, sizeof(vn), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&vn); if (vn <= 0) { /* read and don't compare timestamps (we don't care) */ if (fread(&ts, sizeof(ts), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&ts); /* read and skip format description */ for (;;) { if (fread(&k, sizeof(k), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&k); if (k == 0) break; if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Failed to read word\n"); return NULL; } } /* read model->ucount */ if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[0] = count; } else { counts[0] = vn; } /* read model->bcount, tcount */ if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[1] = count; if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[2] = count; E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", counts[0], counts[1], counts[2]); model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); base = &model->base; if (counts[2] > 0) order = 3; else if (counts[1] > 0) order = 2; else order = 1; ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); model->trie = lm_trie_create(counts[0], order); unigram_next = (uint32 *) ckd_calloc((int32) counts[0] + 1, sizeof(unigram_next)); for (j = 0; j <= (int32) counts[0]; j++) { int32 bigrams; dmp_weight_t weight; /* Skip over the mapping ID, we don't care about it. */ fread(&bigrams, sizeof(int32), 1, fp); /* Read the weights from actual unigram structure. */ fread(&weight, sizeof(weight), 1, fp); if (do_swap) SWAP_INT32(&weight.l); weight.f = logmath_log10_to_log_float(lmath, weight.f); model->trie->unigrams[j].prob = weight.f; fread(&weight, sizeof(weight), 1, fp); if (do_swap) SWAP_INT32(&weight.l); weight.f = logmath_log10_to_log_float(lmath, weight.f); model->trie->unigrams[j].bo = weight.f; //store pointer to dmp next to recognize wid fread(&bigrams, sizeof(int32), 1, fp); if (do_swap) SWAP_INT32(&bigrams); model->trie->unigrams[j].next = bigrams; unigram_next[j] = bigrams; } if (order > 1) { raw_ngrams = ngrams_raw_read_dmp(fp, lmath, counts, order, unigram_next, do_swap); ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order); for (i = 0; i < order; i++) { base->n_counts[i] = fixed_counts[i]; } //build reversed trie lm_trie_alloc_ngram(model->trie, order > 2 ? fixed_counts : counts, order); lm_trie_build(model->trie, raw_ngrams, counts, order); counts[1]++; //free raw ngrams ngrams_raw_free(raw_ngrams, counts, order); } ckd_free(unigram_next); /* read ascii word strings */ read_word_str(base, fp); fclose_comp(fp, is_pipe); return base; }
ngram_model_t * ngram_model_trie_read_arpa(cmd_ln_t * config, const char *path, logmath_t * lmath) { FILE *fp; lineiter_t *li; ngram_model_trie_t *model; ngram_model_t *base; ngram_raw_t **raw_ngrams; int32 is_pipe; uint32 counts[NGRAM_MAX_ORDER]; uint32 fixed_counts[NGRAM_MAX_ORDER]; int order; int i; E_INFO("Trying to read LM in arpa format\n"); if ((fp = fopen_comp(path, "r", &is_pipe)) == NULL) { E_ERROR("File %s not found\n", path); return NULL; } model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); li = lineiter_start_clean(fp); /* Read n-gram counts from file */ if (read_counts_arpa(&li, counts, &order) == -1) { ckd_free(model); lineiter_free(li); fclose_comp(fp, is_pipe); return NULL; } E_INFO("LM of order %d\n", order); for (i = 0; i < order; i++) { E_INFO("#%d-grams: %d\n", i + 1, counts[i]); } base = &model->base; ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); base->writable = TRUE; model->trie = lm_trie_create(counts[0], order); if (read_1grams_arpa(&li, counts[0], base, model->trie->unigrams) < 0) { ckd_free(model); lineiter_free(li); fclose_comp(fp, is_pipe); return NULL; } if (order > 1) { raw_ngrams = ngrams_raw_read_arpa(&li, base->lmath, counts, order, base->wid); ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order); for (i = 0; i < order; i++) { base->n_counts[i] = fixed_counts[i]; } lm_trie_alloc_ngram(model->trie, fixed_counts, order); lm_trie_build(model->trie, raw_ngrams, counts, order); ngrams_raw_free(raw_ngrams, counts, order); } lineiter_free(li); fclose_comp(fp, is_pipe); return base; }