Ejemplo n.º 1
0
FILE *fopen_compchk (char *file, int32 *ispipe)
{
    char tmpfile[16384];
    FILE *fp;
    int32 k, isgz;
    struct stat statbuf;
    
    k = strlen (file);
    
#if (WIN32)
    *ispipe = (k > 3) &&
	((strcmp (file+k-3, ".gz") == 0) || (strcmp (file+k-3, ".GZ") == 0));
    isgz = *ispipe;
#else
    *ispipe = 0;
    isgz = 0;
    if ((k > 2) && ((strcmp (file+k-2, ".Z") == 0) || (strcmp (file+k-2, ".z") == 0))) {
	*ispipe = 1;
    } else {
	if ((k > 3) &&
	    ((strcmp (file+k-3, ".gz") == 0) || (strcmp (file+k-3, ".GZ") == 0))) {
	    *ispipe = 1;
	    isgz = 1;
	}
    }
#endif
    
    strcpy (tmpfile, file);
    if (stat (tmpfile, &statbuf) != 0) {
	/* File doesn't exist; try other compressed/uncompressed form, as appropriate */
	E_ERROR_SYSTEM("stat(%s) failed\n", tmpfile);
	
	if (*ispipe) {
	    if (isgz)
		tmpfile[k-3] = '\0';
	    else
		tmpfile[k-2] = '\0';
	    
	    if (stat (tmpfile, &statbuf) != 0)
		return NULL;
	} else {
	    strcpy (tmpfile+k, ".gz");
	    if (stat (tmpfile, &statbuf) != 0) {
#if (! WIN32)
		strcpy (tmpfile+k, ".Z");
		if (stat (tmpfile, &statbuf) != 0)
		    return NULL;
#else
		return NULL;
#endif
	    }
	}
	
	E_WARN("Using %s instead of %s\n", tmpfile, file);
    }
    
    return (fopen_comp (tmpfile, "r", ispipe));
}
Ejemplo n.º 2
0
ngram_model_t *
ngram_model_trie_read_bin(cmd_ln_t * config,
                          const char *path, logmath_t * lmath)
{
    int32 is_pipe;
    FILE *fp;
    size_t hdr_size;
    char *hdr;
    int cmp_res;
    uint8 i, order;
    uint32 counts[NGRAM_MAX_ORDER];
    ngram_model_trie_t *model;
    ngram_model_t *base;

    E_INFO("Trying to read LM in trie binary format\n");
    if ((fp = fopen_comp(path, "rb", &is_pipe)) == NULL) {
        E_ERROR("File %s not found\n", path);
        return NULL;
    }
    hdr_size = strlen(trie_hdr);
    hdr = (char *) ckd_calloc(hdr_size + 1, sizeof(*hdr));
    fread(hdr, sizeof(*hdr), hdr_size, fp);
    cmp_res = strcmp(hdr, trie_hdr);
    ckd_free(hdr);
    if (cmp_res) {
        E_INFO("Header doesn't match\n");
        fclose_comp(fp, is_pipe);
        return NULL;
    }
    model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
    base = &model->base;
    fread(&order, sizeof(order), 1, fp);
    for (i = 0; i < order; i++) {
        fread(&counts[i], sizeof(counts[i]), 1, fp);
    }
    ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
                     (int32) counts[0]);
    for (i = 0; i < order; i++) {
        base->n_counts[i] = counts[i];
    }

    model->trie = lm_trie_read_bin(counts, order, fp);
    read_word_str(base, fp);
    fclose_comp(fp, is_pipe);

    return base;
}
Ejemplo n.º 3
0
int
ngram_model_trie_write_bin(ngram_model_t * base, const char *path)
{
    int i;
    int32 is_pipe;
    ngram_model_trie_t *model = (ngram_model_trie_t *) base;
    FILE *fp = fopen_comp(path, "wb", &is_pipe);
    if (!fp) {
        E_ERROR("Unable to open %s to write binary trie LM\n", path);
        return -1;
    }

    fwrite(trie_hdr, sizeof(*trie_hdr), strlen(trie_hdr), fp);
    fwrite(&model->base.n, sizeof(model->base.n), 1, fp);
    for (i = 0; i < model->base.n; i++) {
        fwrite(&model->base.n_counts[i], sizeof(model->base.n_counts[i]),
               1, fp);
    }
    lm_trie_write_bin(model->trie, base->n_counts[0], fp);
    write_word_str(fp, base);
    fclose_comp(fp, is_pipe);
    return 0;
}
Ejemplo n.º 4
0
FILE *
fopen_compchk(const char *file, int32 * ispipe)
{
#ifndef HAVE_POPEN
    *ispipe = 0; /* No popen() on WinCE */
    /* And therefore the rest of this function is useless. */
    return (fopen_comp(file, "r", ispipe));
#else /* HAVE_POPEN */
    int32 isgz;
    FILE *fh;

    /* First just try to fopen_comp() it */
    if ((fh = fopen_comp(file, "r", ispipe)) != NULL)
        return fh;
    else {
        char *tmpfile;
        int k;

        /* File doesn't exist; try other compressed/uncompressed form, as appropriate */
        guess_comptype(file, ispipe, &isgz);
        k = strlen(file);
        tmpfile = ckd_calloc(k+5, 1);
        strcpy(tmpfile, file);
        switch (isgz) {
        case COMP_GZIP:
            tmpfile[k - 3] = '\0';
            break;
        case COMP_BZIP2:
            tmpfile[k - 4] = '\0';
            break;
        case COMP_COMPRESS:
            tmpfile[k - 2] = '\0';
            break;
        case COMP_NONE:
            strcpy(tmpfile + k, ".gz");
            if ((fh = fopen_comp(tmpfile, "r", ispipe)) != NULL) {
                E_WARN("Using %s instead of %s\n", tmpfile, file);
                ckd_free(tmpfile);
                return fh;
            }
            strcpy(tmpfile + k, ".bz2");
            if ((fh = fopen_comp(tmpfile, "r", ispipe)) != NULL) {
                E_WARN("Using %s instead of %s\n", tmpfile, file);
                ckd_free(tmpfile);
                return fh;
            }
            strcpy(tmpfile + k, ".Z");
            if ((fh = fopen_comp(tmpfile, "r", ispipe)) != NULL) {
                E_WARN("Using %s instead of %s\n", tmpfile, file);
                ckd_free(tmpfile);
                return fh;
            }
            ckd_free(tmpfile);
            return NULL;
        }
        E_WARN("Using %s instead of %s\n", tmpfile, file);
        fh = fopen_comp(tmpfile, "r", ispipe);
        ckd_free(tmpfile);
        return NULL;
    }
#endif /* HAVE_POPEN */
}
Ejemplo n.º 5
0
int32
read_classdef_file(hash_table_t * classes, const char *file_name)
{
    FILE *fp;
    int32 is_pipe;
    int inclass;  /**< Are we currently reading a list of class words? */
    int32 rv = -1;
    gnode_t *gn;
    glist_t classwords = NULL;
    glist_t classprobs = NULL;
    char *classname = NULL;

    if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
        E_ERROR("File %s not found\n", file_name);
        return -1;
    }

    inclass = FALSE;
    while (!feof(fp)) {
        char line[512];
        char *wptr[2];
        int n_words;

        if (fgets(line, sizeof(line), fp) == NULL)
            break;

        n_words = str2words(line, wptr, 2);
        if (n_words <= 0)
            continue;

        if (inclass) {
            /* Look for an end of class marker. */
            if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
                classdef_t *classdef;
                gnode_t *word, *weight;
                int32 i;

                if (classname == NULL || 0 != strcmp(wptr[1], classname))
                    goto error_out;
                inclass = FALSE;

                /* Construct a class from the list of words collected. */
                classdef = ckd_calloc(1, sizeof(*classdef));
                classwords = glist_reverse(classwords);
                classprobs = glist_reverse(classprobs);
                classdef->n_words = glist_count(classwords);
                classdef->words = ckd_calloc(classdef->n_words,
                                             sizeof(*classdef->words));
                classdef->weights = ckd_calloc(classdef->n_words,
                                               sizeof(*classdef->weights));
                word = classwords;
                weight = classprobs;
                for (i = 0; i < classdef->n_words; ++i) {
                    classdef->words[i] = gnode_ptr(word);
                    classdef->weights[i] = gnode_float32(weight);
                    word = gnode_next(word);
                    weight = gnode_next(weight);
                }

                /* Add this class to the hash table. */
                if (hash_table_enter(classes, classname, classdef) !=
                    classdef) {
                    classdef_free(classdef);
                    goto error_out;
                }

                /* Reset everything. */
                glist_free(classwords);
                glist_free(classprobs);
                classwords = NULL;
                classprobs = NULL;
                classname = NULL;
            }
            else {
                float32 fprob;

                if (n_words == 2)
                    fprob = atof_c(wptr[1]);
                else
                    fprob = 1.0f;
                /* Add it to the list of words for this class. */
                classwords =
                    glist_add_ptr(classwords, ckd_salloc(wptr[0]));
                classprobs = glist_add_float32(classprobs, fprob);
            }
        }
        else {
            /* Start a new LM class if the LMCLASS marker is seen */
            if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
                if (inclass)
                    goto error_out;
                inclass = TRUE;
                classname = ckd_salloc(wptr[1]);
            }
            /* Otherwise, just ignore whatever junk we got */
        }
    }
    rv = 0;                     /* Success. */

  error_out:
    /* Free all the stuff we might have allocated. */
    fclose_comp(fp, is_pipe);
    for (gn = classwords; gn; gn = gnode_next(gn))
        ckd_free(gnode_ptr(gn));
    glist_free(classwords);
    glist_free(classprobs);
    ckd_free(classname);

    return rv;
}
Ejemplo n.º 6
0
ngram_model_t *
ngram_model_trie_read_dmp(cmd_ln_t * config,
                          const char *file_name, logmath_t * lmath)
{
    uint8 do_swap;
    int32 is_pipe;
    int32 k;
    uint32 j;
    int32 vn, ts;
    int32 count;
    uint32 counts[3];
    uint32 fixed_counts[3];
    uint32 *unigram_next;
    int i, order;
    char str[1024];
    FILE *fp;
    ngram_model_trie_t *model;
    ngram_model_t *base;
    ngram_raw_t **raw_ngrams;

    E_INFO("Trying to read LM in DMP format\n");
    if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
        E_ERROR("Dump file %s not found\n", file_name);
        return NULL;
    }

    do_swap = FALSE;
    fread(&k, sizeof(k), 1, fp);
    if (k != strlen(dmp_hdr) + 1) {
        SWAP_INT32(&k);
        if (k != strlen(dmp_hdr) + 1) {
            E_ERROR
                ("Wrong magic header size number %x: %s is not a dump file\n",
                 k, file_name);
            return NULL;
        }
        do_swap = 1;
    }
    if (fread(str, 1, k, fp) != (size_t) k) {
        E_ERROR("Cannot read header\n");
        return NULL;
    }
    if (strncmp(str, dmp_hdr, k) != 0) {
        E_ERROR("Wrong header %s: %s is not a dump file\n", dmp_hdr);
        return NULL;
    }

    if (fread(&k, sizeof(k), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&k);
    if (fread(str, 1, k, fp) != (size_t) k) {
        E_ERROR("Cannot read LM filename in header\n");
        return NULL;
    }

    /* read version#, if present (must be <= 0) */
    if (fread(&vn, sizeof(vn), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&vn);
    if (vn <= 0) {
        /* read and don't compare timestamps (we don't care) */
        if (fread(&ts, sizeof(ts), 1, fp) != 1)
            return NULL;
        if (do_swap)
            SWAP_INT32(&ts);

        /* read and skip format description */
        for (;;) {
            if (fread(&k, sizeof(k), 1, fp) != 1)
                return NULL;
            if (do_swap)
                SWAP_INT32(&k);
            if (k == 0)
                break;
            if (fread(str, 1, k, fp) != (size_t) k) {
                E_ERROR("Failed to read word\n");
                return NULL;
            }
        }
        /* read model->ucount */
        if (fread(&count, sizeof(count), 1, fp) != 1)
            return NULL;
        if (do_swap)
            SWAP_INT32(&count);
        counts[0] = count;
    }
    else {
        counts[0] = vn;
    }
    /* read model->bcount, tcount */
    if (fread(&count, sizeof(count), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&count);
    counts[1] = count;
    if (fread(&count, sizeof(count), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&count);
    counts[2] = count;
    E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", counts[0], counts[1], counts[2]);

    model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
    base = &model->base;
    if (counts[2] > 0)
        order = 3;
    else if (counts[1] > 0)
        order = 2;
    else
        order = 1;
    ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
                     (int32) counts[0]);

    model->trie = lm_trie_create(counts[0], order);

    unigram_next =
        (uint32 *) ckd_calloc((int32) counts[0] + 1, sizeof(unigram_next));
    for (j = 0; j <= (int32) counts[0]; j++) {
        int32 bigrams;
        dmp_weight_t weight;
        /* Skip over the mapping ID, we don't care about it. */
        fread(&bigrams, sizeof(int32), 1, fp);
        /* Read the weights from actual unigram structure. */
        fread(&weight, sizeof(weight), 1, fp);
        if (do_swap)
            SWAP_INT32(&weight.l);
        weight.f = logmath_log10_to_log_float(lmath, weight.f);
        model->trie->unigrams[j].prob = weight.f;
        fread(&weight, sizeof(weight), 1, fp);
        if (do_swap)
            SWAP_INT32(&weight.l);
        weight.f = logmath_log10_to_log_float(lmath, weight.f);
        model->trie->unigrams[j].bo = weight.f;
        //store pointer to dmp next to recognize wid
        fread(&bigrams, sizeof(int32), 1, fp);
        if (do_swap)
            SWAP_INT32(&bigrams);
        model->trie->unigrams[j].next = bigrams;
        unigram_next[j] = bigrams;
    }

    if (order > 1) {
        raw_ngrams =
            ngrams_raw_read_dmp(fp, lmath, counts, order, unigram_next,
                                do_swap);
        ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order);
        for (i = 0; i < order; i++) {
            base->n_counts[i] = fixed_counts[i];
        }

        //build reversed trie
        lm_trie_alloc_ngram(model->trie, order > 2 ? fixed_counts : counts,
                            order);
        lm_trie_build(model->trie, raw_ngrams, counts, order);
        counts[1]++;

        //free raw ngrams
        ngrams_raw_free(raw_ngrams, counts, order);
    }
    ckd_free(unigram_next);

    /* read ascii word strings */
    read_word_str(base, fp);

    fclose_comp(fp, is_pipe);
    return base;
}
Ejemplo n.º 7
0
ngram_model_t *
ngram_model_trie_read_arpa(cmd_ln_t * config,
                           const char *path, logmath_t * lmath)
{
    FILE *fp;
    lineiter_t *li;
    ngram_model_trie_t *model;
    ngram_model_t *base;
    ngram_raw_t **raw_ngrams;
    int32 is_pipe;
    uint32 counts[NGRAM_MAX_ORDER];
    uint32 fixed_counts[NGRAM_MAX_ORDER];
    int order;
    int i;

    E_INFO("Trying to read LM in arpa format\n");
    if ((fp = fopen_comp(path, "r", &is_pipe)) == NULL) {
        E_ERROR("File %s not found\n", path);
        return NULL;
    }

    model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
    li = lineiter_start_clean(fp);
    /* Read n-gram counts from file */
    if (read_counts_arpa(&li, counts, &order) == -1) {
        ckd_free(model);
        lineiter_free(li);
        fclose_comp(fp, is_pipe);
        return NULL;
    }

    E_INFO("LM of order %d\n", order);
    for (i = 0; i < order; i++) {
        E_INFO("#%d-grams: %d\n", i + 1, counts[i]);
    }

    base = &model->base;
    ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
                     (int32) counts[0]);
    base->writable = TRUE;

    model->trie = lm_trie_create(counts[0], order);
    if (read_1grams_arpa(&li, counts[0], base, model->trie->unigrams) < 0) {
        ckd_free(model);
        lineiter_free(li);
        fclose_comp(fp, is_pipe);
        return NULL;
    }

    if (order > 1) {
        raw_ngrams =
            ngrams_raw_read_arpa(&li, base->lmath, counts, order,
                                 base->wid);
        ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order);
        for (i = 0; i < order; i++) {
            base->n_counts[i] = fixed_counts[i];
        }
        lm_trie_alloc_ngram(model->trie, fixed_counts, order);
        lm_trie_build(model->trie, raw_ngrams, counts, order);
        ngrams_raw_free(raw_ngrams, counts, order);
    }

    lineiter_free(li);
    fclose_comp(fp, is_pipe);

    return base;
}