static void
build_widmap(ngram_model_t * base, logmath_t * lmath, int32 n)
{
    ngram_model_set_t *set = (ngram_model_set_t *) base;
    ngram_model_t **models = set->lms;
    hash_table_t *vocab;
    glist_t hlist;
    gnode_t *gn;
    int32 i;

    /* Construct a merged vocabulary and a set of word-ID mappings. */
    vocab = hash_table_new(models[0]->n_words, FALSE);
    /* Create the set of merged words. */
    for (i = 0; i < set->n_models; ++i) {
        int32 j;
        for (j = 0; j < models[i]->n_words; ++j) {
            /* Ignore collisions. */
            (void) hash_table_enter_int32(vocab, models[i]->word_str[j],
                                          j);
        }
    }
    /* Create the array of words, then sort it. */
    if (hash_table_lookup(vocab, "<UNK>", NULL) != 0)
        (void) hash_table_enter_int32(vocab, "<UNK>", 0);
    /* Now we know the number of unigrams, initialize the base model. */
    ngram_model_init(base, &ngram_model_set_funcs, lmath, n,
                     hash_table_inuse(vocab));
    base->writable = FALSE;     /* We will reuse the pointers from the submodels. */
    i = 0;
    hlist = hash_table_tolist(vocab, NULL);
    for (gn = hlist; gn; gn = gnode_next(gn)) {
        hash_entry_t *ent = gnode_ptr(gn);
        base->word_str[i++] = (char *) ent->key;
    }
    glist_free(hlist);
    qsort(base->word_str, base->n_words, sizeof(*base->word_str),
          my_compare);

    /* Now create the word ID mappings. */
    if (set->widmap)
        ckd_free_2d((void **) set->widmap);
    set->widmap = (int32 **) ckd_calloc_2d(base->n_words, set->n_models,
                                           sizeof(**set->widmap));
    for (i = 0; i < base->n_words; ++i) {
        int32 j;
        /* Also create the master wid mapping. */
        (void) hash_table_enter_int32(base->wid, base->word_str[i], i);
        /* printf("%s: %d => ", base->word_str[i], i); */
        for (j = 0; j < set->n_models; ++j) {
            set->widmap[i][j] = ngram_wid(models[j], base->word_str[i]);
            /* printf("%d ", set->widmap[i][j]); */
        }
        /* printf("\n"); */
    }
    hash_table_free(vocab);
}
void
ngram_model_set_map_words(ngram_model_t * base,
                          const char **words, int32 n_words)
{
    ngram_model_set_t *set = (ngram_model_set_t *) base;
    int32 i;

    /* Recreate the word mapping. */
    if (base->writable) {
        for (i = 0; i < base->n_words; ++i) {
            ckd_free(base->word_str[i]);
        }
    }
    ckd_free(base->word_str);
    ckd_free_2d((void **) set->widmap);
    base->writable = TRUE;
    base->n_words = base->n_1g_alloc = n_words;
    base->word_str = ckd_calloc(n_words, sizeof(*base->word_str));
    set->widmap =
        (int32 **) ckd_calloc_2d(n_words, set->n_models,
                                 sizeof(**set->widmap));
    hash_table_empty(base->wid);
    for (i = 0; i < n_words; ++i) {
        int32 j;
        base->word_str[i] = ckd_salloc(words[i]);
        (void) hash_table_enter_int32(base->wid, base->word_str[i], i);
        for (j = 0; j < set->n_models; ++j) {
            set->widmap[i][j] = ngram_wid(set->lms[j], base->word_str[i]);
        }
    }
}
Ejemplo n.º 3
0
int
ngram_model_casefold(ngram_model_t * model, int kase)
{
    int writable, i;
    hash_table_t *new_wid;

    /* Were word strings already allocated? */
    writable = model->writable;
    /* Either way, we are going to allocate some word strings. */
    model->writable = TRUE;

    /* And, don't forget, we need to rebuild the word to unigram ID
     * mapping. */
    new_wid = hash_table_new(model->n_words, FALSE);
    for (i = 0; i < model->n_words; ++i) {
        char *outstr;
        if (writable) {
            outstr = model->word_str[i];
        }
        else {
            outstr = ckd_salloc(model->word_str[i]);
        }
        /* Don't case-fold <tags> or [classes] */
        if (outstr[0] == '<' || outstr[0] == '[') {
        }
        else {
            switch (kase) {
            case NGRAM_UPPER:
                ucase(outstr);
                break;
            case NGRAM_LOWER:
                lcase(outstr);
                break;
            default:
                ;
            }
        }
        model->word_str[i] = outstr;

        /* Now update the hash table.  We might have terrible
         * collisions here, so warn about them. */
        if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
            E_WARN("Duplicate word in dictionary after conversion: %s\n",
                   model->word_str[i]);
        }
    }
    /* Swap out the hash table. */
    hash_table_free(model->wid);
    model->wid = new_wid;
    return 0;
}
Ejemplo n.º 4
0
/**
 * Add a word to the word string and ID mapping.
 */
int32
ngram_add_word_internal(ngram_model_t * model,
                        const char *word, int32 classid)
{

    /* Check for hash collisions. */
    int32 wid;
    if (hash_table_lookup_int32(model->wid, word, &wid) == 0) {
        E_WARN("Omit duplicate word '%s'\n", word);
        return wid;
    }

    /* Take the next available word ID */
    wid = model->n_words;
    if (classid >= 0) {
        wid = NGRAM_CLASSWID(wid, classid);
    }

    /* Reallocate word_str if necessary. */
    if (model->n_words >= model->n_1g_alloc) {
        model->n_1g_alloc += UG_ALLOC_STEP;
        model->word_str = ckd_realloc(model->word_str,
                                      sizeof(*model->word_str) *
                                      model->n_1g_alloc);
    }
    /* Add the word string in the appropriate manner. */
    /* Class words are always dynamically allocated. */
    model->word_str[model->n_words] = ckd_salloc(word);
    /* Now enter it into the hash table. */
    if (hash_table_enter_int32
        (model->wid, model->word_str[model->n_words], wid) != wid) {
        E_ERROR
            ("Hash insertion failed for word %s => %p (should not happen)\n",
             model->word_str[model->n_words], (void *) (long) (wid));
    }
    /* Increment number of words. */
    ++model->n_words;
    return wid;
}
Ejemplo n.º 5
0
fsg_model_t *
fsg_model_read(FILE * fp, logmath_t * lmath, float32 lw)
{
    fsg_model_t *fsg;
    hash_table_t *vocab;
    hash_iter_t *itor;
    int32 lastwid;
    char **wordptr;
    char *lineptr;
    char *fsgname;
    int32 lineno;
    int32 n, i, j;
    int n_state, n_trans, n_null_trans;
    glist_t nulls;
    float32 p;

    lineno = 0;
    vocab = hash_table_new(32, FALSE);
    wordptr = NULL;
    lineptr = NULL;
    nulls = NULL;
    fsgname = NULL;
    fsg = NULL;

    /* Scan upto FSG_BEGIN header */
    for (;;) {
        n = nextline_str2words(fp, &lineno, &lineptr, &wordptr);
        if (n < 0) {
            E_ERROR("%s declaration missing\n", FSG_MODEL_BEGIN_DECL);
            goto parse_error;
        }

        if ((strcmp(wordptr[0], FSG_MODEL_BEGIN_DECL) == 0)) {
            if (n > 2) {
                E_ERROR("Line[%d]: malformed FSG_BEGIN declaration\n",
                        lineno);
                goto parse_error;
            }
            break;
        }
    }
    /* Save FSG name, or it will get clobbered below :(.
     * If name is missing, try the default.
     */
    if (n == 2) {
        fsgname = ckd_salloc(wordptr[1]);
    }
    else {
        E_WARN("FSG name is missing\n");
        fsgname = ckd_salloc("unknown");
    }

    /* Read #states */
    n = nextline_str2words(fp, &lineno, &lineptr, &wordptr);
    if ((n != 2)
        || ((strcmp(wordptr[0], FSG_MODEL_N_DECL) != 0)
            && (strcmp(wordptr[0], FSG_MODEL_NUM_STATES_DECL) != 0))
        || (sscanf(wordptr[1], "%d", &n_state) != 1)
        || (n_state <= 0)) {
        E_ERROR
            ("Line[%d]: #states declaration line missing or malformed\n",
             lineno);
        goto parse_error;
    }

    /* Now create the FSG. */
    fsg = fsg_model_init(fsgname, lmath, lw, n_state);
    ckd_free(fsgname);
    fsgname = NULL;

    /* Read start state */
    n = nextline_str2words(fp, &lineno, &lineptr, &wordptr);
    if ((n != 2)
        || ((strcmp(wordptr[0], FSG_MODEL_S_DECL) != 0)
            && (strcmp(wordptr[0], FSG_MODEL_START_STATE_DECL) != 0))
        || (sscanf(wordptr[1], "%d", &(fsg->start_state)) != 1)
        || (fsg->start_state < 0)
        || (fsg->start_state >= fsg->n_state)) {
        E_ERROR
            ("Line[%d]: start state declaration line missing or malformed\n",
             lineno);
        goto parse_error;
    }

    /* Read final state */
    n = nextline_str2words(fp, &lineno, &lineptr, &wordptr);
    if ((n != 2)
        || ((strcmp(wordptr[0], FSG_MODEL_F_DECL) != 0)
            && (strcmp(wordptr[0], FSG_MODEL_FINAL_STATE_DECL) != 0))
        || (sscanf(wordptr[1], "%d", &(fsg->final_state)) != 1)
        || (fsg->final_state < 0)
        || (fsg->final_state >= fsg->n_state)) {
        E_ERROR
            ("Line[%d]: final state declaration line missing or malformed\n",
             lineno);
        goto parse_error;
    }

    /* Read transitions */
    lastwid = 0;
    n_trans = n_null_trans = 0;
    for (;;) {
        int32 wid, tprob;

        n = nextline_str2words(fp, &lineno, &lineptr, &wordptr);
        if (n <= 0) {
            E_ERROR("Line[%d]: transition or FSG_END statement expected\n",
                    lineno);
            goto parse_error;
        }

        if ((strcmp(wordptr[0], FSG_MODEL_END_DECL) == 0)) {
            break;
        }

        if ((strcmp(wordptr[0], FSG_MODEL_T_DECL) == 0)
            || (strcmp(wordptr[0], FSG_MODEL_TRANSITION_DECL) == 0)) {


            if (((n != 4) && (n != 5))
                || (sscanf(wordptr[1], "%d", &i) != 1)
                || (sscanf(wordptr[2], "%d", &j) != 1)
                || (i < 0) || (i >= fsg->n_state)
                || (j < 0) || (j >= fsg->n_state)) {
                E_ERROR
                    ("Line[%d]: transition spec malformed; Expecting: from-state to-state trans-prob [word]\n",
                     lineno);
                goto parse_error;
            }

            p = atof_c(wordptr[3]);
            if ((p <= 0.0) || (p > 1.0)) {
                E_ERROR
                    ("Line[%d]: transition spec malformed; Expecting float as transition probability\n",
                     lineno);
                goto parse_error;
            }
        }
        else {
            E_ERROR("Line[%d]: transition or FSG_END statement expected\n",
                    lineno);
            goto parse_error;
        }

        tprob = (int32) (logmath_log(lmath, p) * fsg->lw);
        /* Add word to "dictionary". */
        if (n > 4) {
            if (hash_table_lookup_int32(vocab, wordptr[4], &wid) < 0) {
                (void) hash_table_enter_int32(vocab,
                                              ckd_salloc(wordptr[4]),
                                              lastwid);
                wid = lastwid;
                ++lastwid;
            }
            fsg_model_trans_add(fsg, i, j, tprob, wid);
            ++n_trans;
        }
        else {
            if (fsg_model_null_trans_add(fsg, i, j, tprob) == 1) {
                ++n_null_trans;
                nulls =
                    glist_add_ptr(nulls, fsg_model_null_trans(fsg, i, j));
            }
        }
    }

    E_INFO("FSG: %d states, %d unique words, %d transitions (%d null)\n",
           fsg->n_state, hash_table_inuse(vocab), n_trans, n_null_trans);


    /* Now create a string table from the "dictionary" */
    fsg->n_word = hash_table_inuse(vocab);
    fsg->n_word_alloc = fsg->n_word + 10;       /* Pad it a bit. */
    fsg->vocab = ckd_calloc(fsg->n_word_alloc, sizeof(*fsg->vocab));
    for (itor = hash_table_iter(vocab); itor;
         itor = hash_table_iter_next(itor)) {
        char const *word = hash_entry_key(itor->ent);
        int32 wid = (int32) (long) hash_entry_val(itor->ent);
        fsg->vocab[wid] = (char *) word;
    }
    hash_table_free(vocab);

    /* Do transitive closure on null transitions */
    nulls = fsg_model_null_trans_closure(fsg, nulls);
    glist_free(nulls);

    ckd_free(lineptr);
    ckd_free(wordptr);

    return fsg;

  parse_error:
    for (itor = hash_table_iter(vocab); itor;
         itor = hash_table_iter_next(itor))
        ckd_free((char *) hash_entry_key(itor->ent));
    glist_free(nulls);
    hash_table_free(vocab);
    ckd_free(fsgname);
    ckd_free(lineptr);
    ckd_free(wordptr);
    fsg_model_free(fsg);
    return NULL;
}
Ejemplo n.º 6
0
s3wid_t
dict_add_word(dict_t * d, char const *word, s3cipid_t const * p, int32 np)
{
    int32 len;
    dictword_t *wordp;
    s3wid_t newwid;
    char *wword;

    if (d->n_word >= d->max_words) {
        E_INFO("Reallocating to %d KiB for word entries\n",
               (d->max_words + S3DICT_INC_SZ) * sizeof(dictword_t) / 1024);
        d->word =
            (dictword_t *) ckd_realloc(d->word,
                                       (d->max_words +
                                        S3DICT_INC_SZ) * sizeof(dictword_t));
        d->max_words = d->max_words + S3DICT_INC_SZ;
    }

    wordp = d->word + d->n_word;
    wordp->word = (char *) ckd_salloc(word);    /* Freed in dict_free */

    /* Associate word string with d->n_word in hash table */
    if (hash_table_enter_int32(d->ht, wordp->word, d->n_word) != d->n_word) {
        ckd_free(wordp->word);
        wordp->word = NULL;
        return BAD_S3WID;
    }

    /* Fill in word entry, and set defaults */
    if (p && (np > 0)) {
        wordp->ciphone = (s3cipid_t *) ckd_malloc(np * sizeof(s3cipid_t));      /* Freed in dict_free */
        memcpy(wordp->ciphone, p, np * sizeof(s3cipid_t));
        wordp->pronlen = np;
    }
    else {
        wordp->ciphone = NULL;
        wordp->pronlen = 0;
    }
    wordp->alt = BAD_S3WID;
    wordp->basewid = d->n_word;

    /* Determine base/alt wids */
    wword = ckd_salloc(word);
    if ((len = dict_word2basestr(wword)) > 0) {
	int32 w;

        /* Truncated to a baseword string; find its ID */
        if (hash_table_lookup_int32(d->ht, wword, &w) < 0) {
            E_ERROR("Missing base word for: %s\n", word);
            ckd_free(wword);
            ckd_free(wordp->word);
            wordp->word = NULL;
            return BAD_S3WID;
        }

        /* Link into alt list */
        wordp->basewid = w;
        wordp->alt = d->word[w].alt;
        d->word[w].alt = d->n_word;
    }
    ckd_free(wword);

    newwid = d->n_word++;

    return newwid;
}
Ejemplo n.º 7
0
int
ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
{
    iconv_t ic;
    char *outbuf;
    size_t maxlen;
    int i, writable;
    hash_table_t *new_wid;

    /* FIXME: Need to do a special case thing for the GB-HEX encoding
     * used in Sphinx3 Mandarin models. */
    if ((ic = iconv_open(to, from)) == (iconv_t)-1) {
        E_ERROR_SYSTEM("iconv_open() failed");
        return -1;
    }
    /* iconv(3) is a piece of crap and won't accept a NULL out buffer,
     * unlike wcstombs(3). So we have to either call it over and over
     * again until our buffer is big enough, or call it with a huge
     * buffer and then copy things back to the output.  We will use a
     * mix of these two approaches here.  We'll keep a single big
     * buffer around, and expand it as necessary.
     */
    maxlen = 0;
    for (i = 0; i < model->n_words; ++i) {
        if (strlen(model->word_str[i]) > maxlen)
            maxlen = strlen(model->word_str[i]);
    }
    /* Were word strings already allocated? */
    writable = model->writable;
    /* Either way, we are going to allocate some word strings. */
    model->writable = TRUE;
    /* Really should be big enough except for pathological cases. */
    maxlen = maxlen * sizeof(int) + 15;
    outbuf = ckd_calloc(maxlen, 1);
    /* And, don't forget, we need to rebuild the word to unigram ID
     * mapping. */
    new_wid = hash_table_new(model->n_words, FALSE);
    for (i = 0; i < model->n_words; ++i) {
        ICONV_CONST char *in;
        char *out;
        size_t inleft, outleft, result;

    start_conversion:
        in = (ICONV_CONST char *)model->word_str[i];
        /* Yes, this assumes that we don't have any NUL bytes. */
        inleft = strlen(in);
        out = outbuf;
        outleft = maxlen;

        while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) {
            if (errno != E2BIG) {
                /* FIXME: if we already converted any words, then they
                 * are going to be in an inconsistent state. */
                E_ERROR_SYSTEM("iconv() failed");
                ckd_free(outbuf);
                hash_table_free(new_wid);
                return -1;
            }
            /* Reset the internal state of conversion. */
            iconv(ic, NULL, NULL, NULL, NULL);
            /* Make everything bigger. */
            maxlen *= 2;
            out = outbuf = ckd_realloc(outbuf, maxlen);
            /* Reset the input pointers. */
            in = (ICONV_CONST char *)model->word_str[i];
            inleft = strlen(in);
        }

        /* Now flush a shift-out sequence, if any. */
        if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) {
            if (errno != E2BIG) {
                /* FIXME: if we already converted any words, then they
                 * are going to be in an inconsistent state. */
                E_ERROR_SYSTEM("iconv() failed (state reset sequence)");
                ckd_free(outbuf);
                hash_table_free(new_wid);
                return -1;
            }
            /* Reset the internal state of conversion. */
            iconv(ic, NULL, NULL, NULL, NULL);
            /* Make everything bigger. */
            maxlen *= 2;
            outbuf = ckd_realloc(outbuf, maxlen);
            /* Be very evil. */
            goto start_conversion;
        }

        result = maxlen - outleft;
        /* Okay, that was hard, now let's go shopping. */
        if (writable) {
            /* Grow or shrink the output string as necessary. */
            model->word_str[i] = ckd_realloc(model->word_str[i], result + 1);
            model->word_str[i][result] = '\0';
        }
        else {
            /* It actually was not allocated previously, so do that now. */
            model->word_str[i] = ckd_calloc(result + 1, 1);
        }
        /* Copy the new thing in. */
        memcpy(model->word_str[i], outbuf, result);

        /* Now update the hash table.  We might have terrible
         * collisions if a non-reversible conversion was requested.,
         * so warn about them. */
        if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
            E_WARN("Duplicate word in dictionary after conversion: %s\n",
                   model->word_str[i]);
        }
    }
    ckd_free(outbuf);
    iconv_close(ic);
    /* Swap out the hash table. */
    hash_table_free(model->wid);
    model->wid = new_wid;

    return 0;
}