Exemple #1
0
static float lm_trie_hist_score(lm_trie_t *trie, int32 wid, int32 *hist, int32 n_hist, int32 *n_used)
{
    float prob;
    int i, j;
    node_range_t node;
    bitarr_address_t address;

    *n_used = 1;
    prob = unigram_find(trie->unigrams, wid, &node)->prob;
    if (n_hist == 0)
        return prob;
    for (i = 0; i < n_hist - 1; i++) {
        address = middle_find(&trie->middle_begin[i], hist[i], &node);
        if (address.base == NULL) {
            for (j = i; j < n_hist; j++) {
                prob += trie->backoff[j];
            }
            return prob;
        } else {
            (*n_used)++;
            prob = lm_trie_quant_mpread(trie->quant, address, i);
        }
    }
    address = longest_find(trie->longest, hist[n_hist - 1], &node);
    if (address.base == NULL) {
        return prob + trie->backoff[n_hist - 1];
    } else {
        (*n_used)++;
        return lm_trie_quant_lpread(trie->quant, address);
    }
}
Exemple #2
0
static void update_backoff(lm_trie_t *trie, int32 *hist, int32 n_hist)
{
    int i;
    node_range_t node;
    bitarr_address_t address;

    memset(trie->backoff, 0, sizeof(trie->backoff));
    trie->backoff[0] = unigram_find(trie->unigrams, hist[0], &node)->bo;
    for (i = 1; i < n_hist; i++) {
        address = middle_find(&trie->middle_begin[i - 1], hist[i], &node);
        if (address.base == NULL) {
            break;
        }
        trie->backoff[i] = lm_trie_quant_mboread(trie->quant, address, i - 1);
    }
    memcpy(trie->prev_hist, hist, n_hist * sizeof(*hist));
}
Exemple #3
0
static float get_available_backoff(lm_trie_t *trie, int32 start, int32 *hist, int32 n_hist)
{
    float backoff = 0.0f;
    int order_minus_2;
    int32 *hist_iter;
    node_range_t node;
    unigram_t *first_hist = unigram_find(trie->unigrams, hist[0], &node);
    if (start <= 1) {
        backoff += first_hist->bo;
        start = 2;
    }
    order_minus_2 = start - 2;
    for (hist_iter = hist + start - 1; hist_iter < hist + n_hist; hist_iter++, order_minus_2++) {
        bitarr_address_t address = middle_find(&trie->middle_begin[order_minus_2], *hist_iter, &node);
        if (address.base == NULL) break;
        backoff += lm_trie_quant_mboread(trie->quant, address, order_minus_2);
    }
    return backoff;
}
Exemple #4
0
static float get_available_prob(lm_trie_t *trie, int32 wid, int32 *hist, int max_order, int32 n_hist, int32 *n_used)
{
    float prob;
    node_range_t node;
    bitarr_address_t address;
    int order_minus_2;
    uint8 independent_left;
    int32 *hist_iter, *hist_end;

    *n_used = 1;
    prob = unigram_find(trie->unigrams, wid, &node)->prob;
    if (n_hist == 0) {
        return prob;
    }

    //find ngrams of higher order if any
    order_minus_2 = 0;
    independent_left = (node.begin == node.end);
    hist_iter = hist;
    hist_end = hist + n_hist;
    for (;;order_minus_2++, hist_iter++) {
        if (hist_iter == hist_end) return prob;
        if (independent_left) return prob;
        if (order_minus_2 == max_order - 2) break;

        address = middle_find(&trie->middle_begin[order_minus_2], *hist_iter, &node);
        independent_left = (address.base == NULL) || (node.begin == node.end);

        //didn't find entry
        if (address.base == NULL) return prob;
        prob =  lm_trie_quant_mpread(trie->quant, address, order_minus_2);
        *n_used = order_minus_2 + 2;
    }

    address = longest_find(trie->longest, *hist_iter, &node);
    if (address.base != NULL) {
        prob = lm_trie_quant_lpread(trie->quant, address);
        *n_used = max_order;
    }
    return prob;
}
Exemple #5
0
static void
fill_raw_ngram(lm_trie_t * trie, logmath_t * lmath,
               ngram_raw_t * raw_ngrams, uint32 * raw_ngram_idx,
               uint32 * counts, node_range_t range, uint32 * hist,
               int n_hist, int order, int max_order)
{
    if (n_hist > 0 && range.begin == range.end) {
        return;
    }
    if (n_hist == 0) {
        uint32 i;
        for (i = 0; i < counts[0]; i++) {
            node_range_t node;
            unigram_find(trie->unigrams, i, &node);
            hist[0] = i;
            fill_raw_ngram(trie, lmath, raw_ngrams, raw_ngram_idx, counts,
                           node, hist, 1, order, max_order);
        }
    }
    else if (n_hist < order - 1) {
        uint32 ptr;
        node_range_t node;
        bitarr_address_t address;
        uint32 new_word;
        middle_t *middle = &trie->middle_begin[n_hist - 1];
        for (ptr = range.begin; ptr < range.end; ptr++) {
            address.base = middle->base.base;
            address.offset = ptr * middle->base.total_bits;
            new_word =
                bitarr_read_int25(address, middle->base.word_bits,
                                  middle->base.word_mask);
            hist[n_hist] = new_word;
            address.offset += middle->base.word_bits + middle->quant_bits;
            node.begin =
                bitarr_read_int25(address, middle->next_mask.bits,
                                  middle->next_mask.mask);
            address.offset =
                (ptr + 1) * middle->base.total_bits +
                middle->base.word_bits + middle->quant_bits;
            node.end =
                bitarr_read_int25(address, middle->next_mask.bits,
                                  middle->next_mask.mask);
            fill_raw_ngram(trie, lmath, raw_ngrams, raw_ngram_idx, counts,
                           node, hist, n_hist + 1, order, max_order);
        }
    }
    else {
        bitarr_address_t address;
        uint32 ptr;
        float prob, backoff;
        int i;
        assert(n_hist == order - 1);
        for (ptr = range.begin; ptr < range.end; ptr++) {
            ngram_raw_t *raw_ngram = &raw_ngrams[*raw_ngram_idx];
            raw_ngram->weights =
                (float *) ckd_calloc(order == max_order ? 1 : 2,
                                     sizeof(*raw_ngram->weights));
            if (order == max_order) {
                longest_t *longest = trie->longest;     //access
                address.base = longest->base.base;
                address.offset = ptr * longest->base.total_bits;
                hist[n_hist] =
                    bitarr_read_int25(address, longest->base.word_bits,
                                      longest->base.word_mask);
                address.offset += longest->base.word_bits;
                prob = lm_trie_quant_lpread(trie->quant, address);
            }
            else {
                middle_t *middle = &trie->middle_begin[n_hist - 1];
                address.base = middle->base.base;
                address.offset = ptr * middle->base.total_bits;
                hist[n_hist] =
                    bitarr_read_int25(address, middle->base.word_bits,
                                      middle->base.word_mask);
                address.offset += middle->base.word_bits;
                prob =
                    lm_trie_quant_mpread(trie->quant, address, n_hist - 1);
                backoff =
                    lm_trie_quant_mboread(trie->quant, address,
                                          n_hist - 1);
                raw_ngram->weights[1] =
                    (float) logmath_log_float_to_log10(lmath, backoff);
            }
            raw_ngram->weights[0] =
                (float) logmath_log_float_to_log10(lmath, prob);
            raw_ngram->words =
                (uint32 *) ckd_calloc(order, sizeof(*raw_ngram->words));
            for (i = 0; i <= n_hist; i++) {
                raw_ngram->words[i] = hist[n_hist - i];
            }
            (*raw_ngram_idx)++;
        }
    }
}