Exemplo n.º 1
0
bool parse_body(FILE * input, PhraseLargeTable3 * phrase_table,
                FacadePhraseIndex * phrase_index,
                KMixtureModelBigram * bigram){
    taglib_push_state();

    assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
    assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
    assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));

    do {
    retry:
        assert(taglib_read(linebuf, line_type, values, required));
        switch(line_type) {
        case END_LINE:
            goto end;
        case GRAM_1_LINE:
            my_getline(input);
            parse_unigram(input, phrase_table, phrase_index, bigram);
            goto retry;
        case GRAM_2_LINE:
            my_getline(input);
            parse_bigram(input, phrase_table, phrase_index, bigram);
            goto retry;
        default:
            assert(false);
        }
    } while (my_getline(input) != -1) ;

 end:
    taglib_pop_state();
    return true;
}
bool parse_body(FILE * input, FILE * output){
    taglib_push_state();

    assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
    assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
    assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));

    do {
    retry:
        assert(taglib_read(linebuf, line_type, values, required));
        switch(line_type) {
        case END_LINE:
            fprintf(output, "\\end\n");
            goto end;
        case GRAM_1_LINE:
            fprintf(output, "\\1-gram\n");
            my_getline(input);
            parse_unigram(input, output);
            goto retry;
        case GRAM_2_LINE:
            fprintf(output, "\\2-gram\n");
            my_getline(input);
            parse_bigram(input, output);
            goto retry;
        default:
            assert(false);
        }
    } while (my_getline(input) != -1);

 end:
    taglib_pop_state();
    return true;
}
Exemplo n.º 3
0
bool parse_headline(KMixtureModelBigram * bigram){
    /* enter "\data" line */
    assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model:count:N:total_freq", ""));

    /* read "\data" line */
    if ( !taglib_read(linebuf, line_type, values, required) ) {
        fprintf(stderr, "error: k mixture model expected.\n");
        return false;
    }

    assert(line_type == BEGIN_LINE);
    /* check header */
    TAGLIB_GET_TAGVALUE(const char *, model, (const char *));
    if ( !( strcmp("k mixture model", model) == 0 ) ) {
        fprintf(stderr, "error: k mixture model expected.\n");
        return false;
    }

    TAGLIB_GET_TAGVALUE(glong, count, atol);
    TAGLIB_GET_TAGVALUE(glong, N, atol);
    TAGLIB_GET_TAGVALUE(glong, total_freq, atol);

    KMixtureModelMagicHeader magic_header;
    memset(&magic_header, 0, sizeof(KMixtureModelMagicHeader));
    magic_header.m_WC =count; magic_header.m_N = N;
    magic_header.m_total_freq = total_freq;
    bigram->set_magic_header(magic_header);

    return true;
}
Exemplo n.º 4
0
bool parse_unigram(FILE * input, PhraseLargeTable * phrases,
                   FacadePhraseIndex * phrase_index){
    taglib_push_state();

    assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 1, "count", ""));

    do {
        assert(taglib_read(linebuf, line_type, values, required));
        switch (line_type) {
        case GRAM_1_ITEM_LINE:{
            /* handle \item in \1-gram */
            const char * string = (const char *) g_ptr_array_index(values, 0);
            phrase_token_t token = taglib_string_to_token(phrases, string);
            gpointer value = NULL;
            assert(g_hash_table_lookup_extended(required, "count", NULL, &value));
            glong count = atol((const char *)value);
            phrase_index->add_unigram_frequency(token, count);
            break;
        }
        case END_LINE:
        case GRAM_1_LINE:
        case GRAM_2_LINE:
            goto end;
        default:
            assert(false);
        }
    } while (my_getline(input) != -1);

 end:
    taglib_pop_state();
    return true;
}
Exemplo n.º 5
0
bool parse_unigram(FILE * input, PhraseLargeTable3 * phrase_table,
                   FacadePhraseIndex * phrase_index,
                   KMixtureModelBigram * bigram){
    taglib_push_state();

    assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count:freq", ""));

    do {
        assert(taglib_read(linebuf, line_type, values, required));
        switch (line_type) {
        case GRAM_1_ITEM_LINE:{
            /* handle \item in \1-gram */
            TAGLIB_GET_TOKEN(token, 0);
            TAGLIB_GET_PHRASE_STRING(word, 1);
            assert(taglib_validate_token_with_string
                   (phrase_index, token, word));

            TAGLIB_GET_TAGVALUE(glong, count, atol);
            TAGLIB_GET_TAGVALUE(glong, freq, atol);

            KMixtureModelArrayHeader array_header;
            memset(&array_header, 0, sizeof(KMixtureModelArrayHeader));
            array_header.m_WC = count; array_header.m_freq = freq;
            bigram->set_array_header(token, array_header);
            break;
        }
        case END_LINE:
        case GRAM_1_LINE:
        case GRAM_2_LINE:
            goto end;
        default:
            assert(false);
        }
    } while (my_getline(input) != -1);

 end:
    taglib_pop_state();
    return true;
}
bool parse_headline(FILE * input, FILE * output) {
    /* enter "\data" line */
    assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model",
                          "count:N:total_freq"));

    /* read "\data" line */
    if ( !taglib_read(linebuf, line_type, values, required) ) {
        fprintf(stderr, "error: k mixture model expected.\n");
        return false;
    }

    assert(line_type == BEGIN_LINE);
    TAGLIB_GET_TAGVALUE(const char *, model, (const char *));
    if ( !( strcmp("k mixture model", model) == 0 ) ){
        fprintf(stderr, "error: k mixture model expected.\n");
        return false;
    }

    /* print header */
    fprintf(output, "\\data model interpolation\n");

    return true;
}
bool parse_unigram(FILE * input, FILE * output){
    taglib_push_state();

    assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "freq", "count"));

    do {
        assert(taglib_read(linebuf, line_type, values, required));
        switch(line_type) {
        case GRAM_1_ITEM_LINE: {
            /* handle \item in \1-gram */
            TAGLIB_GET_TOKEN(token, 0);
            TAGLIB_GET_PHRASE_STRING(word, 1);

            /* remove the "<start>" in the uni-gram of interpolation model */
            if ( sentence_start == token )
                break;

            TAGLIB_GET_TAGVALUE(glong, freq, atol);

            /* ignore zero unigram freq item */
            if ( 0 != freq )
                fprintf(output, "\\item %d %s count %ld\n", token, word, freq);
            break;
        }
        case END_LINE:
        case GRAM_1_LINE:
        case GRAM_2_LINE:
            goto end;
        default:
            assert(false);
        }
    } while (my_getline(input) != -1);

 end:
    taglib_pop_state();
    return true;
}
bool parse_bigram(FILE * input, FILE * output){
    taglib_push_state();

    assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4,
                          "count", "T:N_n_0:n_1:Mr"));

    do {
        assert(taglib_read(linebuf, line_type, values, required));
        switch (line_type) {
        case GRAM_2_ITEM_LINE:{
            /* handle \item in \2-gram */
            /* two strings */
            TAGLIB_GET_TOKEN(token1, 0);
            TAGLIB_GET_PHRASE_STRING(word1, 1);

            TAGLIB_GET_TOKEN(token2, 2);
            TAGLIB_GET_PHRASE_STRING(word2, 3);

            TAGLIB_GET_TAGVALUE(glong, count, atol);
            fprintf(output, "\\item %d %s %d %s count %ld\n",
                    token1, word1, token2, word2, count);
            break;
        }
        case END_LINE:
        case GRAM_1_LINE:
        case GRAM_2_LINE:
            goto end;
        default:
            assert(false);
        }
    } while (my_getline(input) != -1);

 end:
    taglib_pop_state();
    return true;
}
Exemplo n.º 9
0
bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table,
                  FacadePhraseIndex * phrase_index,
                  KMixtureModelBigram * bigram){
    taglib_push_state();

    assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4,
                          "count:T:N_n_0:n_1:Mr", ""));

    phrase_token_t last_token = null_token;
    KMixtureModelSingleGram * last_single_gram = NULL;
    do {
        assert(taglib_read(linebuf, line_type, values, required));
        switch (line_type) {
        case GRAM_2_ITEM_LINE:{
            /* handle \item in \2-gram */
            /* two tokens */
            TAGLIB_GET_TOKEN(token1, 0);
            TAGLIB_GET_PHRASE_STRING(word1, 1);
            assert(taglib_validate_token_with_string
                   (phrase_index, token1, word1));

            TAGLIB_GET_TOKEN(token2, 2);
            TAGLIB_GET_PHRASE_STRING(word2, 3);
            assert(taglib_validate_token_with_string
                   (phrase_index, token2, word2));

            TAGLIB_GET_TAGVALUE(glong, count, atol);
            TAGLIB_GET_TAGVALUE(glong, T, atol);
            assert(count == T);
            TAGLIB_GET_TAGVALUE(glong, N_n_0, atol);
            TAGLIB_GET_TAGVALUE(glong, n_1, atol);
            TAGLIB_GET_TAGVALUE(glong, Mr, atol);

            KMixtureModelArrayItem array_item;
            memset(&array_item, 0, sizeof(KMixtureModelArrayItem));
            array_item.m_WC = count; array_item.m_N_n_0 = N_n_0;
            array_item.m_n_1 = n_1; array_item.m_Mr = Mr;

            if ( last_token != token1 ) {
                if ( last_token && last_single_gram ) {
                    bigram->store(last_token, last_single_gram);
                    delete last_single_gram;
                    /* safe guard */
                    last_token = null_token;
                    last_single_gram = NULL;
                }
                KMixtureModelSingleGram * single_gram = NULL;
                bigram->load(token1, single_gram);

                /* create the new single gram */
                if ( single_gram == NULL )
                    single_gram = new KMixtureModelSingleGram;
                last_token = token1;
                last_single_gram = single_gram;
            }

            assert(NULL != last_single_gram);
            assert(last_single_gram->insert_array_item(token2, array_item));
            break;
        }
        case END_LINE:
        case GRAM_1_LINE:
        case GRAM_2_LINE:
            goto end;
        default:
            assert(false);
        }
    } while (my_getline(input) != -1);

 end:
    if ( last_token && last_single_gram ) {
        bigram->store(last_token, last_single_gram);
        delete last_single_gram;
        /* safe guard */
        last_token = null_token;
        last_single_gram = NULL;
    }

    taglib_pop_state();
    return true;
}
Exemplo n.º 10
0
int main(int argc, char * argv[]){
    FILE * input = stdin;
    const char * bigram_filename = "bigram.db";

    PhraseLargeTable phrases;

    MemoryChunk * chunk = new MemoryChunk;
    bool retval = chunk->load("phrase_index.bin");
    if (!retval) {
        fprintf(stderr, "open phrase_index.bin failed!\n");
        exit(ENOENT);
    }
    phrases.load(chunk);

    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(&phrase_index))
        exit(ENOENT);

    Bigram bigram;
    retval = bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
    if (!retval) {
        fprintf(stderr, "open %s failed!\n", bigram_filename);
        exit(ENOENT);
    }

    taglib_init();

    values = g_ptr_array_new();
    required = g_hash_table_new(g_str_hash, g_str_equal);

    //enter "\data" line
    assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", ""));
    ssize_t result = my_getline(input);
    if ( result == -1 ) {
        fprintf(stderr, "empty file input.\n");
        exit(ENODATA);
    }

    //read "\data" line
    if ( !taglib_read(linebuf, line_type, values, required) ) {
        fprintf(stderr, "error: interpolation model expected.\n");
        exit(ENODATA);
    }

    assert(line_type == BEGIN_LINE);
    char * value = NULL;
    assert(g_hash_table_lookup_extended(required, "model", NULL, (gpointer *)&value));
    if ( !( strcmp("interpolation", value) == 0 ) ) {
        fprintf(stderr, "error: interpolation model expected.\n");
        exit(ENODATA);
    }

    result = my_getline(input);
    if ( result != -1 )
        parse_body(input, &phrases, &phrase_index, &bigram);

    taglib_fini();

    if (!save_phrase_index(&phrase_index))
        exit(ENOENT);

    return 0;
}
Exemplo n.º 11
0
bool parse_bigram(FILE * input, PhraseLargeTable * phrases,
                  FacadePhraseIndex * phrase_index,
                  Bigram * bigram){
    taglib_push_state();

    assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 2, "count", ""));

    phrase_token_t last_token = 0; SingleGram * last_single_gram = NULL;
    do {
        assert(taglib_read(linebuf, line_type, values, required));
        switch (line_type) {
        case GRAM_2_ITEM_LINE:{
            /* handle \item in \2-gram */
            /* two tokens */
            const char * string = (const char *) g_ptr_array_index(values, 0);
            phrase_token_t token1 = taglib_string_to_token(phrases, string);
            string = (const char *) g_ptr_array_index(values, 1);
            phrase_token_t token2 = taglib_string_to_token(phrases, string);

            gpointer value = NULL;
            /* tag: count */
            assert(g_hash_table_lookup_extended(required, "count", NULL, &value));
            glong count = atol((const char *)value);

            if ( last_token != token1 ) {
                if ( last_token && last_single_gram ) {
                    bigram->store(last_token, last_single_gram);
                    delete last_single_gram;
                    //safe guard
                    last_token = 0;
                    last_single_gram = NULL;
                }
                SingleGram * single_gram = NULL;
                bigram->load(token1, single_gram);

                //create the new single gram
                if ( single_gram == NULL )
                    single_gram = new SingleGram;
                last_token = token1;
                last_single_gram = single_gram;
            }
            //save the freq
            guint32 total_freq = 0;
            assert(last_single_gram->get_total_freq(total_freq));
            assert(last_single_gram->insert_freq(token2, count));
            total_freq += count;
            assert(last_single_gram->set_total_freq(total_freq));
            break;
        }
        case END_LINE:
        case GRAM_1_LINE:
        case GRAM_2_LINE:
            goto end;
        default:
            assert(false);
        }
    } while (my_getline(input) != -1);

 end:
    if ( last_token && last_single_gram ) {
        bigram->store(last_token, last_single_gram);
        delete last_single_gram;
        //safe guard
        last_token = 0;
        last_single_gram = NULL;
    }

    taglib_pop_state();
    return true;
}