bool parse_body(FILE * input, PhraseLargeTable3 * phrase_table, FacadePhraseIndex * phrase_index, KMixtureModelBigram * bigram){ taglib_push_state(); assert(taglib_add_tag(END_LINE, "\\end", 0, "", "")); assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); do { retry: assert(taglib_read(linebuf, line_type, values, required)); switch(line_type) { case END_LINE: goto end; case GRAM_1_LINE: my_getline(input); parse_unigram(input, phrase_table, phrase_index, bigram); goto retry; case GRAM_2_LINE: my_getline(input); parse_bigram(input, phrase_table, phrase_index, bigram); goto retry; default: assert(false); } } while (my_getline(input) != -1) ; end: taglib_pop_state(); return true; }
bool parse_unigram(FILE * input, PhraseLargeTable * phrases, FacadePhraseIndex * phrase_index){ taglib_push_state(); assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 1, "count", "")); do { assert(taglib_read(linebuf, line_type, values, required)); switch (line_type) { case GRAM_1_ITEM_LINE:{ /* handle \item in \1-gram */ const char * string = (const char *) g_ptr_array_index(values, 0); phrase_token_t token = taglib_string_to_token(phrases, string); gpointer value = NULL; assert(g_hash_table_lookup_extended(required, "count", NULL, &value)); glong count = atol((const char *)value); phrase_index->add_unigram_frequency(token, count); break; } case END_LINE: case GRAM_1_LINE: case GRAM_2_LINE: goto end; default: assert(false); } } while (my_getline(input) != -1); end: taglib_pop_state(); return true; }
bool parse_body(FILE * input, FILE * output){ taglib_push_state(); assert(taglib_add_tag(END_LINE, "\\end", 0, "", "")); assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); do { retry: assert(taglib_read(linebuf, line_type, values, required)); switch(line_type) { case END_LINE: fprintf(output, "\\end\n"); goto end; case GRAM_1_LINE: fprintf(output, "\\1-gram\n"); my_getline(input); parse_unigram(input, output); goto retry; case GRAM_2_LINE: fprintf(output, "\\2-gram\n"); my_getline(input); parse_bigram(input, output); goto retry; default: assert(false); } } while (my_getline(input) != -1); end: taglib_pop_state(); return true; }
bool parse_unigram(FILE * input, PhraseLargeTable3 * phrase_table, FacadePhraseIndex * phrase_index, KMixtureModelBigram * bigram){ taglib_push_state(); assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count:freq", "")); do { assert(taglib_read(linebuf, line_type, values, required)); switch (line_type) { case GRAM_1_ITEM_LINE:{ /* handle \item in \1-gram */ TAGLIB_GET_TOKEN(token, 0); TAGLIB_GET_PHRASE_STRING(word, 1); assert(taglib_validate_token_with_string (phrase_index, token, word)); TAGLIB_GET_TAGVALUE(glong, count, atol); TAGLIB_GET_TAGVALUE(glong, freq, atol); KMixtureModelArrayHeader array_header; memset(&array_header, 0, sizeof(KMixtureModelArrayHeader)); array_header.m_WC = count; array_header.m_freq = freq; bigram->set_array_header(token, array_header); break; } case END_LINE: case GRAM_1_LINE: case GRAM_2_LINE: goto end; default: assert(false); } } while (my_getline(input) != -1); end: taglib_pop_state(); return true; }
bool parse_unigram(FILE * input, FILE * output){ taglib_push_state(); assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "freq", "count")); do { assert(taglib_read(linebuf, line_type, values, required)); switch(line_type) { case GRAM_1_ITEM_LINE: { /* handle \item in \1-gram */ TAGLIB_GET_TOKEN(token, 0); TAGLIB_GET_PHRASE_STRING(word, 1); /* remove the "<start>" in the uni-gram of interpolation model */ if ( sentence_start == token ) break; TAGLIB_GET_TAGVALUE(glong, freq, atol); /* ignore zero unigram freq item */ if ( 0 != freq ) fprintf(output, "\\item %d %s count %ld\n", token, word, freq); break; } case END_LINE: case GRAM_1_LINE: case GRAM_2_LINE: goto end; default: assert(false); } } while (my_getline(input) != -1); end: taglib_pop_state(); return true; }
bool parse_bigram(FILE * input, FILE * output){ taglib_push_state(); assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, "count", "T:N_n_0:n_1:Mr")); do { assert(taglib_read(linebuf, line_type, values, required)); switch (line_type) { case GRAM_2_ITEM_LINE:{ /* handle \item in \2-gram */ /* two strings */ TAGLIB_GET_TOKEN(token1, 0); TAGLIB_GET_PHRASE_STRING(word1, 1); TAGLIB_GET_TOKEN(token2, 2); TAGLIB_GET_PHRASE_STRING(word2, 3); TAGLIB_GET_TAGVALUE(glong, count, atol); fprintf(output, "\\item %d %s %d %s count %ld\n", token1, word1, token2, word2, count); break; } case END_LINE: case GRAM_1_LINE: case GRAM_2_LINE: goto end; default: assert(false); } } while (my_getline(input) != -1); end: taglib_pop_state(); return true; }
bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table, FacadePhraseIndex * phrase_index, KMixtureModelBigram * bigram){ taglib_push_state(); assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, "count:T:N_n_0:n_1:Mr", "")); phrase_token_t last_token = null_token; KMixtureModelSingleGram * last_single_gram = NULL; do { assert(taglib_read(linebuf, line_type, values, required)); switch (line_type) { case GRAM_2_ITEM_LINE:{ /* handle \item in \2-gram */ /* two tokens */ TAGLIB_GET_TOKEN(token1, 0); TAGLIB_GET_PHRASE_STRING(word1, 1); assert(taglib_validate_token_with_string (phrase_index, token1, word1)); TAGLIB_GET_TOKEN(token2, 2); TAGLIB_GET_PHRASE_STRING(word2, 3); assert(taglib_validate_token_with_string (phrase_index, token2, word2)); TAGLIB_GET_TAGVALUE(glong, count, atol); TAGLIB_GET_TAGVALUE(glong, T, atol); assert(count == T); TAGLIB_GET_TAGVALUE(glong, N_n_0, atol); TAGLIB_GET_TAGVALUE(glong, n_1, atol); TAGLIB_GET_TAGVALUE(glong, Mr, atol); KMixtureModelArrayItem array_item; memset(&array_item, 0, sizeof(KMixtureModelArrayItem)); array_item.m_WC = count; array_item.m_N_n_0 = N_n_0; array_item.m_n_1 = n_1; array_item.m_Mr = Mr; if ( last_token != token1 ) { if ( last_token && last_single_gram ) { bigram->store(last_token, last_single_gram); delete last_single_gram; /* safe guard */ last_token = null_token; last_single_gram = NULL; } KMixtureModelSingleGram * single_gram = NULL; bigram->load(token1, single_gram); /* create the new single gram */ if ( single_gram == NULL ) single_gram = new KMixtureModelSingleGram; last_token = token1; last_single_gram = single_gram; } assert(NULL != last_single_gram); assert(last_single_gram->insert_array_item(token2, array_item)); break; } case END_LINE: case GRAM_1_LINE: case GRAM_2_LINE: goto end; default: assert(false); } } while (my_getline(input) != -1); end: if ( last_token && last_single_gram ) { bigram->store(last_token, last_single_gram); delete last_single_gram; /* safe guard */ last_token = null_token; last_single_gram = NULL; } taglib_pop_state(); return true; }
bool parse_bigram(FILE * input, PhraseLargeTable * phrases, FacadePhraseIndex * phrase_index, Bigram * bigram){ taglib_push_state(); assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 2, "count", "")); phrase_token_t last_token = 0; SingleGram * last_single_gram = NULL; do { assert(taglib_read(linebuf, line_type, values, required)); switch (line_type) { case GRAM_2_ITEM_LINE:{ /* handle \item in \2-gram */ /* two tokens */ const char * string = (const char *) g_ptr_array_index(values, 0); phrase_token_t token1 = taglib_string_to_token(phrases, string); string = (const char *) g_ptr_array_index(values, 1); phrase_token_t token2 = taglib_string_to_token(phrases, string); gpointer value = NULL; /* tag: count */ assert(g_hash_table_lookup_extended(required, "count", NULL, &value)); glong count = atol((const char *)value); if ( last_token != token1 ) { if ( last_token && last_single_gram ) { bigram->store(last_token, last_single_gram); delete last_single_gram; //safe guard last_token = 0; last_single_gram = NULL; } SingleGram * single_gram = NULL; bigram->load(token1, single_gram); //create the new single gram if ( single_gram == NULL ) single_gram = new SingleGram; last_token = token1; last_single_gram = single_gram; } //save the freq guint32 total_freq = 0; assert(last_single_gram->get_total_freq(total_freq)); assert(last_single_gram->insert_freq(token2, count)); total_freq += count; assert(last_single_gram->set_total_freq(total_freq)); break; } case END_LINE: case GRAM_1_LINE: case GRAM_2_LINE: goto end; default: assert(false); } } while (my_getline(input) != -1); end: if ( last_token && last_single_gram ) { bigram->store(last_token, last_single_gram); delete last_single_gram; //safe guard last_token = 0; last_single_gram = NULL; } taglib_pop_state(); return true; }