int main(int argc, char * argv[]){

    GError * error = NULL;
    GOptionContext * context;

    context = g_option_context_new("- validate k mixture model");
    if (!g_option_context_parse(context, &argc, &argv, &error)) {
        g_print("option parsing failed:%s\n", error->message);
        exit(EINVAL);
    }

    if (2 != argc) {
        fprintf(stderr, "wrong arguments.\n");
        exit(EINVAL);
    }

    const char * k_mixture_model_filename = argv[1];

    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    bigram.attach(k_mixture_model_filename, ATTACH_READONLY);

    if (!validate_unigram(&bigram)) {
        fprintf(stderr, "k mixture model validation failed.\n");
        exit(ENODATA);
    }

    if (!validate_bigram(&bigram)) {
        fprintf(stderr, "k mixture model validation failed.\n");
        exit(ENODATA);
    }

    return 0;
}
int main(int argc, char * argv[]){
    FILE * input = stdin;

    setlocale(LC_ALL, "");

    GError * error = NULL;
    GOptionContext * context;

    context = g_option_context_new("- import k mixture model");
    g_option_context_add_main_entries(context, entries, NULL);
    if (!g_option_context_parse(context, &argc, &argv, &error)) {
        g_print("option parsing failed:%s\n", error->message);
        exit(EINVAL);
    }

    SystemTableInfo2 system_table_info;

    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    PhraseLargeTable3 phrase_table;
    phrase_table.attach(SYSTEM_PHRASE_INDEX, ATTACH_READONLY);

    FacadePhraseIndex phrase_index;

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_default_tables();

    if (!load_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);

    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);

    taglib_init();

    /* prepare to read n-gram model */
    values = g_ptr_array_new();
    required = g_hash_table_new(g_str_hash, g_str_equal);

    ssize_t result = my_getline(input);
    if ( result == -1 ) {
        fprintf(stderr, "empty file input.\n");
        exit(ENODATA);
    }

    if (!parse_headline(&bigram))
        exit(ENODATA);

    result = my_getline(input);
    if ( result != -1 )
        parse_body(input, &phrase_table, &phrase_index, &bigram);

    taglib_fini();

    return 0;
}
int main(int argc, char * argv[]){
    int i = 1;
    const char * k_mixture_model_filename = NULL;

    while ( i < argc ){
        if ( strcmp ("--help", argv[i]) == 0 ){
            print_help();
            exit(0);
        } else {
            k_mixture_model_filename = argv[i];
        }
        ++i;
    }

    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    bigram.attach(k_mixture_model_filename, ATTACH_READONLY);

    if (!validate_unigram(&bigram)) {
        fprintf(stderr, "k mixture model validation failed.\n");
        exit(ENODATA);
    }

    if (!validate_bigram(&bigram)) {
        fprintf(stderr, "k mixture model validation failed.\n");
        exit(ENODATA);
    }

    return 0;
}
int main(int argc, char * argv[]){
    setlocale(LC_ALL, "");

    GError * error = NULL;
    GOptionContext * context;

    context = g_option_context_new("- estimate k mixture model");
    g_option_context_add_main_entries(context, entries, NULL);
    if (!g_option_context_parse(context, &argc, &argv, &error)) {
        g_print("option parsing failed:%s\n", error->message);
        exit(EINVAL);
    }

    /* magic header signature check here. */
    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    bigram.attach(bigram_filename, ATTACH_READONLY);

    KMixtureModelBigram deleted_bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    deleted_bigram.attach(deleted_bigram_filename, ATTACH_READONLY);

    GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
    deleted_bigram.get_all_items(deleted_items);

    parameter_t lambda_sum = 0;
    int lambda_count = 0;

    for( size_t i = 0; i < deleted_items->len; ++i ){
        phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i);
        KMixtureModelSingleGram * single_gram = NULL;
        bigram.load(*token, single_gram, true);

        KMixtureModelSingleGram * deleted_single_gram = NULL;
        deleted_bigram.load(*token, deleted_single_gram);

        KMixtureModelArrayHeader array_header;
        if (single_gram)
            assert(single_gram->get_array_header(array_header));
        KMixtureModelArrayHeader deleted_array_header;
        assert(deleted_single_gram->get_array_header(deleted_array_header));

        if ( 0 != deleted_array_header.m_WC ) {
            parameter_t lambda = compute_interpolation(deleted_single_gram, &bigram, single_gram);

            printf("token:%d lambda:%f\n", *token, lambda);

            lambda_sum += lambda;
            lambda_count ++;
        }

        if (single_gram)
            delete single_gram;
        delete deleted_single_gram;
    }

    printf("average lambda:%f\n", (lambda_sum/lambda_count));
    g_array_free(deleted_items, TRUE);
    return 0;
}
Beispiel #5
0
void JelinekMercerFeature::read_counts(const std::string filename) {
  std::ifstream infile;
  infile.open(filename.c_str());

  int type;
  int count;
  int contexts;
  int next;
  std::string word;

  infile >> _corpora;

  infile >> _vocab;
  _types.resize(_vocab);
  _corpus_names.resize(_corpora);
  for (int ii=0; ii < _vocab; ++ii) {
    infile >> word;
    _types[ii] = word;
    if (ii % 25000 == 0)
      std::cout << "Read vocab " << word << " (" << ii << "/" << _vocab << ")" << std::endl;
  }

  /*
   * Size the counts appropriately
   */
  assert(_corpora > 0);
  this->_unigram.resize(_corpora);
  this->_bigram.resize(_corpora);
  this->_normalizer.resize(_corpora);
  this->_compare.resize(_corpora);

  for (int cc=0; cc < _corpora; ++cc) {
    infile >> _corpus_names[cc];
    infile >> _compare[cc];
    if (cc % 1000 == 0)
      std::cout << "Read corpus for " << _corpus_names[cc] << " (" << cc << "/" << _corpora << ")" << std::endl;
    _unigram[cc].resize(_vocab);
    _normalizer[cc] = 0;

    for (int vv=0; vv < _vocab; ++vv) {
      infile >> type;
      infile >> count;
      infile >> contexts;
      assert(type == vv);

      // Set unigram counts
      _normalizer[cc] += (float)count + kSMOOTH;
      _unigram[cc][type] = count;

      for (int bb=0; bb<contexts; ++bb) {
        infile >> next;
        infile >> count;
        _bigram[cc][bigram(type, next)] = count;
      }
    }
  }
  std::cout << "Done reading corpus" << std::endl;
}
Beispiel #6
0
double
CBigramHistory::pr(uint32_t* its_wid, uint32_t* ite_wid, uint32_t wid)
{
    TBigram bigram(DCWID, DCWID);
    if (its_wid != ite_wid)
        bigram.first = *(ite_wid - 1);
    bigram.second = wid;
    return pr(bigram);
}
Beispiel #7
0
void
CBigramHistory::forget(uint32_t *its_wid, uint32_t *ite_wid)
{
    for (; its_wid < ite_wid; ++its_wid) {
        TBigram bigram(*its_wid, DCWID);

        if (its_wid + 1 != ite_wid)
            bigram.second = *(its_wid + 1);

        TBigramPool::iterator it = m_bifreq.find(bigram);
        if (it != m_bifreq.end())
            m_bifreq.erase(it);
    }
}
int main(int argc, char * argv[]){
    int i = 1;
    const char * k_mixture_model_filename = NULL;
    FILE * output = stdout;

    while ( i < argc ){
        if ( strcmp ("--help", argv[i]) == 0 ){
            print_help();
            exit(0);
        } else if ( strcmp ("--k-mixture-model-file", argv[i]) == 0 ){
            if ( ++i > argc ){
                print_help();
                exit(EINVAL);
            }
            k_mixture_model_filename = argv[i];
        } else {
            print_help();
            exit(EINVAL);
        }
        ++i;
    }

    FacadePhraseIndex phrase_index;

    //gb_char binary file
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("gb_char.bin");
    phrase_index.load(1, chunk);

    //gbk_char binary file
    chunk = new MemoryChunk;
    chunk->load("gbk_char.bin");
    phrase_index.load(2, chunk);

    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    bigram.attach(k_mixture_model_filename, ATTACH_READONLY);

    print_k_mixture_model_magic_header(output, &bigram);
    print_k_mixture_model_array_headers(output, &bigram, &phrase_index);
    print_k_mixture_model_array_items(output, &bigram, &phrase_index);

    end_data(output);

    return 0;
}
Beispiel #9
0
bool
CBigramHistory::loadFromBuffer(void* buf_ptr, size_t sz)
{
    clear();

    sz /= sizeof(uint32_t);
    uint32_t *pw = (uint32_t*)buf_ptr;

    if (pw && sz > 0) {
#ifndef WORDS_BIGENDIAN
        std::transform(pw, pw + sz, pw, swap32);
#endif
        TBigram bigram(DCWID, DCWID);
        for (size_t i = 0; i < sz; ++i) {
            bigram.first = bigram.second;
            bigram.second = *pw++;
            m_memory.push_back(bigram.second);
            incUniFreq(bigram.second);
            incBiFreq(bigram);
        }
    }
    return true;
}
Beispiel #10
0
bool
CBigramHistory::memorize(uint32_t* its_wid, uint32_t* ite_wid)
{
    TBigram bigram(DCWID, DCWID);

    // First, we insert a DC word id before the context history
    // to separated from previous stream.
    if (m_memory.size() == contxt_memory_size) {
        TBigram hb;
        hb.first = m_memory.front();
        m_memory.pop_front();
        hb.second = m_memory.front();

        decUniFreq(hb.first);
        decBiFreq(hb);
    }
    m_memory.push_back(DCWID);

    //Now trying to memorize new stream and forget oldest
    for (; its_wid != ite_wid; ++its_wid) {
        if (m_memory.size() == contxt_memory_size) {
            TBigram hb;
            hb.first = m_memory.front();
            m_memory.pop_front();
            hb.second = m_memory.front();

            decUniFreq(hb.first);
            decBiFreq(hb);
        }
        bigram.first = bigram.second;
        bigram.second = *its_wid;
        m_memory.push_back(*its_wid);
        incUniFreq(bigram.second);
        incBiFreq(bigram);
    }
    return true;
}
int main(int argc, char * argv[]){
    int i = 1;
    const char * k_mixture_model_filename = NULL;

    setlocale(LC_ALL, "");
    while ( i < argc ){
        if ( strcmp("--help", argv[i]) == 0 ){
            print_help();
            exit(0);
        } else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){
            g_train_pi_gram = false;
        } else if ( strcmp("--maximum-occurs-allowed", argv[i]) == 0 ){
            if ( ++i >= argc ){
                print_help();
                exit(EINVAL);
            }
            g_maximum_occurs = atoi(argv[i]);
        } else if ( strcmp("--maximum-increase-rates-allowed", argv[i]) == 0 ){
            if ( ++i >= argc ){
                print_help();
                exit(EINVAL);
            }
            g_maximum_increase_rates = atof(argv[i]);
        } else if ( strcmp("--k-mixture-model-file", argv[i]) == 0 ){
            if ( ++i >= argc ){
                print_help();
                exit(EINVAL);
            }
            k_mixture_model_filename = argv[i];
        } else {
            break;
        }
        ++i;
    }

    PhraseLargeTable2 phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("phrase_index.bin");
    phrase_table.load(chunk);

    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(&phrase_index))
        exit(ENOENT);

    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);

    while ( i < argc ){
        const char * filename = argv[i];
        FILE * document = fopen(filename, "r");
        if ( NULL == document ){
            int err_saved = errno;
            fprintf(stderr, "can't open file: %s.\n", filename);
            fprintf(stderr, "error:%s.\n", strerror(err_saved));
            exit(err_saved);
        }

        HashofDocument hash_of_document = g_hash_table_new
            (g_direct_hash, g_direct_equal);
        HashofUnigram hash_of_unigram = g_hash_table_new
            (g_direct_hash, g_direct_equal);

        assert(read_document(&phrase_table, &phrase_index, document,
                             hash_of_document, hash_of_unigram));
        fclose(document);
        document = NULL;

        GHashTableIter iter;
        gpointer key, value;

        /* train the document, and convert it to k mixture model. */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            phrase_token_t token1 = GPOINTER_TO_UINT(key);
            train_second_word(hash_of_unigram, &bigram,
                              hash_of_document, token1);
        }

        KMixtureModelMagicHeader magic_header;
        assert(bigram.get_magic_header(magic_header));
        magic_header.m_N ++;
        assert(bigram.set_magic_header(magic_header));

        post_processing_unigram(&bigram, hash_of_unigram);

        /* free resources of g_hash_of_document */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            HashofSecondWord second_word = (HashofSecondWord) value;
            g_hash_table_iter_steal(&iter);
            g_hash_table_unref(second_word);
        }
        g_hash_table_unref(hash_of_document);
        hash_of_document = NULL;

        g_hash_table_unref(hash_of_unigram);
        hash_of_unigram = NULL;

        ++i;
    }

    return 0;
}
int main(int argc, char * argv[]){
    int i = 1;
    const char * bigram_filename = NULL;

    setlocale(LC_ALL, "");
    while ( i < argc ){
        if ( strcmp("--help", argv[i]) == 0 ){
            print_help();
            exit(0);
        } else if ( strcmp("-k", argv[i]) == 0 ){
            if ( ++i >= argc ){
                print_help();
                exit(EINVAL);
            }
            g_prune_k = atoi(argv[i]);
        } else if ( strcmp("--CDF", argv[i]) == 0 ){
            if ( ++i >= argc ){
                print_help();
                exit(EINVAL);
            }
            g_prune_poss = atof(argv[i]);
        } else {
            bigram_filename = argv[i];
        }
        ++i;
    }

    /* TODO: magic header signature check here. */
    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    bigram.attach(bigram_filename, ATTACH_READWRITE);

    KMixtureModelMagicHeader magic_header;
    bigram.get_magic_header(magic_header);
    GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
    bigram.get_all_items(items);

    /* print prune progress */
    size_t progress = 0; size_t onestep = items->len / 20;
    for ( size_t i = 0; i < items->len; ++i ){
        if ( progress >= onestep ) {
            progress = 0; fprintf(stderr, "*");
        }
        progress ++;

        phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
        KMixtureModelSingleGram * single_gram = NULL;
        bigram.load(*token, single_gram);

        FlexibleBigramPhraseArray removed_array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));

        prune_k_mixture_model(&magic_header, single_gram, removed_array);
        bigram.store(*token, single_gram);

        delete single_gram;

        /* post processing for unigram reduce */
        for (size_t m = 0; m < removed_array->len; ++m ){
            KMixtureModelArrayItemWithToken * item =
                &g_array_index(removed_array,
                              KMixtureModelArrayItemWithToken, m);
            KMixtureModelArrayHeader array_header;
            assert(bigram.get_array_header(item->m_token, array_header));
            array_header.m_freq -= item->m_item.m_WC;
            assert(array_header.m_freq >= 0);
            assert(bigram.set_array_header(item->m_token, array_header));
        }

        g_array_free(removed_array, TRUE);
        removed_array = NULL;
    }

    fprintf(stderr, "\n");

    bigram.set_magic_header(magic_header);

    /* post processing clean up zero items */
    KMixtureModelArrayHeader array_header;
    for ( size_t i = 0; i < items->len; ++i ){
        phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
        assert(bigram.get_array_header(*token, array_header));
        if ( 0 == array_header.m_WC && 0 == array_header.m_freq )
            assert(bigram.remove(*token));
    }

    g_array_free(items, TRUE);

    return 0;
}
Beispiel #13
0
int JelinekMercerFeature::bigram_count(int corpus, int first, int second) {
  bigram key = bigram(first, second);
  if (_bigram[corpus].find(key) == _bigram[corpus].end()) return 0;
  else return _bigram[corpus][key];
}
Beispiel #14
0
int main(int argc, char * argv[]){
    int i = 1;

    setlocale(LC_ALL, "");

    GError * error = NULL;
    GOptionContext * context;

    context = g_option_context_new("- generate k mixture model");
    g_option_context_add_main_entries(context, entries, NULL);
    if (!g_option_context_parse(context, &argc, &argv, &error)) {
        g_print("option parsing failed:%s\n", error->message);
        exit(EINVAL);
    }

    SystemTableInfo2 system_table_info;

    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    PhraseLargeTable3 phrase_table;
    phrase_table.attach(SYSTEM_PHRASE_INDEX, ATTACH_READONLY);

    FacadePhraseIndex phrase_index;

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_default_tables();

    if (!load_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);

    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    bigram.attach(g_k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);

    while ( i < argc ){
        const char * filename = argv[i];
        FILE * document = fopen(filename, "r");
        if ( NULL == document ){
            int err_saved = errno;
            fprintf(stderr, "can't open file: %s.\n", filename);
            fprintf(stderr, "error:%s.\n", strerror(err_saved));
            exit(err_saved);
        }

        HashofDocument hash_of_document = g_hash_table_new
            (g_direct_hash, g_direct_equal);
        HashofUnigram hash_of_unigram = g_hash_table_new
            (g_direct_hash, g_direct_equal);

        assert(read_document(&phrase_table, &phrase_index, document,
                             hash_of_document, hash_of_unigram));
        fclose(document);
        document = NULL;

        GHashTableIter iter;
        gpointer key, value;

        /* train the document, and convert it to k mixture model. */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            phrase_token_t token1 = GPOINTER_TO_UINT(key);
            train_second_word(hash_of_unigram, &bigram,
                              hash_of_document, token1);
        }

        KMixtureModelMagicHeader magic_header;
        assert(bigram.get_magic_header(magic_header));
        magic_header.m_N ++;
        assert(bigram.set_magic_header(magic_header));

        post_processing_unigram(&bigram, hash_of_unigram);

        /* free resources of g_hash_of_document */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            HashofSecondWord second_word = (HashofSecondWord) value;
            g_hash_table_iter_steal(&iter);
            g_hash_table_unref(second_word);
        }
        g_hash_table_unref(hash_of_document);
        hash_of_document = NULL;

        g_hash_table_unref(hash_of_unigram);
        hash_of_unigram = NULL;

        ++i;
    }

    return 0;
}
int main(int argc, char * argv[]) {
    FlexibleSingleGram<guint32, guint32> single_gram;
    typedef FlexibleSingleGram<guint32, guint32>::ArrayItemWithToken array_item_t;

    const guint32 total_freq = 16;
    assert(single_gram.set_array_header(total_freq));

    phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3 };
    guint32 freqs[6] = { 1, 2, 4, 8, 16, 32};

    guint32 freq;

    for ( size_t i = 0; i < 6; ++i ){
        if ( single_gram.get_array_item(tokens[i], freq) )
            assert(single_gram.set_array_item(tokens[i], freqs[i]));
        else
            assert(single_gram.insert_array_item(tokens[i], freqs[i]));
    }

    single_gram.get_array_item(3, freq);
    assert(freq == 32);

    printf("--------------------------------------------------------\n");
    PhraseIndexRange range;
    FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(array_item_t));
    range.m_range_begin = 0; range.m_range_end = 8;
    single_gram.search(&range, array);
    for ( size_t i = 0; i < array->len; ++i ){
        array_item_t * item = &g_array_index(array, array_item_t, i);
        printf("item:%d:%d\n", item->m_token, item->m_item);
    }

    assert(single_gram.get_array_header(freq));
    assert(freq == total_freq);

    FlexibleBigram<guint32, guint32, guint32> bigram("TEST");
    assert(bigram.attach("/tmp/training.db", ATTACH_READWRITE|ATTACH_CREATE));
    bigram.store(1, &single_gram);
    assert(single_gram.insert_array_item(5, 8));
    assert(single_gram.remove_array_item(1, freq));
    assert(single_gram.set_array_header(32));
    assert(single_gram.get_array_header(freq));
    printf("new array header:%d\n", freq);
    bigram.store(2, &single_gram);

    for (int m = 1; m <= 2; ++m ){
        printf("--------------------------------------------------------\n");
        FlexibleSingleGram<guint32, guint32> * train_gram;
        bigram.load(m, train_gram);
        g_array_set_size(array, 0);
        range.m_range_begin = 0; range.m_range_end = 8;
        train_gram->search(&range, array);
        for ( size_t i = 0; i < array->len; ++i ){
            array_item_t * item = &g_array_index(array, array_item_t, i);
            printf("item:%d:%d\n", item->m_token, item->m_item);
        }
        delete train_gram;
    }

    GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
    bigram.get_all_items(items);
    printf("-----------------------items----------------------------\n");
    for ( size_t i = 0; i < items->len; ++i ){
        phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
        printf("item:%d\n", *token);
    }

    printf("-----------------------magic header---------------------\n");
    bigram.set_magic_header(total_freq);
    bigram.get_magic_header(freq);
    assert(total_freq == freq);
    printf("magic header:%d\n", freq);

    printf("-----------------------array header---------------------\n");
    for ( size_t i = 1; i <= 2; ++i){
        bigram.get_array_header(i, freq);
        printf("single gram: %d, freq:%d\n", i, freq);
    }

    bigram.set_array_header(1, 1);

    printf("-----------------------array header---------------------\n");
    for ( size_t i = 1; i <= 2; ++i){
        bigram.get_array_header(i, freq);
        printf("single gram: %d, freq:%d\n", i, freq);
    }

    for (int m = 1; m <= 2; ++m ){
        printf("--------------------------------------------------------\n");
        FlexibleSingleGram<guint32, guint32> * train_gram;
        bigram.load(m, train_gram);
        g_array_set_size(array, 0);
        range.m_range_begin = 0; range.m_range_end = 8;
        train_gram->search(&range, array);
        for ( size_t i = 0; i < array->len; ++i ){
            array_item_t * item = &g_array_index(array, array_item_t, i);
            printf("item:%d:%d\n", item->m_token, item->m_item);
        }
        delete train_gram;
    }

    assert(bigram.remove(1));

    bigram.get_all_items(items);
    printf("-----------------------items----------------------------\n");
    for ( size_t i = 0; i < items->len; ++i ){
        phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
        printf("item:%d\n", *token);
    }

    g_array_free(items, TRUE);
    g_array_free(array, TRUE);
    return 0;
}