void test_simple_vocabulary(void) { Vocabulary* voc = new Vocabulary("/home/sasha/work/data/test/stopwords.csv"); cout << voc->contains("the") << endl; cout << voc->contains(".") << endl; cout << voc->contains(")") << endl; cout << voc->contains(",") << endl; delete voc; }
void do_trans_wc() { Vocabulary voc; WordForm wordform; voc.load(settings.voc_file_name); wordform.load(settings.wf_file_name); voc.dump_words(wordform); bool by_alth = false; wordreader_t reader(stdin); while (!reader.eof()) { string cap_orig_word = reader.get_word(); if (cap_orig_word.empty()) continue; string orig_word = tolower(cap_orig_word), word = wordform.get_base( orig_word); if (word.empty()) { word = orig_word; } else { if (word != orig_word) word_forms[word].insert(orig_word); } if (settings.work_mode == MODE_TRANS) { if (found_words.count(word) == 0) { if (voc.contains(word)) { found_words.insert(word); if (!by_alth) { printf("%s\t%s\n", word.c_str(), voc.get_trans(word).c_str()); } } else { not_found_words.insert(cap_orig_word); } } } else if (settings.work_mode == MODE_WC) { ++alph_words[word]; } ++total_words; if (settings.verbose) { if (total_words % 1000 == 0) { fprintf(stderr, "read %d words\n", total_words); } } } if (settings.verbose) { fprintf(stderr, "total: read %d words\n", total_words); } if (settings.work_mode == MODE_TRANS) { if (by_alth) { for (set<string>::iterator w = found_words.begin(); w != found_words.end(); ++w) { printf("%s\t%s\n", w->c_str(), voc.get_trans(*w).c_str()); } } if (settings.full_out) { print_notfound(); print_wordform(); } } else if (settings.work_mode == MODE_WC) { multimap<int, string_t> ord_words; string_t word; int occur; map<string_t, int>::iterator last; for (map<string_t, int>::iterator w = alph_words.begin(); w != alph_words.end(); ++w) { ord_words.insert(make_pair(w->second, w->first)); } for (multimap<int, string_t>::reverse_iterator w = ord_words.rbegin(); w != ord_words.rend(); ++w) { occur = w->first; printf("%s\n", w->second.c_str()); } if (settings.full_out) { freopen((settings.in_file_name + ".occur.txt").c_str(), "w+t", stdout); for (multimap<int, string_t>::reverse_iterator w = ord_words.rbegin(); w != ord_words.rend(); ++w) { word = w->second; occur = w->first; printf("%s\t%d\t%.3lf%%\n", word.c_str(), occur, occur * 100.0 / total_words); } freopen((settings.in_file_name + ".alphabet.txt").c_str(), "w+t", stdout); for (map<string_t, int>::iterator w = alph_words.begin(); w != alph_words.end(); ++w) { word = w->first; occur = w->second; printf("%s\t%d\t%.3lf%%\n", word.c_str(), occur, occur * 100.0 / total_words); } print_wordform(); } } }