void test_simple_vocabulary(void) {
	Vocabulary* voc = new Vocabulary("/home/sasha/work/data/test/stopwords.csv");
	cout << voc->contains("the") << endl;
	cout << voc->contains(".") << endl;
	cout << voc->contains(")") << endl;
	cout << voc->contains(",") << endl;

	delete voc;
}
Exemple #2
0
void do_trans_wc()
{
	Vocabulary voc;
	WordForm wordform;

	voc.load(settings.voc_file_name);
	wordform.load(settings.wf_file_name);
	voc.dump_words(wordform);

	bool by_alth = false;
	wordreader_t reader(stdin);
	while (!reader.eof()) {
		string cap_orig_word = reader.get_word();
		if (cap_orig_word.empty())
			continue;
		string orig_word = tolower(cap_orig_word), word = wordform.get_base(
				orig_word);
		if (word.empty()) {
			word = orig_word;
		} else {
			if (word != orig_word)
				word_forms[word].insert(orig_word);
		}

		if (settings.work_mode == MODE_TRANS) {
			if (found_words.count(word) == 0) {
				if (voc.contains(word)) {
					found_words.insert(word);
					if (!by_alth) {
						printf("%s\t%s\n", word.c_str(),
								voc.get_trans(word).c_str());
					}
				} else {
					not_found_words.insert(cap_orig_word);
				}
			}
		} else if (settings.work_mode == MODE_WC) {
			++alph_words[word];
		}
		++total_words;
		if (settings.verbose) {
			if (total_words % 1000 == 0) {
				fprintf(stderr, "read %d words\n", total_words);
			}
		}
	}
	if (settings.verbose) {
		fprintf(stderr, "total: read %d words\n", total_words);
	}

	if (settings.work_mode == MODE_TRANS) {
		if (by_alth) {
			for (set<string>::iterator w = found_words.begin(); w
					!= found_words.end(); ++w) {
				printf("%s\t%s\n", w->c_str(), voc.get_trans(*w).c_str());
			}
		}

		if (settings.full_out) {
			print_notfound();
			print_wordform();
		}
	} else if (settings.work_mode == MODE_WC) {
		multimap<int, string_t> ord_words;
		string_t word;
		int occur;
		map<string_t, int>::iterator last;
		for (map<string_t, int>::iterator w = alph_words.begin(); w
				!= alph_words.end(); ++w) {
			ord_words.insert(make_pair(w->second, w->first));
		}

		for (multimap<int, string_t>::reverse_iterator w = ord_words.rbegin(); w
				!= ord_words.rend(); ++w) {
			occur = w->first;
			printf("%s\n", w->second.c_str());
		}

		if (settings.full_out) {
			freopen((settings.in_file_name + ".occur.txt").c_str(), "w+t",
					stdout);
			for (multimap<int, string_t>::reverse_iterator w =
					ord_words.rbegin(); w != ord_words.rend(); ++w) {
				word = w->second;
				occur = w->first;
				printf("%s\t%d\t%.3lf%%\n", word.c_str(), occur,
						occur * 100.0 / total_words);
			}

			freopen((settings.in_file_name + ".alphabet.txt").c_str(), "w+t",
					stdout);
			for (map<string_t, int>::iterator w = alph_words.begin(); w
					!= alph_words.end(); ++w) {
				word = w->first;
				occur = w->second;
				printf("%s\t%d\t%.3lf%%\n", word.c_str(), occur,
						occur * 100.0 / total_words);
			}

			print_wordform();
		}
	}
}