//Reads the corpus file, the output folder, the minimum and the maximum number of clusters and runs the EM algorithm. int main(int argc, char **argv) { const char* info = "printinfo"; if(strcmp(argv[2],info) == 0){ Corpus *c = new Corpus(argv[1]); cout << "Corpus Loaded - Unique Terms = " << c->vocsize << endl; cout << "Total Terms = " << c->terms << endl; cout << "Total Articles = " << c->size() << endl; double avg = (double)c->terms/(double)c->size(); cout << "avg = " << avg << endl; std::tr1::unordered_map<string,int>::iterator it; string outfile = "Vocabulary.txt"; ofstream out; out.open(outfile.c_str()); for(it=c->id2word.begin(); it != c->id2word.end(); it++){ if(c->df[it->second] > 3){ out << it->first << endl; } } out.close(); return 0; } long pi = 3.141592653589793; if(argc < 6) cout << "Usage: ./em Cropus_File Output_Folder min_number_of_clusters max_number_of_clusters max_em_iterations" << endl; int key=15; long double likelihood=0.0,L=0; Corpus *c = new Corpus(argv[1]); int minC = atoi(argv[3]); int maxC = atoi(argv[4]); int MaxIter = atoi(argv[5]); long double likelihoods[maxC+1]; cout << "Corpus Loaded - Unique Terms = " << c->vocsize << endl; //OMPED Iterations in order to accelerate the process #pragma omp parallel for for(unsigned j=minC; j <= maxC; j++){ EM *em = new EM(j,c,MaxIter,string(argv[2])); likelihoods[j] = em->run(); em->~EM(); } string outfile = string(argv[2])+"/likelihoods.txt"; ofstream out; out.open(outfile.c_str()); for(unsigned i = minC; i <= maxC; i++){ double d = (i*(c->vocsize-1))+(i-1); long double penalty = (d/2.0)*log2(c->terms); long double dr = ((d/2.0)*(2*pi)); long double bic = -likelihoods[i] + penalty; cout << i << " " << -likelihoods[i] << " " << penalty << " " << bic << endl; out << i << " " << -likelihoods[i] << " " << penalty << " " << bic << endl; } out.close(); return 0; }
void remap(PhonemeAlphabet& alph, Corpus& corp) { for(unsigned i = 0; i < corp.size(); i++) { auto& labels = corp.label(i); for(auto& p : labels) p.id = alph.new_id(p.id); auto& inputs = corp.input(i); for(auto& p : inputs) p.id = alph.new_id(p.id); } }
static void Print(const Rung& r, const Corpus& source, const Corpus& target, const PrintParams& params) { if(r.i == source.size() && r.j == target.size()) return; if(r.score < params.printThreshold) return; if(params.print11 && (r.bead[0] != 1 || r.bead[1] != 1)) return; if(!params.printUnaligned && (r.bead[0] == 0 || r.bead[1] == 0)) return; const Sentence& s1 = source(r.i, r.i + r.bead[0] - 1); const Sentence& s2 = target(r.j, r.j + r.bead[1] - 1); if(params.printIds) std::cout << r.i << " " << r.j << "\t"; if(params.printBeads) std::cout << r.bead << "\t"; if(params.printScores) std::cout << r.score << "\t"; std::cout << s1 << "\t" << s2 << std::endl; }
bool init_tool(int argc, const char** argv, Options* opts) { *opts = Options::parse_options(argc, argv); if(!Options::has_required(*opts)) return false; COLOR_ENABLED = !opts->has_opt("no-color"); FORCE_SCALE = opts->has_opt("force-scale"); SMOOTH = opts->has_opt("smooth"); SCALE_ENERGY = opts->has_opt("energy"); PRINT_SCALE = opts->has_opt("print-scale"); REPORT_PROGRESS = opts->has_opt("progress"); VLOG = std::ofstream(opts->get_opt<std::string>("vlog", "vlog.log")); crf.label_alphabet = &alphabet_synth; baseline_crf.label_alphabet = &alphabet_synth; build_data(*opts); pre_process(alphabet_synth, corpus_synth); pre_process(alphabet_test, corpus_test); alphabet_synth.optimize(); remap(alphabet_synth, corpus_synth); alphabet_test.optimize(); remap(alphabet_test, corpus_test); auto testSize = opts->get_opt<unsigned>("test-corpus-size", 10); for(auto i = testSize; i < corpus_test.size(); i++) corpus_eval.add(corpus_test.input(i), corpus_test.label(i)); corpus_test.set_max_size(testSize); INFO("Synth sequences = " << corpus_synth.size()); INFO("Test sequences = " << corpus_test.size()); INFO("Eval sequences = " << corpus_eval.size()); return true; }