コード例 #1
0
ファイル: main.cpp プロジェクト: f1r3w1nd/EM
//Reads the corpus file, the output folder, the minimum and the maximum number of clusters and runs the EM algorithm.
int main(int argc, char **argv) {

  const char* info = "printinfo";
  if(strcmp(argv[2],info) == 0){
     Corpus *c = new Corpus(argv[1]);
     cout << "Corpus Loaded - Unique Terms = " << c->vocsize << endl;
     cout << "Total Terms = " << c->terms << endl;
     cout << "Total Articles = " << c->size() << endl;
     double avg = (double)c->terms/(double)c->size();
     cout << "avg = " << avg << endl;
     std::tr1::unordered_map<string,int>::iterator it;
     string outfile = "Vocabulary.txt";
     ofstream out;
     out.open(outfile.c_str());
     for(it=c->id2word.begin(); it != c->id2word.end(); it++){
       if(c->df[it->second] > 3){
	out << it->first << endl;
       }
    }
    out.close();
    return 0;
  }
  long pi = 3.141592653589793;
  if(argc < 6)
    cout << "Usage: ./em Cropus_File Output_Folder min_number_of_clusters max_number_of_clusters max_em_iterations" << endl;
  int key=15;
  long double likelihood=0.0,L=0;
  
  Corpus *c = new Corpus(argv[1]);
  int minC = atoi(argv[3]);
  int maxC = atoi(argv[4]);
  int MaxIter = atoi(argv[5]);
  long double likelihoods[maxC+1];
  cout << "Corpus Loaded - Unique Terms = " << c->vocsize << endl;
//OMPED Iterations in order to accelerate the process
#pragma omp parallel for
  for(unsigned j=minC; j <= maxC; j++){
    EM *em = new EM(j,c,MaxIter,string(argv[2]));
    likelihoods[j] = em->run();
    em->~EM();
  }
  
  string outfile = string(argv[2])+"/likelihoods.txt";
  ofstream out;
  out.open(outfile.c_str());
  for(unsigned i = minC; i <= maxC; i++){
    double d = (i*(c->vocsize-1))+(i-1);
    long double penalty = (d/2.0)*log2(c->terms);
    long double dr = ((d/2.0)*(2*pi));
    long double bic = -likelihoods[i] + penalty;
    cout << i << " " << -likelihoods[i] << " " << penalty << " " << bic << endl;
    out << i << " " << -likelihoods[i] << " " << penalty << " " << bic << endl;
  }
  out.close();
  return 0;
}
コード例 #2
0
ファイル: tool.cpp プロジェクト: ivanzamanov/mini-crf
void remap(PhonemeAlphabet& alph, Corpus& corp) {
  for(unsigned i = 0; i < corp.size(); i++) {
    auto& labels = corp.label(i);
    for(auto& p : labels)
      p.id = alph.new_id(p.id);

    auto& inputs = corp.input(i);
    for(auto& p : inputs)
      p.id = alph.new_id(p.id);
  }
}
コード例 #3
0
ファイル: Printer.hpp プロジェクト: emjotde/bleu-champ
 static void Print(const Rung& r, const Corpus& source, const Corpus& target,
                   const PrintParams& params) {
   if(r.i == source.size() && r.j == target.size())
     return;
   
   if(r.score < params.printThreshold)
     return;
   if(params.print11 && (r.bead[0] != 1 || r.bead[1] != 1))
     return;
   if(!params.printUnaligned && (r.bead[0] == 0 || r.bead[1] == 0))
     return;
 
   const Sentence& s1 = source(r.i, r.i + r.bead[0] - 1);
   const Sentence& s2 = target(r.j, r.j + r.bead[1] - 1);
   
   if(params.printIds)    std::cout << r.i << " " << r.j << "\t";
   if(params.printBeads)  std::cout << r.bead << "\t";
   if(params.printScores) std::cout << r.score <<  "\t";
   
   std::cout << s1 << "\t" << s2 << std::endl;
 }
コード例 #4
0
ファイル: tool.cpp プロジェクト: ivanzamanov/mini-crf
  bool init_tool(int argc, const char** argv, Options* opts) {
    *opts = Options::parse_options(argc, argv);
    if(!Options::has_required(*opts))
      return false;
    COLOR_ENABLED = !opts->has_opt("no-color");
    FORCE_SCALE = opts->has_opt("force-scale");
    SMOOTH = opts->has_opt("smooth");
    SCALE_ENERGY = opts->has_opt("energy");
    PRINT_SCALE = opts->has_opt("print-scale");
    REPORT_PROGRESS = opts->has_opt("progress");

    VLOG = std::ofstream(opts->get_opt<std::string>("vlog", "vlog.log"));

    crf.label_alphabet = &alphabet_synth;
    baseline_crf.label_alphabet = &alphabet_synth;
    build_data(*opts);

    pre_process(alphabet_synth, corpus_synth);
    pre_process(alphabet_test, corpus_test);
  
    alphabet_synth.optimize();
    remap(alphabet_synth, corpus_synth);

    alphabet_test.optimize();
    remap(alphabet_test, corpus_test);

    auto testSize = opts->get_opt<unsigned>("test-corpus-size", 10);
    for(auto i = testSize; i < corpus_test.size(); i++)
      corpus_eval.add(corpus_test.input(i), corpus_test.label(i));
    corpus_test.set_max_size(testSize);

    INFO("Synth sequences = " << corpus_synth.size());
    INFO("Test sequences = " << corpus_test.size());
    INFO("Eval sequences = " << corpus_eval.size());
    return true;
  }