コード例 #1
0
ファイル: sparseLDA.cpp プロジェクト: rforge/rtm
  void load(unsigned int K, const Corpus& corpus) {
		//	K_(K), V_(corpus.getV()) {
		K_ = K;
		V_ = corpus.getV();

		indices_.resize(V_);
		lengths_.resize(V_);

		indices_[0] = 0;
		lengths_[0] = min(corpus.getWordCount(0), K);
		int total = lengths_[0];

		for (unsigned int ii = 1; ii < V_; ++ii) {
			lengths_[ii] = min(corpus.getWordCount(ii), K);
			indices_[ii] = indices_[ii - 1] + lengths_[ii - 1];
			total += lengths_[ii];
		}

		data_.resize(total);
		for (unsigned int ii = 0; ii < total; ++ii) {
			data_[ii] = 0;
		}
		// Set up M_ and mask_
		M_ = ceil(log2(K));
		mask_ = (1L << (M_)) - 1;
	}
コード例 #2
0
ファイル: lda_app.cpp プロジェクト: lijiankou/bigdata
void App() {
  long t1;
  (void) time(&t1);
  seedMT(t1);
  float em_converged = 1e-4;
  int em_max_iter = 20;
  int em_estimate_alpha = 1; //1 indicate estimate alpha and 0 use given value
  int var_max_iter = 30;
  double var_converged = 1e-6;
  double initial_alpha = 0.1;
  int n_topic = 30;
  LDA lda;
  lda.Init(em_converged, em_max_iter, em_estimate_alpha, var_max_iter,
                         var_converged, initial_alpha, n_topic);
  Corpus cor;
  //Str data = "../../data/ap.dat";
  Str data = "lda_data";
  cor.LoadData(data);
  Corpus train;
  Corpus test;
  double p = 0.8;
  SplitData(cor, p, &train, &test);
  Str type = "seeded";
  LdaModel m;
  lda.RunEM(type, train, test, &m);

  LOG(INFO) << m.alpha;
  VVReal gamma;
  VVVReal phi;
  lda.Infer(test, m, &gamma, &phi);
  WriteStrToFile(Join(gamma, " ", "\n"), "gamma");
  WriteStrToFile(Join(phi, " ", "\n", "\n\n"), "phi");
}
コード例 #3
0
vector<float> AvaliadorAcuracia::calcularDesempenho( Corpus &corpus, int atributo_padrao, int atributo_teste )
{
    /**
    *
    *   Calcula a porcentagem de acerto para um determinado atributo
    *
    */
    vector<float> vectorAcuracia;
    int row = corpus.pegarQtdSentencas(), column, acertos = 0, totalTokens = 0;

    for ( register int i = 0; i < row; ++i )
    {
        column = corpus.pegarQtdTokens( i );

        for ( register int j = 0; j < column; ++j )
        {
            if ( corpus.pegarValor(i,j,atributo_padrao) == corpus.pegarValor(i,j,atributo_teste) )
                ++acertos;

            ++totalTokens;
        }
    }


    vectorAcuracia.push_back( (float)acertos / totalTokens );

    return vectorAcuracia;
}
コード例 #4
0
ファイル: Corpus.cpp プロジェクト: estnltk/pfe
OrderedCover fullOrderedCoverFromCorpus(Corpus const& corpus) {
    std::map<std::string, OrderedDocCover> _map;
    for (auto i=corpus.begin() ; i!=corpus.end() ; ++i) {
        _map.insert(_map.end(),
                    {i->first, fullOrderedDocCoverFromDoc(i->second)});
    }
    return OrderedCover(_map);
}
コード例 #5
0
ファイル: Corpus.cpp プロジェクト: estnltk/pfe
void writeCorpusToStream(std::ostream& os, Corpus const& corpus) {
    for (auto i=corpus.begin() ; i!=corpus.end() ; ++i) {
        if (i!=corpus.begin()) {
            os << std::endl;
        }
        os << i->first << std::endl;
        writeDocToStream(os, i->second, i->first);
    }
}
コード例 #6
0
ファイル: main.cpp プロジェクト: f1r3w1nd/EM
//Reads the corpus file, the output folder, the minimum and the maximum number of clusters and runs the EM algorithm.
int main(int argc, char **argv) {

  const char* info = "printinfo";
  if(strcmp(argv[2],info) == 0){
     Corpus *c = new Corpus(argv[1]);
     cout << "Corpus Loaded - Unique Terms = " << c->vocsize << endl;
     cout << "Total Terms = " << c->terms << endl;
     cout << "Total Articles = " << c->size() << endl;
     double avg = (double)c->terms/(double)c->size();
     cout << "avg = " << avg << endl;
     std::tr1::unordered_map<string,int>::iterator it;
     string outfile = "Vocabulary.txt";
     ofstream out;
     out.open(outfile.c_str());
     for(it=c->id2word.begin(); it != c->id2word.end(); it++){
       if(c->df[it->second] > 3){
	out << it->first << endl;
       }
    }
    out.close();
    return 0;
  }
  long pi = 3.141592653589793;
  if(argc < 6)
    cout << "Usage: ./em Cropus_File Output_Folder min_number_of_clusters max_number_of_clusters max_em_iterations" << endl;
  int key=15;
  long double likelihood=0.0,L=0;
  
  Corpus *c = new Corpus(argv[1]);
  int minC = atoi(argv[3]);
  int maxC = atoi(argv[4]);
  int MaxIter = atoi(argv[5]);
  long double likelihoods[maxC+1];
  cout << "Corpus Loaded - Unique Terms = " << c->vocsize << endl;
//OMPED Iterations in order to accelerate the process
#pragma omp parallel for
  for(unsigned j=minC; j <= maxC; j++){
    EM *em = new EM(j,c,MaxIter,string(argv[2]));
    likelihoods[j] = em->run();
    em->~EM();
  }
  
  string outfile = string(argv[2])+"/likelihoods.txt";
  ofstream out;
  out.open(outfile.c_str());
  for(unsigned i = minC; i <= maxC; i++){
    double d = (i*(c->vocsize-1))+(i-1);
    long double penalty = (d/2.0)*log2(c->terms);
    long double dr = ((d/2.0)*(2*pi));
    long double bic = -likelihoods[i] + penalty;
    cout << i << " " << -likelihoods[i] << " " << penalty << " " << bic << endl;
    out << i << " " << -likelihoods[i] << " " << penalty << " " << bic << endl;
  }
  out.close();
  return 0;
}
コード例 #7
0
ファイル: read_corpus.cpp プロジェクト: arunchaganty/ctm-cvb
int main()
{
    Corpus corpus = Corpus::construct( TEST_DATA );

    corpus.write( TMP_DATA );

    // Assert files are equal
    assert( file_equal( TEST_DATA, TMP_DATA ) );

    return 0;
}
コード例 #8
0
ファイル: tool.cpp プロジェクト: ivanzamanov/mini-crf
void remap(PhonemeAlphabet& alph, Corpus& corp) {
  for(unsigned i = 0; i < corp.size(); i++) {
    auto& labels = corp.label(i);
    for(auto& p : labels)
      p.id = alph.new_id(p.id);

    auto& inputs = corp.input(i);
    for(auto& p : inputs)
      p.id = alph.new_id(p.id);
  }
}
コード例 #9
0
ファイル: Corpus.cpp プロジェクト: estnltk/pfe
Corpus
corpusSample(Corpus const& corpus, std::vector<std::string> const& docIds)
throw(std::runtime_error) {
    Corpus _sample;
    for (auto i=docIds.begin() ; i!=docIds.end() ; ++i) {
        auto j = corpus.find(*i);
        if (j == corpus.end()) {
            throw std::runtime_error(ERR_DOCUMENT_NOT_FOUND);
        } else {
            _sample[*i] = j->second;
        }
    }
    return _sample;
}
コード例 #10
0
ファイル: tool.cpp プロジェクト: ivanzamanov/mini-crf
namespace tool {
  Corpus corpus_synth, corpus_test, corpus_eval;

  CRF crf;
  BaselineCRF baseline_crf;
  PhonemeAlphabet alphabet_synth, alphabet_test;

  StringLabelProvider labels_synth;
  StringLabelProvider labels_test;
  StringLabelProvider labels_all;

  std::ofstream VLOG;

  bool init_tool(int argc, const char** argv, Options* opts) {
    *opts = Options::parse_options(argc, argv);
    if(!Options::has_required(*opts))
      return false;
    COLOR_ENABLED = !opts->has_opt("no-color");
    FORCE_SCALE = opts->has_opt("force-scale");
    SMOOTH = opts->has_opt("smooth");
    SCALE_ENERGY = opts->has_opt("energy");
    PRINT_SCALE = opts->has_opt("print-scale");
    REPORT_PROGRESS = opts->has_opt("progress");

    VLOG = std::ofstream(opts->get_opt<std::string>("vlog", "vlog.log"));

    crf.label_alphabet = &alphabet_synth;
    baseline_crf.label_alphabet = &alphabet_synth;
    build_data(*opts);

    pre_process(alphabet_synth, corpus_synth);
    pre_process(alphabet_test, corpus_test);
  
    alphabet_synth.optimize();
    remap(alphabet_synth, corpus_synth);

    alphabet_test.optimize();
    remap(alphabet_test, corpus_test);

    auto testSize = opts->get_opt<unsigned>("test-corpus-size", 10);
    for(auto i = testSize; i < corpus_test.size(); i++)
      corpus_eval.add(corpus_test.input(i), corpus_test.label(i));
    corpus_test.set_max_size(testSize);

    INFO("Synth sequences = " << corpus_synth.size());
    INFO("Test sequences = " << corpus_test.size());
    INFO("Eval sequences = " << corpus_eval.size());
    return true;
  }
}
コード例 #11
0
ファイル: Corpus.cpp プロジェクト: estnltk/pfe
Corpus readCorpusFromStream(std::istream& is) {
    Corpus corpus;
    std::string docName;
    Document doc;

    doc = readDocFromStream(is, docName);
    while (!is.eof() && !is.fail()) {
        corpus[docName] = doc;
        doc = readDocFromStream(is, docName);
    }
    if (corpus.find(docName) == corpus.end()) {
        corpus[docName] = doc;
    }
    return corpus;
}
コード例 #12
0
vector<float> AvaliadorMatrizConfusao::calcularDesempenho( Corpus &corpus, int atributo_padrao, int atributo_teste )
{
    /**
    *
    *   Calcula somatorio de verdadeiros e negativos por classe
    *
    */
    vector<float> vectorMatriz;
    int row = corpus.pegarQtdSentencas(), column,
     numeroClasses = classes.size(), tam, posVerdadeiro, posResposta;
    string resposta, verdade;

    tam = numeroClasses*numeroClasses;

    vectorMatriz.resize(tam);
    for ( register int c = 0; c < tam; ++c )
        vectorMatriz[c] = 0.0;

    for ( register int i = 0; i < row; ++i )
    {
        column = corpus.pegarQtdTokens( i );

        for ( register int j = 0; j < column; ++j )
        {
            verdade = corpus(i,j,atributo_padrao);
            resposta = corpus(i,j,atributo_teste);

            posVerdadeiro = posResposta = -1;
            for ( register int c = 0; c < numeroClasses; ++c ){
                if (classes[c]==verdade)
                    posVerdadeiro = c;
                if (classes[c]==resposta)
                    posResposta = c;

            }
            if (posVerdadeiro == -1 || posResposta == -1 ){
                cout << "Classe não encontrada, uma exceção será gerada.";
                throw "Classe não encontrada pela matriz de confusão";
            }

            vectorMatriz[posVerdadeiro*numeroClasses+posResposta]++;
        }
    }

    ultimaMatriz = vectorMatriz;
    return vectorMatriz;
}
コード例 #13
0
void ProcessadorAttDisc::criarAtributos(Corpus &objCorpus){

    for(int i=0; i< numatributos; i++)
    {
        objCorpus.criarAtributo("New"+atributo[i],"0");
    }

}
コード例 #14
0
ファイル: main.cpp プロジェクト: aykutfirat/MedSTC-Mac
int main(int argc, char* argv[])
{
	seedMT( time(NULL) );

	if ( argc > 1 )
	{
		Corpus* c = new Corpus();
		Params param;

		param.read_settings( argv[5] ); //"settings.txt");
		param.NTOPICS    = atoi(argv[1]);
		param.INITIAL_C  = atof(argv[2]);
		param.LAMBDA     = atof(argv[3]);
		param.RHO        = atof(argv[4]);
		param.NFOLDS	 = 1;
		if ( argc > 6 ) param.NFOLDS     = atoi(argv[6]);
		if ( argc > 7 ) param.DELTA_ELL  = atof(argv[7]);

		c->read_data(param.train_filename, param.NLABELS);
		char dir[512];
		sprintf(dir, "s%d_c%d_f%d_s%d", param.NTOPICS, (int)param.INITIAL_C, 
			param.NFOLDS, param.SUPERVISED);
		mkdir(dir,0755);

		MedSTC model;
		model.train("random", dir, c, &param);

		// testing.
		Corpus *tstC = new Corpus();
		tstC->read_data(param.test_filename, param.NLABELS);
		MedSTC evlModel;
		double dAcc = evlModel.sparse_coding(dir, tstC, &param);
		printf("Accuracy: %.3f\n", dAcc);

		delete tstC;
		delete c;
	} else {
		printf("usage : MedSTC est [initial alpha] [k] [labels] [random/seeded/*] [directory]\n");
		printf("        MedSTC cv [foldnum] [foldix] [initial alpha] [k] [labels] [settings] [data] [random/seeded/*] [directory]\n");
		printf("        MedSTC inf [settings] [model] [data] [name]\n");
	}

	return 0;
}
コード例 #15
0
ファイル: estimate.cpp プロジェクト: lijiankou/mllib-1
void MGRTMApp() {
  ml::Converged converged;
  converged.em_converged_ = 1e-4;
  converged.em_max_iter_ = 100;
  converged.var_converged_ = 1e-4;
  converged.var_max_iter_ = 10;
  int rho = 3;
           
  VarMGRTM var;
  var.Init(converged,rho);
  var.Load(FLAGS_net_path, FLAGS_cor_path, FLAGS_neg_times);
                    
  Str path(FLAGS_cor_path);
  Corpus cor;
  cor.LoadData(path);
                           
  MGRTM m;
  m.Init(2, FLAGS_local_topic, FLAGS_global_topic, cor.TermNum(), 1, 0.01, 0.01);
  var.RunEM(&m);
}
コード例 #16
0
ファイル: tbl-evaluate.cpp プロジェクト: danieldk/simpletbl
vector<TaggedWord> corpusToTaggedWords(Corpus<TaggedWord> const &corpus)
{
	vector<TaggedWord> sequence;

	vector<Sentence<TaggedWord> > sentences = corpus.sentences();
	for (vector<Sentence<TaggedWord> >::const_iterator iter = sentences.begin();
		iter != sentences.end(); ++iter)
		copy(iter->words().begin(), iter->words().end(),
			back_inserter(sequence));

	return sequence;
}
コード例 #17
0
ファイル: Printer.hpp プロジェクト: emjotde/bleu-champ
 static void Print(const Rung& r, const Corpus& source, const Corpus& target,
                   const PrintParams& params) {
   if(r.i == source.size() && r.j == target.size())
     return;
   
   if(r.score < params.printThreshold)
     return;
   if(params.print11 && (r.bead[0] != 1 || r.bead[1] != 1))
     return;
   if(!params.printUnaligned && (r.bead[0] == 0 || r.bead[1] == 0))
     return;
 
   const Sentence& s1 = source(r.i, r.i + r.bead[0] - 1);
   const Sentence& s2 = target(r.j, r.j + r.bead[1] - 1);
   
   if(params.printIds)    std::cout << r.i << " " << r.j << "\t";
   if(params.printBeads)  std::cout << r.bead << "\t";
   if(params.printScores) std::cout << r.score <<  "\t";
   
   std::cout << s1 << "\t" << s2 << std::endl;
 }
コード例 #18
0
ファイル: estimate.cpp プロジェクト: lijiankou/mllib-1
void LdaApp() {
  long t1;
  (void) time(&t1);
  seedMT(t1);

  float em_converged = 1e-4;
  int em_max_iter = FLAGS_em_iterate;
  int em_estimate_alpha = 1; //1 indicate estimate alpha and 0 use given value
  int var_max_iter = FLAGS_var_iterate;
  double var_converged = 1e-6;
  double initial_alpha = FLAGS_alpha;
  int topic = FLAGS_topic_num;

  Corpus train;
  Corpus test;
  train.LoadData(FLAGS_cor_train);
  test.LoadData(FLAGS_cor_test);
  LOG(INFO) << train.Len()<< " " << test.Len();

  LdaModel m;
  LDA lda;
  lda.Init(em_converged, em_max_iter, em_estimate_alpha, var_max_iter,
                         var_converged, initial_alpha, topic);
  Str type = "seeded";
  lda.RunEM(type, train, test, &m);

  VVReal gamma;
  VVVReal phi;
  lda.Infer(test, m, &gamma, &phi);
  WriteStrToFile(Join(gamma, " ", "\n"), "./model/gamma");
  WriteStrToFile(Join(m.log_prob_w, topic, train.num_terms), "./model/beta");
  WriteStrToFile(Join(phi, " ", "\n", "\n\n"), "./model/phi");
}
コード例 #19
0
bool ClassificadorAdaboostM1::executarClassificacao( Corpus &corpusProva, int atributo ) {
    vector<vector<double> > exemplos(corpusProva.pegarQtdTotalExemplos(), vector<double>(valores.size(), 0.0));
    int k, indice;
    for (unsigned int t = 0; t < classificadores.size(); t++) {
        (classificadores[t])->executarClassificacao(corpusProva, atributo);
        k = 0;
        for (int i = 0; i < corpusProva.pegarQtdSentencas(); i++) {
            for (int j = 0; j < corpusProva.pegarQtdExemplos(i); j++) {
                indice = -1;
                for (unsigned int a = 0; a < valores.size() - 1; a++)
                    if (valores[a] == corpusProva(i, j, atributo)) {
                        indice = a;
                        a = valores.size();
                    }
                if (indice == -1)
                    indice = valores.size() - 1;
                exemplos[k][indice] -= log(betas[t]);
                k++;
            }
        }
    }
    double maxBeta; int maxAtr;
    k = 0;
    for (int i = 0; i < corpusProva.pegarQtdSentencas(); i++) {
        for (int j = 0; j < corpusProva.pegarQtdExemplos(i); j++) {
            maxBeta = exemplos[k][valores.size()-1];
            maxAtr = valores.size() - 1;
            for (unsigned int a = 0; a < valores.size() - 1; a++) {
                if (maxBeta <= exemplos[k][a]) {
                    maxBeta = exemplos[k][a];
                    maxAtr = a;
                }
            }
            corpusProva(i, j, atributo, valores[maxAtr]);
            k++;
        }
    }
    return true;
}
コード例 #20
0
ファイル: lda_var_em.cpp プロジェクト: rpdodo/mllib
double LDA::Likelihood(const Corpus &cor, int d, const LdaModel &m, VRealC &gamma,VVRealC &phi) const {
  double alpha_sum = double_array_sum(m.alpha,m.num_topics);
  double gamma_sum = std::accumulate(gamma.begin(), gamma.end(), 0.0);
  double digsum = DiGamma(gamma_sum);
  const int &num = m.num_topics;
  VReal expect(num);
  for (int k = 0; k < num; k++) {
    expect.at(k) = DiGamma(gamma.at(k)) - digsum;
  }

  double l = lgamma(alpha_sum) - lgamma(gamma_sum);
  
  for (int k = 0; k < num; k++) {
    l += ((m.alpha[k] - gamma.at(k)) * expect[k] + lgamma(gamma.at(k)) - lgamma(m.alpha[k]));
    for (size_t n = 0; n < cor.ULen(d); n++) {
      if (phi[n][k] > 0) {
        l += cor.Count(d, n) * phi[n][k] * (expect[k] - log(phi[n][k]) + m.log_prob_w[k][cor.Word(d, n)]);
      }
    }
  }
  return l;
}
コード例 #21
0
double fscore(vector<vector<vector<bool> > > sample, Corpus &corpus) {
	int tp = 0,
		fp = 0,
		tn = 0,
		fn = 0;

	for (int a = 0; a < sample.size(); a++) {
		for (int d = 0; d < sample[a].size(); d++) {
			if (corpus.isObscured(a, d)) {
				for (int r = 0; r < sample[a][d].size(); r++) {
					if (a != r) {
						int trueEdge = corpus.getEdge(a, d, r);
						if (sample[a][d][r]) {
							if (trueEdge) {
								tp++;
							} else {
								fp++;
							}
						} else {
							if (trueEdge) {
								fn++;
							} else {
								tn++;
							}
						}
					}
				}
			}
		}
	}

	double precision = (1.0 * tp) / (tp + fp);
	double recall = (1.0 * tp) / (tp + fn);

	cout << "tp: " << tp << ", fp: " << fp << ", tn: " << tn << ", fn: " << fn << "\n";
	cout << "Precision: " << precision << "; Recall: " << recall << "\n";

	return 2 * precision * recall / (precision + recall);
}
コード例 #22
0
bool ProcessadorAttDisc::processarCorpus(Corpus &objCorpus)
{

    int totlinhas, qtdConjExemplos,c;
    int  idCol, linha;
    int *indices;
    indices = new int[numatributos];

    for(int i=0; i<numatributos; i++)
    {
            indices[i] = objCorpus.pegarPosAtributo(atributo[i]);
    }

    criarAtributos(objCorpus);

    idCol = objCorpus.pegarPosAtributo("New"+atributo[0]);

    qtdConjExemplos = objCorpus.pegarQtdConjExemplos();
    for (c=0; c<qtdConjExemplos; c++){
        totlinhas = objCorpus.pegarQtdExemplos(c);

        for (linha=0; linha < totlinhas; linha++){

                for( int i=0; i<numatributos; i++)
                {
                        int  vatual= objCorpus.pegarValor(c,linha,indices[i]);
                        string valor_atual = objCorpus.pegarSimbolo(vatual);
                       int logatual=0;
                        if(valor_atual == "VERDADEIRO" )
                        logatual = 1;
                        std::stringstream out2;
                        out2 << setprecision(0) << setiosflags(ios::fixed);
                        out2 << logatual;
                        objCorpus.ajustarValor(c, linha, idCol+i, objCorpus.pegarIndice(out2.str()));


                }


            }
        }


    return true;

}
コード例 #23
0
ファイル: lda_var_em.cpp プロジェクト: rpdodo/mllib
void LDA::InitVarParamter(const Corpus &cor, int d, const LdaModel &m, VReal* digamma,VReal* ga, VVReal* phi) const {
  ga->resize(m.num_topics);
  digamma->resize(m.num_topics);
  phi->resize(cor.ULen(d));
  for (int k = 0; k < m.num_topics; k++) {
    (*ga)[k] = m.alpha[k] + (cor.docs[d].total / ((double) m.num_topics));
    (*digamma)[k] = DiGamma((*ga)[k]);
  }
  for (VReal::size_type n = 0; n < phi->size(); n++) {
    phi->at(n).resize(m.num_topics);
    for (int k = 0; k < m.num_topics; k++) {
      (*phi)[n][k] = 1.0 / m.num_topics;
    }
  }
}
コード例 #24
0
ファイル: digitize.cpp プロジェクト: wixor/wi
static void run_line(const char *p, const char *end, const Corpus &corp)
{
    while(p < end)
    {
        while(p < end && *p == ' ') p++;
        const char *start = p;
        while(p < end && *p != ' ') p++;

        if(p - start > 0)
        {
            int term_id = corp.lookup(start, p-start);
            if(term_id == -1) 
                fprintf(stderr, "WARNING: term not found: '%.*s'\n", (int)(p-start), start);
            else {
                uint64_t x = rawpost(term_id, doc_id, term_pos++);
                fwrite_unlocked(&x, sizeof(x), 1, stdout);
            }
        }
    }
}
コード例 #25
0
void dtw_model::validate(Corpus& corpus) {
  static const size_t MINIATURE_SIZE = 10000;
  static vector<tsample> samples = corpus.getSamples(MINIATURE_SIZE);
  static double objective = 0;
  static bool aboutToStop = false;
  static const double SOFT_THRESHOLD = 2e-6 * _learning_rate;
  static const double HARD_THRESHOLD = SOFT_THRESHOLD * 0.1;
  static size_t MIN_ITERATION = 128;
  static size_t itr = 0;

  double obj = calcObjective(samples);
  double diff = obj - objective;
  double improveRate = abs(diff / objective);

  printf("objective = %.7f \t prev-objective = %.7f \n", obj, objective);
  printf("improvement rate on dev-set of size %lu = %.6e ", samples.size(), improveRate);
  printf(", still "GREEN"%.0f"COLOREND" times of threshold \n", improveRate / SOFT_THRESHOLD);

  if (itr > MIN_ITERATION) {
    if (improveRate != improveRate)
      exit(-1);
    
    if (improveRate < HARD_THRESHOLD) {
      printf("\nObjective function on dev-set is no longer decreasing...\n");
      printf("Training process "GREEN"DONE"COLOREND"\n");
      // doPause();
      exit(0);
    }
    else if (aboutToStop || improveRate < SOFT_THRESHOLD) {
      aboutToStop = true;
      _learning_rate /= 2;
    }
  }

  objective = obj;
  ++itr;
}
コード例 #26
0
ファイル: main.cpp プロジェクト: aacharya/DSLDA2
/*
* main
*/
int main(int argc, char* argv[])
{
	seedMT( time(NULL) );
	// seedMT(4357U);

	if (argc > 1)
	{
		Corpus* c = new Corpus();
		Params param;
		param.INNER_CV = true;
		if ( strcmp(argv[1], "estinf") == 0 ) {
			param.read_settings("settings.txt");
			param.NTOPICS = atoi(argv[2]);
			param.NLABELS = atoi(argv[3]);
			param.NFOLDS = atoi(argv[4]);
			param.INITIAL_C = atof(argv[5]);
			param.DELTA_ELL = atof(argv[6]);

			printf("K: %d, C: %.3f, Alpha: %d, svm: %d\n", param.NTOPICS, 
				param.INITIAL_C, param.ESTIMATE_ALPHA, param.SVM_ALGTYPE);

			c->read_data(param.train_filename, param.NLABELS);
			char dir[512];
			sprintf(dir, "20ng%d_c%d_f%d", param.NTOPICS, (int)param.INITIAL_C, param.NFOLDS);
			make_directory(dir);

			if ( param.INNER_CV ) {
				c->shuffle();

				char modelDir[512];
				sprintf(modelDir, "%s/innercv", dir);
				make_directory(modelDir);

				param.INITIAL_C = innerCV(modelDir, c, &param);
				printf("\n\nBest C: %f\n", param.INITIAL_C);
			}
			MedLDA model;
			model.run_em(argv[7], dir, c, &param);

			// testing.
			Corpus *tstC = new Corpus();
			tstC->read_data(param.test_filename, param.NLABELS);
			MedLDA evlModel;
			double dAcc = evlModel.infer(dir, tstC, &param);
			printf("Accuracy: %.3f\n", dAcc);
			delete tstC;
		}
		if ( strcmp(argv[1], "est") == 0 ) {
			param.read_settings("settings.txt");
			param.NTOPICS = atoi(argv[2]);
			param.NLABELS = atoi(argv[3]);
			param.NFOLDS = atoi(argv[4]);
			param.INITIAL_C = atof(argv[5]);
			param.DELTA_ELL = atof(argv[6]);

			c->read_data(param.train_filename, param.NLABELS);
			char dir[512];
			sprintf(dir, "%s%d_c%d_f%d", argv[7], param.NTOPICS, param.INITIAL_C, param.NFOLDS);
			make_directory(dir);

			if ( param.INNER_CV ) {
				c->shuffle();

				char modelDir[512];
				sprintf(modelDir, "%s/innercv", dir);
				make_directory(modelDir);

				param.INITIAL_C = innerCV(modelDir, c, &param);
				printf("\n\nBest C: %f\n", param.INITIAL_C);
			}
			MedLDA model;
			model.run_em(argv[8], dir, c, &param);
		}
		if (strcmp(argv[1], "inf")==0)
		{
			param.read_settings("settings.txt");
			param.NLABELS = atoi(argv[2]);
			c->read_data(param.test_filename, param.NLABELS);
			MedLDA model;
			double dAcc = model.infer(argv[3], c, &param);
			printf("Accuracy: %.3f\n", dAcc);
		}

		delete c;
	} else {
		printf("usage : MEDsLDAc estinf [k] [labels] [fold] [initial C] [l] [random/seeded/*]\n");
		printf("        MEDsLDAc est [k] [labels] [fold] [initial C] [l] [dir root] [random/seeded/*]\n");
		printf("        MEDsLDAc inf [labels] [model]\n");
	}
	return(0);
}