void load(unsigned int K, const Corpus& corpus) { // K_(K), V_(corpus.getV()) { K_ = K; V_ = corpus.getV(); indices_.resize(V_); lengths_.resize(V_); indices_[0] = 0; lengths_[0] = min(corpus.getWordCount(0), K); int total = lengths_[0]; for (unsigned int ii = 1; ii < V_; ++ii) { lengths_[ii] = min(corpus.getWordCount(ii), K); indices_[ii] = indices_[ii - 1] + lengths_[ii - 1]; total += lengths_[ii]; } data_.resize(total); for (unsigned int ii = 0; ii < total; ++ii) { data_[ii] = 0; } // Set up M_ and mask_ M_ = ceil(log2(K)); mask_ = (1L << (M_)) - 1; }
void App() { long t1; (void) time(&t1); seedMT(t1); float em_converged = 1e-4; int em_max_iter = 20; int em_estimate_alpha = 1; //1 indicate estimate alpha and 0 use given value int var_max_iter = 30; double var_converged = 1e-6; double initial_alpha = 0.1; int n_topic = 30; LDA lda; lda.Init(em_converged, em_max_iter, em_estimate_alpha, var_max_iter, var_converged, initial_alpha, n_topic); Corpus cor; //Str data = "../../data/ap.dat"; Str data = "lda_data"; cor.LoadData(data); Corpus train; Corpus test; double p = 0.8; SplitData(cor, p, &train, &test); Str type = "seeded"; LdaModel m; lda.RunEM(type, train, test, &m); LOG(INFO) << m.alpha; VVReal gamma; VVVReal phi; lda.Infer(test, m, &gamma, &phi); WriteStrToFile(Join(gamma, " ", "\n"), "gamma"); WriteStrToFile(Join(phi, " ", "\n", "\n\n"), "phi"); }
vector<float> AvaliadorAcuracia::calcularDesempenho( Corpus &corpus, int atributo_padrao, int atributo_teste ) { /** * * Calcula a porcentagem de acerto para um determinado atributo * */ vector<float> vectorAcuracia; int row = corpus.pegarQtdSentencas(), column, acertos = 0, totalTokens = 0; for ( register int i = 0; i < row; ++i ) { column = corpus.pegarQtdTokens( i ); for ( register int j = 0; j < column; ++j ) { if ( corpus.pegarValor(i,j,atributo_padrao) == corpus.pegarValor(i,j,atributo_teste) ) ++acertos; ++totalTokens; } } vectorAcuracia.push_back( (float)acertos / totalTokens ); return vectorAcuracia; }
OrderedCover fullOrderedCoverFromCorpus(Corpus const& corpus) { std::map<std::string, OrderedDocCover> _map; for (auto i=corpus.begin() ; i!=corpus.end() ; ++i) { _map.insert(_map.end(), {i->first, fullOrderedDocCoverFromDoc(i->second)}); } return OrderedCover(_map); }
void writeCorpusToStream(std::ostream& os, Corpus const& corpus) { for (auto i=corpus.begin() ; i!=corpus.end() ; ++i) { if (i!=corpus.begin()) { os << std::endl; } os << i->first << std::endl; writeDocToStream(os, i->second, i->first); } }
//Reads the corpus file, the output folder, the minimum and the maximum number of clusters and runs the EM algorithm. int main(int argc, char **argv) { const char* info = "printinfo"; if(strcmp(argv[2],info) == 0){ Corpus *c = new Corpus(argv[1]); cout << "Corpus Loaded - Unique Terms = " << c->vocsize << endl; cout << "Total Terms = " << c->terms << endl; cout << "Total Articles = " << c->size() << endl; double avg = (double)c->terms/(double)c->size(); cout << "avg = " << avg << endl; std::tr1::unordered_map<string,int>::iterator it; string outfile = "Vocabulary.txt"; ofstream out; out.open(outfile.c_str()); for(it=c->id2word.begin(); it != c->id2word.end(); it++){ if(c->df[it->second] > 3){ out << it->first << endl; } } out.close(); return 0; } long pi = 3.141592653589793; if(argc < 6) cout << "Usage: ./em Cropus_File Output_Folder min_number_of_clusters max_number_of_clusters max_em_iterations" << endl; int key=15; long double likelihood=0.0,L=0; Corpus *c = new Corpus(argv[1]); int minC = atoi(argv[3]); int maxC = atoi(argv[4]); int MaxIter = atoi(argv[5]); long double likelihoods[maxC+1]; cout << "Corpus Loaded - Unique Terms = " << c->vocsize << endl; //OMPED Iterations in order to accelerate the process #pragma omp parallel for for(unsigned j=minC; j <= maxC; j++){ EM *em = new EM(j,c,MaxIter,string(argv[2])); likelihoods[j] = em->run(); em->~EM(); } string outfile = string(argv[2])+"/likelihoods.txt"; ofstream out; out.open(outfile.c_str()); for(unsigned i = minC; i <= maxC; i++){ double d = (i*(c->vocsize-1))+(i-1); long double penalty = (d/2.0)*log2(c->terms); long double dr = ((d/2.0)*(2*pi)); long double bic = -likelihoods[i] + penalty; cout << i << " " << -likelihoods[i] << " " << penalty << " " << bic << endl; out << i << " " << -likelihoods[i] << " " << penalty << " " << bic << endl; } out.close(); return 0; }
int main() { Corpus corpus = Corpus::construct( TEST_DATA ); corpus.write( TMP_DATA ); // Assert files are equal assert( file_equal( TEST_DATA, TMP_DATA ) ); return 0; }
void remap(PhonemeAlphabet& alph, Corpus& corp) { for(unsigned i = 0; i < corp.size(); i++) { auto& labels = corp.label(i); for(auto& p : labels) p.id = alph.new_id(p.id); auto& inputs = corp.input(i); for(auto& p : inputs) p.id = alph.new_id(p.id); } }
Corpus corpusSample(Corpus const& corpus, std::vector<std::string> const& docIds) throw(std::runtime_error) { Corpus _sample; for (auto i=docIds.begin() ; i!=docIds.end() ; ++i) { auto j = corpus.find(*i); if (j == corpus.end()) { throw std::runtime_error(ERR_DOCUMENT_NOT_FOUND); } else { _sample[*i] = j->second; } } return _sample; }
namespace tool { Corpus corpus_synth, corpus_test, corpus_eval; CRF crf; BaselineCRF baseline_crf; PhonemeAlphabet alphabet_synth, alphabet_test; StringLabelProvider labels_synth; StringLabelProvider labels_test; StringLabelProvider labels_all; std::ofstream VLOG; bool init_tool(int argc, const char** argv, Options* opts) { *opts = Options::parse_options(argc, argv); if(!Options::has_required(*opts)) return false; COLOR_ENABLED = !opts->has_opt("no-color"); FORCE_SCALE = opts->has_opt("force-scale"); SMOOTH = opts->has_opt("smooth"); SCALE_ENERGY = opts->has_opt("energy"); PRINT_SCALE = opts->has_opt("print-scale"); REPORT_PROGRESS = opts->has_opt("progress"); VLOG = std::ofstream(opts->get_opt<std::string>("vlog", "vlog.log")); crf.label_alphabet = &alphabet_synth; baseline_crf.label_alphabet = &alphabet_synth; build_data(*opts); pre_process(alphabet_synth, corpus_synth); pre_process(alphabet_test, corpus_test); alphabet_synth.optimize(); remap(alphabet_synth, corpus_synth); alphabet_test.optimize(); remap(alphabet_test, corpus_test); auto testSize = opts->get_opt<unsigned>("test-corpus-size", 10); for(auto i = testSize; i < corpus_test.size(); i++) corpus_eval.add(corpus_test.input(i), corpus_test.label(i)); corpus_test.set_max_size(testSize); INFO("Synth sequences = " << corpus_synth.size()); INFO("Test sequences = " << corpus_test.size()); INFO("Eval sequences = " << corpus_eval.size()); return true; } }
Corpus readCorpusFromStream(std::istream& is) { Corpus corpus; std::string docName; Document doc; doc = readDocFromStream(is, docName); while (!is.eof() && !is.fail()) { corpus[docName] = doc; doc = readDocFromStream(is, docName); } if (corpus.find(docName) == corpus.end()) { corpus[docName] = doc; } return corpus; }
vector<float> AvaliadorMatrizConfusao::calcularDesempenho( Corpus &corpus, int atributo_padrao, int atributo_teste ) { /** * * Calcula somatorio de verdadeiros e negativos por classe * */ vector<float> vectorMatriz; int row = corpus.pegarQtdSentencas(), column, numeroClasses = classes.size(), tam, posVerdadeiro, posResposta; string resposta, verdade; tam = numeroClasses*numeroClasses; vectorMatriz.resize(tam); for ( register int c = 0; c < tam; ++c ) vectorMatriz[c] = 0.0; for ( register int i = 0; i < row; ++i ) { column = corpus.pegarQtdTokens( i ); for ( register int j = 0; j < column; ++j ) { verdade = corpus(i,j,atributo_padrao); resposta = corpus(i,j,atributo_teste); posVerdadeiro = posResposta = -1; for ( register int c = 0; c < numeroClasses; ++c ){ if (classes[c]==verdade) posVerdadeiro = c; if (classes[c]==resposta) posResposta = c; } if (posVerdadeiro == -1 || posResposta == -1 ){ cout << "Classe não encontrada, uma exceção será gerada."; throw "Classe não encontrada pela matriz de confusão"; } vectorMatriz[posVerdadeiro*numeroClasses+posResposta]++; } } ultimaMatriz = vectorMatriz; return vectorMatriz; }
void ProcessadorAttDisc::criarAtributos(Corpus &objCorpus){ for(int i=0; i< numatributos; i++) { objCorpus.criarAtributo("New"+atributo[i],"0"); } }
int main(int argc, char* argv[]) { seedMT( time(NULL) ); if ( argc > 1 ) { Corpus* c = new Corpus(); Params param; param.read_settings( argv[5] ); //"settings.txt"); param.NTOPICS = atoi(argv[1]); param.INITIAL_C = atof(argv[2]); param.LAMBDA = atof(argv[3]); param.RHO = atof(argv[4]); param.NFOLDS = 1; if ( argc > 6 ) param.NFOLDS = atoi(argv[6]); if ( argc > 7 ) param.DELTA_ELL = atof(argv[7]); c->read_data(param.train_filename, param.NLABELS); char dir[512]; sprintf(dir, "s%d_c%d_f%d_s%d", param.NTOPICS, (int)param.INITIAL_C, param.NFOLDS, param.SUPERVISED); mkdir(dir,0755); MedSTC model; model.train("random", dir, c, ¶m); // testing. Corpus *tstC = new Corpus(); tstC->read_data(param.test_filename, param.NLABELS); MedSTC evlModel; double dAcc = evlModel.sparse_coding(dir, tstC, ¶m); printf("Accuracy: %.3f\n", dAcc); delete tstC; delete c; } else { printf("usage : MedSTC est [initial alpha] [k] [labels] [random/seeded/*] [directory]\n"); printf(" MedSTC cv [foldnum] [foldix] [initial alpha] [k] [labels] [settings] [data] [random/seeded/*] [directory]\n"); printf(" MedSTC inf [settings] [model] [data] [name]\n"); } return 0; }
void MGRTMApp() { ml::Converged converged; converged.em_converged_ = 1e-4; converged.em_max_iter_ = 100; converged.var_converged_ = 1e-4; converged.var_max_iter_ = 10; int rho = 3; VarMGRTM var; var.Init(converged,rho); var.Load(FLAGS_net_path, FLAGS_cor_path, FLAGS_neg_times); Str path(FLAGS_cor_path); Corpus cor; cor.LoadData(path); MGRTM m; m.Init(2, FLAGS_local_topic, FLAGS_global_topic, cor.TermNum(), 1, 0.01, 0.01); var.RunEM(&m); }
vector<TaggedWord> corpusToTaggedWords(Corpus<TaggedWord> const &corpus) { vector<TaggedWord> sequence; vector<Sentence<TaggedWord> > sentences = corpus.sentences(); for (vector<Sentence<TaggedWord> >::const_iterator iter = sentences.begin(); iter != sentences.end(); ++iter) copy(iter->words().begin(), iter->words().end(), back_inserter(sequence)); return sequence; }
static void Print(const Rung& r, const Corpus& source, const Corpus& target, const PrintParams& params) { if(r.i == source.size() && r.j == target.size()) return; if(r.score < params.printThreshold) return; if(params.print11 && (r.bead[0] != 1 || r.bead[1] != 1)) return; if(!params.printUnaligned && (r.bead[0] == 0 || r.bead[1] == 0)) return; const Sentence& s1 = source(r.i, r.i + r.bead[0] - 1); const Sentence& s2 = target(r.j, r.j + r.bead[1] - 1); if(params.printIds) std::cout << r.i << " " << r.j << "\t"; if(params.printBeads) std::cout << r.bead << "\t"; if(params.printScores) std::cout << r.score << "\t"; std::cout << s1 << "\t" << s2 << std::endl; }
void LdaApp() { long t1; (void) time(&t1); seedMT(t1); float em_converged = 1e-4; int em_max_iter = FLAGS_em_iterate; int em_estimate_alpha = 1; //1 indicate estimate alpha and 0 use given value int var_max_iter = FLAGS_var_iterate; double var_converged = 1e-6; double initial_alpha = FLAGS_alpha; int topic = FLAGS_topic_num; Corpus train; Corpus test; train.LoadData(FLAGS_cor_train); test.LoadData(FLAGS_cor_test); LOG(INFO) << train.Len()<< " " << test.Len(); LdaModel m; LDA lda; lda.Init(em_converged, em_max_iter, em_estimate_alpha, var_max_iter, var_converged, initial_alpha, topic); Str type = "seeded"; lda.RunEM(type, train, test, &m); VVReal gamma; VVVReal phi; lda.Infer(test, m, &gamma, &phi); WriteStrToFile(Join(gamma, " ", "\n"), "./model/gamma"); WriteStrToFile(Join(m.log_prob_w, topic, train.num_terms), "./model/beta"); WriteStrToFile(Join(phi, " ", "\n", "\n\n"), "./model/phi"); }
bool ClassificadorAdaboostM1::executarClassificacao( Corpus &corpusProva, int atributo ) { vector<vector<double> > exemplos(corpusProva.pegarQtdTotalExemplos(), vector<double>(valores.size(), 0.0)); int k, indice; for (unsigned int t = 0; t < classificadores.size(); t++) { (classificadores[t])->executarClassificacao(corpusProva, atributo); k = 0; for (int i = 0; i < corpusProva.pegarQtdSentencas(); i++) { for (int j = 0; j < corpusProva.pegarQtdExemplos(i); j++) { indice = -1; for (unsigned int a = 0; a < valores.size() - 1; a++) if (valores[a] == corpusProva(i, j, atributo)) { indice = a; a = valores.size(); } if (indice == -1) indice = valores.size() - 1; exemplos[k][indice] -= log(betas[t]); k++; } } } double maxBeta; int maxAtr; k = 0; for (int i = 0; i < corpusProva.pegarQtdSentencas(); i++) { for (int j = 0; j < corpusProva.pegarQtdExemplos(i); j++) { maxBeta = exemplos[k][valores.size()-1]; maxAtr = valores.size() - 1; for (unsigned int a = 0; a < valores.size() - 1; a++) { if (maxBeta <= exemplos[k][a]) { maxBeta = exemplos[k][a]; maxAtr = a; } } corpusProva(i, j, atributo, valores[maxAtr]); k++; } } return true; }
double LDA::Likelihood(const Corpus &cor, int d, const LdaModel &m, VRealC &gamma,VVRealC &phi) const { double alpha_sum = double_array_sum(m.alpha,m.num_topics); double gamma_sum = std::accumulate(gamma.begin(), gamma.end(), 0.0); double digsum = DiGamma(gamma_sum); const int &num = m.num_topics; VReal expect(num); for (int k = 0; k < num; k++) { expect.at(k) = DiGamma(gamma.at(k)) - digsum; } double l = lgamma(alpha_sum) - lgamma(gamma_sum); for (int k = 0; k < num; k++) { l += ((m.alpha[k] - gamma.at(k)) * expect[k] + lgamma(gamma.at(k)) - lgamma(m.alpha[k])); for (size_t n = 0; n < cor.ULen(d); n++) { if (phi[n][k] > 0) { l += cor.Count(d, n) * phi[n][k] * (expect[k] - log(phi[n][k]) + m.log_prob_w[k][cor.Word(d, n)]); } } } return l; }
double fscore(vector<vector<vector<bool> > > sample, Corpus &corpus) { int tp = 0, fp = 0, tn = 0, fn = 0; for (int a = 0; a < sample.size(); a++) { for (int d = 0; d < sample[a].size(); d++) { if (corpus.isObscured(a, d)) { for (int r = 0; r < sample[a][d].size(); r++) { if (a != r) { int trueEdge = corpus.getEdge(a, d, r); if (sample[a][d][r]) { if (trueEdge) { tp++; } else { fp++; } } else { if (trueEdge) { fn++; } else { tn++; } } } } } } } double precision = (1.0 * tp) / (tp + fp); double recall = (1.0 * tp) / (tp + fn); cout << "tp: " << tp << ", fp: " << fp << ", tn: " << tn << ", fn: " << fn << "\n"; cout << "Precision: " << precision << "; Recall: " << recall << "\n"; return 2 * precision * recall / (precision + recall); }
bool ProcessadorAttDisc::processarCorpus(Corpus &objCorpus) { int totlinhas, qtdConjExemplos,c; int idCol, linha; int *indices; indices = new int[numatributos]; for(int i=0; i<numatributos; i++) { indices[i] = objCorpus.pegarPosAtributo(atributo[i]); } criarAtributos(objCorpus); idCol = objCorpus.pegarPosAtributo("New"+atributo[0]); qtdConjExemplos = objCorpus.pegarQtdConjExemplos(); for (c=0; c<qtdConjExemplos; c++){ totlinhas = objCorpus.pegarQtdExemplos(c); for (linha=0; linha < totlinhas; linha++){ for( int i=0; i<numatributos; i++) { int vatual= objCorpus.pegarValor(c,linha,indices[i]); string valor_atual = objCorpus.pegarSimbolo(vatual); int logatual=0; if(valor_atual == "VERDADEIRO" ) logatual = 1; std::stringstream out2; out2 << setprecision(0) << setiosflags(ios::fixed); out2 << logatual; objCorpus.ajustarValor(c, linha, idCol+i, objCorpus.pegarIndice(out2.str())); } } } return true; }
void LDA::InitVarParamter(const Corpus &cor, int d, const LdaModel &m, VReal* digamma,VReal* ga, VVReal* phi) const { ga->resize(m.num_topics); digamma->resize(m.num_topics); phi->resize(cor.ULen(d)); for (int k = 0; k < m.num_topics; k++) { (*ga)[k] = m.alpha[k] + (cor.docs[d].total / ((double) m.num_topics)); (*digamma)[k] = DiGamma((*ga)[k]); } for (VReal::size_type n = 0; n < phi->size(); n++) { phi->at(n).resize(m.num_topics); for (int k = 0; k < m.num_topics; k++) { (*phi)[n][k] = 1.0 / m.num_topics; } } }
static void run_line(const char *p, const char *end, const Corpus &corp) { while(p < end) { while(p < end && *p == ' ') p++; const char *start = p; while(p < end && *p != ' ') p++; if(p - start > 0) { int term_id = corp.lookup(start, p-start); if(term_id == -1) fprintf(stderr, "WARNING: term not found: '%.*s'\n", (int)(p-start), start); else { uint64_t x = rawpost(term_id, doc_id, term_pos++); fwrite_unlocked(&x, sizeof(x), 1, stdout); } } } }
void dtw_model::validate(Corpus& corpus) { static const size_t MINIATURE_SIZE = 10000; static vector<tsample> samples = corpus.getSamples(MINIATURE_SIZE); static double objective = 0; static bool aboutToStop = false; static const double SOFT_THRESHOLD = 2e-6 * _learning_rate; static const double HARD_THRESHOLD = SOFT_THRESHOLD * 0.1; static size_t MIN_ITERATION = 128; static size_t itr = 0; double obj = calcObjective(samples); double diff = obj - objective; double improveRate = abs(diff / objective); printf("objective = %.7f \t prev-objective = %.7f \n", obj, objective); printf("improvement rate on dev-set of size %lu = %.6e ", samples.size(), improveRate); printf(", still "GREEN"%.0f"COLOREND" times of threshold \n", improveRate / SOFT_THRESHOLD); if (itr > MIN_ITERATION) { if (improveRate != improveRate) exit(-1); if (improveRate < HARD_THRESHOLD) { printf("\nObjective function on dev-set is no longer decreasing...\n"); printf("Training process "GREEN"DONE"COLOREND"\n"); // doPause(); exit(0); } else if (aboutToStop || improveRate < SOFT_THRESHOLD) { aboutToStop = true; _learning_rate /= 2; } } objective = obj; ++itr; }
/* * main */ int main(int argc, char* argv[]) { seedMT( time(NULL) ); // seedMT(4357U); if (argc > 1) { Corpus* c = new Corpus(); Params param; param.INNER_CV = true; if ( strcmp(argv[1], "estinf") == 0 ) { param.read_settings("settings.txt"); param.NTOPICS = atoi(argv[2]); param.NLABELS = atoi(argv[3]); param.NFOLDS = atoi(argv[4]); param.INITIAL_C = atof(argv[5]); param.DELTA_ELL = atof(argv[6]); printf("K: %d, C: %.3f, Alpha: %d, svm: %d\n", param.NTOPICS, param.INITIAL_C, param.ESTIMATE_ALPHA, param.SVM_ALGTYPE); c->read_data(param.train_filename, param.NLABELS); char dir[512]; sprintf(dir, "20ng%d_c%d_f%d", param.NTOPICS, (int)param.INITIAL_C, param.NFOLDS); make_directory(dir); if ( param.INNER_CV ) { c->shuffle(); char modelDir[512]; sprintf(modelDir, "%s/innercv", dir); make_directory(modelDir); param.INITIAL_C = innerCV(modelDir, c, ¶m); printf("\n\nBest C: %f\n", param.INITIAL_C); } MedLDA model; model.run_em(argv[7], dir, c, ¶m); // testing. Corpus *tstC = new Corpus(); tstC->read_data(param.test_filename, param.NLABELS); MedLDA evlModel; double dAcc = evlModel.infer(dir, tstC, ¶m); printf("Accuracy: %.3f\n", dAcc); delete tstC; } if ( strcmp(argv[1], "est") == 0 ) { param.read_settings("settings.txt"); param.NTOPICS = atoi(argv[2]); param.NLABELS = atoi(argv[3]); param.NFOLDS = atoi(argv[4]); param.INITIAL_C = atof(argv[5]); param.DELTA_ELL = atof(argv[6]); c->read_data(param.train_filename, param.NLABELS); char dir[512]; sprintf(dir, "%s%d_c%d_f%d", argv[7], param.NTOPICS, param.INITIAL_C, param.NFOLDS); make_directory(dir); if ( param.INNER_CV ) { c->shuffle(); char modelDir[512]; sprintf(modelDir, "%s/innercv", dir); make_directory(modelDir); param.INITIAL_C = innerCV(modelDir, c, ¶m); printf("\n\nBest C: %f\n", param.INITIAL_C); } MedLDA model; model.run_em(argv[8], dir, c, ¶m); } if (strcmp(argv[1], "inf")==0) { param.read_settings("settings.txt"); param.NLABELS = atoi(argv[2]); c->read_data(param.test_filename, param.NLABELS); MedLDA model; double dAcc = model.infer(argv[3], c, ¶m); printf("Accuracy: %.3f\n", dAcc); } delete c; } else { printf("usage : MEDsLDAc estinf [k] [labels] [fold] [initial C] [l] [random/seeded/*]\n"); printf(" MEDsLDAc est [k] [labels] [fold] [initial C] [l] [dir root] [random/seeded/*]\n"); printf(" MEDsLDAc inf [labels] [model]\n"); } return(0); }