void expectation_prop::train(size_t numTopics) { this->numTopics = numTopics; setup_parameters(); double likelihood; double old_likelihood = 0.0; int iteration = 0; bool converged = false; while(!converged && iteration < MAX_ITERATION){ iteration++; likelihood = 0.0; for(int d=0; d<numDocs; ++d){ likelihood += doc_e_step(d); } m_step(); double conv = fabs((old_likelihood - likelihood)/old_likelihood); old_likelihood = likelihood; if(conv < CONV_THRESHHOLD){ converged = true; } first = false; std::cout << "Iteration " << iteration << ": with likelihood: " << likelihood <<std::endl; } }
void var_bayes::train(size_t num_topics) { numTopics = size_t(num_topics); logProbW = std::vector<std::vector<double>>(numTopics, std::vector<double>(numTerms, 0)); alpha = setup_alpha(); suff_stats ss; randomSSInit(ss); mle(ss, false); gamma = std::vector<std::vector<double>>(numDocs, std::vector<double>(numTopics, 0)); phi = std::vector<std::vector<std::vector<double>>>(numDocs); for(int d=0; d<corpus.numDocs; d++){ phi[d] = std::vector<std::vector<double>>(corpus.docs[d].uniqueCount, std::vector<double>((numTopics))); } int iteration = 0; double likelihood = 0; double old_likelihood = 0; double converged = 1; bool update_alpha; while ( ( (converged>CONV_THRESHHOLD) ) and ( (iteration <= MIN_ITER) or (iteration <= MAX_ITER) ) ) { iteration++; likelihood = 0; clock_t start = clock(); zeroSSInit(ss); for(int d=0; d<numDocs; d++){ document doc = corpus.docs[d]; std::vector<double>& var_gamma = gamma[d]; std::vector<std::vector<double>>& doc_phi = phi[d]; likelihood += doc_e_step(doc, ss, var_gamma, doc_phi); } update_alpha = ((iteration % UPDATE_INTERVAL == 0) and (EST_ALPHA)); mle(ss, update_alpha); converged = fabs((old_likelihood - likelihood)/old_likelihood); old_likelihood = likelihood; std::cout << "Iteration " << iteration << ": with likelihood: " << likelihood << " in " << double(clock() - start)/CLOCKS_PER_SEC << " seconds. (" << converged << ")"; if(update_alpha){ std::cout << " (alpha update)"; } std::cout << std::endl; } std::cout << "Converged in " << iteration << " iterations with likelihood of " << likelihood << std::endl; this->likelihood = likelihood; }
void run_quiet_em(char* start, corpus* corpus) { int d = 0, n = 0; lda_model *model = NULL; double **var_gamma = NULL, **phi = NULL; // last_gamma is a double[num_docs][num_topics] // allocate variational parameters var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs)); memset(var_gamma, 0.0, corpus->num_docs); for (d = 0; d < corpus->num_docs; ++d) { var_gamma[d] = (double*)malloc(sizeof(double) * NTOPICS); memset(var_gamma[d], 0.0, sizeof(double)*NTOPICS); } int max_length = max_corpus_length(corpus); phi = (double**)malloc(sizeof(double*)*max_length); memset(phi, 0.0, max_length); for (n = 0; n < max_length; ++n) { phi[n] = (double*)malloc(sizeof(double) * NTOPICS); memset(phi[n], 0.0, sizeof(double)*NTOPICS); } // initialize model lda_suffstats* ss = NULL; if (strncmp(start, "seeded",6)==0) { model = new_lda_model(corpus->num_terms, NTOPICS); model->alpha = INITIAL_ALPHA; ss = new_lda_suffstats(model); if (VERBOSE) { corpus_initialize_ss(ss, model, corpus); } else { quiet_corpus_initialize_ss(ss, model, corpus); } if (VERBOSE) { lda_mle(model, ss, 0); } else { quiet_lda_mle(model, ss, 0); } } else if (strncmp(start, "fixed",5)==0) { model = new_lda_model(corpus->num_terms, NTOPICS); model->alpha = INITIAL_ALPHA; ss = new_lda_suffstats(model); corpus_initialize_fixed_ss(ss, model, corpus); if (VERBOSE) { lda_mle(model, ss, 0); } else { quiet_lda_mle(model, ss, 0); } } else if (strncmp(start, "random",6)==0) { model = new_lda_model(corpus->num_terms, NTOPICS); model->alpha = INITIAL_ALPHA; ss = new_lda_suffstats(model); random_initialize_ss(ss, model); if (VERBOSE) { lda_mle(model, ss, 0); } else { quiet_lda_mle(model, ss, 0); } } else { model = load_lda_model(start); ss = new_lda_suffstats(model); } // save the model in the last_model global last_model = model; model_loaded = TRUE; // run expectation maximization int i = 0; double likelihood = 0.0, likelihood_old = 0, converged = 1; while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) { i++; if (VERBOSE) printf("**** em iteration %d ****\n", i); likelihood = 0; zero_initialize_ss(ss, model); // e-step for (d = 0; d < corpus->num_docs; d++) { if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d); likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss); } // m-step if (VERBOSE) { lda_mle(model, ss, ESTIMATE_ALPHA); } else { quiet_lda_mle(model, ss, ESTIMATE_ALPHA); } // check for convergence converged = (likelihood_old - likelihood) / (likelihood_old); if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2; likelihood_old = likelihood; // store model and likelihood last_model = model; last_gamma = var_gamma; last_phi = phi; } // output the final model last_model = model; last_gamma = var_gamma; last_phi = phi; free_lda_suffstats(model,ss); // output the word assignments (for visualization) /* char filename[100]; sprintf(filename, "%s/word-assignments.dat", directory); FILE* w_asgn_file = fopen(filename, "w"); for (d = 0; d < corpus->num_docs; d++) { if ((d % 100) == 0) printf("final e step document %d\n",d); likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi); write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model); } fclose(w_asgn_file); */ }
void run_em(char* start, char* directory, corpus* corpus) { int d, n; lda_model *model = NULL; double **var_gamma, **phi; // allocate variational parameters var_gamma = malloc(sizeof(double*)*(corpus->num_docs)); for (d = 0; d < corpus->num_docs; d++) var_gamma[d] = malloc(sizeof(double) * NTOPICS); int max_length = max_corpus_length(corpus); phi = malloc(sizeof(double*)*max_length); for (n = 0; n < max_length; n++) phi[n] = malloc(sizeof(double) * NTOPICS); // initialize model char filename[100]; lda_suffstats* ss = NULL; if (strcmp(start, "seeded")==0) { model = new_lda_model(corpus->num_terms, NTOPICS); ss = new_lda_suffstats(model); corpus_initialize_ss(ss, model, corpus); if (VERBOSE) { lda_mle(model, ss, 0); } else { quiet_lda_mle(model, ss, 0); } model->alpha = INITIAL_ALPHA; } else if (strcmp(start, "random")==0) { model = new_lda_model(corpus->num_terms, NTOPICS); ss = new_lda_suffstats(model); random_initialize_ss(ss, model); if (VERBOSE) { lda_mle(model, ss, 0); } else { quiet_lda_mle(model, ss, 0); } model->alpha = INITIAL_ALPHA; } else { model = load_lda_model(start); ss = new_lda_suffstats(model); } sprintf(filename,"%s/000",directory); save_lda_model(model, filename); // run expectation maximization int i = 0; double likelihood, likelihood_old = 0, converged = 1; sprintf(filename, "%s/likelihood.dat", directory); FILE* likelihood_file = fopen(filename, "w"); while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) { i++; if (VERBOSE) printf("**** em iteration %d ****\n", i); likelihood = 0; zero_initialize_ss(ss, model); // e-step printf("e-step\n"); for (d = 0; d < corpus->num_docs; d++) { if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d); likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss); } printf("m-step\n"); // m-step if (VERBOSE) { lda_mle(model, ss, ESTIMATE_ALPHA); } else { quiet_lda_mle(model, ss, ESTIMATE_ALPHA); } // check for convergence converged = (likelihood_old - likelihood) / (likelihood_old); if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2; likelihood_old = likelihood; // output model and likelihood fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged); fflush(likelihood_file); if ((i % LAG) == 0) { sprintf(filename,"%s/%03d",directory, i); save_lda_model(model, filename); sprintf(filename,"%s/%03d.gamma",directory, i); save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); } } // output the final model sprintf(filename,"%s/final",directory); save_lda_model(model, filename); sprintf(filename,"%s/final.gamma",directory); save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); // output the word assignments (for visualization) sprintf(filename, "%s/word-assignments.dat", directory); FILE* w_asgn_file = fopen(filename, "w"); short error = 0; double tl = 0.0; for (d = 0; d < corpus->num_docs; d++) { if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d); error = 0; tl = lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi,&error); if( error ) { continue; } likelihood += tl; write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model); } fclose(w_asgn_file); fclose(likelihood_file); }
void test_likelihood(t_setting* setting, const t_corpus* corpus, const std::vector<t_cat> tree_structure) { FILE* fileptr_lowerbound_result; FILE* fileptr_lowerbound_summary; FILE* fileptr_document_completion_result; FILE* fileptr_document_completion_summary; char filename[MAX_BUF]; sprintf(filename, "%s_tilda", setting->model_path); t_tilda_model* trained_tilda_model = load_tilda_model(filename); sprintf(filename, "%s_var", setting->model_path); t_tilda_var_model* trained_var_model = load_var_model(filename, corpus); double** rho = NULL; double* old_rho = NULL; double* nu = NULL; double* dirichlet_prior = NULL; double* expected_theta = NULL; double** expected_beta = NULL; const int& K = trained_tilda_model->num_topics; setting->num_topics = K; oneoverk = 1 / (double) K; double document_completion_sum_ll = 0.0; int document_completion_sum_num_words = 0; double lowerbound_sum_likelihood = 0; int lowerbound_sum_num_words = 0; nu = zero_init_double_array(K); rho = zero_init_double_matrix(corpus->max_length, K); old_rho = zero_init_double_array(K); dirichlet_prior = zero_init_double_array(K); expected_theta = zero_init_double_array(K); expected_beta = zero_init_double_matrix(K, corpus->num_terms); digamma_nu = zero_init_double_matrix(corpus->num_docs, K); digamma_nu_sum = zero_init_double_array(corpus->num_docs); digamma_lambda = zero_init_double_matrix(K, corpus->num_terms); digamma_lambda_sum = zero_init_double_array(K); compute_lambda_statistics(trained_var_model, expected_beta); sprintf(filename, "%s_lowerbound_result", setting->output_path); fileptr_lowerbound_result = fopen(filename, "w"); sprintf(filename, "%s_document_completion_result", setting->output_path); fileptr_document_completion_result = fopen(filename, "w"); for (int i = 0; i < tree_structure.size(); ++i) { const double& alpha_t = trained_tilda_model->alpha[i]; const double* kappa_t = trained_var_model->kappa[i]; const double& tau_t = trained_var_model->tau[i]; for (int j = 0; j < K; ++j) { dirichlet_prior[j] = alpha_t * kappa_t[j]; } for (int d = 0; d < tree_structure[i].docids.size(); ++d) { const int& docid = tree_structure[i].docids[d]; // evaluation using variational bound double this_doc_lowerbound = doc_e_step(&(corpus->docs[docid]), dirichlet_prior, nu, digamma_lambda, digamma_lambda_sum, setting, docid, rho, old_rho); assert(!std::isnan(this_doc_lowerbound)); this_doc_lowerbound += lgamma(alpha_t); this_doc_lowerbound -= (K - alpha_t) * digamma(tau_t); this_doc_lowerbound -= alpha_t * (K - 1) / tau_t; for (int j = 0; j < K; ++j) { this_doc_lowerbound -= lgamma(alpha_t * kappa_t[j]) + (1 - alpha_t * kappa_t[j]) * (log(kappa_t[j]) - digamma(tau_t * kappa_t[j])); } for (int j = 0; j < K; ++j) { this_doc_lowerbound += dirichlet_prior[j] * (digamma_nu[docid][j] - digamma_nu_sum[docid]); } assert(!std::isnan(this_doc_lowerbound)); fprintf(fileptr_lowerbound_result, "docid %d\tlower_bound %5.5f\tnum_words %d\n", docid, this_doc_lowerbound, corpus->docs[docid].total); lowerbound_sum_likelihood += this_doc_lowerbound; lowerbound_sum_num_words += corpus->docs[docid].total; // evaluation using document completion t_document* inference_doc = NULL; t_document* test_doc = NULL; split_document(inference_doc, test_doc, &(corpus->docs[docid])); double half_doc_lowerbound = doc_e_step(inference_doc, dirichlet_prior, nu, digamma_lambda, digamma_lambda_sum, setting, docid, rho, old_rho); assert(!std::isnan(half_doc_lowerbound)); half_doc_lowerbound += lgamma(alpha_t); half_doc_lowerbound -= (K - alpha_t) * digamma(tau_t); half_doc_lowerbound -= alpha_t * (K - 1) / tau_t; for (int j = 0; j < K; ++j) { half_doc_lowerbound -= lgamma(alpha_t * kappa_t[j]) + (1 - alpha_t * kappa_t[j]) * (log(kappa_t[j]) - digamma(tau_t * kappa_t[j])); } for (int j = 0; j < K; ++j) { half_doc_lowerbound += dirichlet_prior[j] * (digamma_nu[docid][j] - digamma_nu_sum[docid]); } assert(!std::isnan(half_doc_lowerbound)); double document_completion_log_likelihood = 0.0; double nu_sum = 0.0; for (int j = 0; j < K; ++j) { nu_sum += nu[j]; } for (int j = 0; j < K; ++j) { expected_theta[j] = nu[j] / nu_sum; } for (int n = 0; n < test_doc->length; n++) { double this_word_likelihood = 0.0; for (int j = 0; j < K; ++j) { this_word_likelihood += expected_theta[j] * expected_beta[j][test_doc->words[n]]; } document_completion_log_likelihood += log(this_word_likelihood + 1e-100) * test_doc->counts[n]; } fprintf(fileptr_document_completion_result, "docid %d\thalf_lower_bound %5.5f\tscore %5.5f\ttest_num_words %d\n", docid, half_doc_lowerbound, document_completion_log_likelihood, test_doc->total); document_completion_sum_ll += document_completion_log_likelihood; document_completion_sum_num_words += test_doc->total; free_document(inference_doc); free_document(test_doc); } } fclose(fileptr_lowerbound_result); fclose(fileptr_document_completion_result); double perplexity = exp(-lowerbound_sum_likelihood / (double) lowerbound_sum_num_words); sprintf(filename, "%s_lowerbound_summary", setting->output_path); fileptr_lowerbound_summary = fopen(filename, "w"); fprintf(fileptr_lowerbound_summary, "sum_lowerbound %5.5f\n", lowerbound_sum_likelihood); fprintf(fileptr_lowerbound_summary, "sum_num_words %d\n", lowerbound_sum_num_words); fprintf(fileptr_lowerbound_summary, "perplexity %5.5f\n", perplexity); fclose(fileptr_lowerbound_summary); double per_word_ll = document_completion_sum_ll / (double) document_completion_sum_num_words; sprintf(filename, "%s_document_completion_summary", setting->output_path); fileptr_document_completion_summary = fopen(filename, "w"); fprintf(fileptr_document_completion_summary, "sum_num_words %d\n", document_completion_sum_num_words); fprintf(fileptr_document_completion_summary, "per_word_ll %5.5f\n", per_word_ll); fprintf(fileptr_document_completion_summary, "perplexity %5.5f\n", exp(-per_word_ll)); fclose(fileptr_document_completion_summary); free_double_matrix(digamma_lambda); free(digamma_lambda_sum); free_double_matrix(digamma_nu); free(digamma_nu_sum); free_double_matrix(expected_beta); free(expected_theta); free(dirichlet_prior); free(nu); free_double_matrix(rho); free(old_rho); free_var_model(trained_var_model); free_tilda_model(trained_tilda_model); }