Example #1
0
void expectation_prop::train(size_t numTopics) {
    this->numTopics = numTopics;

    setup_parameters();
    double likelihood;
    double old_likelihood = 0.0;
    int iteration = 0;
    bool converged = false;
    while(!converged && iteration < MAX_ITERATION){
        iteration++;
        likelihood = 0.0;
        for(int d=0; d<numDocs; ++d){
            likelihood += doc_e_step(d);
        }
        m_step();

        double conv = fabs((old_likelihood - likelihood)/old_likelihood);
        old_likelihood = likelihood;
        if(conv < CONV_THRESHHOLD){
            converged = true;
        }
        first = false;

        std::cout << "Iteration " << iteration << ": with likelihood: " << likelihood <<std::endl;
    }
}
Example #2
0
void var_bayes::train(size_t num_topics) {
    numTopics = size_t(num_topics);
    logProbW = std::vector<std::vector<double>>(numTopics, std::vector<double>(numTerms, 0));

    alpha = setup_alpha();

    suff_stats ss;
    randomSSInit(ss);
    mle(ss, false);

    gamma = std::vector<std::vector<double>>(numDocs, std::vector<double>(numTopics, 0));
    phi = std::vector<std::vector<std::vector<double>>>(numDocs);
    for(int d=0; d<corpus.numDocs; d++){
        phi[d] = std::vector<std::vector<double>>(corpus.docs[d].uniqueCount, std::vector<double>((numTopics)));
    }

    int iteration = 0;
    double likelihood  = 0;
    double old_likelihood = 0;
    double converged = 1;
    bool update_alpha;

    while ( ( (converged>CONV_THRESHHOLD) ) and ( (iteration <= MIN_ITER) or (iteration <= MAX_ITER) ) ) {
        iteration++;
        likelihood = 0;
        clock_t start = clock();
        zeroSSInit(ss);

        for(int d=0; d<numDocs; d++){
            document doc = corpus.docs[d];
            std::vector<double>& var_gamma = gamma[d];
            std::vector<std::vector<double>>& doc_phi = phi[d];
            likelihood += doc_e_step(doc, ss, var_gamma, doc_phi);
        }
        update_alpha = ((iteration % UPDATE_INTERVAL == 0) and (EST_ALPHA));
        mle(ss, update_alpha);

        converged = fabs((old_likelihood - likelihood)/old_likelihood);
        old_likelihood = likelihood;

        std::cout << "Iteration " << iteration << ": with likelihood: " << likelihood
                  << " in " << double(clock() - start)/CLOCKS_PER_SEC << " seconds. (" << converged << ")";
        if(update_alpha){
            std::cout << " (alpha update)";
        }
        std::cout << std::endl;
    }
    std::cout << "Converged in " << iteration << " iterations with likelihood of " << likelihood << std::endl;
    this->likelihood = likelihood;
}
Example #3
0
void run_quiet_em(char* start, corpus* corpus) {
	int d = 0, n = 0;
	lda_model *model = NULL;
	double **var_gamma = NULL, **phi = NULL;
	// last_gamma is a double[num_docs][num_topics]

	// allocate variational parameters


	var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs));
  memset(var_gamma, 0.0, corpus->num_docs);

	for (d = 0; d < corpus->num_docs; ++d) {
		var_gamma[d] = (double*)malloc(sizeof(double) * NTOPICS);
    memset(var_gamma[d], 0.0, sizeof(double)*NTOPICS);
  }

	int max_length = max_corpus_length(corpus);

	phi = (double**)malloc(sizeof(double*)*max_length);
  memset(phi, 0.0, max_length);
	for (n = 0; n < max_length; ++n) {
		phi[n] = (double*)malloc(sizeof(double) * NTOPICS);
    memset(phi[n], 0.0, sizeof(double)*NTOPICS);
  }

	// initialize model

	lda_suffstats* ss = NULL;
	if (strncmp(start, "seeded",6)==0) {
		model = new_lda_model(corpus->num_terms, NTOPICS);
		model->alpha = INITIAL_ALPHA;
		ss = new_lda_suffstats(model);
		if (VERBOSE) {
      corpus_initialize_ss(ss, model, corpus);
    } else {
      quiet_corpus_initialize_ss(ss, model, corpus);
    }
		if (VERBOSE) {
      lda_mle(model, ss, 0);
		} else {
      quiet_lda_mle(model, ss, 0);
		}
	} else if (strncmp(start, "fixed",5)==0) {
	  model = new_lda_model(corpus->num_terms, NTOPICS);
    model->alpha = INITIAL_ALPHA;
	  ss = new_lda_suffstats(model);
	  corpus_initialize_fixed_ss(ss, model, corpus);
    if (VERBOSE) {
      lda_mle(model, ss, 0);
    } else {
      quiet_lda_mle(model, ss, 0);
    }
	} else if (strncmp(start, "random",6)==0) {
		model = new_lda_model(corpus->num_terms, NTOPICS);
		model->alpha = INITIAL_ALPHA;
		ss = new_lda_suffstats(model);
		random_initialize_ss(ss, model);
		if (VERBOSE) {
      lda_mle(model, ss, 0);
		} else {
      quiet_lda_mle(model, ss, 0);
		}
	} else {
		model = load_lda_model(start);
		ss = new_lda_suffstats(model);
	}

	// save the model in the last_model global
	last_model = model;
	model_loaded = TRUE;

	// run expectation maximization

	int i = 0;
	double likelihood = 0.0, likelihood_old = 0, converged = 1;

	while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
		i++;
		if (VERBOSE) printf("**** em iteration %d ****\n", i);
		likelihood = 0;
		zero_initialize_ss(ss, model);

		// e-step

		for (d = 0; d < corpus->num_docs; d++) {
			if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
			likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
		}

		// m-step
    if (VERBOSE) {
      lda_mle(model, ss, ESTIMATE_ALPHA);
    } else {
      quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
    }

		// check for convergence

		converged = (likelihood_old - likelihood) / (likelihood_old);
		if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
		likelihood_old = likelihood;

		// store model and likelihood

		last_model = model;
		last_gamma = var_gamma;
    last_phi = phi;
	}

	// output the final model

	last_model = model;
	last_gamma = var_gamma;
  last_phi = phi;

  free_lda_suffstats(model,ss);

	// output the word assignments (for visualization)
	/*
	char filename[100];
	sprintf(filename, "%s/word-assignments.dat", directory);
	FILE* w_asgn_file = fopen(filename, "w");
	for (d = 0; d < corpus->num_docs; d++) {
		if ((d % 100) == 0)
			printf("final e step document %d\n",d);
		likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi);
		write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
	}
	fclose(w_asgn_file);
	*/
}
Example #4
0
void run_em(char* start, char* directory, corpus* corpus) {
	int d, n;
	lda_model *model = NULL;
	double **var_gamma, **phi;

	// allocate variational parameters


	var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
	for (d = 0; d < corpus->num_docs; d++)
		var_gamma[d] = malloc(sizeof(double) * NTOPICS);

	int max_length = max_corpus_length(corpus);
	phi = malloc(sizeof(double*)*max_length);
	for (n = 0; n < max_length; n++)
		phi[n] = malloc(sizeof(double) * NTOPICS);

	// initialize model

	char filename[100];

	lda_suffstats* ss = NULL;
	if (strcmp(start, "seeded")==0) {
		model = new_lda_model(corpus->num_terms, NTOPICS);
		ss = new_lda_suffstats(model);
		corpus_initialize_ss(ss, model, corpus);
		if (VERBOSE) {
		    lda_mle(model, ss, 0);
	    } else {
            quiet_lda_mle(model, ss, 0);
	    }
		    
		model->alpha = INITIAL_ALPHA;
	} else if (strcmp(start, "random")==0) {
		model = new_lda_model(corpus->num_terms, NTOPICS);
		ss = new_lda_suffstats(model);
		random_initialize_ss(ss, model);
		if (VERBOSE) {
		    lda_mle(model, ss, 0);
	    } else {
	        quiet_lda_mle(model, ss, 0);
	    }
		model->alpha = INITIAL_ALPHA;
	} else {
		model = load_lda_model(start);
		ss = new_lda_suffstats(model);
	}

	sprintf(filename,"%s/000",directory);
	save_lda_model(model, filename);

	// run expectation maximization

	int i = 0;
	double likelihood, likelihood_old = 0, converged = 1;
	sprintf(filename, "%s/likelihood.dat", directory);
	FILE* likelihood_file = fopen(filename, "w");

	while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
		i++;
		if (VERBOSE)
		    printf("**** em iteration %d ****\n", i);
		likelihood = 0;
		zero_initialize_ss(ss, model);

		// e-step
    printf("e-step\n");

		for (d = 0; d < corpus->num_docs; d++) {
			if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
			likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
		}
    printf("m-step\n");

		// m-step
    if (VERBOSE) {
      lda_mle(model, ss, ESTIMATE_ALPHA);
    } else {
      quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
    }

		// check for convergence
		converged = (likelihood_old - likelihood) / (likelihood_old);
		if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
		likelihood_old = likelihood;

		// output model and likelihood

		fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged);
		fflush(likelihood_file);
		if ((i % LAG) == 0)
		{
			sprintf(filename,"%s/%03d",directory, i);
			save_lda_model(model, filename);
			sprintf(filename,"%s/%03d.gamma",directory, i);
			save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
		}
	}

		// output the final model

	sprintf(filename,"%s/final",directory);
	save_lda_model(model, filename);
	sprintf(filename,"%s/final.gamma",directory);
	save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);

		// output the word assignments (for visualization)

	sprintf(filename, "%s/word-assignments.dat", directory);
	FILE* w_asgn_file = fopen(filename, "w");
  short error = 0;
  double tl = 0.0;
	for (d = 0; d < corpus->num_docs; d++)
	{
		if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d);
    error = 0;
    tl = lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi,&error);
    if( error ) { continue; }
		likelihood += tl;
		write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
	}
	fclose(w_asgn_file);
	fclose(likelihood_file);
}
Example #5
0
void test_likelihood(t_setting* setting, const t_corpus* corpus, const std::vector<t_cat> tree_structure)
{
	FILE* 	fileptr_lowerbound_result;
	FILE* 	fileptr_lowerbound_summary;
	FILE* 	fileptr_document_completion_result;
	FILE* 	fileptr_document_completion_summary;
	char 	filename[MAX_BUF];

	sprintf(filename, "%s_tilda", setting->model_path);
	t_tilda_model* trained_tilda_model = load_tilda_model(filename);

	sprintf(filename, "%s_var", setting->model_path);
	t_tilda_var_model* trained_var_model = load_var_model(filename, corpus);

	double**	rho = NULL;
	double* 	old_rho = NULL;
	double* 	nu = NULL;
	double*		dirichlet_prior = NULL;
	double*		expected_theta = NULL;
	double**	expected_beta = NULL;


	const int& K = trained_tilda_model->num_topics;
	setting->num_topics = K;

	oneoverk = 1 / (double) K;

	double 	document_completion_sum_ll = 0.0;
	int		document_completion_sum_num_words = 0;

	double	lowerbound_sum_likelihood = 0;
	int  	lowerbound_sum_num_words = 0;

	nu = zero_init_double_array(K);
	rho = zero_init_double_matrix(corpus->max_length, K);
	old_rho = zero_init_double_array(K);
	dirichlet_prior = zero_init_double_array(K);
	expected_theta = zero_init_double_array(K);
	expected_beta = zero_init_double_matrix(K, corpus->num_terms);

	digamma_nu = zero_init_double_matrix(corpus->num_docs, K);
	digamma_nu_sum = zero_init_double_array(corpus->num_docs);

	digamma_lambda = zero_init_double_matrix(K, corpus->num_terms);
	digamma_lambda_sum = zero_init_double_array(K);

	compute_lambda_statistics(trained_var_model, expected_beta);

	sprintf(filename, "%s_lowerbound_result", setting->output_path);
	fileptr_lowerbound_result = fopen(filename, "w");

	sprintf(filename, "%s_document_completion_result", setting->output_path);
	fileptr_document_completion_result = fopen(filename, "w");

	for (int i = 0; i < tree_structure.size(); ++i) {
		const double&	alpha_t = trained_tilda_model->alpha[i];
		const double*	kappa_t = trained_var_model->kappa[i];
		const double&	tau_t = trained_var_model->tau[i];

		for (int j = 0; j < K; ++j) {
			dirichlet_prior[j] = alpha_t * kappa_t[j];
		}

		for (int d = 0; d < tree_structure[i].docids.size(); ++d) {
			const int& docid = tree_structure[i].docids[d];

			// evaluation using variational bound
			double this_doc_lowerbound = doc_e_step(&(corpus->docs[docid]), dirichlet_prior, nu,
													digamma_lambda, digamma_lambda_sum, setting,
													docid, rho, old_rho);

			assert(!std::isnan(this_doc_lowerbound));

			this_doc_lowerbound += lgamma(alpha_t);
			this_doc_lowerbound -= (K - alpha_t) * digamma(tau_t);
			this_doc_lowerbound -= alpha_t * (K - 1) / tau_t;
			for (int j = 0; j < K; ++j) {
				this_doc_lowerbound -= lgamma(alpha_t * kappa_t[j]) +
										(1 - alpha_t * kappa_t[j]) * (log(kappa_t[j]) - digamma(tau_t * kappa_t[j]));
			}

			for (int j = 0; j < K; ++j) {
				this_doc_lowerbound += dirichlet_prior[j] * (digamma_nu[docid][j] - digamma_nu_sum[docid]);
			}

			assert(!std::isnan(this_doc_lowerbound));

			fprintf(fileptr_lowerbound_result, "docid %d\tlower_bound %5.5f\tnum_words %d\n", docid, this_doc_lowerbound, corpus->docs[docid].total);

			lowerbound_sum_likelihood += this_doc_lowerbound;
			lowerbound_sum_num_words += corpus->docs[docid].total;

			// evaluation using document completion
			t_document*	inference_doc = NULL;
			t_document*	test_doc = NULL;
			split_document(inference_doc, test_doc, &(corpus->docs[docid]));
			double half_doc_lowerbound = doc_e_step(inference_doc, dirichlet_prior, nu,
													digamma_lambda, digamma_lambda_sum, setting,
													docid, rho, old_rho);

			assert(!std::isnan(half_doc_lowerbound));

			half_doc_lowerbound += lgamma(alpha_t);
			half_doc_lowerbound -= (K - alpha_t) * digamma(tau_t);
			half_doc_lowerbound -= alpha_t * (K - 1) / tau_t;
			for (int j = 0; j < K; ++j) {
				half_doc_lowerbound -= lgamma(alpha_t * kappa_t[j]) +
										(1 - alpha_t * kappa_t[j]) * (log(kappa_t[j]) - digamma(tau_t * kappa_t[j]));
			}

			for (int j = 0; j < K; ++j) {
				half_doc_lowerbound += dirichlet_prior[j] * (digamma_nu[docid][j] - digamma_nu_sum[docid]);
			}

			assert(!std::isnan(half_doc_lowerbound));

			double document_completion_log_likelihood = 0.0;
			double nu_sum = 0.0;
			for (int j = 0; j < K; ++j) {
				nu_sum += nu[j];
			}
			for (int j = 0; j < K; ++j) {
				expected_theta[j] = nu[j] / nu_sum;
			}

			for (int n = 0; n < test_doc->length; n++) {
				double this_word_likelihood = 0.0;
				for (int j = 0; j < K; ++j) {
					this_word_likelihood += expected_theta[j] * expected_beta[j][test_doc->words[n]];
				}
				document_completion_log_likelihood += log(this_word_likelihood + 1e-100) * test_doc->counts[n];
			}

			fprintf(fileptr_document_completion_result, "docid %d\thalf_lower_bound %5.5f\tscore %5.5f\ttest_num_words %d\n",
					docid, half_doc_lowerbound, document_completion_log_likelihood, test_doc->total);

			document_completion_sum_ll += document_completion_log_likelihood;
			document_completion_sum_num_words += test_doc->total;

			free_document(inference_doc);
			free_document(test_doc);
		}
	}

	fclose(fileptr_lowerbound_result);
	fclose(fileptr_document_completion_result);

	double perplexity = exp(-lowerbound_sum_likelihood / (double) lowerbound_sum_num_words);
	sprintf(filename, "%s_lowerbound_summary", setting->output_path);
	fileptr_lowerbound_summary = fopen(filename, "w");
	fprintf(fileptr_lowerbound_summary, "sum_lowerbound %5.5f\n", lowerbound_sum_likelihood);
	fprintf(fileptr_lowerbound_summary, "sum_num_words %d\n", lowerbound_sum_num_words);
	fprintf(fileptr_lowerbound_summary, "perplexity %5.5f\n", perplexity);
	fclose(fileptr_lowerbound_summary);

	double per_word_ll = document_completion_sum_ll / (double) document_completion_sum_num_words;
	sprintf(filename, "%s_document_completion_summary", setting->output_path);
	fileptr_document_completion_summary = fopen(filename, "w");
	fprintf(fileptr_document_completion_summary, "sum_num_words %d\n", document_completion_sum_num_words);
	fprintf(fileptr_document_completion_summary, "per_word_ll %5.5f\n", per_word_ll);
	fprintf(fileptr_document_completion_summary, "perplexity %5.5f\n", exp(-per_word_ll));
	fclose(fileptr_document_completion_summary);

	free_double_matrix(digamma_lambda);
	free(digamma_lambda_sum);

	free_double_matrix(digamma_nu);
	free(digamma_nu_sum);

	free_double_matrix(expected_beta);
	free(expected_theta);
	free(dirichlet_prior);
	free(nu);
	free_double_matrix(rho);
	free(old_rho);
	free_var_model(trained_var_model);
	free_tilda_model(trained_tilda_model);

}