Ejemplo n.º 1
0
Archivo: gibbs.cpp Proyecto: DylanV/lda
void gibbs::train(size_t numTopics) {

    this->numTopics = numTopics;

    zero_init_counts();
    random_assign_topics();

    this->beta = 1/numTerms;

    double max_iter = MAX_ITER;

    for(int iter=0; iter<max_iter; ++iter){
        for(int d=0; d<numDocs; ++d){

            document curr_doc = corpus.docs[d];
            int doc_word_index = 0;

            for(auto const& word_count : curr_doc.wordCounts){

                int w = word_count.first;
                int count = word_count.second;

                for(int i=0; i<count; ++i){
                    // get current assignment
                    int z = topic_assignments[d][doc_word_index];
                    // decrement counts
                    n_dk[d][z] -= 1;
                    n_kw[z][w] -= 1;
                    n_k[z] -= 1;
                    n_d[d] -= 1;
                    // get conditional distribution
                    std::vector<double> pz = get_pz(d, w);
                    // update assignment
                    z = sample_multinomial(pz);
                    topic_assignments[d][doc_word_index] = z;
                    doc_word_index++;
                    // update counts
                    n_dk[d][z] += 1;
                    n_kw[z][w] += 1;
                    n_k[z] += 1;
                    n_d[d] += 1;
                }
            }
        }

        std::cout << "<";
        int count = 0;
        int length = 50;
        for(int x=0; x<(iter/max_iter)*length; x++){
            std::cout << "=";
            count++;
        }
        for(int x=0; x<length-count; x++){
            std::cout << "-";
        }
        std::cout << "> " << iter+1 << "/" << max_iter << '\r' << std::flush;
    }
    // update phi and theta
    estimate_parameters();
}
Ejemplo n.º 2
0
void TopicSearch::init_z_with_beta(){

	this->initial_z_ = zeros<uvec>(this->num_word_instances_);
	this->sampled_z_ = zeros<uvec>(this->num_word_instances_);

	for(size_t j = 0; j < this->num_word_instances_; j++)
		this->initial_z_(j) = sample_multinomial(this->init_beta_sample_.col(this->word_ids_(j)));

}
Ejemplo n.º 3
0
void TopicSearch::run_hybrid_random_walk_simulated_annealing_uniform2(
		vec iter_temperature,
		double random_walk_prob,
		double percent_random_walk){

	size_t accepted_Z_instances;
	bool valid_burn_in_period;

    Timer timer = Timer();
	timer.restart_time();

	init_z();

	if (this->burn_in_period_ > 0 && this->burn_in_period_ < this->max_iterations_){
		valid_burn_in_period = true;
		accepted_Z_instances = ceil((this->max_iterations_ - this->burn_in_period_) / this->spacing);
	}
	else {
		valid_burn_in_period = false;
		accepted_Z_instances = ceil(this->max_iterations_ / this->spacing);
	}



	for (size_t d = 0; d < this->num_documents_; d++){ // START For each document

		size_t num_words 				= this->document_lengths_[d];
		umat accepted_Z 				= zeros<umat>(num_words, accepted_Z_instances);
		vec accepted_Z_pp 				= zeros<vec>(accepted_Z_instances);
		uvec proposed_Z 				= zeros<uvec>(num_words);
		uvec current_Z 					= zeros<uvec>(num_words);
		size_t acceptance_count 		= 0;
		size_t count 					= 0;
		size_t num_random_walks			= 0;
		size_t num_random_walks2		= 0;
		uvec sampled_z;
		uvec sampled_z2;
		vector <size_t> word_indices 	= this->document_word_indices_[d];
		long double ppZ;
		long double ppZ_prime;
		long double tpZ_prime;
		long double tpZ;
		long double p_ratio;
		long double q_ratio;
		double acceptance_probability;
		double multi_jump_prob 			= percent_random_walk / 100.0;

		for (size_t n = 0; n < num_words; n++)
			current_Z(n) 				= this->initial_z_(word_indices[n]);

		ppZ 							= calc_ln_partition_probality(word_indices, current_Z);

		for (size_t iter = 0; iter < this->max_iterations_; iter++){ // START TOPIC SEARCH

			if (this->sample_uniform() <= random_walk_prob){ // do random walk from the previous state

				num_random_walks++;
				proposed_Z 				= current_Z;

				for (size_t i = 0; i < num_words; i++){
					if (this->sample_uniform() <= multi_jump_prob){ /// for each word with some probability multinomial jump
						proposed_Z(i) 	= sample_multinomial(this->init_beta_sample_.col(this->word_ids_(word_indices[i])));
						num_random_walks2++;
					}
				}

			}
			else { // do sample from a uniform

				for (size_t i = 0; i < num_words; i++)
					proposed_Z(i) 		= sample_uniform_int(this->num_topics_);

			}

			ppZ_prime 					= calc_ln_partition_probality(word_indices, proposed_Z);
			p_ratio 					= ppZ_prime - ppZ;

			tpZ_prime = calc_lnTP_hybrid_multi_randomwalk(num_words, word_indices, current_Z, random_walk_prob, multi_jump_prob);
			tpZ = calc_lnTP_hybrid_multi_randomwalk(num_words, word_indices, proposed_Z, random_walk_prob, multi_jump_prob);
			q_ratio = tpZ_prime - tpZ;

			assert(iter_temperature(iter) > 0);
//			acceptance_probability 		= min(1.0L, pow(exp(p_ratio), (1 / iter_temperature(iter)))); // MH acceptance probability
			acceptance_probability = min(1.0L, pow(exp(p_ratio + q_ratio), (1.0 / iter_temperature(iter)))); // MH acceptance probability


			if (this->sample_uniform() <= acceptance_probability){
				current_Z 				= proposed_Z;
				ppZ 					= ppZ_prime;
				acceptance_count++;
//				if (this->verbose_ >= 1){
//					cout << "doc " << d + 1 << " iter " << iter + 1;
//					cout << " accepted";
//					cout << " [a.p. = " << pow(exp(p_ratio), (1 / iter_temperature(iter)))
//							<< " ln P(z') = " << ppZ_prime << " ln P(z) = " << ppZ
//							<< " a.count = " << acceptance_count << " ]" << endl;
//				}
			}

			if (((iter + 1) % this->spacing == 0) && (!valid_burn_in_period || (valid_burn_in_period && (this->burn_in_period_ < iter)))) {
				accepted_Z.col(count) 	= current_Z;
				accepted_Z_pp(count) 	= ppZ_prime;
				count++;
			}

		} // END TOPIC SEARCH


		// Saves the results to the class variable
		sampled_z 						= find_mode(accepted_Z);
		sampled_z2 						= accepted_Z.col(count - 1);

		for (size_t n = 0; n < num_words; n++){
			this->sampled_z_(word_indices[n]) = sampled_z(n);
			this->beta_counts_(this->sampled_z_(word_indices[n]), this->word_ids_(word_indices[n])) += 1;
			this->beta_counts_last_(sampled_z2(n), this->word_ids_(word_indices[n])) += 1;
		}


		// Calculates theta counts
		this->theta_counts_.col(d) 		= calc_topic_counts(sampled_z, this->num_topics_);
		this->theta_counts_last_.col(d) = calc_topic_counts(sampled_z2, this->num_topics_);


		// Resets all used data structures
		current_Z.reset();
		proposed_Z.reset();
		accepted_Z.reset();
		sampled_z.reset();
		sampled_z2.reset();

//		if (this->verbose_ >= 1)
			cout << "doc " << d + 1 << " accepted # " << acceptance_count << " random walks # " << num_random_walks << " actual random walks # " << num_random_walks2 << endl;

	} // END For each document





//	if (this->verbose_ >= 1){
		cout << endl << "Total execution time: " << timer.get_time() << "s" << endl;
		cout << "Model perplexity: " << calc_corpus_perplexity() << endl; // using beta_counts_ and theta_counts_
		cout << "Log partition probability: " << calc_ln_corpus_partition_probality() << endl;
//	}
}
Ejemplo n.º 4
0
void TopicSearch::run_hybrid_random_walk_simulated_annealing(
		vec iter_temperature,
		double random_walk_prob,
		double percent_random_walk){

	size_t accepted_Z_instances;
	bool valid_burn_in_period;

    Timer timer = Timer();
	timer.restart_time();

	init_z();
	mat multinomial_prob = init_topical_Multinomial_probabilities();

	if (this->verbose_ >= 1)
		cout << "Multinomial probabilities: " << endl << multinomial_prob;

	if (this->burn_in_period_ > 0 && this->burn_in_period_ < this->max_iterations_){
		valid_burn_in_period = true;
		accepted_Z_instances = ceil((this->max_iterations_ - this->burn_in_period_) / this->spacing);
	}
	else {
		valid_burn_in_period = false;
		accepted_Z_instances = ceil(this->max_iterations_ / this->spacing);
	}

	for (size_t d = 0; d < this->num_documents_; d++){ // START For each document

		size_t num_words = this->document_lengths_[d];
		umat accepted_Z = zeros<umat>(num_words, accepted_Z_instances);
		uvec proposed_Z = zeros<uvec>(num_words);
		uvec current_Z = zeros<uvec>(num_words);
		size_t acceptance_count = 0;
		size_t count = 0;
		size_t random_walk_count = ceil((percent_random_walk / 100) * num_words);
		size_t num_random_walks			= 0;
		uvec sampled_z;
		uvec sampled_z2;

		vector <size_t> word_indices = this->document_word_indices_[d];
		for (size_t n = 0; n < num_words; n++)
			current_Z(n) = this->initial_z_(word_indices[n]);
		long double ppZ = calc_ln_partition_probality(word_indices, current_Z);

		for (size_t iter = 0; iter < this->max_iterations_; iter++){ // START TOPIC SEARCH

			if (this->sample_uniform() <= random_walk_prob){ // do random walk from the previous state

				num_random_walks++;
				proposed_Z = current_Z;
				for (size_t s = 0; s < random_walk_count; s++){
					size_t idx = sample_uniform_int(num_words); // selects a word at random
					while(1){
						size_t topic = sample_uniform_int(this->num_topics_);
						if (topic != current_Z(idx)){
							proposed_Z(idx) = topic;
							break;
						}
					}
				}

			}
			else { // do sample from the topic specific Multinomial

				for (size_t i = 0; i < num_words; i++)
					proposed_Z(i) = sample_multinomial(multinomial_prob.col(current_Z(i)));

			}

			long double ppZ_prime = calc_ln_partition_probality(word_indices, proposed_Z);

			long double tpZ_prime = calc_lnTP_hybrid_randomwalk(
					num_words, proposed_Z, current_Z, multinomial_prob, random_walk_prob, random_walk_count);
			long double tpZ = calc_lnTP_hybrid_randomwalk(
					num_words, current_Z, proposed_Z, multinomial_prob, random_walk_prob, random_walk_count);
			long double p_ratio = ppZ_prime - ppZ;
			long double q_ratio = tpZ_prime - tpZ;
			double acceptance_probability = min(1.0L, pow(exp(p_ratio + q_ratio), (1 / iter_temperature(iter)))); // MH acceptance probability

			if (this->sample_uniform() <= acceptance_probability){
				ppZ = ppZ_prime; // To avoid re-calculation
				current_Z = proposed_Z;
				acceptance_count++;

//				if (this->verbose_ >= 1){
//
//					cout << "doc " << d + 1 << " iter " << iter + 1;
//					cout << " accepted";
//					cout << " [a.p. = " << pow(exp(p_ratio + q_ratio), (1 / iter_temperature(iter)))
//							// << " t.ratio = " << exp(q_ratio) << " p.ratio = " << exp(p_ratio)
//							<< " ln P(z') = " << ppZ_prime << " ln P(z) = " << ppZ
//							<< " ln T(z',z) = " << tpZ_prime << " ln T(z,z') = " << tpZ
//							<< " a.count = " << acceptance_count << " ]" << endl;
//				}

			}

			if ((iter % this->spacing == 0)
					&& (!valid_burn_in_period || (valid_burn_in_period
							&& (this->burn_in_period_ < iter)))) {
				accepted_Z.col(count) = current_Z;
				count++;
			}

		} // END TOPIC SEARCH


//		// Saves the results to the class variables
//		sampled_z = find_mode(accepted_Z);
//		sampled_z2 = accepted_Z.col(count - 1);
////		for (size_t n = 0; n < num_words; n++){
////			this->sampled_z_(word_indices[n]) = sampled_z(n);
////			this->beta_counts_(this->sampled_z_(word_indices[n]), this->word_ids_(word_indices[n])) += 1;
////			this->beta_counts_last_(sampled_z2(n), this->word_ids_(word_indices[n])) += 1;
////		}
//
//		// Calculates theta counts
//		this->theta_counts_.col(d) = calc_topic_counts(sampled_z, this->num_topics_);
//		this->theta_counts_last_.col(d) = calc_topic_counts(sampled_z2, this->num_topics_);
//
//		// Resets all used data structures
//		current_Z.reset();
//		proposed_Z.reset();
//		accepted_Z.reset();
//		sampled_z.reset();
//		sampled_z2.reset();


		// Saves the results to the class variable
		sampled_z 						= find_mode(accepted_Z);
		sampled_z2 						= accepted_Z.col(count - 1);

		for (size_t n = 0; n < num_words; n++){
			this->sampled_z_(word_indices[n]) = sampled_z(n);
			this->beta_counts_(this->sampled_z_(word_indices[n]), this->word_ids_(word_indices[n])) += 1;
			this->beta_counts_last_(sampled_z2(n), this->word_ids_(word_indices[n])) += 1;
		}


		// Calculates theta counts
		this->theta_counts_.col(d) 		= calc_topic_counts(sampled_z, this->num_topics_);
		this->theta_counts_last_.col(d) = calc_topic_counts(sampled_z2, this->num_topics_);


		// Resets all used data structures
		current_Z.reset();
		proposed_Z.reset();
		accepted_Z.reset();
		sampled_z.reset();
		sampled_z2.reset();

		if (this->verbose_ >= 1)
			cout << "doc " << d + 1 << " accepted # " << acceptance_count << " random walks # " << num_random_walks << endl;

	} // END For each document

	if (this->verbose_ >= 1){
		cout << endl << "Total execution time: " << timer.get_time() << "s" << endl;
		cout << "Model perplexity: " << calc_corpus_perplexity() << endl; // using beta_counts_ and theta_counts_
		cout << "Log partition probability: " << calc_ln_corpus_partition_probality() << endl;
	}

}