Exemplo n.º 1
0
void TopicSearch::init_z(){

	this->initial_z_ = zeros<uvec>(this->num_word_instances_);
	this->sampled_z_ = zeros<uvec>(this->num_word_instances_);

	for(size_t j = 0; j < this->num_word_instances_; j++)
		this->initial_z_(j) = sample_uniform_int(this->num_topics_);

}
Exemplo n.º 2
0
void randomassignment_for_doc(int N, int T, int *w, int *d, int *z, double **Nwt, double *Ndt, double *Nt) //
{
	int i, t;
	for (i = 0; i < N; i++) {
		t = sample_uniform_int(T); //  (int)(T*drand48());
		z[i] = t;
		Nwt[w[i]][t]++;
		Ndt[t]++;
		Nt[t]++;
	}
}
Exemplo n.º 3
0
/**
* Samples random permutations for a given count
*
* Arguments:
* 		n - the number of samples
* Return:
* 		order - a vector of indices that represents
* 				the permutations of numbers in [1, n]
**/
arma::uvec randperm(unsigned int n) {
  arma::uvec order = arma::zeros<arma::uvec>(n);
  unsigned int k, nn, takeanumber, temp;
  for (k=0; k<n; k++) order(k) = k;
  nn = n;
  for (k=0; k<n; k++) {
    takeanumber = sample_uniform_int(nn); // take a number between 0 and nn-1
    temp = order(nn-1);
    order(nn-1) = order(takeanumber);
    order(takeanumber) = temp;
    nn--;
  }
  return order;
}
Exemplo n.º 4
0
int *randperm(int n) //
{
	int *order = ivec(n);
	int k, nn, takeanumber, temp;
	for (k=0; k<n; k++) order[ k ] = k;
	nn = n;
	for (k=0; k<n; k++) {
		// take a number between 0 and nn-1
		takeanumber = sample_uniform_int(nn); // (int) (nn*drand48());
		temp = order[ nn-1 ];
		order[ nn-1 ] = order[ takeanumber ];
		order[ takeanumber ] = temp;
		nn--;
	}
	return order;
}
Exemplo n.º 5
0
void TopicSearch::run_hybrid_random_walk_simulated_annealing_uniform2(
		vec iter_temperature,
		double random_walk_prob,
		double percent_random_walk){

	size_t accepted_Z_instances;
	bool valid_burn_in_period;

    Timer timer = Timer();
	timer.restart_time();

	init_z();

	if (this->burn_in_period_ > 0 && this->burn_in_period_ < this->max_iterations_){
		valid_burn_in_period = true;
		accepted_Z_instances = ceil((this->max_iterations_ - this->burn_in_period_) / this->spacing);
	}
	else {
		valid_burn_in_period = false;
		accepted_Z_instances = ceil(this->max_iterations_ / this->spacing);
	}



	for (size_t d = 0; d < this->num_documents_; d++){ // START For each document

		size_t num_words 				= this->document_lengths_[d];
		umat accepted_Z 				= zeros<umat>(num_words, accepted_Z_instances);
		vec accepted_Z_pp 				= zeros<vec>(accepted_Z_instances);
		uvec proposed_Z 				= zeros<uvec>(num_words);
		uvec current_Z 					= zeros<uvec>(num_words);
		size_t acceptance_count 		= 0;
		size_t count 					= 0;
		size_t num_random_walks			= 0;
		size_t num_random_walks2		= 0;
		uvec sampled_z;
		uvec sampled_z2;
		vector <size_t> word_indices 	= this->document_word_indices_[d];
		long double ppZ;
		long double ppZ_prime;
		long double tpZ_prime;
		long double tpZ;
		long double p_ratio;
		long double q_ratio;
		double acceptance_probability;
		double multi_jump_prob 			= percent_random_walk / 100.0;

		for (size_t n = 0; n < num_words; n++)
			current_Z(n) 				= this->initial_z_(word_indices[n]);

		ppZ 							= calc_ln_partition_probality(word_indices, current_Z);

		for (size_t iter = 0; iter < this->max_iterations_; iter++){ // START TOPIC SEARCH

			if (this->sample_uniform() <= random_walk_prob){ // do random walk from the previous state

				num_random_walks++;
				proposed_Z 				= current_Z;

				for (size_t i = 0; i < num_words; i++){
					if (this->sample_uniform() <= multi_jump_prob){ /// for each word with some probability multinomial jump
						proposed_Z(i) 	= sample_multinomial(this->init_beta_sample_.col(this->word_ids_(word_indices[i])));
						num_random_walks2++;
					}
				}

			}
			else { // do sample from a uniform

				for (size_t i = 0; i < num_words; i++)
					proposed_Z(i) 		= sample_uniform_int(this->num_topics_);

			}

			ppZ_prime 					= calc_ln_partition_probality(word_indices, proposed_Z);
			p_ratio 					= ppZ_prime - ppZ;

			tpZ_prime = calc_lnTP_hybrid_multi_randomwalk(num_words, word_indices, current_Z, random_walk_prob, multi_jump_prob);
			tpZ = calc_lnTP_hybrid_multi_randomwalk(num_words, word_indices, proposed_Z, random_walk_prob, multi_jump_prob);
			q_ratio = tpZ_prime - tpZ;

			assert(iter_temperature(iter) > 0);
//			acceptance_probability 		= min(1.0L, pow(exp(p_ratio), (1 / iter_temperature(iter)))); // MH acceptance probability
			acceptance_probability = min(1.0L, pow(exp(p_ratio + q_ratio), (1.0 / iter_temperature(iter)))); // MH acceptance probability


			if (this->sample_uniform() <= acceptance_probability){
				current_Z 				= proposed_Z;
				ppZ 					= ppZ_prime;
				acceptance_count++;
//				if (this->verbose_ >= 1){
//					cout << "doc " << d + 1 << " iter " << iter + 1;
//					cout << " accepted";
//					cout << " [a.p. = " << pow(exp(p_ratio), (1 / iter_temperature(iter)))
//							<< " ln P(z') = " << ppZ_prime << " ln P(z) = " << ppZ
//							<< " a.count = " << acceptance_count << " ]" << endl;
//				}
			}

			if (((iter + 1) % this->spacing == 0) && (!valid_burn_in_period || (valid_burn_in_period && (this->burn_in_period_ < iter)))) {
				accepted_Z.col(count) 	= current_Z;
				accepted_Z_pp(count) 	= ppZ_prime;
				count++;
			}

		} // END TOPIC SEARCH


		// Saves the results to the class variable
		sampled_z 						= find_mode(accepted_Z);
		sampled_z2 						= accepted_Z.col(count - 1);

		for (size_t n = 0; n < num_words; n++){
			this->sampled_z_(word_indices[n]) = sampled_z(n);
			this->beta_counts_(this->sampled_z_(word_indices[n]), this->word_ids_(word_indices[n])) += 1;
			this->beta_counts_last_(sampled_z2(n), this->word_ids_(word_indices[n])) += 1;
		}


		// Calculates theta counts
		this->theta_counts_.col(d) 		= calc_topic_counts(sampled_z, this->num_topics_);
		this->theta_counts_last_.col(d) = calc_topic_counts(sampled_z2, this->num_topics_);


		// Resets all used data structures
		current_Z.reset();
		proposed_Z.reset();
		accepted_Z.reset();
		sampled_z.reset();
		sampled_z2.reset();

//		if (this->verbose_ >= 1)
			cout << "doc " << d + 1 << " accepted # " << acceptance_count << " random walks # " << num_random_walks << " actual random walks # " << num_random_walks2 << endl;

	} // END For each document





//	if (this->verbose_ >= 1){
		cout << endl << "Total execution time: " << timer.get_time() << "s" << endl;
		cout << "Model perplexity: " << calc_corpus_perplexity() << endl; // using beta_counts_ and theta_counts_
		cout << "Log partition probability: " << calc_ln_corpus_partition_probality() << endl;
//	}
}
Exemplo n.º 6
0
void TopicSearch::run_hybrid_random_walk_simulated_annealing(
		vec iter_temperature,
		double random_walk_prob,
		double percent_random_walk){

	size_t accepted_Z_instances;
	bool valid_burn_in_period;

    Timer timer = Timer();
	timer.restart_time();

	init_z();
	mat multinomial_prob = init_topical_Multinomial_probabilities();

	if (this->verbose_ >= 1)
		cout << "Multinomial probabilities: " << endl << multinomial_prob;

	if (this->burn_in_period_ > 0 && this->burn_in_period_ < this->max_iterations_){
		valid_burn_in_period = true;
		accepted_Z_instances = ceil((this->max_iterations_ - this->burn_in_period_) / this->spacing);
	}
	else {
		valid_burn_in_period = false;
		accepted_Z_instances = ceil(this->max_iterations_ / this->spacing);
	}

	for (size_t d = 0; d < this->num_documents_; d++){ // START For each document

		size_t num_words = this->document_lengths_[d];
		umat accepted_Z = zeros<umat>(num_words, accepted_Z_instances);
		uvec proposed_Z = zeros<uvec>(num_words);
		uvec current_Z = zeros<uvec>(num_words);
		size_t acceptance_count = 0;
		size_t count = 0;
		size_t random_walk_count = ceil((percent_random_walk / 100) * num_words);
		size_t num_random_walks			= 0;
		uvec sampled_z;
		uvec sampled_z2;

		vector <size_t> word_indices = this->document_word_indices_[d];
		for (size_t n = 0; n < num_words; n++)
			current_Z(n) = this->initial_z_(word_indices[n]);
		long double ppZ = calc_ln_partition_probality(word_indices, current_Z);

		for (size_t iter = 0; iter < this->max_iterations_; iter++){ // START TOPIC SEARCH

			if (this->sample_uniform() <= random_walk_prob){ // do random walk from the previous state

				num_random_walks++;
				proposed_Z = current_Z;
				for (size_t s = 0; s < random_walk_count; s++){
					size_t idx = sample_uniform_int(num_words); // selects a word at random
					while(1){
						size_t topic = sample_uniform_int(this->num_topics_);
						if (topic != current_Z(idx)){
							proposed_Z(idx) = topic;
							break;
						}
					}
				}

			}
			else { // do sample from the topic specific Multinomial

				for (size_t i = 0; i < num_words; i++)
					proposed_Z(i) = sample_multinomial(multinomial_prob.col(current_Z(i)));

			}

			long double ppZ_prime = calc_ln_partition_probality(word_indices, proposed_Z);

			long double tpZ_prime = calc_lnTP_hybrid_randomwalk(
					num_words, proposed_Z, current_Z, multinomial_prob, random_walk_prob, random_walk_count);
			long double tpZ = calc_lnTP_hybrid_randomwalk(
					num_words, current_Z, proposed_Z, multinomial_prob, random_walk_prob, random_walk_count);
			long double p_ratio = ppZ_prime - ppZ;
			long double q_ratio = tpZ_prime - tpZ;
			double acceptance_probability = min(1.0L, pow(exp(p_ratio + q_ratio), (1 / iter_temperature(iter)))); // MH acceptance probability

			if (this->sample_uniform() <= acceptance_probability){
				ppZ = ppZ_prime; // To avoid re-calculation
				current_Z = proposed_Z;
				acceptance_count++;

//				if (this->verbose_ >= 1){
//
//					cout << "doc " << d + 1 << " iter " << iter + 1;
//					cout << " accepted";
//					cout << " [a.p. = " << pow(exp(p_ratio + q_ratio), (1 / iter_temperature(iter)))
//							// << " t.ratio = " << exp(q_ratio) << " p.ratio = " << exp(p_ratio)
//							<< " ln P(z') = " << ppZ_prime << " ln P(z) = " << ppZ
//							<< " ln T(z',z) = " << tpZ_prime << " ln T(z,z') = " << tpZ
//							<< " a.count = " << acceptance_count << " ]" << endl;
//				}

			}

			if ((iter % this->spacing == 0)
					&& (!valid_burn_in_period || (valid_burn_in_period
							&& (this->burn_in_period_ < iter)))) {
				accepted_Z.col(count) = current_Z;
				count++;
			}

		} // END TOPIC SEARCH


//		// Saves the results to the class variables
//		sampled_z = find_mode(accepted_Z);
//		sampled_z2 = accepted_Z.col(count - 1);
////		for (size_t n = 0; n < num_words; n++){
////			this->sampled_z_(word_indices[n]) = sampled_z(n);
////			this->beta_counts_(this->sampled_z_(word_indices[n]), this->word_ids_(word_indices[n])) += 1;
////			this->beta_counts_last_(sampled_z2(n), this->word_ids_(word_indices[n])) += 1;
////		}
//
//		// Calculates theta counts
//		this->theta_counts_.col(d) = calc_topic_counts(sampled_z, this->num_topics_);
//		this->theta_counts_last_.col(d) = calc_topic_counts(sampled_z2, this->num_topics_);
//
//		// Resets all used data structures
//		current_Z.reset();
//		proposed_Z.reset();
//		accepted_Z.reset();
//		sampled_z.reset();
//		sampled_z2.reset();


		// Saves the results to the class variable
		sampled_z 						= find_mode(accepted_Z);
		sampled_z2 						= accepted_Z.col(count - 1);

		for (size_t n = 0; n < num_words; n++){
			this->sampled_z_(word_indices[n]) = sampled_z(n);
			this->beta_counts_(this->sampled_z_(word_indices[n]), this->word_ids_(word_indices[n])) += 1;
			this->beta_counts_last_(sampled_z2(n), this->word_ids_(word_indices[n])) += 1;
		}


		// Calculates theta counts
		this->theta_counts_.col(d) 		= calc_topic_counts(sampled_z, this->num_topics_);
		this->theta_counts_last_.col(d) = calc_topic_counts(sampled_z2, this->num_topics_);


		// Resets all used data structures
		current_Z.reset();
		proposed_Z.reset();
		accepted_Z.reset();
		sampled_z.reset();
		sampled_z2.reset();

		if (this->verbose_ >= 1)
			cout << "doc " << d + 1 << " accepted # " << acceptance_count << " random walks # " << num_random_walks << endl;

	} // END For each document

	if (this->verbose_ >= 1){
		cout << endl << "Total execution time: " << timer.get_time() << "s" << endl;
		cout << "Model perplexity: " << calc_corpus_perplexity() << endl; // using beta_counts_ and theta_counts_
		cout << "Log partition probability: " << calc_ln_corpus_partition_probality() << endl;
	}

}