void gibbs::train(size_t numTopics) { this->numTopics = numTopics; zero_init_counts(); random_assign_topics(); this->beta = 1/numTerms; double max_iter = MAX_ITER; for(int iter=0; iter<max_iter; ++iter){ for(int d=0; d<numDocs; ++d){ document curr_doc = corpus.docs[d]; int doc_word_index = 0; for(auto const& word_count : curr_doc.wordCounts){ int w = word_count.first; int count = word_count.second; for(int i=0; i<count; ++i){ // get current assignment int z = topic_assignments[d][doc_word_index]; // decrement counts n_dk[d][z] -= 1; n_kw[z][w] -= 1; n_k[z] -= 1; n_d[d] -= 1; // get conditional distribution std::vector<double> pz = get_pz(d, w); // update assignment z = sample_multinomial(pz); topic_assignments[d][doc_word_index] = z; doc_word_index++; // update counts n_dk[d][z] += 1; n_kw[z][w] += 1; n_k[z] += 1; n_d[d] += 1; } } } std::cout << "<"; int count = 0; int length = 50; for(int x=0; x<(iter/max_iter)*length; x++){ std::cout << "="; count++; } for(int x=0; x<length-count; x++){ std::cout << "-"; } std::cout << "> " << iter+1 << "/" << max_iter << '\r' << std::flush; } // update phi and theta estimate_parameters(); }
void TopicSearch::init_z_with_beta(){ this->initial_z_ = zeros<uvec>(this->num_word_instances_); this->sampled_z_ = zeros<uvec>(this->num_word_instances_); for(size_t j = 0; j < this->num_word_instances_; j++) this->initial_z_(j) = sample_multinomial(this->init_beta_sample_.col(this->word_ids_(j))); }
void TopicSearch::run_hybrid_random_walk_simulated_annealing_uniform2( vec iter_temperature, double random_walk_prob, double percent_random_walk){ size_t accepted_Z_instances; bool valid_burn_in_period; Timer timer = Timer(); timer.restart_time(); init_z(); if (this->burn_in_period_ > 0 && this->burn_in_period_ < this->max_iterations_){ valid_burn_in_period = true; accepted_Z_instances = ceil((this->max_iterations_ - this->burn_in_period_) / this->spacing); } else { valid_burn_in_period = false; accepted_Z_instances = ceil(this->max_iterations_ / this->spacing); } for (size_t d = 0; d < this->num_documents_; d++){ // START For each document size_t num_words = this->document_lengths_[d]; umat accepted_Z = zeros<umat>(num_words, accepted_Z_instances); vec accepted_Z_pp = zeros<vec>(accepted_Z_instances); uvec proposed_Z = zeros<uvec>(num_words); uvec current_Z = zeros<uvec>(num_words); size_t acceptance_count = 0; size_t count = 0; size_t num_random_walks = 0; size_t num_random_walks2 = 0; uvec sampled_z; uvec sampled_z2; vector <size_t> word_indices = this->document_word_indices_[d]; long double ppZ; long double ppZ_prime; long double tpZ_prime; long double tpZ; long double p_ratio; long double q_ratio; double acceptance_probability; double multi_jump_prob = percent_random_walk / 100.0; for (size_t n = 0; n < num_words; n++) current_Z(n) = this->initial_z_(word_indices[n]); ppZ = calc_ln_partition_probality(word_indices, current_Z); for (size_t iter = 0; iter < this->max_iterations_; iter++){ // START TOPIC SEARCH if (this->sample_uniform() <= random_walk_prob){ // do random walk from the previous state num_random_walks++; proposed_Z = current_Z; for (size_t i = 0; i < num_words; i++){ if (this->sample_uniform() <= multi_jump_prob){ /// for each word with some probability multinomial jump proposed_Z(i) = sample_multinomial(this->init_beta_sample_.col(this->word_ids_(word_indices[i]))); num_random_walks2++; } } } else { // do sample from a uniform for (size_t i = 0; i < num_words; i++) proposed_Z(i) = sample_uniform_int(this->num_topics_); } ppZ_prime = calc_ln_partition_probality(word_indices, proposed_Z); p_ratio = ppZ_prime - ppZ; tpZ_prime = calc_lnTP_hybrid_multi_randomwalk(num_words, word_indices, current_Z, random_walk_prob, multi_jump_prob); tpZ = calc_lnTP_hybrid_multi_randomwalk(num_words, word_indices, proposed_Z, random_walk_prob, multi_jump_prob); q_ratio = tpZ_prime - tpZ; assert(iter_temperature(iter) > 0); // acceptance_probability = min(1.0L, pow(exp(p_ratio), (1 / iter_temperature(iter)))); // MH acceptance probability acceptance_probability = min(1.0L, pow(exp(p_ratio + q_ratio), (1.0 / iter_temperature(iter)))); // MH acceptance probability if (this->sample_uniform() <= acceptance_probability){ current_Z = proposed_Z; ppZ = ppZ_prime; acceptance_count++; // if (this->verbose_ >= 1){ // cout << "doc " << d + 1 << " iter " << iter + 1; // cout << " accepted"; // cout << " [a.p. = " << pow(exp(p_ratio), (1 / iter_temperature(iter))) // << " ln P(z') = " << ppZ_prime << " ln P(z) = " << ppZ // << " a.count = " << acceptance_count << " ]" << endl; // } } if (((iter + 1) % this->spacing == 0) && (!valid_burn_in_period || (valid_burn_in_period && (this->burn_in_period_ < iter)))) { accepted_Z.col(count) = current_Z; accepted_Z_pp(count) = ppZ_prime; count++; } } // END TOPIC SEARCH // Saves the results to the class variable sampled_z = find_mode(accepted_Z); sampled_z2 = accepted_Z.col(count - 1); for (size_t n = 0; n < num_words; n++){ this->sampled_z_(word_indices[n]) = sampled_z(n); this->beta_counts_(this->sampled_z_(word_indices[n]), this->word_ids_(word_indices[n])) += 1; this->beta_counts_last_(sampled_z2(n), this->word_ids_(word_indices[n])) += 1; } // Calculates theta counts this->theta_counts_.col(d) = calc_topic_counts(sampled_z, this->num_topics_); this->theta_counts_last_.col(d) = calc_topic_counts(sampled_z2, this->num_topics_); // Resets all used data structures current_Z.reset(); proposed_Z.reset(); accepted_Z.reset(); sampled_z.reset(); sampled_z2.reset(); // if (this->verbose_ >= 1) cout << "doc " << d + 1 << " accepted # " << acceptance_count << " random walks # " << num_random_walks << " actual random walks # " << num_random_walks2 << endl; } // END For each document // if (this->verbose_ >= 1){ cout << endl << "Total execution time: " << timer.get_time() << "s" << endl; cout << "Model perplexity: " << calc_corpus_perplexity() << endl; // using beta_counts_ and theta_counts_ cout << "Log partition probability: " << calc_ln_corpus_partition_probality() << endl; // } }
void TopicSearch::run_hybrid_random_walk_simulated_annealing( vec iter_temperature, double random_walk_prob, double percent_random_walk){ size_t accepted_Z_instances; bool valid_burn_in_period; Timer timer = Timer(); timer.restart_time(); init_z(); mat multinomial_prob = init_topical_Multinomial_probabilities(); if (this->verbose_ >= 1) cout << "Multinomial probabilities: " << endl << multinomial_prob; if (this->burn_in_period_ > 0 && this->burn_in_period_ < this->max_iterations_){ valid_burn_in_period = true; accepted_Z_instances = ceil((this->max_iterations_ - this->burn_in_period_) / this->spacing); } else { valid_burn_in_period = false; accepted_Z_instances = ceil(this->max_iterations_ / this->spacing); } for (size_t d = 0; d < this->num_documents_; d++){ // START For each document size_t num_words = this->document_lengths_[d]; umat accepted_Z = zeros<umat>(num_words, accepted_Z_instances); uvec proposed_Z = zeros<uvec>(num_words); uvec current_Z = zeros<uvec>(num_words); size_t acceptance_count = 0; size_t count = 0; size_t random_walk_count = ceil((percent_random_walk / 100) * num_words); size_t num_random_walks = 0; uvec sampled_z; uvec sampled_z2; vector <size_t> word_indices = this->document_word_indices_[d]; for (size_t n = 0; n < num_words; n++) current_Z(n) = this->initial_z_(word_indices[n]); long double ppZ = calc_ln_partition_probality(word_indices, current_Z); for (size_t iter = 0; iter < this->max_iterations_; iter++){ // START TOPIC SEARCH if (this->sample_uniform() <= random_walk_prob){ // do random walk from the previous state num_random_walks++; proposed_Z = current_Z; for (size_t s = 0; s < random_walk_count; s++){ size_t idx = sample_uniform_int(num_words); // selects a word at random while(1){ size_t topic = sample_uniform_int(this->num_topics_); if (topic != current_Z(idx)){ proposed_Z(idx) = topic; break; } } } } else { // do sample from the topic specific Multinomial for (size_t i = 0; i < num_words; i++) proposed_Z(i) = sample_multinomial(multinomial_prob.col(current_Z(i))); } long double ppZ_prime = calc_ln_partition_probality(word_indices, proposed_Z); long double tpZ_prime = calc_lnTP_hybrid_randomwalk( num_words, proposed_Z, current_Z, multinomial_prob, random_walk_prob, random_walk_count); long double tpZ = calc_lnTP_hybrid_randomwalk( num_words, current_Z, proposed_Z, multinomial_prob, random_walk_prob, random_walk_count); long double p_ratio = ppZ_prime - ppZ; long double q_ratio = tpZ_prime - tpZ; double acceptance_probability = min(1.0L, pow(exp(p_ratio + q_ratio), (1 / iter_temperature(iter)))); // MH acceptance probability if (this->sample_uniform() <= acceptance_probability){ ppZ = ppZ_prime; // To avoid re-calculation current_Z = proposed_Z; acceptance_count++; // if (this->verbose_ >= 1){ // // cout << "doc " << d + 1 << " iter " << iter + 1; // cout << " accepted"; // cout << " [a.p. = " << pow(exp(p_ratio + q_ratio), (1 / iter_temperature(iter))) // // << " t.ratio = " << exp(q_ratio) << " p.ratio = " << exp(p_ratio) // << " ln P(z') = " << ppZ_prime << " ln P(z) = " << ppZ // << " ln T(z',z) = " << tpZ_prime << " ln T(z,z') = " << tpZ // << " a.count = " << acceptance_count << " ]" << endl; // } } if ((iter % this->spacing == 0) && (!valid_burn_in_period || (valid_burn_in_period && (this->burn_in_period_ < iter)))) { accepted_Z.col(count) = current_Z; count++; } } // END TOPIC SEARCH // // Saves the results to the class variables // sampled_z = find_mode(accepted_Z); // sampled_z2 = accepted_Z.col(count - 1); //// for (size_t n = 0; n < num_words; n++){ //// this->sampled_z_(word_indices[n]) = sampled_z(n); //// this->beta_counts_(this->sampled_z_(word_indices[n]), this->word_ids_(word_indices[n])) += 1; //// this->beta_counts_last_(sampled_z2(n), this->word_ids_(word_indices[n])) += 1; //// } // // // Calculates theta counts // this->theta_counts_.col(d) = calc_topic_counts(sampled_z, this->num_topics_); // this->theta_counts_last_.col(d) = calc_topic_counts(sampled_z2, this->num_topics_); // // // Resets all used data structures // current_Z.reset(); // proposed_Z.reset(); // accepted_Z.reset(); // sampled_z.reset(); // sampled_z2.reset(); // Saves the results to the class variable sampled_z = find_mode(accepted_Z); sampled_z2 = accepted_Z.col(count - 1); for (size_t n = 0; n < num_words; n++){ this->sampled_z_(word_indices[n]) = sampled_z(n); this->beta_counts_(this->sampled_z_(word_indices[n]), this->word_ids_(word_indices[n])) += 1; this->beta_counts_last_(sampled_z2(n), this->word_ids_(word_indices[n])) += 1; } // Calculates theta counts this->theta_counts_.col(d) = calc_topic_counts(sampled_z, this->num_topics_); this->theta_counts_last_.col(d) = calc_topic_counts(sampled_z2, this->num_topics_); // Resets all used data structures current_Z.reset(); proposed_Z.reset(); accepted_Z.reset(); sampled_z.reset(); sampled_z2.reset(); if (this->verbose_ >= 1) cout << "doc " << d + 1 << " accepted # " << acceptance_count << " random walks # " << num_random_walks << endl; } // END For each document if (this->verbose_ >= 1){ cout << endl << "Total execution time: " << timer.get_time() << "s" << endl; cout << "Model perplexity: " << calc_corpus_perplexity() << endl; // using beta_counts_ and theta_counts_ cout << "Log partition probability: " << calc_ln_corpus_partition_probality() << endl; } }