void Btm::update_counts(Biterm& bi, int k) { word_topic_count(bi.get_wi(), bi.get_z())--; word_topic_count(bi.get_wj(), bi.get_z())--; topic_count_wd(bi.get_z()) -= 2; topic_count_bt(bi.get_z())--; bi.set_z(k); word_topic_count(bi.get_wi(), k)++; word_topic_count(bi.get_wj(), k)++; topic_count_wd(k) += 2; topic_count_bt(k)++; }
void Trainer2::PartialTablebuild(std::vector<int> &mybucket, std::vector<wtopic> &wtable) { // int **table; int wordmax = wordmax_; int topick = num_topic_; assert(data_.size() <= (unsigned long)wordmax); for(int i=0; i<mybucket.size(); i++){ // for (auto& word : data_) { int widx = mybucket[i]; auto &word = data_[widx]; std::vector<int> word_topic_count(num_topic_, 0); for (const auto topic : word.assignment_){ assert(topic < topick); ++word_topic_count[topic]; } wtopic entry; for(int j=0; j < num_topic_; j++){ if(word_topic_count[j] > 0){ entry.topic.push_back(j); entry.cnt.push_back(word_topic_count[j]); } } if(entry.topic.size() > 0){ wtable[widx] = entry; } } }
arma::mat Btm::calc_beta() { arma::mat beta(V, K, arma::fill::zeros); for (int k = 0; k < K; k++) { for (int w = 0; w < V; w++) { beta(w, k) = (word_topic_count(w, k) + eta) / (topic_count_wd[k] + V * eta); } } return beta; }
// compute p(z_i=k|z/i, B) NumericVector Btm::sample_prob(Biterm& bi) { NumericVector Q(K); for (int k = 0; k < K; k++) { int subtract = 0; if (bi.get_z() == k) subtract = 1.0; // Rcout << "subtr: " << subtract << ", "; // Rcout << "topic count word: " << topic_count_wd[k] << ", "; // Rcout << "denom: " << std::pow(topic_count_wd[k] - (2.0 * subtract) + (V * eta), 2.0) << ", "; Q[k] = (alpha + topic_count_bt[k] - subtract) * (eta + word_topic_count(bi.get_wi(), k) - subtract) * (eta + word_topic_count(bi.get_wj(), k) - subtract) / ((topic_count_wd[k] - subtract + V * eta + 1) * (topic_count_wd[k] - subtract + V * eta)); if (k != 0) Q[k] = Q[k] + Q[k-1]; // Rcout << "Q_" << k << ": " << Q[k] << " " << std::endl; } // Rcout << std::endl; return Q; }
void Trainer2::SingleTableEntrybuild(int widx, int *karray) { int wordmax = wordmax_; int topick = num_topic_; assert(data_.size() <= (unsigned long)wordmax); auto &word = data_[widx]; std::vector<int> word_topic_count(num_topic_, 0); for (const auto topic : word.assignment_){ assert(topic < topick); ++word_topic_count[topic]; } // wtopic entry; memset(karray, 0x0, sizeof(int)*num_topic_); for(int j=0; j < num_topic_; j++){ karray[j] = word_topic_count[j]; } }
// distributed with multi threading. void Trainer2::TrainOneData_dist_mt(Data& word, int vidx, wtopic &wordtopic, int threadid) { double beta_sum = BETA * data_.size(); // Construct word topic count on the fly to save memory std::vector<int> word_topic_count(num_topic_, 0); int size = wordtopic.topic.size(); for(int i=0; i < size; i++){ int col = wordtopic.topic[i]; int cnt = wordtopic.cnt[i]; word_topic_count[col] = cnt; // shared w table but no two threads access the same row at the same time } for (int k = 0; k < num_topic_; ++k) { nasummary_[k] = summary_[k]; } // Compute per word cached values double Xbar = .0; std::vector<double> phi_w(num_topic_, .0); for (int k = 0; k < num_topic_; ++k) { // TODO : summary_ is shared phi_w[k] = (word_topic_count[k] + BETA) / (nasummary_[k] + beta_sum); Xbar += phi_w[k]; } int *topicarray = topicarray_[threadid]; int *cntarray = cntarray_[threadid]; int itemsize; Xbar *= ALPHA; // Go! std::vector<double> Yval(num_topic_, .0); // only access first x entries for (size_t n = 0; n < word.token_.size(); ++n) { // repeat all tokesn of the given word id. size = wordtopic.topic.size(); int old_topic = word.assignment_[n]; auto& doc = stat_[word.token_[n]]; int doc_id = word.token_[n]; itemsize = doc.item_.size(); mutex_pool_[doc_id].lock(); for(int i=0; i<doc.item_.size(); i++){ struct Pair &tmp = doc.item_[i]; topicarray[i] = tmp.top_; cntarray[i] = tmp.cnt_; } mutex_pool_[doc_id].unlock(); Xbar -= ALPHA * phi_w[old_topic]; --word_topic_count[old_topic]; --nasummary_[old_topic]; phi_w[old_topic] = (word_topic_count[old_topic] + BETA) / (nasummary_[old_topic] + beta_sum); Xbar += ALPHA * phi_w[old_topic]; double Ybar = .0; for (size_t i = 0; i < itemsize; ++i) { int cnt = (topicarray[i] == old_topic) ? (cntarray[i] - 1) : cntarray[i]; double val = phi_w[topicarray[i]] * cnt; Yval[i] = val; Ybar += val; } // Sample double sample = _unif01(_rng) * (Xbar + Ybar); int new_topic = -1; if (sample < Ybar) { new_topic = topicarray[itemsize-1]; // item shouldn't be empty for (size_t i = 0; i < itemsize - 1; ++i) { sample -= Yval[i]; if (sample <= .0) { new_topic = topicarray[i]; break; } } } // end of Y bucket else { sample -= Ybar; sample /= ALPHA; new_topic = num_topic_ - 1; for (int k = 0; k < num_topic_ - 1; ++k) { sample -= phi_w[k]; if (sample <= .0) { new_topic = k; break; } } } // end of choosing bucket CHECK_GE(new_topic, 0); CHECK_LT(new_topic, num_topic_); // Increment Xbar -= ALPHA * phi_w[new_topic]; ++word_topic_count[new_topic]; ++nasummary_[new_topic]; phi_w[new_topic] = (word_topic_count[new_topic] + BETA) / (nasummary_[new_topic] + beta_sum); Xbar += ALPHA * phi_w[new_topic]; word.assignment_[n] = new_topic; // update process wise doc direcly with locking. doc.UpdateCount(old_topic, new_topic, doc_id, mutex_pool_); bool oldfound=false; bool newfound=false; for(int i=0; i < size; i++){ int col = wordtopic.topic[i]; if(col == new_topic){ newfound=true; wordtopic.cnt[i]++; } if(col == old_topic){ oldfound=true; wordtopic.cnt[i]--; } } if(newfound == false){ wordtopic.topic.push_back(new_topic); wordtopic.cnt.push_back(1); } if(oldfound == false) strads_msg(ERR, "\t\t\t @@@@@ FATAL old one not found worker : start vidx: %d increment wordtopicsize: %ld \n", vidx, wordtopic.topic.size()); assert(oldfound); // then remove temporary copy of doc // for sync between my local and global --summary_[old_topic]; ++summary_[new_topic]; } // end of iter over tokens of a widx word }