Exemplo n.º 1
0
void Btm::update_counts(Biterm& bi, int k) {
  word_topic_count(bi.get_wi(), bi.get_z())--;
  word_topic_count(bi.get_wj(), bi.get_z())--;
  topic_count_wd(bi.get_z()) -= 2;
  topic_count_bt(bi.get_z())--;
  bi.set_z(k);  
  word_topic_count(bi.get_wi(), k)++;
  word_topic_count(bi.get_wj(), k)++;
  topic_count_wd(k) += 2;
  topic_count_bt(k)++;
}
Exemplo n.º 2
0
void Trainer2::PartialTablebuild(std::vector<int> &mybucket, std::vector<wtopic> &wtable) {
  //  int **table;
  int wordmax = wordmax_;
  int topick = num_topic_; 
  assert(data_.size() <= (unsigned long)wordmax);
  for(int i=0; i<mybucket.size(); i++){
    //  for (auto& word : data_) {                                                                                                   
    int widx = mybucket[i];
    auto &word = data_[widx];
    std::vector<int> word_topic_count(num_topic_, 0);
    for (const auto topic : word.assignment_){
      assert(topic < topick);
      ++word_topic_count[topic];
    }
    wtopic entry;
    for(int j=0; j < num_topic_; j++){
      if(word_topic_count[j] > 0){
	entry.topic.push_back(j);
	entry.cnt.push_back(word_topic_count[j]);
      }
    }
    if(entry.topic.size() > 0){
      wtable[widx] = entry;
    }
  }
}
Exemplo n.º 3
0
arma::mat Btm::calc_beta() {
  arma::mat beta(V, K, arma::fill::zeros);
  for (int k = 0; k < K; k++) {
    for (int w = 0; w < V; w++) {
      beta(w, k) = (word_topic_count(w, k) + eta) / (topic_count_wd[k] + V * eta);
    }
  }
  return beta;
}
Exemplo n.º 4
0
// compute p(z_i=k|z/i, B)
NumericVector Btm::sample_prob(Biterm& bi) {
  NumericVector Q(K);
  for (int k = 0; k < K; k++) {
    int subtract = 0;
    if (bi.get_z() == k)
      subtract = 1.0;
    
    // Rcout << "subtr: " << subtract << ", ";
    // Rcout << "topic count word: " << topic_count_wd[k] << ", ";
    // Rcout << "denom: " << std::pow(topic_count_wd[k] - (2.0 * subtract) + (V * eta), 2.0) << ", ";
    Q[k] = (alpha + topic_count_bt[k] - subtract) * 
      (eta + word_topic_count(bi.get_wi(), k) - subtract) *
      (eta + word_topic_count(bi.get_wj(), k) - subtract) /
      ((topic_count_wd[k] - subtract + V * eta + 1) * 
        (topic_count_wd[k] - subtract + V * eta));
    if (k != 0)
      Q[k] = Q[k] + Q[k-1];
    // Rcout << "Q_" << k << ": " << Q[k] << " " << std::endl;
  }
  // Rcout << std::endl;
  return Q;
}
Exemplo n.º 5
0
void Trainer2::SingleTableEntrybuild(int widx, int *karray) {
  int wordmax = wordmax_;
  int topick = num_topic_; 
  assert(data_.size() <= (unsigned long)wordmax);
  auto &word = data_[widx];
  std::vector<int> word_topic_count(num_topic_, 0);
  for (const auto topic : word.assignment_){
    assert(topic < topick);
    ++word_topic_count[topic];
  }
  //  wtopic entry;
  memset(karray, 0x0, sizeof(int)*num_topic_);
  for(int j=0; j < num_topic_; j++){
    karray[j] = word_topic_count[j];
  }
}
Exemplo n.º 6
0
// distributed with multi threading. 
void Trainer2::TrainOneData_dist_mt(Data& word, int vidx, wtopic &wordtopic, int threadid) {
  double beta_sum = BETA * data_.size();

  // Construct word topic count on the fly to save memory
  std::vector<int> word_topic_count(num_topic_, 0);
  int size = wordtopic.topic.size();
  for(int i=0; i < size; i++){
    int col = wordtopic.topic[i];
    int cnt = wordtopic.cnt[i];   
    word_topic_count[col] = cnt;  // shared w table but no two threads access the same row at the same time 
  }
  for (int k = 0; k < num_topic_; ++k) {
    nasummary_[k] = summary_[k];
  }
  // Compute per word cached values
  double Xbar = .0;
  std::vector<double> phi_w(num_topic_, .0);
  for (int k = 0; k < num_topic_; ++k) {
    // TODO : summary_ is shared 
    phi_w[k] = (word_topic_count[k] + BETA) / (nasummary_[k] + beta_sum);
    Xbar += phi_w[k];
  }
  int *topicarray =  topicarray_[threadid]; 
  int *cntarray =  cntarray_[threadid];
  int itemsize;
  Xbar *= ALPHA;
  // Go!
  std::vector<double> Yval(num_topic_, .0); // only access first x entries
  for (size_t n = 0; n < word.token_.size(); ++n) { // repeat all tokesn of the given word id. 
    size = wordtopic.topic.size();
    int old_topic = word.assignment_[n];
    auto& doc = stat_[word.token_[n]];
    int doc_id = word.token_[n];
    itemsize = doc.item_.size();
    mutex_pool_[doc_id].lock();
    for(int i=0; i<doc.item_.size(); i++){
      struct Pair &tmp = doc.item_[i];
      topicarray[i] = tmp.top_;
      cntarray[i] = tmp.cnt_;     
    }
    mutex_pool_[doc_id].unlock();
    Xbar -= ALPHA * phi_w[old_topic];
    --word_topic_count[old_topic];
    --nasummary_[old_topic];
    phi_w[old_topic] = (word_topic_count[old_topic] + BETA) / (nasummary_[old_topic] + beta_sum);
    Xbar += ALPHA * phi_w[old_topic];
    double Ybar = .0;
    for (size_t i = 0; i < itemsize; ++i) {
      int cnt = (topicarray[i] == old_topic) ? (cntarray[i] - 1) : cntarray[i];
      double val = phi_w[topicarray[i]] * cnt;
      Yval[i] = val;
      Ybar += val;
    }

    // Sample
    double sample = _unif01(_rng) * (Xbar + Ybar);
    int new_topic = -1;
    if (sample < Ybar) {
      new_topic = topicarray[itemsize-1]; // item shouldn't be empty
      for (size_t i = 0; i < itemsize - 1; ++i) {
        sample -= Yval[i];
        if (sample <= .0) { new_topic = topicarray[i]; break; }
      }
    } // end of Y bucket
    else {
      sample -= Ybar;
      sample /= ALPHA;
      new_topic = num_topic_ - 1;
      for (int k = 0; k < num_topic_ - 1; ++k) {
        sample -= phi_w[k];
        if (sample <= .0) { new_topic = k; break; }
      }
    } // end of choosing bucket
    CHECK_GE(new_topic, 0);
    CHECK_LT(new_topic, num_topic_);
    // Increment
    Xbar -= ALPHA * phi_w[new_topic];
    ++word_topic_count[new_topic];
    ++nasummary_[new_topic];    
    phi_w[new_topic] = (word_topic_count[new_topic] + BETA) / (nasummary_[new_topic] + beta_sum);
    Xbar += ALPHA * phi_w[new_topic];
    word.assignment_[n] = new_topic;
    // update process wise doc direcly with locking. 
    doc.UpdateCount(old_topic, new_topic, doc_id, mutex_pool_);
    bool oldfound=false;
    bool newfound=false;

    for(int i=0; i < size; i++){
      int col = wordtopic.topic[i];
      if(col == new_topic){
	newfound=true;
	wordtopic.cnt[i]++;
      }
      if(col == old_topic){
	oldfound=true;
	wordtopic.cnt[i]--;
      }      
    }

    if(newfound == false){
      wordtopic.topic.push_back(new_topic);
      wordtopic.cnt.push_back(1);
    }

    if(oldfound == false)
      strads_msg(ERR, "\t\t\t @@@@@ FATAL old one not found  worker : start vidx: %d increment  wordtopicsize: %ld \n", 
		 vidx, wordtopic.topic.size());
    assert(oldfound);    

    // then remove temporary copy of doc 
    // for sync between my local and global 
    --summary_[old_topic];
    ++summary_[new_topic];  
  } // end of iter over tokens of a widx word 
}