double lda_m_step(lda* model, lda_suff_stats* ss) { int k, w; double lhood = 0; for (k = 0; k < model->ntopics; k++) { gsl_vector ss_k = gsl_matrix_column(ss->topics_ss, k).vector; gsl_vector log_p = gsl_matrix_column(model->topics, k).vector; if (LDA_USE_VAR_BAYES == 0) { gsl_blas_dcopy(&ss_k, &log_p); normalize(&log_p); vct_log(&log_p); } else { double digsum = sum(&ss_k)+model->nterms*LDA_TOPIC_DIR_PARAM; digsum = gsl_sf_psi(digsum); double param_sum = 0; for (w = 0; w < model->nterms; w++) { double param = vget(&ss_k, w) + LDA_TOPIC_DIR_PARAM; param_sum += param; double elogprob = gsl_sf_psi(param) - digsum; vset(&log_p, w, elogprob); lhood += (LDA_TOPIC_DIR_PARAM - param) * elogprob + gsl_sf_lngamma(param); } lhood -= gsl_sf_lngamma(param_sum); } } return(lhood); }
double c_ctr::doc_inference(const c_document* doc, const gsl_vector* theta_v, const gsl_matrix* log_beta, gsl_matrix* phi, gsl_vector* gamma, gsl_matrix* word_ss, bool update_word_ss) { double pseudo_count = 1.0; double likelihood = 0; gsl_vector* log_theta_v = gsl_vector_alloc(theta_v->size); gsl_vector_memcpy(log_theta_v, theta_v); vct_log(log_theta_v); int n, k, w; double x; for (n = 0; n < doc->m_length; n ++) { w = doc->m_words[n]; for (k = 0; k < m_num_factors; k ++) mset(phi, n, k, vget(theta_v, k) * mget(m_beta, k, w)); gsl_vector_view row = gsl_matrix_row(phi, n); vnormalize(&row.vector); for (k = 0; k < m_num_factors; k ++) { x = mget(phi, n, k); if (x > 0) likelihood += x*(vget(log_theta_v, k) + mget(log_beta, k, w) - log(x)); } } if (pseudo_count > 0) { likelihood += pseudo_count * vsum(log_theta_v); } gsl_vector_set_all(gamma, pseudo_count); // smoothing with small pseudo counts for (n = 0; n < doc->m_length; n ++) { for (k = 0; k < m_num_factors; k ++) { x = doc->m_counts[n] * mget(phi, n, k); vinc(gamma, k, x); if (update_word_ss) minc(word_ss, k, doc->m_words[n], x); } } gsl_vector_free(log_theta_v); return likelihood; }