/** * ドキュメントを保存する関数 * @param char *str 追加するドキュメントの文字列 */ void insert_document(char *str) { char **tmp; // ドキュメント格納配列の大きさが足りないとき if (n_docs >= n_docs_max) { n_docs_max *= 2; // 倍の大きさにする // メモリを再度割り当てる if ((tmp = (char **)realloc(document, sizeof(char *) * n_docs_max)) == NULL) { fprintf(stderr, "Memory Reallocation Error\n"); free_document(); exit(1); } else { document = tmp; // 新しい領域を指す } } // ドキュメントの要素を確保 document[n_docs] = (char *)malloc(sizeof(char) * (strlen(str) + 1)); if (document[n_docs] == NULL) { fprintf(stderr, "Memory Allocation Error\n"); exit(1); } // 文字列のコピー memcpy(document[n_docs], str, sizeof(char) * (strlen(str) + 1)); n_docs++; // ドキュメント数の更新 }
void XML_InitDocument(CXMLDOCUMENT *_object, xmlDoc *doc, const char *err) { if (!doc) { GB.Error(err ? err : "Unable to parse XML file"); return; } free_document(THIS); THIS->doc = doc; }
/** * 要素の解放を行う関数 */ void free_all(void) { free_document(); // ドキュメントの解放 free_array(); // 配列の解放 free_results(); // 結果の解放 }
void test_likelihood(t_setting* setting, const t_corpus* corpus, const std::vector<t_cat> tree_structure) { FILE* fileptr_lowerbound_result; FILE* fileptr_lowerbound_summary; FILE* fileptr_document_completion_result; FILE* fileptr_document_completion_summary; char filename[MAX_BUF]; sprintf(filename, "%s_tilda", setting->model_path); t_tilda_model* trained_tilda_model = load_tilda_model(filename); sprintf(filename, "%s_var", setting->model_path); t_tilda_var_model* trained_var_model = load_var_model(filename, corpus); double** rho = NULL; double* old_rho = NULL; double* nu = NULL; double* dirichlet_prior = NULL; double* expected_theta = NULL; double** expected_beta = NULL; const int& K = trained_tilda_model->num_topics; setting->num_topics = K; oneoverk = 1 / (double) K; double document_completion_sum_ll = 0.0; int document_completion_sum_num_words = 0; double lowerbound_sum_likelihood = 0; int lowerbound_sum_num_words = 0; nu = zero_init_double_array(K); rho = zero_init_double_matrix(corpus->max_length, K); old_rho = zero_init_double_array(K); dirichlet_prior = zero_init_double_array(K); expected_theta = zero_init_double_array(K); expected_beta = zero_init_double_matrix(K, corpus->num_terms); digamma_nu = zero_init_double_matrix(corpus->num_docs, K); digamma_nu_sum = zero_init_double_array(corpus->num_docs); digamma_lambda = zero_init_double_matrix(K, corpus->num_terms); digamma_lambda_sum = zero_init_double_array(K); compute_lambda_statistics(trained_var_model, expected_beta); sprintf(filename, "%s_lowerbound_result", setting->output_path); fileptr_lowerbound_result = fopen(filename, "w"); sprintf(filename, "%s_document_completion_result", setting->output_path); fileptr_document_completion_result = fopen(filename, "w"); for (int i = 0; i < tree_structure.size(); ++i) { const double& alpha_t = trained_tilda_model->alpha[i]; const double* kappa_t = trained_var_model->kappa[i]; const double& tau_t = trained_var_model->tau[i]; for (int j = 0; j < K; ++j) { dirichlet_prior[j] = alpha_t * kappa_t[j]; } for (int d = 0; d < tree_structure[i].docids.size(); ++d) { const int& docid = tree_structure[i].docids[d]; // evaluation using variational bound double this_doc_lowerbound = doc_e_step(&(corpus->docs[docid]), dirichlet_prior, nu, digamma_lambda, digamma_lambda_sum, setting, docid, rho, old_rho); assert(!std::isnan(this_doc_lowerbound)); this_doc_lowerbound += lgamma(alpha_t); this_doc_lowerbound -= (K - alpha_t) * digamma(tau_t); this_doc_lowerbound -= alpha_t * (K - 1) / tau_t; for (int j = 0; j < K; ++j) { this_doc_lowerbound -= lgamma(alpha_t * kappa_t[j]) + (1 - alpha_t * kappa_t[j]) * (log(kappa_t[j]) - digamma(tau_t * kappa_t[j])); } for (int j = 0; j < K; ++j) { this_doc_lowerbound += dirichlet_prior[j] * (digamma_nu[docid][j] - digamma_nu_sum[docid]); } assert(!std::isnan(this_doc_lowerbound)); fprintf(fileptr_lowerbound_result, "docid %d\tlower_bound %5.5f\tnum_words %d\n", docid, this_doc_lowerbound, corpus->docs[docid].total); lowerbound_sum_likelihood += this_doc_lowerbound; lowerbound_sum_num_words += corpus->docs[docid].total; // evaluation using document completion t_document* inference_doc = NULL; t_document* test_doc = NULL; split_document(inference_doc, test_doc, &(corpus->docs[docid])); double half_doc_lowerbound = doc_e_step(inference_doc, dirichlet_prior, nu, digamma_lambda, digamma_lambda_sum, setting, docid, rho, old_rho); assert(!std::isnan(half_doc_lowerbound)); half_doc_lowerbound += lgamma(alpha_t); half_doc_lowerbound -= (K - alpha_t) * digamma(tau_t); half_doc_lowerbound -= alpha_t * (K - 1) / tau_t; for (int j = 0; j < K; ++j) { half_doc_lowerbound -= lgamma(alpha_t * kappa_t[j]) + (1 - alpha_t * kappa_t[j]) * (log(kappa_t[j]) - digamma(tau_t * kappa_t[j])); } for (int j = 0; j < K; ++j) { half_doc_lowerbound += dirichlet_prior[j] * (digamma_nu[docid][j] - digamma_nu_sum[docid]); } assert(!std::isnan(half_doc_lowerbound)); double document_completion_log_likelihood = 0.0; double nu_sum = 0.0; for (int j = 0; j < K; ++j) { nu_sum += nu[j]; } for (int j = 0; j < K; ++j) { expected_theta[j] = nu[j] / nu_sum; } for (int n = 0; n < test_doc->length; n++) { double this_word_likelihood = 0.0; for (int j = 0; j < K; ++j) { this_word_likelihood += expected_theta[j] * expected_beta[j][test_doc->words[n]]; } document_completion_log_likelihood += log(this_word_likelihood + 1e-100) * test_doc->counts[n]; } fprintf(fileptr_document_completion_result, "docid %d\thalf_lower_bound %5.5f\tscore %5.5f\ttest_num_words %d\n", docid, half_doc_lowerbound, document_completion_log_likelihood, test_doc->total); document_completion_sum_ll += document_completion_log_likelihood; document_completion_sum_num_words += test_doc->total; free_document(inference_doc); free_document(test_doc); } } fclose(fileptr_lowerbound_result); fclose(fileptr_document_completion_result); double perplexity = exp(-lowerbound_sum_likelihood / (double) lowerbound_sum_num_words); sprintf(filename, "%s_lowerbound_summary", setting->output_path); fileptr_lowerbound_summary = fopen(filename, "w"); fprintf(fileptr_lowerbound_summary, "sum_lowerbound %5.5f\n", lowerbound_sum_likelihood); fprintf(fileptr_lowerbound_summary, "sum_num_words %d\n", lowerbound_sum_num_words); fprintf(fileptr_lowerbound_summary, "perplexity %5.5f\n", perplexity); fclose(fileptr_lowerbound_summary); double per_word_ll = document_completion_sum_ll / (double) document_completion_sum_num_words; sprintf(filename, "%s_document_completion_summary", setting->output_path); fileptr_document_completion_summary = fopen(filename, "w"); fprintf(fileptr_document_completion_summary, "sum_num_words %d\n", document_completion_sum_num_words); fprintf(fileptr_document_completion_summary, "per_word_ll %5.5f\n", per_word_ll); fprintf(fileptr_document_completion_summary, "perplexity %5.5f\n", exp(-per_word_ll)); fclose(fileptr_document_completion_summary); free_double_matrix(digamma_lambda); free(digamma_lambda_sum); free_double_matrix(digamma_nu); free(digamma_nu_sum); free_double_matrix(expected_beta); free(expected_theta); free(dirichlet_prior); free(nu); free_double_matrix(rho); free(old_rho); free_var_model(trained_var_model); free_tilda_model(trained_tilda_model); }