void initialize_lda_ss_from_random(corpus_t* data, lda_suff_stats* ss) { int k, n; gsl_rng * r = new_random_number_generator(); for (k = 0; k < ss->topics_ss->size2; k++) { gsl_vector topic = gsl_matrix_column(ss->topics_ss, k).vector; gsl_vector_set_all(&topic, 0); for (n = 0; n < topic.size; n++) { vset(&topic, n, gsl_rng_uniform(r) + 0.5 / data->ndocs + 4.0); } } }
void initialize_lda_ss_from_data(corpus_t* data, lda_suff_stats* ss) { int k, n, i, w; gsl_rng * r = new_random_number_generator(); for (k = 0; k < ss->topics_ss->size2; k++) { gsl_vector topic = gsl_matrix_column(ss->topics_ss, k).vector; for (n = 0; n < LDA_SEED_INIT; n++) { int d = floor(gsl_rng_uniform(r) * data->ndocs); doc_t* doc = data->doc[d]; for (i = 0; i < doc->nterms; i++) { vinc(&topic, doc->word[n], doc->count[n]); } } for (w = 0; w < topic.size; w++) { vinc(&topic, w, LDA_INIT_SMOOTH + gsl_rng_uniform(r)); } } }
int main(int argc, char* argv[]) { if (argc < 2) print_usage_and_exit(); int verbose = 0; // Control parameters. char* directory = NULL; time_t t; time(&t); long random_seed = (long) t; int max_iter = 100; int max_time = 1800; int save_lag = 5; // Data parameters. char* train_data = NULL; // Model parameters. double eta = 0.01; double gamma = 1.0; double alpha = 1.0; double gamma_a = 1.0; double gamma_b = 1.0; double alpha_a = 1.0; double alpha_b = 1.0; int sample_hyper = 0; // test only parameters char* test_data = NULL; char* model_prefix = NULL; for (int i = 1; i < argc; ++i) { if (!strcmp(argv[i], "--help")) print_usage_and_exit(); else if (!strcmp(argv[i], "--verbose")) verbose = 1; else if (!strcmp(argv[i], "--directory")) directory = argv[++i]; else if (!strcmp(argv[i], "--random_seed")) random_seed = atoi(argv[++i]); else if (!strcmp(argv[i], "--max_iter")) max_iter = atoi(argv[++i]); else if (!strcmp(argv[i], "--max_time")) max_time = atoi(argv[++i]); else if (!strcmp(argv[i], "--save_lag")) save_lag = atoi(argv[++i]); else if (!strcmp(argv[i], "--train_data")) train_data = argv[++i]; else if (!strcmp(argv[i], "--eta")) eta = atof(argv[++i]); else if (!strcmp(argv[i], "--gamma")) gamma = atof(argv[++i]); else if (!strcmp(argv[i], "--alpha")) alpha = atof(argv[++i]); else if (!strcmp(argv[i], "--gamma_a")) gamma_a = atof(argv[++i]); else if (!strcmp(argv[i], "--gamma_b")) gamma_b = atof(argv[++i]); else if (!strcmp(argv[i], "--gamma_a")) gamma_a = atof(argv[++i]); else if (!strcmp(argv[i], "--gamma_b")) gamma_b = atof(argv[++i]); else if (!strcmp(argv[i], "--sample_hyper")) sample_hyper = 1; else if (!strcmp(argv[i], "--test_data")) test_data = argv[++i]; else if (!strcmp(argv[i], "--model_prefix")) model_prefix = argv[++i]; else { printf("%s, unknown parameters, exit\n", argv[i]); print_usage_and_exit(); } } /// print information printf("************************************************************************************************\n"); if (directory == NULL) { printf("Following information is missing: --directory\n"); printf("Run ./hdp for help.\n"); exit(0); } if (!dir_exists(directory)) make_directory(directory); printf("Working directory: %s.\n", directory); char name[500]; // Init random numbe generator. RANDOM_NUMBER = new_random_number_generator(random_seed); if (test_data == NULL || model_prefix == NULL) { sprintf(name, "%s/settings.dat", directory); printf("Setting saved at %s.\n", name); FILE* setting_file = fopen(name, "w"); fprintf(setting_file, "Control parameters:\n"); fprintf(setting_file, "directory: %s\n", directory); fprintf(setting_file, "random_seed: %d\n", (int)random_seed); fprintf(setting_file, "save_lag: %d\n", save_lag); fprintf(setting_file, "max_iter: %d\n", max_iter); fprintf(setting_file, "max_time: %d\n", max_time); fprintf(setting_file, "\nData parameters:\n"); fprintf(setting_file, "train_data: %s\n", train_data); fprintf(setting_file, "\nModel parameters:\n"); fprintf(setting_file, "eta: %.4lf\n", eta); fprintf(setting_file, "gamma: %.4lf\n", gamma); fprintf(setting_file, "alpha: %.4lf\n", alpha); fprintf(setting_file, "gamma_a: %.2lf\n", gamma_a); fprintf(setting_file, "gamma_b: %.4lf\n", gamma_b); fprintf(setting_file, "gamma_a: %.2lf\n", alpha_a); fprintf(setting_file, "gamma_b: %.4lf\n", alpha_b); fprintf(setting_file, "sample_hyper: %d\n", sample_hyper); fclose(setting_file); Corpus* c_train = NULL; printf("Reading training data from %s.\n", train_data); // Reading one of the train data. c_train = new Corpus(); c_train->read_data(train_data); // Open the log file for training data. sprintf(name, "%s/train.log", directory); FILE* train_log = fopen(name, "w"); // Heldout columns record the documents that have not seen before. sprintf(name, "time\titer\tnum.topics\tgamma\talpha\t\tword.count\tlikelihood\tavg.likelihood"); if(verbose) printf("%s\n", name); fprintf(train_log, "%s\n", name); // Start iterating. time_t start, current; int total_time = 0; int iter = 0; HDP* hdp = new HDP(); hdp->init_hdp(eta, gamma, alpha, c_train->size_vocab_); // Setting up the hdp state. hdp->setup_doc_states(c_train->docs_); // first iteration hdp->iterate_gibbs_state(false, false); while ((max_iter == -1 || iter < max_iter) && (max_time == -1 || total_time < max_time)) { ++iter; time (&start); // Iterations. hdp->iterate_gibbs_state(true, true); // Scoring the documents. double likelihood = hdp->log_likelihood(NULL); hdp->compact_hdp_state(); if (sample_hyper) hdp->hyper_inference(gamma_a, gamma_b, alpha_a, alpha_b); // Record the time. time(¤t); int elapse = (int) difftime(current, start); total_time += elapse; sprintf(name, "%d\t%d\t%d\t\t%.5f\t%.5f\t\t%d\t\t%.3f\t%.5f", total_time, iter, hdp->hdp_state_->num_topics_, hdp->hdp_state_->gamma_, hdp->hdp_state_->alpha_, c_train->num_total_words_, likelihood, likelihood/c_train->num_total_words_); if (verbose) printf("%s\n", name); fprintf(train_log, "%s\n", name); fflush(train_log); if (save_lag > 0 && (iter % save_lag == 0)) { sprintf(name, "%s/iter@%05d", directory, iter); hdp->save_state(name); } } sprintf(name, "%s/final", directory); hdp->save_state(name); // Free training data. if (c_train != NULL) { delete c_train; } fclose(train_log); delete hdp; } if (test_data != NULL && model_prefix != NULL) { Corpus* c_test = new Corpus(); c_test->read_data(test_data); HDP* hdp = new HDP(); printf("Loading model from prefix %s...\n", model_prefix); hdp->load_state(model_prefix); // Remember the old state. HDPState* old_hdp_state = new HDPState(); old_hdp_state->copy_hdp_state(*hdp->hdp_state_); hdp->setup_doc_states(c_test->docs_); if (verbose) printf("Initialization ...\n"); hdp->iterate_gibbs_state(false, false); sprintf(name, "%s/%s-test.log", directory, basename(model_prefix)); FILE* test_log = fopen(name, "w"); sprintf(name, "time\titer\tnum.topics\tword.count\tlikelihood\tavg.likelihood"); if(verbose) printf("%s\n", name); fprintf(test_log, "%s\n", name); time_t start, current; int total_time = 0; int iter = 0; // Iterations. while ((max_iter == -1 || iter < max_iter) && (max_time == -1 || total_time < max_time)) { ++iter; time (&start); hdp->iterate_gibbs_state(true, true); double likelihood = hdp->log_likelihood(old_hdp_state); hdp->compact_hdp_state(); time(¤t); int elapse = (int) difftime(current, start); total_time += elapse; sprintf(name, "%d\t%d\t%d\t\t%d\t\t%.3f\t%.5f", total_time, iter, hdp->hdp_state_->num_topics_, c_test->num_total_words_, likelihood, likelihood/c_test->num_total_words_); if (verbose) printf("%s\n", name); fprintf(test_log, "%s\n", name); fflush(test_log); } if (verbose) printf("Done and saving ...\n"); sprintf(name, "%s/%s-test", directory, basename(model_prefix)); hdp->save_state(name); hdp->save_doc_states(name); fclose(test_log); delete hdp; delete old_hdp_state; delete c_test; } // Free random number generator. free_random_number_generator(RANDOM_NUMBER); return 0; }
int main(int argc, char* argv[]) { if (argc < 2) print_usage_and_exit(); char filename[500]; int theta_opt = 0; int lda_regression = 0; const char* const short_options = "hd:x:i:a:b:u:v:r:s:m:k:t:e:y:z:w:"; const struct option long_options[] = { {"help", no_argument, NULL, 'h'}, {"directory", required_argument, NULL, 'd'}, {"user", required_argument, NULL, 'x'}, {"item", required_argument, NULL, 'i'}, {"a", required_argument, NULL, 'a'}, {"b", required_argument, NULL, 'b'}, {"lambda_u", required_argument, NULL, 'u'}, {"lambda_v", required_argument, NULL, 'v'}, {"random_seed", required_argument, NULL, 'r'}, {"save_lag", required_argument, NULL, 's'}, {"max_iter", required_argument, NULL, 'm'}, {"num_factors", required_argument, NULL, 'k'}, {"mult", required_argument, NULL, 't'}, {"theta_init", required_argument, NULL, 'e'}, {"beta_init", required_argument, NULL, 'y'}, {"learning_rate", required_argument, NULL, 'z'}, {"alpha_smooth", required_argument, NULL, 'w'}, {"theta_opt", no_argument, &theta_opt, 1}, {"lda_regression",no_argument, &lda_regression, 1}, {NULL, 0, NULL, 0}}; char* directory = NULL; char* user_path = NULL; char* item_path = NULL; double a = 1.0; double b = 0.01; double lambda_u = 0.01; double lambda_v = 100; double learning_rate = -1; double alpha_smooth = 0.0; time_t t; time(&t); long random_seed = (long) t; int save_lag = 20; int max_iter = 200; int num_factors = 200; char* mult_path = NULL; char* theta_init_path = NULL; char* beta_init_path = NULL; int cc = 0; while(true) { cc = getopt_long(argc, argv, short_options, long_options, NULL); switch(cc) { case 'h': print_usage_and_exit(); break; case 'd': directory = optarg; break; case 'x': user_path = optarg; break; case 'i': item_path = optarg; break; case 'a': a = atof(optarg); break; case 'b': b = atof(optarg); break; case 'u': lambda_u = atof(optarg); break; case 'v': lambda_v = atof(optarg); break; case 'z': learning_rate = atof(optarg); break; case 'w': alpha_smooth = atof(optarg); break; case 'r': random_seed = atoi(optarg); break; case 's': save_lag = atoi(optarg); break; case 'm': max_iter = atoi(optarg); break; case 'k': num_factors = atoi(optarg); break; case 't': mult_path = optarg; break; case 'e': theta_init_path = optarg; break; case 'y': beta_init_path = optarg; break; case -1: break; case '?': print_usage_and_exit(); break; default: break; } if (cc == -1) break; } /// print information printf("\n************************************************************************************************\n"); if (!dir_exists(directory)) make_directory(directory); printf("result directory: %s\n", directory); if (!file_exists(user_path)) { printf("user file %s doesn't exist! quit ...\n", user_path); exit(-1); } printf("user file: %s\n", user_path); if (!file_exists(item_path)) { printf("item file %s doesn't exist! quit ...\n", item_path); exit(-1); } printf("item file: %s\n", item_path); printf("a: %.4f\n", a); printf("b: %.4f\n", b); printf("lambda_u: %.4f\n", lambda_u); printf("lambda_v: %.4f\n", lambda_v); printf("learning_rate: %.5f\n", learning_rate); printf("alpha_smooth: %.5f\n", alpha_smooth); printf("random seed: %d\n", (int)random_seed); printf("save lag: %d\n", save_lag); printf("max iter: %d\n", max_iter); printf("number of factors: %d\n", num_factors); if (mult_path != NULL) { if (!file_exists(item_path)) { printf("mult file %s doesn't exist! quit ...\n", mult_path); exit(-1); } printf("mult file: %s\n", mult_path); if (theta_init_path == NULL) { printf("topic proportions file must be provided ...\n"); exit(-1); } if (!file_exists(theta_init_path)) { printf("topic proportions file %s doesn't exist! quit ...\n", theta_init_path); exit(-1); } printf("topic proportions file: %s\n", theta_init_path); if (beta_init_path == NULL) { printf("topic distributions file must be provided ...\n"); exit(-1); } if (!file_exists(beta_init_path)) { printf("topic distributions file %s doesn't exist! quit ...\n", beta_init_path); exit(-1); } printf("topic distributions file: %s\n", beta_init_path); if (theta_opt) printf("theta optimization: True\n"); else printf("theta optimization: false\n"); } else if (theta_opt) { printf("theta optimization: false"); printf("(theta_opt has no effect, back to default value: false)\n"); theta_opt = 0; } printf("\n"); /// save the settings int ctr_run = 1; if (mult_path == NULL) ctr_run = 0; ctr_hyperparameter ctr_param; ctr_param.set(a, b, lambda_u, lambda_v, learning_rate, alpha_smooth, random_seed, max_iter, save_lag, theta_opt, ctr_run, lda_regression); sprintf(filename, "%s/settings.txt", directory); ctr_param.save(filename); /// init random numbe generator RANDOM_NUMBER = new_random_number_generator(random_seed); // read users printf("reading user matrix from %s ...\n", user_path); c_data* users = new c_data(); users->read_data(user_path); int num_users = (int)users->m_vec_data.size(); // read items printf("reading item matrix from %s ...\n", item_path); c_data* items = new c_data(); items->read_data(item_path); int num_items = (int)items->m_vec_data.size(); // create model instance c_ctr* ctr = new c_ctr(); ctr->set_model_parameters(num_factors, num_users, num_items); c_corpus* c = NULL; if (mult_path != NULL) { // read word data c = new c_corpus(); c->read_data(mult_path); ctr->read_init_information(theta_init_path, beta_init_path, c, alpha_smooth); } if (learning_rate <= 0) { ctr->learn_map_estimate(users, items, c, &ctr_param, directory); } else { ctr->stochastic_learn_map_estimate(users, items, c, &ctr_param, directory); } free_random_number_generator(RANDOM_NUMBER); if (c != NULL) delete c; delete ctr; delete users; delete items; return 0; }