void initialize_lda_ss_from_random(corpus_t* data, lda_suff_stats* ss) {
    int k, n;
    gsl_rng * r = new_random_number_generator();
    for (k = 0; k < ss->topics_ss->size2; k++)
    {
        gsl_vector topic = gsl_matrix_column(ss->topics_ss, k).vector;
        gsl_vector_set_all(&topic, 0);
        for (n = 0; n < topic.size; n++)
        {
	  vset(&topic, n, gsl_rng_uniform(r) + 0.5 / data->ndocs + 4.0);
        }
    }
}
void initialize_lda_ss_from_data(corpus_t* data, lda_suff_stats* ss) {
    int k, n, i, w;
    gsl_rng * r = new_random_number_generator();

    for (k = 0; k < ss->topics_ss->size2; k++)
    {
        gsl_vector topic = gsl_matrix_column(ss->topics_ss, k).vector;
        for (n = 0; n < LDA_SEED_INIT; n++)
        {
            int d = floor(gsl_rng_uniform(r) * data->ndocs);
            doc_t* doc = data->doc[d];
            for (i = 0; i < doc->nterms; i++)
            {
                vinc(&topic, doc->word[n], doc->count[n]);
            }
        }
        for (w = 0; w < topic.size; w++)
        {
            vinc(&topic, w, LDA_INIT_SMOOTH + gsl_rng_uniform(r));
        }
    }
}
Esempio n. 3
0
File: main.cpp Progetto: 2php/hdp
int main(int argc, char* argv[]) {
  if (argc < 2) print_usage_and_exit();

  int verbose = 0;

  // Control parameters.
  char*  directory = NULL;
  time_t t; time(&t);
  long   random_seed = (long) t;
  int    max_iter = 100;
  int    max_time = 1800;
  int    save_lag = 5;

  // Data parameters.
  char* train_data = NULL;

  // Model parameters.
  double eta = 0.01;
  double gamma = 1.0;
  double alpha = 1.0;
  double gamma_a = 1.0;
  double gamma_b = 1.0;
  double alpha_a = 1.0;
  double alpha_b = 1.0;
  int sample_hyper = 0;

  // test only parameters
  char* test_data = NULL;
  char* model_prefix = NULL;

  for (int i = 1; i < argc; ++i) {
    if (!strcmp(argv[i], "--help")) print_usage_and_exit();
    else if (!strcmp(argv[i], "--verbose"))        verbose = 1;

    else if (!strcmp(argv[i], "--directory"))       directory = argv[++i];
    else if (!strcmp(argv[i], "--random_seed"))     random_seed = atoi(argv[++i]);
    else if (!strcmp(argv[i], "--max_iter"))        max_iter = atoi(argv[++i]);
    else if (!strcmp(argv[i], "--max_time"))        max_time = atoi(argv[++i]);
    else if (!strcmp(argv[i], "--save_lag"))        save_lag = atoi(argv[++i]);

    else if (!strcmp(argv[i], "--train_data"))      train_data = argv[++i];

    else if (!strcmp(argv[i], "--eta"))             eta = atof(argv[++i]);
    else if (!strcmp(argv[i], "--gamma"))           gamma = atof(argv[++i]);
    else if (!strcmp(argv[i], "--alpha"))           alpha = atof(argv[++i]);
    else if (!strcmp(argv[i], "--gamma_a"))         gamma_a = atof(argv[++i]);
    else if (!strcmp(argv[i], "--gamma_b"))         gamma_b = atof(argv[++i]);
    else if (!strcmp(argv[i], "--gamma_a"))         gamma_a = atof(argv[++i]);
    else if (!strcmp(argv[i], "--gamma_b"))         gamma_b = atof(argv[++i]);
    else if (!strcmp(argv[i], "--sample_hyper"))    sample_hyper = 1;

    else if (!strcmp(argv[i], "--test_data"))       test_data = argv[++i];
    else if (!strcmp(argv[i], "--model_prefix"))    model_prefix = argv[++i];
    else {
      printf("%s, unknown parameters, exit\n", argv[i]); 
      print_usage_and_exit();
    }
  }
  /// print information
  printf("************************************************************************************************\n");

  if (directory == NULL)  {
    printf("Following information is missing: --directory\n");
    printf("Run ./hdp for help.\n");
    exit(0);
  }
  
  if (!dir_exists(directory)) make_directory(directory);
  printf("Working directory: %s.\n", directory);

  char name[500];
  // Init random numbe generator.
  RANDOM_NUMBER = new_random_number_generator(random_seed);
  
  if (test_data == NULL || model_prefix == NULL) {
    sprintf(name, "%s/settings.dat", directory);
    printf("Setting saved at %s.\n", name); 
    FILE* setting_file = fopen(name, "w");

    fprintf(setting_file, "Control parameters:\n");
    fprintf(setting_file, "directory: %s\n", directory);
    fprintf(setting_file, "random_seed: %d\n", (int)random_seed);
    fprintf(setting_file, "save_lag: %d\n", save_lag);
    fprintf(setting_file, "max_iter: %d\n", max_iter);
    fprintf(setting_file, "max_time: %d\n", max_time);

    fprintf(setting_file, "\nData parameters:\n");
    fprintf(setting_file, "train_data: %s\n", train_data);

    fprintf(setting_file, "\nModel parameters:\n");
    fprintf(setting_file, "eta: %.4lf\n", eta);
    fprintf(setting_file, "gamma: %.4lf\n", gamma);
    fprintf(setting_file, "alpha: %.4lf\n", alpha);
    fprintf(setting_file, "gamma_a: %.2lf\n", gamma_a);
    fprintf(setting_file, "gamma_b: %.4lf\n", gamma_b);
    fprintf(setting_file, "gamma_a: %.2lf\n", alpha_a);
    fprintf(setting_file, "gamma_b: %.4lf\n", alpha_b);
    fprintf(setting_file, "sample_hyper: %d\n", sample_hyper);

    fclose(setting_file);

    Corpus* c_train = NULL;

    printf("Reading training data from %s.\n", train_data);
    // Reading one of the train data.
    c_train = new Corpus();
    c_train->read_data(train_data);

    // Open the log file for training data.
    sprintf(name, "%s/train.log", directory);
    FILE* train_log = fopen(name, "w");
    // Heldout columns record the documents that have not seen before.
    sprintf(name, "time\titer\tnum.topics\tgamma\talpha\t\tword.count\tlikelihood\tavg.likelihood");
    if(verbose) printf("%s\n", name);
    fprintf(train_log, "%s\n", name);
    
    // Start iterating.
    time_t start, current;
    int total_time = 0;
    int iter = 0;

    HDP* hdp = new HDP();
    hdp->init_hdp(eta, gamma, alpha, c_train->size_vocab_);

    // Setting up the hdp state.
    hdp->setup_doc_states(c_train->docs_);
    // first iteration
    hdp->iterate_gibbs_state(false, false);

    while ((max_iter == -1 || iter < max_iter) && (max_time == -1 || total_time < max_time)) {
      ++iter;
      time (&start);
       
      // Iterations.
      hdp->iterate_gibbs_state(true, true);
      // Scoring the documents.
      double likelihood = hdp->log_likelihood(NULL);
      hdp->compact_hdp_state();

      if (sample_hyper) hdp->hyper_inference(gamma_a, gamma_b, alpha_a, alpha_b);
      
      // Record the time.
      time(&current);
      int elapse = (int) difftime(current, start);
      total_time += elapse;

      sprintf(name, "%d\t%d\t%d\t\t%.5f\t%.5f\t\t%d\t\t%.3f\t%.5f", 
              total_time, iter, hdp->hdp_state_->num_topics_, hdp->hdp_state_->gamma_,
              hdp->hdp_state_->alpha_, c_train->num_total_words_, likelihood, likelihood/c_train->num_total_words_);

      if (verbose) printf("%s\n", name);
      fprintf(train_log, "%s\n", name); 
      fflush(train_log);

      if (save_lag > 0 && (iter % save_lag == 0)) {
        sprintf(name, "%s/iter@%05d", directory, iter);
        hdp->save_state(name);
      }
    }

    sprintf(name, "%s/final", directory);
    hdp->save_state(name);

    // Free training data.
    if (c_train != NULL) {
      delete c_train;
    }
    fclose(train_log);

    delete hdp;
  }
  
  if (test_data != NULL && model_prefix != NULL) {
    Corpus* c_test = new Corpus();
    c_test->read_data(test_data);

    HDP* hdp = new HDP();
    printf("Loading model from prefix %s...\n", model_prefix);
    hdp->load_state(model_prefix);

    // Remember the old state.
    HDPState* old_hdp_state = new HDPState();
    old_hdp_state->copy_hdp_state(*hdp->hdp_state_);

    hdp->setup_doc_states(c_test->docs_);

    if (verbose) printf("Initialization ...\n");
    hdp->iterate_gibbs_state(false, false);

    sprintf(name, "%s/%s-test.log", directory, basename(model_prefix));
    FILE* test_log = fopen(name, "w");
    sprintf(name, "time\titer\tnum.topics\tword.count\tlikelihood\tavg.likelihood");
    if(verbose) printf("%s\n", name);
    fprintf(test_log, "%s\n", name);

    time_t start, current;
    int total_time = 0;
    int iter = 0;

    // Iterations.
    while ((max_iter == -1 || iter < max_iter) && (max_time == -1 || total_time < max_time)) {
      ++iter;
      time (&start);
      hdp->iterate_gibbs_state(true, true);
      double likelihood = hdp->log_likelihood(old_hdp_state);
      hdp->compact_hdp_state();
      time(&current);
      int elapse = (int) difftime(current, start);
      total_time += elapse;

      sprintf(name, "%d\t%d\t%d\t\t%d\t\t%.3f\t%.5f", 
              total_time, iter, hdp->hdp_state_->num_topics_,
              c_test->num_total_words_, likelihood,
              likelihood/c_test->num_total_words_);

      if (verbose) printf("%s\n", name);
      fprintf(test_log, "%s\n", name); 
      fflush(test_log);
    }
    
    if (verbose) printf("Done and saving ...\n");
    sprintf(name, "%s/%s-test", directory, basename(model_prefix));
    hdp->save_state(name);
    hdp->save_doc_states(name);
    fclose(test_log);

    delete hdp;
    delete old_hdp_state;
    delete c_test;
  }

  // Free random number generator.
  free_random_number_generator(RANDOM_NUMBER);
  return 0;
}
Esempio n. 4
0
int main(int argc, char* argv[]) {
  if (argc < 2) print_usage_and_exit();

  char filename[500];
  int theta_opt = 0;
  int lda_regression = 0;

  const char* const short_options = "hd:x:i:a:b:u:v:r:s:m:k:t:e:y:z:w:";
  const struct option long_options[] = {
    {"help",          no_argument,       NULL, 'h'},
    {"directory",     required_argument, NULL, 'd'},
    {"user",          required_argument, NULL, 'x'},
    {"item",          required_argument, NULL, 'i'},
    {"a",             required_argument, NULL, 'a'},
    {"b",             required_argument, NULL, 'b'},
    {"lambda_u",      required_argument, NULL, 'u'},
    {"lambda_v",      required_argument, NULL, 'v'},
    {"random_seed",   required_argument, NULL, 'r'},
    {"save_lag",      required_argument, NULL, 's'},
    {"max_iter",      required_argument, NULL, 'm'},
    {"num_factors",   required_argument, NULL, 'k'},
    {"mult",          required_argument, NULL, 't'},
    {"theta_init",    required_argument, NULL, 'e'},
    {"beta_init",     required_argument, NULL, 'y'},
    {"learning_rate", required_argument, NULL, 'z'},
    {"alpha_smooth",  required_argument, NULL, 'w'},
    {"theta_opt",     no_argument, &theta_opt, 1},
    {"lda_regression",no_argument, &lda_regression, 1},
    {NULL, 0, NULL, 0}};

  char*  directory = NULL;

  char*  user_path = NULL;
  char*  item_path = NULL;
  double a = 1.0;
  double b = 0.01;
  double lambda_u = 0.01;
  double lambda_v = 100;
  double learning_rate = -1;
  double alpha_smooth = 0.0;

  time_t t; time(&t);
  long   random_seed = (long) t;
  int    save_lag = 20;
  int    max_iter = 200;

  int    num_factors = 200;
  char*  mult_path = NULL;
  char*  theta_init_path = NULL;
  char*  beta_init_path = NULL;

  int cc = 0; 
  while(true) {
    cc = getopt_long(argc, argv, short_options, long_options, NULL);
    switch(cc) {
      case 'h':
        print_usage_and_exit();
        break;
      case 'd':
        directory = optarg;
        break;
      case 'x':
        user_path = optarg;
        break;
      case 'i':
        item_path = optarg;
        break;
      case 'a':
        a = atof(optarg);
        break;
      case 'b':
        b = atof(optarg);
        break;
      case 'u':
        lambda_u = atof(optarg);
        break;
      case 'v':
        lambda_v = atof(optarg);
        break;
      case 'z':
        learning_rate = atof(optarg);
        break;
      case 'w':
        alpha_smooth = atof(optarg);
        break;
      case 'r':
        random_seed = atoi(optarg);
        break;
      case 's':
        save_lag = atoi(optarg);
        break;
      case 'm':
        max_iter =  atoi(optarg);
        break;    
      case 'k':
        num_factors = atoi(optarg);
        break;
      case 't':
        mult_path = optarg;
        break;
      case 'e':
        theta_init_path = optarg;
        break;
      case 'y':
        beta_init_path = optarg;
        break;
      case -1:
        break;
      case '?':
        print_usage_and_exit();
        break;
      default:
        break;
    }
    if (cc == -1)
      break;
  }

  /// print information
  printf("\n************************************************************************************************\n");
  
  if (!dir_exists(directory)) make_directory(directory);
  printf("result directory: %s\n", directory);

  if (!file_exists(user_path)) {
    printf("user file %s doesn't exist! quit ...\n", user_path);
    exit(-1);
  }
  printf("user file: %s\n", user_path);

  if (!file_exists(item_path)) {
    printf("item file %s doesn't exist! quit ...\n", item_path);
    exit(-1);
  }
  printf("item file: %s\n", item_path);

  printf("a: %.4f\n", a);
  printf("b: %.4f\n", b);
  printf("lambda_u: %.4f\n", lambda_u);
  printf("lambda_v: %.4f\n", lambda_v);
  printf("learning_rate: %.5f\n", learning_rate);
  printf("alpha_smooth: %.5f\n", alpha_smooth);
  printf("random seed: %d\n", (int)random_seed);
  printf("save lag: %d\n", save_lag);
  printf("max iter: %d\n", max_iter);
  printf("number of factors: %d\n", num_factors);

  if (mult_path != NULL) {
    if (!file_exists(item_path)) {
      printf("mult file %s doesn't exist! quit ...\n", mult_path);
      exit(-1);
    }
    printf("mult file: %s\n", mult_path);
      
    if (theta_init_path == NULL) {
      printf("topic proportions file must be provided ...\n");
      exit(-1);
    }
    if (!file_exists(theta_init_path)) {
      printf("topic proportions file %s doesn't exist! quit ...\n", theta_init_path);
      exit(-1);
    }
    printf("topic proportions file: %s\n", theta_init_path);

    if (beta_init_path == NULL) {
      printf("topic distributions file must be provided ...\n");
      exit(-1);
    }
    if (!file_exists(beta_init_path)) {
      printf("topic distributions file %s doesn't exist! quit ...\n", beta_init_path);
      exit(-1);
    }
    printf("topic distributions file: %s\n", beta_init_path);
    if (theta_opt) printf("theta optimization: True\n");
    else printf("theta optimization: false\n");
  }
  else if (theta_opt) {
    printf("theta optimization: false");
    printf("(theta_opt has no effect, back to default value: false)\n");
    theta_opt = 0;
  }

  printf("\n");

  /// save the settings
  int ctr_run = 1;
  if (mult_path == NULL) ctr_run = 0;
  ctr_hyperparameter ctr_param;
  ctr_param.set(a, b, lambda_u, lambda_v, learning_rate, alpha_smooth,
      random_seed, max_iter, save_lag, theta_opt, ctr_run, lda_regression);
  sprintf(filename, "%s/settings.txt", directory); 
  ctr_param.save(filename);
  
  /// init random numbe generator
  RANDOM_NUMBER = new_random_number_generator(random_seed);

  // read users
  printf("reading user matrix from %s ...\n", user_path);
  c_data* users = new c_data(); 
  users->read_data(user_path);
  int num_users = (int)users->m_vec_data.size();

  // read items
  printf("reading item matrix from %s ...\n", item_path);
  c_data* items = new c_data(); 
  items->read_data(item_path);
  int num_items = (int)items->m_vec_data.size();

  // create model instance
  c_ctr* ctr = new c_ctr();
  ctr->set_model_parameters(num_factors, num_users, num_items);

  c_corpus* c = NULL;
  if (mult_path != NULL) {
    // read word data
    c = new c_corpus();
    c->read_data(mult_path);
    ctr->read_init_information(theta_init_path, beta_init_path, c, alpha_smooth);
  }

  if (learning_rate <= 0) {
    ctr->learn_map_estimate(users, items, c, &ctr_param, directory);
  } else {
    ctr->stochastic_learn_map_estimate(users, items, c, &ctr_param, directory);
  }

  free_random_number_generator(RANDOM_NUMBER);
  if (c != NULL) delete c;

  delete ctr;
  delete users;
  delete items;
  return 0;
}