lda_model* load_lda_model(char* model_root) { char filename[100]; FILE* fileptr; int i, j, num_terms, num_topics; float x, alpha; sprintf(filename, "%s.other", model_root); printf("loading %s\n", filename); fileptr = fopen(filename, "r"); fscanf(fileptr, "num_topics %d\n", &num_topics); fscanf(fileptr, "num_terms %d\n", &num_terms); fscanf(fileptr, "alpha %f\n", &alpha); fclose(fileptr); lda_model* model = new_lda_model(num_terms, num_topics); model->alpha = alpha; sprintf(filename, "%s.beta", model_root); printf("loading %s\n", filename); fileptr = fopen(filename, "r"); for (i = 0; i < num_topics; i++) { for (j = 0; j < num_terms; j++) { fscanf(fileptr, "%f", &x); model->log_prob_w[i][j] = x; } } fclose(fileptr); return(model); }
lda* read_lda(int ntopics, int nterms, char* name) { char filename[400]; lda* model = new_lda_model(ntopics, nterms); sprintf(filename, "%s.beta", name); mtx_fscanf(filename, model->topics); sprintf(filename, "%s.alpha", name); vct_fscanf(filename, model->alpha); return(model); }
void run_quiet_em(char* start, corpus* corpus) { int d = 0, n = 0; lda_model *model = NULL; double **var_gamma = NULL, **phi = NULL; // last_gamma is a double[num_docs][num_topics] // allocate variational parameters var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs)); memset(var_gamma, 0.0, corpus->num_docs); for (d = 0; d < corpus->num_docs; ++d) { var_gamma[d] = (double*)malloc(sizeof(double) * NTOPICS); memset(var_gamma[d], 0.0, sizeof(double)*NTOPICS); } int max_length = max_corpus_length(corpus); phi = (double**)malloc(sizeof(double*)*max_length); memset(phi, 0.0, max_length); for (n = 0; n < max_length; ++n) { phi[n] = (double*)malloc(sizeof(double) * NTOPICS); memset(phi[n], 0.0, sizeof(double)*NTOPICS); } // initialize model lda_suffstats* ss = NULL; if (strncmp(start, "seeded",6)==0) { model = new_lda_model(corpus->num_terms, NTOPICS); model->alpha = INITIAL_ALPHA; ss = new_lda_suffstats(model); if (VERBOSE) { corpus_initialize_ss(ss, model, corpus); } else { quiet_corpus_initialize_ss(ss, model, corpus); } if (VERBOSE) { lda_mle(model, ss, 0); } else { quiet_lda_mle(model, ss, 0); } } else if (strncmp(start, "fixed",5)==0) { model = new_lda_model(corpus->num_terms, NTOPICS); model->alpha = INITIAL_ALPHA; ss = new_lda_suffstats(model); corpus_initialize_fixed_ss(ss, model, corpus); if (VERBOSE) { lda_mle(model, ss, 0); } else { quiet_lda_mle(model, ss, 0); } } else if (strncmp(start, "random",6)==0) { model = new_lda_model(corpus->num_terms, NTOPICS); model->alpha = INITIAL_ALPHA; ss = new_lda_suffstats(model); random_initialize_ss(ss, model); if (VERBOSE) { lda_mle(model, ss, 0); } else { quiet_lda_mle(model, ss, 0); } } else { model = load_lda_model(start); ss = new_lda_suffstats(model); } // save the model in the last_model global last_model = model; model_loaded = TRUE; // run expectation maximization int i = 0; double likelihood = 0.0, likelihood_old = 0, converged = 1; while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) { i++; if (VERBOSE) printf("**** em iteration %d ****\n", i); likelihood = 0; zero_initialize_ss(ss, model); // e-step for (d = 0; d < corpus->num_docs; d++) { if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d); likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss); } // m-step if (VERBOSE) { lda_mle(model, ss, ESTIMATE_ALPHA); } else { quiet_lda_mle(model, ss, ESTIMATE_ALPHA); } // check for convergence converged = (likelihood_old - likelihood) / (likelihood_old); if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2; likelihood_old = likelihood; // store model and likelihood last_model = model; last_gamma = var_gamma; last_phi = phi; } // output the final model last_model = model; last_gamma = var_gamma; last_phi = phi; free_lda_suffstats(model,ss); // output the word assignments (for visualization) /* char filename[100]; sprintf(filename, "%s/word-assignments.dat", directory); FILE* w_asgn_file = fopen(filename, "w"); for (d = 0; d < corpus->num_docs; d++) { if ((d % 100) == 0) printf("final e step document %d\n",d); likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi); write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model); } fclose(w_asgn_file); */ }
void run_em(char* start, char* directory, corpus* corpus) { int d, n; lda_model *model = NULL; double **var_gamma, **phi; // allocate variational parameters var_gamma = malloc(sizeof(double*)*(corpus->num_docs)); for (d = 0; d < corpus->num_docs; d++) var_gamma[d] = malloc(sizeof(double) * NTOPICS); int max_length = max_corpus_length(corpus); phi = malloc(sizeof(double*)*max_length); for (n = 0; n < max_length; n++) phi[n] = malloc(sizeof(double) * NTOPICS); // initialize model char filename[100]; lda_suffstats* ss = NULL; if (strcmp(start, "seeded")==0) { model = new_lda_model(corpus->num_terms, NTOPICS); ss = new_lda_suffstats(model); corpus_initialize_ss(ss, model, corpus); if (VERBOSE) { lda_mle(model, ss, 0); } else { quiet_lda_mle(model, ss, 0); } model->alpha = INITIAL_ALPHA; } else if (strcmp(start, "random")==0) { model = new_lda_model(corpus->num_terms, NTOPICS); ss = new_lda_suffstats(model); random_initialize_ss(ss, model); if (VERBOSE) { lda_mle(model, ss, 0); } else { quiet_lda_mle(model, ss, 0); } model->alpha = INITIAL_ALPHA; } else { model = load_lda_model(start); ss = new_lda_suffstats(model); } sprintf(filename,"%s/000",directory); save_lda_model(model, filename); // run expectation maximization int i = 0; double likelihood, likelihood_old = 0, converged = 1; sprintf(filename, "%s/likelihood.dat", directory); FILE* likelihood_file = fopen(filename, "w"); while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) { i++; if (VERBOSE) printf("**** em iteration %d ****\n", i); likelihood = 0; zero_initialize_ss(ss, model); // e-step printf("e-step\n"); for (d = 0; d < corpus->num_docs; d++) { if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d); likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss); } printf("m-step\n"); // m-step if (VERBOSE) { lda_mle(model, ss, ESTIMATE_ALPHA); } else { quiet_lda_mle(model, ss, ESTIMATE_ALPHA); } // check for convergence converged = (likelihood_old - likelihood) / (likelihood_old); if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2; likelihood_old = likelihood; // output model and likelihood fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged); fflush(likelihood_file); if ((i % LAG) == 0) { sprintf(filename,"%s/%03d",directory, i); save_lda_model(model, filename); sprintf(filename,"%s/%03d.gamma",directory, i); save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); } } // output the final model sprintf(filename,"%s/final",directory); save_lda_model(model, filename); sprintf(filename,"%s/final.gamma",directory); save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); // output the word assignments (for visualization) sprintf(filename, "%s/word-assignments.dat", directory); FILE* w_asgn_file = fopen(filename, "w"); short error = 0; double tl = 0.0; for (d = 0; d < corpus->num_docs; d++) { if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d); error = 0; tl = lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi,&error); if( error ) { continue; } likelihood += tl; write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model); } fclose(w_asgn_file); fclose(likelihood_file); }