void inference(char* dataset, char* model_root, char* out) { int i; char fname[100]; // read the data and model corpus * corpus = read_data(dataset); llna_model * model = read_llna_model(model_root); gsl_vector * lhood = gsl_vector_alloc(corpus->ndocs); gsl_matrix * corpus_nu = gsl_matrix_alloc(corpus->ndocs, model->k); gsl_matrix * corpus_lambda = gsl_matrix_alloc(corpus->ndocs, model->k); // gsl_matrix * topic_lhoods = gsl_matrix_alloc(corpus->ndocs, model->k); gsl_matrix * phi_sums = gsl_matrix_alloc(corpus->ndocs, model->k); // approximate inference init_temp_vectors(model->k-1); // !!! hacky sprintf(fname, "%s-word-assgn.dat", out); FILE* word_assignment_file = fopen(fname, "w"); for (i = 0; i < corpus->ndocs; i++) { doc doc = corpus->docs[i]; llna_var_param * var = new_llna_var_param(doc.nterms, model->k); init_var_unif(var, &doc, model); vset(lhood, i, var_inference(var, &doc, model)); gsl_matrix_set_row(corpus_lambda, i, var->lambda); gsl_matrix_set_row(corpus_nu, i, var->nu); gsl_vector curr_row = gsl_matrix_row(phi_sums, i).vector; col_sum(var->phi, &curr_row); write_word_assignment(word_assignment_file, &doc, var->phi); printf("document %05d, niter = %05d\n", i, var->niter); free_llna_var_param(var); } // output likelihood and some variational parameters sprintf(fname, "%s-ctm-lhood.dat", out); printf_vector(fname, lhood); sprintf(fname, "%s-lambda.dat", out); printf_matrix(fname, corpus_lambda); sprintf(fname, "%s-nu.dat", out); printf_matrix(fname, corpus_nu); sprintf(fname, "%s-phi-sum.dat", out); printf_matrix(fname, phi_sums); }
void run_em(char* start, char* directory, corpus* corpus) { int d, n; lda_model *model = NULL; double **var_gamma, **phi; // allocate variational parameters var_gamma = malloc(sizeof(double*)*(corpus->num_docs)); for (d = 0; d < corpus->num_docs; d++) var_gamma[d] = malloc(sizeof(double) * NTOPICS); int max_length = max_corpus_length(corpus); phi = malloc(sizeof(double*)*max_length); for (n = 0; n < max_length; n++) phi[n] = malloc(sizeof(double) * NTOPICS); // initialize model char filename[100]; lda_suffstats* ss = NULL; if (strcmp(start, "seeded")==0) { model = new_lda_model(corpus->num_terms, NTOPICS); ss = new_lda_suffstats(model); corpus_initialize_ss(ss, model, corpus); if (VERBOSE) { lda_mle(model, ss, 0); } else { quiet_lda_mle(model, ss, 0); } model->alpha = INITIAL_ALPHA; } else if (strcmp(start, "random")==0) { model = new_lda_model(corpus->num_terms, NTOPICS); ss = new_lda_suffstats(model); random_initialize_ss(ss, model); if (VERBOSE) { lda_mle(model, ss, 0); } else { quiet_lda_mle(model, ss, 0); } model->alpha = INITIAL_ALPHA; } else { model = load_lda_model(start); ss = new_lda_suffstats(model); } sprintf(filename,"%s/000",directory); save_lda_model(model, filename); // run expectation maximization int i = 0; double likelihood, likelihood_old = 0, converged = 1; sprintf(filename, "%s/likelihood.dat", directory); FILE* likelihood_file = fopen(filename, "w"); while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) { i++; if (VERBOSE) printf("**** em iteration %d ****\n", i); likelihood = 0; zero_initialize_ss(ss, model); // e-step printf("e-step\n"); for (d = 0; d < corpus->num_docs; d++) { if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d); likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss); } printf("m-step\n"); // m-step if (VERBOSE) { lda_mle(model, ss, ESTIMATE_ALPHA); } else { quiet_lda_mle(model, ss, ESTIMATE_ALPHA); } // check for convergence converged = (likelihood_old - likelihood) / (likelihood_old); if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2; likelihood_old = likelihood; // output model and likelihood fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged); fflush(likelihood_file); if ((i % LAG) == 0) { sprintf(filename,"%s/%03d",directory, i); save_lda_model(model, filename); sprintf(filename,"%s/%03d.gamma",directory, i); save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); } } // output the final model sprintf(filename,"%s/final",directory); save_lda_model(model, filename); sprintf(filename,"%s/final.gamma",directory); save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); // output the word assignments (for visualization) sprintf(filename, "%s/word-assignments.dat", directory); FILE* w_asgn_file = fopen(filename, "w"); short error = 0; double tl = 0.0; for (d = 0; d < corpus->num_docs; d++) { if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d); error = 0; tl = lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi,&error); if( error ) { continue; } likelihood += tl; write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model); } fclose(w_asgn_file); fclose(likelihood_file); }
/* * learn dictionary and find optimum code. */ int MedSTC::train(char* start, char* directory, Corpus* pC, Params *param) { m_dDeltaEll = param->DELTA_ELL; m_dLambda = param->LAMBDA; m_dRho = param->RHO; m_dGamma = m_dLambda; long runtime_start = get_runtime(); // allocate variational parameters double ***phi = (double***)malloc(sizeof(double**) * pC->num_docs); for ( int d=0; d<pC->num_docs; d++ ) { phi[d] = (double**)malloc(sizeof(double*)*pC->docs[d].length); for (int n=0; n<pC->docs[d].length; n++) { phi[d][n] = (double*)malloc(sizeof(double) * param->NTOPICS); } } double **theta = (double**)malloc(sizeof(double*)*(pC->num_docs)); for (int d=0; d<pC->num_docs; d++) { theta[d] = (double*)malloc(sizeof(double) * param->NTOPICS); } for ( int d=0; d<pC->num_docs; d++ ) { init_phi(&(pC->docs[d]), phi[d], theta[d], param); } // initialize model if (strcmp(start, "random")==0) { new_model(pC->num_docs, pC->num_terms, param->NTOPICS, param->NLABELS, param->INITIAL_C); init_param( pC ); } else { load_model(start); m_dC = param->INITIAL_C; } strcpy(m_directory, directory); char filename[100]; // run expectation maximization sprintf(filename, "%s/lhood.dat", directory); FILE* lhood_file = fopen(filename, "w"); Document *pDoc = NULL; double dobj, obj_old = 1, converged = 1; int nIt = 0; while (((converged < 0) || (converged > param->EM_CONVERGED) || (nIt <= 2)) && (nIt <= param->EM_MAX_ITER)) { dobj = 0; double dLogLoss = 0; for ( int d=0; d<pC->num_docs; d++ ) { pDoc = &(pC->docs[d]); dobj += sparse_coding( pDoc, d, param, theta[d], phi[d] ); dLogLoss += m_dLogLoss; } // m-step dict_learn(pC, theta, phi, param, false); if ( param->SUPERVISED == 1 ) { // for supervised MedLDA. char buff[512]; get_train_filename( buff, m_directory, param ); outputLowDimData( buff, pC, theta ); svmStructSolver(buff, param, m_dMu); if ( param->PRIMALSVM == 1 ) { // solve svm in the primal form for ( int d=0; d<pC->num_docs; d++ ) { loss_aug_predict( &(pC->docs[d]), theta[d] ); } } dobj += m_dsvm_primalobj; } else ; // check for convergence converged = fabs(1 - dobj / obj_old); obj_old = dobj; // output model and lhood if ( param->SUPERVISED == 1 ) { fprintf(lhood_file, "%10.10f\t%10.10f\t%5.5e\t%.5f\n", dobj-m_dsvm_primalobj, dobj, converged, dLogLoss); } else { fprintf(lhood_file, "%10.10f\t%5.5e\t%.5f\n", dobj, converged, dLogLoss); } fflush(lhood_file); if ( nIt > 0 && (nIt % LAG) == 0) { sprintf( filename, "%s/%d", directory, nIt + 1); save_model( filename, -1 ); sprintf( filename, "%s/%d.theta", directory, nIt + 1 ); save_theta( filename, theta, pC->num_docs, m_nK ); } nIt ++; } // learn the final SVM. if ( param->SUPERVISED == 0 ) { char buff[512]; get_train_filename(buff, m_directory, param); outputLowDimData(buff, pC, theta); svmStructSolver(buff, param, m_dMu); } long runtime_end = get_runtime(); double dTrainTime = ((double)runtime_end-(double)runtime_start) / 100.0; // output the final model sprintf( filename, "%s/final", directory); save_model( filename, dTrainTime ); // output the word assignments (for visualization) int nNum = 0, nAcc = 0; sprintf(filename, "%s/word-assignments.dat", directory); FILE* w_asgn_file = fopen(filename, "w"); for (int d=0; d<pC->num_docs; d++) { sparse_coding( &(pC->docs[d]), d, param, theta[d], phi[d] ); write_word_assignment(w_asgn_file, &(pC->docs[d]), phi[d]); nNum ++; pC->docs[d].predlabel = predict(theta[d]); if ( pC->docs[d].gndlabel == pC->docs[d].predlabel ) nAcc ++; } fclose(w_asgn_file); fclose(lhood_file); sprintf(filename,"%s/train.theta",directory); save_theta(filename, theta, pC->num_docs, m_nK); for (int d=0; d<pC->num_docs; d++) { free( theta[d] ); for (int n=0; n<pC->docs[d].length; n++) free( phi[d][n] ); free( phi[d] ); } free( theta ); free( phi ); return nIt; }