int pod_experiment(char* observed_data, char* heldout_data, char* model_root, char* out) { corpus *obs, *heldout; llna_model *model; llna_var_param *var; int i; gsl_vector *log_lhood, *e_theta; doc obs_doc, heldout_doc; char string[100]; double total_lhood = 0, total_words = 0, l; FILE* e_theta_file = fopen("/Users/blei/llna050_e_theta.txt", "w"); // load model and data obs = read_data(observed_data); heldout = read_data(heldout_data); assert(obs->ndocs == heldout->ndocs); model = read_llna_model(model_root); // run experiment init_temp_vectors(model->k-1); // !!! hacky log_lhood = gsl_vector_alloc(obs->ndocs + 1); e_theta = gsl_vector_alloc(model->k); for (i = 0; i < obs->ndocs; i++) { // get observed and heldout documents obs_doc = obs->docs[i]; heldout_doc = heldout->docs[i]; // compute variational distribution var = new_llna_var_param(obs_doc.nterms, model->k); init_var_unif(var, &obs_doc, model); var_inference(var, &obs_doc, model); expected_theta(var, &obs_doc, model, e_theta); vfprint(e_theta, e_theta_file); // approximate inference of held out data l = log_mult_prob(&heldout_doc, e_theta, model->log_beta); vset(log_lhood, i, l); total_words += heldout_doc.total; total_lhood += l; printf("hid doc %d log_lhood %5.5f\n", i, vget(log_lhood, i)); // save results? free_llna_var_param(var); } vset(log_lhood, obs->ndocs, exp(-total_lhood/total_words)); printf("perplexity : %5.10f", exp(-total_lhood/total_words)); sprintf(string, "%s-pod-llna.dat", out); printf_vector(string, log_lhood); return(0); }
void inference(char* dataset, char* model_root, char* out) { int i; char fname[100]; // read the data and model corpus * corpus = read_data(dataset); llna_model * model = read_llna_model(model_root); gsl_vector * lhood = gsl_vector_alloc(corpus->ndocs); gsl_matrix * corpus_nu = gsl_matrix_alloc(corpus->ndocs, model->k); gsl_matrix * corpus_lambda = gsl_matrix_alloc(corpus->ndocs, model->k); // gsl_matrix * topic_lhoods = gsl_matrix_alloc(corpus->ndocs, model->k); gsl_matrix * phi_sums = gsl_matrix_alloc(corpus->ndocs, model->k); // approximate inference init_temp_vectors(model->k-1); // !!! hacky sprintf(fname, "%s-word-assgn.dat", out); FILE* word_assignment_file = fopen(fname, "w"); for (i = 0; i < corpus->ndocs; i++) { doc doc = corpus->docs[i]; llna_var_param * var = new_llna_var_param(doc.nterms, model->k); init_var_unif(var, &doc, model); vset(lhood, i, var_inference(var, &doc, model)); gsl_matrix_set_row(corpus_lambda, i, var->lambda); gsl_matrix_set_row(corpus_nu, i, var->nu); gsl_vector curr_row = gsl_matrix_row(phi_sums, i).vector; col_sum(var->phi, &curr_row); write_word_assignment(word_assignment_file, &doc, var->phi); printf("document %05d, niter = %05d\n", i, var->niter); free_llna_var_param(var); } // output likelihood and some variational parameters sprintf(fname, "%s-ctm-lhood.dat", out); printf_vector(fname, lhood); sprintf(fname, "%s-lambda.dat", out); printf_matrix(fname, corpus_lambda); sprintf(fname, "%s-nu.dat", out); printf_matrix(fname, corpus_nu); sprintf(fname, "%s-phi-sum.dat", out); printf_matrix(fname, phi_sums); }
int main (int argc, char **argv) { int i, j; int interactive_test = 0; CA_Arbdata *ca_arbdata = NULL; /* new, link btw acc/syn */ //char *modelmap = NULL; char *arbfile = NULL; char* q; modelID model_sequence[128]; char pronunciation[256]; int pronunciation_len; int rc; srec_arbdata *allotree = NULL; /* initial memory */ CHKLOG(rc, PMemInit()); if(argc<=1){ printf("USAGE: -swiarb <swiarb file> -interactive\n"); exit(1); } for(i=1; i<argc; i++) { if(!strcmp(argv[i],"-swiarb")) { if(argc==2){ printf("Please specify the swiarb file.\n"); exit(1); } arbfile = argv[++i]; printf("using swiarb from file %s\n", arbfile); } else if(!strcmp(argv[i],"-interactive")) { interactive_test++; } else { printf("error_usage: argument [%s]\n", argv[i]); exit(1); } } /* get modelID for a triphone */ ca_arbdata = CA_LoadArbdata(arbfile); for(i=0; i<MAX_INTERACTIVE_NUM; i++){ if(interactive_test){ printf("Type \"quit\" to exit the test.\n"); printf("pronunciation: "); q = fgets(pronunciation, sizeof(pronunciation), stdin); if(!strcmp(q,"quit\n")) break; } else{ printf("USAGE: -swiarb <swiarb file> -interactive\n"); exit(1); } pronunciation_len = strlen(pronunciation)-1; CA_ArbdataGetModelIdsForPron(ca_arbdata, pronunciation, pronunciation_len, &model_sequence[0]); printf("short pronunciation length is %d.\n", pronunciation_len); printf("Acoustic model IDs (\"#\" is silence,\"_\" is word boundary):\n"); for (j=0;j<pronunciation_len;j++){ if(j==0){ if(pronunciation_len==1) printf("triphone:_%c_ -> ModelID:%d\n", pronunciation[j], model_sequence[j]); else printf("triphone:_%c%c -> ModelID:%d\n", pronunciation[j], pronunciation[j+1], model_sequence[j]); } else if(j==(pronunciation_len-1)){ printf("triphone:%c%c_ -> ModelID:%d\n", pronunciation[j-1], pronunciation[j], model_sequence[j]); } else{ printf("triphone:%c%c%c -> ModelID:%d\n", pronunciation[j-1], pronunciation[j], pronunciation[j+1], model_sequence[j]); } allotree = (srec_arbdata*)ca_arbdata; printf_vector("pel_ids: ", " %d", allotree->hmm_infos[model_sequence[j]].state_indices, (unsigned int) allotree->hmm_infos[model_sequence[j]].num_states); printf("\n"); } } CA_FreeArbdata( ca_arbdata); PMemShutdown(); return 0; CLEANUP: return 1; }