Exemplo n.º 1
0
int pod_experiment(char* observed_data, char* heldout_data,
                   char* model_root, char* out)
{
    corpus *obs, *heldout;
    llna_model *model;
    llna_var_param *var;
    int i;
    gsl_vector *log_lhood, *e_theta;
    doc obs_doc, heldout_doc;
    char string[100];
    double total_lhood = 0, total_words = 0, l;
    FILE* e_theta_file = fopen("/Users/blei/llna050_e_theta.txt", "w");

    // load model and data
    obs = read_data(observed_data);
    heldout = read_data(heldout_data);
    assert(obs->ndocs == heldout->ndocs);
    model = read_llna_model(model_root);

    // run experiment
    init_temp_vectors(model->k-1); // !!! hacky
    log_lhood = gsl_vector_alloc(obs->ndocs + 1);
    e_theta = gsl_vector_alloc(model->k);
    for (i = 0; i < obs->ndocs; i++)
    {
        // get observed and heldout documents
        obs_doc = obs->docs[i];
        heldout_doc = heldout->docs[i];
        // compute variational distribution
        var = new_llna_var_param(obs_doc.nterms, model->k);
        init_var_unif(var, &obs_doc, model);
        var_inference(var, &obs_doc, model);
        expected_theta(var, &obs_doc, model, e_theta);

        vfprint(e_theta, e_theta_file);

        // approximate inference of held out data
        l = log_mult_prob(&heldout_doc, e_theta, model->log_beta);
        vset(log_lhood, i, l);
        total_words += heldout_doc.total;
        total_lhood += l;
        printf("hid doc %d    log_lhood %5.5f\n", i, vget(log_lhood, i));
        // save results?
        free_llna_var_param(var);
    }
    vset(log_lhood, obs->ndocs, exp(-total_lhood/total_words));
    printf("perplexity : %5.10f", exp(-total_lhood/total_words));
    sprintf(string, "%s-pod-llna.dat", out);
    printf_vector(string, log_lhood);
    return(0);
}
Exemplo n.º 2
0
void inference(char* dataset, char* model_root, char* out)
{
    int i;
    char fname[100];

    // read the data and model
    corpus * corpus = read_data(dataset);
    llna_model * model = read_llna_model(model_root);
    gsl_vector * lhood = gsl_vector_alloc(corpus->ndocs);
    gsl_matrix * corpus_nu = gsl_matrix_alloc(corpus->ndocs, model->k);
    gsl_matrix * corpus_lambda = gsl_matrix_alloc(corpus->ndocs, model->k);
    // gsl_matrix * topic_lhoods = gsl_matrix_alloc(corpus->ndocs, model->k);
    gsl_matrix * phi_sums = gsl_matrix_alloc(corpus->ndocs, model->k);

    // approximate inference
    init_temp_vectors(model->k-1); // !!! hacky
    sprintf(fname, "%s-word-assgn.dat", out);
    FILE* word_assignment_file = fopen(fname, "w");
    for (i = 0; i < corpus->ndocs; i++)
    {
        doc doc = corpus->docs[i];
        llna_var_param * var = new_llna_var_param(doc.nterms, model->k);
        init_var_unif(var, &doc, model);

        vset(lhood, i, var_inference(var, &doc, model));
        gsl_matrix_set_row(corpus_lambda, i, var->lambda);
        gsl_matrix_set_row(corpus_nu, i, var->nu);
        gsl_vector curr_row = gsl_matrix_row(phi_sums, i).vector;
        col_sum(var->phi, &curr_row);
        write_word_assignment(word_assignment_file, &doc, var->phi);

        printf("document %05d, niter = %05d\n", i, var->niter);
        free_llna_var_param(var);
    }

    // output likelihood and some variational parameters
    sprintf(fname, "%s-ctm-lhood.dat", out);
    printf_vector(fname, lhood);
    sprintf(fname, "%s-lambda.dat", out);
    printf_matrix(fname, corpus_lambda);
    sprintf(fname, "%s-nu.dat", out);
    printf_matrix(fname, corpus_nu);
    sprintf(fname, "%s-phi-sum.dat", out);
    printf_matrix(fname, phi_sums);

}
Exemplo n.º 3
0
int main (int argc, char **argv)
{
	int i, j;
	int interactive_test = 0;
    CA_Arbdata             *ca_arbdata = NULL;     /* new, link btw acc/syn */
	//char *modelmap = NULL;
	char *arbfile = NULL;
	char* q;
	modelID model_sequence[128];
	char pronunciation[256];
	int pronunciation_len;
	int rc;
	srec_arbdata *allotree = NULL;

/* initial memory */
	CHKLOG(rc, PMemInit());

	if(argc<=1){
	  printf("USAGE: -swiarb <swiarb file> -interactive\n");
	  exit(1);
	}


	for(i=1; i<argc; i++) {
      if(!strcmp(argv[i],"-swiarb")) {
	if(argc==2){
	  printf("Please specify the swiarb file.\n");
	  exit(1);
	}
	arbfile = argv[++i];
	printf("using swiarb from file %s\n", arbfile);
      } else if(!strcmp(argv[i],"-interactive")) {
	interactive_test++;
      } else {
	printf("error_usage: argument [%s]\n", argv[i]);
	exit(1);
      }
    }
	
/* get modelID for a triphone */
    ca_arbdata = CA_LoadArbdata(arbfile);
    
    for(i=0; i<MAX_INTERACTIVE_NUM; i++){

      if(interactive_test){
	printf("Type \"quit\" to exit the test.\n");
	printf("pronunciation: ");
	q = fgets(pronunciation, sizeof(pronunciation), stdin);
	if(!strcmp(q,"quit\n")) break;
      }
      else{
	printf("USAGE: -swiarb <swiarb file> -interactive\n");
	exit(1);
      }

      pronunciation_len = strlen(pronunciation)-1;
      CA_ArbdataGetModelIdsForPron(ca_arbdata,
                                 pronunciation, pronunciation_len,
                                 &model_sequence[0]);


      printf("short pronunciation length is %d.\n", pronunciation_len);
      printf("Acoustic model IDs (\"#\" is silence,\"_\" is word boundary):\n");
      for (j=0;j<pronunciation_len;j++){
      
	if(j==0){
	  if(pronunciation_len==1) 
	    printf("triphone:_%c_ -> ModelID:%d\n", pronunciation[j], model_sequence[j]);
	    else
	  printf("triphone:_%c%c -> ModelID:%d\n", pronunciation[j], pronunciation[j+1],
	       model_sequence[j]);
	}
	else if(j==(pronunciation_len-1)){
	  printf("triphone:%c%c_ -> ModelID:%d\n", pronunciation[j-1], pronunciation[j], model_sequence[j]);
	}
	else{
	  printf("triphone:%c%c%c -> ModelID:%d\n", pronunciation[j-1], pronunciation[j], pronunciation[j+1],
	       model_sequence[j]);
	}
      
	allotree = (srec_arbdata*)ca_arbdata;
	printf_vector("pel_ids: ", " %d", allotree->hmm_infos[model_sequence[j]].state_indices, 
		    (unsigned int) allotree->hmm_infos[model_sequence[j]].num_states);
	printf("\n");
      
      }
    }

  CA_FreeArbdata( ca_arbdata);
	
  PMemShutdown();
  return 0;
CLEANUP:
  return 1;
}