Esempio n. 1
0
void inference(char* dataset, char* model_root, char* out)
{
    int i;
    char fname[100];

    // read the data and model
    corpus * corpus = read_data(dataset);
    llna_model * model = read_llna_model(model_root);
    gsl_vector * lhood = gsl_vector_alloc(corpus->ndocs);
    gsl_matrix * corpus_nu = gsl_matrix_alloc(corpus->ndocs, model->k);
    gsl_matrix * corpus_lambda = gsl_matrix_alloc(corpus->ndocs, model->k);
    // gsl_matrix * topic_lhoods = gsl_matrix_alloc(corpus->ndocs, model->k);
    gsl_matrix * phi_sums = gsl_matrix_alloc(corpus->ndocs, model->k);

    // approximate inference
    init_temp_vectors(model->k-1); // !!! hacky
    sprintf(fname, "%s-word-assgn.dat", out);
    FILE* word_assignment_file = fopen(fname, "w");
    for (i = 0; i < corpus->ndocs; i++)
    {
        doc doc = corpus->docs[i];
        llna_var_param * var = new_llna_var_param(doc.nterms, model->k);
        init_var_unif(var, &doc, model);

        vset(lhood, i, var_inference(var, &doc, model));
        gsl_matrix_set_row(corpus_lambda, i, var->lambda);
        gsl_matrix_set_row(corpus_nu, i, var->nu);
        gsl_vector curr_row = gsl_matrix_row(phi_sums, i).vector;
        col_sum(var->phi, &curr_row);
        write_word_assignment(word_assignment_file, &doc, var->phi);

        printf("document %05d, niter = %05d\n", i, var->niter);
        free_llna_var_param(var);
    }

    // output likelihood and some variational parameters
    sprintf(fname, "%s-ctm-lhood.dat", out);
    printf_vector(fname, lhood);
    sprintf(fname, "%s-lambda.dat", out);
    printf_matrix(fname, corpus_lambda);
    sprintf(fname, "%s-nu.dat", out);
    printf_matrix(fname, corpus_nu);
    sprintf(fname, "%s-phi-sum.dat", out);
    printf_matrix(fname, phi_sums);

}
Esempio n. 2
0
void run_em(char* start, char* directory, corpus* corpus) {
	int d, n;
	lda_model *model = NULL;
	double **var_gamma, **phi;

	// allocate variational parameters


	var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
	for (d = 0; d < corpus->num_docs; d++)
		var_gamma[d] = malloc(sizeof(double) * NTOPICS);

	int max_length = max_corpus_length(corpus);
	phi = malloc(sizeof(double*)*max_length);
	for (n = 0; n < max_length; n++)
		phi[n] = malloc(sizeof(double) * NTOPICS);

	// initialize model

	char filename[100];

	lda_suffstats* ss = NULL;
	if (strcmp(start, "seeded")==0) {
		model = new_lda_model(corpus->num_terms, NTOPICS);
		ss = new_lda_suffstats(model);
		corpus_initialize_ss(ss, model, corpus);
		if (VERBOSE) {
		    lda_mle(model, ss, 0);
	    } else {
            quiet_lda_mle(model, ss, 0);
	    }
		    
		model->alpha = INITIAL_ALPHA;
	} else if (strcmp(start, "random")==0) {
		model = new_lda_model(corpus->num_terms, NTOPICS);
		ss = new_lda_suffstats(model);
		random_initialize_ss(ss, model);
		if (VERBOSE) {
		    lda_mle(model, ss, 0);
	    } else {
	        quiet_lda_mle(model, ss, 0);
	    }
		model->alpha = INITIAL_ALPHA;
	} else {
		model = load_lda_model(start);
		ss = new_lda_suffstats(model);
	}

	sprintf(filename,"%s/000",directory);
	save_lda_model(model, filename);

	// run expectation maximization

	int i = 0;
	double likelihood, likelihood_old = 0, converged = 1;
	sprintf(filename, "%s/likelihood.dat", directory);
	FILE* likelihood_file = fopen(filename, "w");

	while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
		i++;
		if (VERBOSE)
		    printf("**** em iteration %d ****\n", i);
		likelihood = 0;
		zero_initialize_ss(ss, model);

		// e-step
    printf("e-step\n");

		for (d = 0; d < corpus->num_docs; d++) {
			if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
			likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
		}
    printf("m-step\n");

		// m-step
    if (VERBOSE) {
      lda_mle(model, ss, ESTIMATE_ALPHA);
    } else {
      quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
    }

		// check for convergence
		converged = (likelihood_old - likelihood) / (likelihood_old);
		if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
		likelihood_old = likelihood;

		// output model and likelihood

		fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged);
		fflush(likelihood_file);
		if ((i % LAG) == 0)
		{
			sprintf(filename,"%s/%03d",directory, i);
			save_lda_model(model, filename);
			sprintf(filename,"%s/%03d.gamma",directory, i);
			save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
		}
	}

		// output the final model

	sprintf(filename,"%s/final",directory);
	save_lda_model(model, filename);
	sprintf(filename,"%s/final.gamma",directory);
	save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);

		// output the word assignments (for visualization)

	sprintf(filename, "%s/word-assignments.dat", directory);
	FILE* w_asgn_file = fopen(filename, "w");
  short error = 0;
  double tl = 0.0;
	for (d = 0; d < corpus->num_docs; d++)
	{
		if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d);
    error = 0;
    tl = lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi,&error);
    if( error ) { continue; }
		likelihood += tl;
		write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
	}
	fclose(w_asgn_file);
	fclose(likelihood_file);
}
Esempio n. 3
0
/*
* learn dictionary and find optimum code.
*/
int MedSTC::train(char* start, char* directory, Corpus* pC, Params *param)
{
	m_dDeltaEll = param->DELTA_ELL;
	m_dLambda   = param->LAMBDA;
	m_dRho      = param->RHO;
	m_dGamma    = m_dLambda;
	long runtime_start = get_runtime();

	// allocate variational parameters
	double ***phi = (double***)malloc(sizeof(double**) * pC->num_docs);
	for ( int d=0; d<pC->num_docs; d++ ) {
		phi[d] = (double**)malloc(sizeof(double*)*pC->docs[d].length);
		for (int n=0; n<pC->docs[d].length; n++) {
			phi[d][n] = (double*)malloc(sizeof(double) * param->NTOPICS);
		}
	}
	double **theta = (double**)malloc(sizeof(double*)*(pC->num_docs));
	for (int d=0; d<pC->num_docs; d++) {
		theta[d] = (double*)malloc(sizeof(double) * param->NTOPICS);
	}
	for ( int d=0; d<pC->num_docs; d++ ) {
		init_phi(&(pC->docs[d]), phi[d], theta[d], param);
	}

	// initialize model
	if (strcmp(start, "random")==0) {
		new_model(pC->num_docs, pC->num_terms, param->NTOPICS, 
								param->NLABELS, param->INITIAL_C);
		init_param( pC );
	} else {
		load_model(start);
		m_dC = param->INITIAL_C;
	}
	strcpy(m_directory, directory);

	char filename[100];
	

	// run expectation maximization
	sprintf(filename, "%s/lhood.dat", directory);
	FILE* lhood_file = fopen(filename, "w");

	Document *pDoc = NULL;
	double dobj, obj_old = 1, converged = 1;
	int nIt = 0;
	while (((converged < 0) || (converged > param->EM_CONVERGED) 
		|| (nIt <= 2)) && (nIt <= param->EM_MAX_ITER))
	{

		dobj = 0;
		double dLogLoss = 0;
		for ( int d=0; d<pC->num_docs; d++ ) {
			pDoc = &(pC->docs[d]);
			dobj += sparse_coding( pDoc, d, param, theta[d], phi[d] );
			dLogLoss += m_dLogLoss;
		}

		// m-step

		dict_learn(pC, theta, phi, param, false);

		if ( param->SUPERVISED == 1 ) { // for supervised MedLDA.
			char buff[512];
			get_train_filename( buff, m_directory, param );
			outputLowDimData( buff, pC, theta );

			svmStructSolver(buff, param, m_dMu);

			if ( param->PRIMALSVM == 1 ) { // solve svm in the primal form
				for ( int d=0; d<pC->num_docs; d++ ) {
					loss_aug_predict( &(pC->docs[d]), theta[d] );
				}
			}
			dobj += m_dsvm_primalobj;
		} else ;

		// check for convergence
		converged = fabs(1 - dobj / obj_old);
		obj_old = dobj;

		// output model and lhood
		if ( param->SUPERVISED == 1 ) {
			fprintf(lhood_file, "%10.10f\t%10.10f\t%5.5e\t%.5f\n", dobj-m_dsvm_primalobj, dobj, converged, dLogLoss);
		} else {
			fprintf(lhood_file, "%10.10f\t%5.5e\t%.5f\n", dobj, converged, dLogLoss);
		}
		fflush(lhood_file);
		if ( nIt > 0 && (nIt % LAG) == 0) {
			sprintf( filename, "%s/%d", directory, nIt + 1);
			save_model( filename, -1 );
			sprintf( filename, "%s/%d.theta", directory, nIt + 1 );
			save_theta( filename, theta, pC->num_docs, m_nK );
		}
		nIt ++;
	}
	// learn the final SVM.
	if ( param->SUPERVISED == 0 ) {
		char buff[512];
		get_train_filename(buff, m_directory, param);
		outputLowDimData(buff, pC, theta);

		svmStructSolver(buff, param, m_dMu);
	}
	long runtime_end = get_runtime();
	double dTrainTime = ((double)runtime_end-(double)runtime_start) / 100.0;


	// output the final model
	sprintf( filename, "%s/final", directory);
	save_model( filename, dTrainTime );

	// output the word assignments (for visualization)
	int nNum = 0, nAcc = 0;
	sprintf(filename, "%s/word-assignments.dat", directory);
	FILE* w_asgn_file = fopen(filename, "w");
	for (int d=0; d<pC->num_docs; d++) {

		sparse_coding( &(pC->docs[d]), d, param, theta[d], phi[d] );
		write_word_assignment(w_asgn_file, &(pC->docs[d]), phi[d]);

		nNum ++;
		pC->docs[d].predlabel = predict(theta[d]);
		if ( pC->docs[d].gndlabel == pC->docs[d].predlabel ) nAcc ++;
	}
	fclose(w_asgn_file);
	fclose(lhood_file);

	sprintf(filename,"%s/train.theta",directory);
	save_theta(filename, theta, pC->num_docs, m_nK);

	for (int d=0; d<pC->num_docs; d++) {
		free( theta[d] );
		for (int n=0; n<pC->docs[d].length; n++)
			free( phi[d][n] );
		free( phi[d] );
	}
	free( theta );
	free( phi );

	return nIt;
}