コード例 #1
0
ファイル: lda-model.cpp プロジェクト: wangchaohui/LDA
lda_model* load_lda_model(char* model_root)
{
    char filename[100];
    FILE* fileptr;
    int i, j, num_terms, num_topics;
    float x, alpha;

    sprintf(filename, "%s.other", model_root);
    printf("loading %s\n", filename);
    fileptr = fopen(filename, "r");
    fscanf(fileptr, "num_topics %d\n", &num_topics);
    fscanf(fileptr, "num_terms %d\n", &num_terms);
    fscanf(fileptr, "alpha %f\n", &alpha);
    fclose(fileptr);

    lda_model* model = new_lda_model(num_terms, num_topics);
    model->alpha = alpha;

    sprintf(filename, "%s.beta", model_root);
    printf("loading %s\n", filename);
    fileptr = fopen(filename, "r");
    for (i = 0; i < num_topics; i++)
    {
        for (j = 0; j < num_terms; j++)
        {
            fscanf(fileptr, "%f", &x);
            model->log_prob_w[i][j] = x;
        }
    }
    fclose(fileptr);
    return(model);
}
コード例 #2
0
lda* read_lda(int ntopics, int nterms, char* name) {
    char filename[400];

    lda* model = new_lda_model(ntopics, nterms);
    sprintf(filename, "%s.beta", name);
    mtx_fscanf(filename, model->topics);
    sprintf(filename, "%s.alpha", name);
    vct_fscanf(filename, model->alpha);

    return(model);
}
コード例 #3
0
ファイル: lda-inference.c プロジェクト: taf2/lda-ruby
void run_quiet_em(char* start, corpus* corpus) {
	int d = 0, n = 0;
	lda_model *model = NULL;
	double **var_gamma = NULL, **phi = NULL;
	// last_gamma is a double[num_docs][num_topics]

	// allocate variational parameters


	var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs));
  memset(var_gamma, 0.0, corpus->num_docs);

	for (d = 0; d < corpus->num_docs; ++d) {
		var_gamma[d] = (double*)malloc(sizeof(double) * NTOPICS);
    memset(var_gamma[d], 0.0, sizeof(double)*NTOPICS);
  }

	int max_length = max_corpus_length(corpus);

	phi = (double**)malloc(sizeof(double*)*max_length);
  memset(phi, 0.0, max_length);
	for (n = 0; n < max_length; ++n) {
		phi[n] = (double*)malloc(sizeof(double) * NTOPICS);
    memset(phi[n], 0.0, sizeof(double)*NTOPICS);
  }

	// initialize model

	lda_suffstats* ss = NULL;
	if (strncmp(start, "seeded",6)==0) {
		model = new_lda_model(corpus->num_terms, NTOPICS);
		model->alpha = INITIAL_ALPHA;
		ss = new_lda_suffstats(model);
		if (VERBOSE) {
      corpus_initialize_ss(ss, model, corpus);
    } else {
      quiet_corpus_initialize_ss(ss, model, corpus);
    }
		if (VERBOSE) {
      lda_mle(model, ss, 0);
		} else {
      quiet_lda_mle(model, ss, 0);
		}
	} else if (strncmp(start, "fixed",5)==0) {
	  model = new_lda_model(corpus->num_terms, NTOPICS);
    model->alpha = INITIAL_ALPHA;
	  ss = new_lda_suffstats(model);
	  corpus_initialize_fixed_ss(ss, model, corpus);
    if (VERBOSE) {
      lda_mle(model, ss, 0);
    } else {
      quiet_lda_mle(model, ss, 0);
    }
	} else if (strncmp(start, "random",6)==0) {
		model = new_lda_model(corpus->num_terms, NTOPICS);
		model->alpha = INITIAL_ALPHA;
		ss = new_lda_suffstats(model);
		random_initialize_ss(ss, model);
		if (VERBOSE) {
      lda_mle(model, ss, 0);
		} else {
      quiet_lda_mle(model, ss, 0);
		}
	} else {
		model = load_lda_model(start);
		ss = new_lda_suffstats(model);
	}

	// save the model in the last_model global
	last_model = model;
	model_loaded = TRUE;

	// run expectation maximization

	int i = 0;
	double likelihood = 0.0, likelihood_old = 0, converged = 1;

	while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
		i++;
		if (VERBOSE) printf("**** em iteration %d ****\n", i);
		likelihood = 0;
		zero_initialize_ss(ss, model);

		// e-step

		for (d = 0; d < corpus->num_docs; d++) {
			if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
			likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
		}

		// m-step
    if (VERBOSE) {
      lda_mle(model, ss, ESTIMATE_ALPHA);
    } else {
      quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
    }

		// check for convergence

		converged = (likelihood_old - likelihood) / (likelihood_old);
		if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
		likelihood_old = likelihood;

		// store model and likelihood

		last_model = model;
		last_gamma = var_gamma;
    last_phi = phi;
	}

	// output the final model

	last_model = model;
	last_gamma = var_gamma;
  last_phi = phi;

  free_lda_suffstats(model,ss);

	// output the word assignments (for visualization)
	/*
	char filename[100];
	sprintf(filename, "%s/word-assignments.dat", directory);
	FILE* w_asgn_file = fopen(filename, "w");
	for (d = 0; d < corpus->num_docs; d++) {
		if ((d % 100) == 0)
			printf("final e step document %d\n",d);
		likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi);
		write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
	}
	fclose(w_asgn_file);
	*/
}
コード例 #4
0
ファイル: lda-inference.c プロジェクト: taf2/lda-ruby
void run_em(char* start, char* directory, corpus* corpus) {
	int d, n;
	lda_model *model = NULL;
	double **var_gamma, **phi;

	// allocate variational parameters


	var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
	for (d = 0; d < corpus->num_docs; d++)
		var_gamma[d] = malloc(sizeof(double) * NTOPICS);

	int max_length = max_corpus_length(corpus);
	phi = malloc(sizeof(double*)*max_length);
	for (n = 0; n < max_length; n++)
		phi[n] = malloc(sizeof(double) * NTOPICS);

	// initialize model

	char filename[100];

	lda_suffstats* ss = NULL;
	if (strcmp(start, "seeded")==0) {
		model = new_lda_model(corpus->num_terms, NTOPICS);
		ss = new_lda_suffstats(model);
		corpus_initialize_ss(ss, model, corpus);
		if (VERBOSE) {
		    lda_mle(model, ss, 0);
	    } else {
            quiet_lda_mle(model, ss, 0);
	    }
		    
		model->alpha = INITIAL_ALPHA;
	} else if (strcmp(start, "random")==0) {
		model = new_lda_model(corpus->num_terms, NTOPICS);
		ss = new_lda_suffstats(model);
		random_initialize_ss(ss, model);
		if (VERBOSE) {
		    lda_mle(model, ss, 0);
	    } else {
	        quiet_lda_mle(model, ss, 0);
	    }
		model->alpha = INITIAL_ALPHA;
	} else {
		model = load_lda_model(start);
		ss = new_lda_suffstats(model);
	}

	sprintf(filename,"%s/000",directory);
	save_lda_model(model, filename);

	// run expectation maximization

	int i = 0;
	double likelihood, likelihood_old = 0, converged = 1;
	sprintf(filename, "%s/likelihood.dat", directory);
	FILE* likelihood_file = fopen(filename, "w");

	while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
		i++;
		if (VERBOSE)
		    printf("**** em iteration %d ****\n", i);
		likelihood = 0;
		zero_initialize_ss(ss, model);

		// e-step
    printf("e-step\n");

		for (d = 0; d < corpus->num_docs; d++) {
			if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
			likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
		}
    printf("m-step\n");

		// m-step
    if (VERBOSE) {
      lda_mle(model, ss, ESTIMATE_ALPHA);
    } else {
      quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
    }

		// check for convergence
		converged = (likelihood_old - likelihood) / (likelihood_old);
		if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
		likelihood_old = likelihood;

		// output model and likelihood

		fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged);
		fflush(likelihood_file);
		if ((i % LAG) == 0)
		{
			sprintf(filename,"%s/%03d",directory, i);
			save_lda_model(model, filename);
			sprintf(filename,"%s/%03d.gamma",directory, i);
			save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
		}
	}

		// output the final model

	sprintf(filename,"%s/final",directory);
	save_lda_model(model, filename);
	sprintf(filename,"%s/final.gamma",directory);
	save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);

		// output the word assignments (for visualization)

	sprintf(filename, "%s/word-assignments.dat", directory);
	FILE* w_asgn_file = fopen(filename, "w");
  short error = 0;
  double tl = 0.0;
	for (d = 0; d < corpus->num_docs; d++)
	{
		if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d);
    error = 0;
    tl = lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi,&error);
    if( error ) { continue; }
		likelihood += tl;
		write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
	}
	fclose(w_asgn_file);
	fclose(likelihood_file);
}