Esempio n. 1
0
int main()
{
	int i,z,d,w;
	int billID, wordID, freq;
	double sum;
	double likelihood[10];
	FILE *fp1 = fopen("../data/Bill_Term", "r");
	FILE *fp2 = fopen("../data/Term_Bill", "r");
	FILE *fpwrite = fopen("../Result/result6479/10topic_NoSeeds_zw", "w");
	FILE *fpwrite2 = fopen("../Result/result6479/10topic_NoSeeds_zd", "w");

	// initialize
	double **n_dw1 = dmatrix(N_Bill_Term, 3);
	double **n_dw2 = dmatrix(N_Bill_Term, 3);
	double ***p_z_d = d3matrix(10, K, D);
	double ***p_w_z = d3matrix(10, V, K);

	// initialize n(d,w), p(z|d), p(w|z)
	i = 0;
	while (!feof(fp1))
	{
		fscanf(fp1, "%d\t%d\t%d\n", &billID, &wordID, &freq);
	//	n_dw[billID-1][wordID-1] = freq;
		n_dw1[i][0] = billID;
		n_dw1[i][1] = wordID;
		n_dw1[i][2] = freq;

		i++;
	}
	printf("%s%d\n%s%d\n", "i = ", i, "N_Bill_Term = ", N_Bill_Term);
	i = 0;
	while (!feof(fp2))
	{
		fscanf(fp2, "%d\t%d\t%d\n", &billID, &wordID, &freq);

		n_dw2[i][0] = wordID;
		n_dw2[i][1] = billID;
		n_dw2[i][2] = freq;

		i++;
	}
	printf("%s%d\n%s%d\n", "i = ", i, "N_Bill_Term = ", N_Bill_Term);

	srand((unsigned)time(NULL));
	for (i = 0; i < 10; i++)
	{
		for (d = 0; d < D; d++)
		{
			for (z = 0; z < K; z++)
				p_z_d[i][z][d] = rand() / (double)RAND_MAX;		// p(z|d)	

			// Normalization
			sum = 0;
			for (z = 0; z < K; z++)
				sum += p_z_d[i][z][d];

			if (sum != 0)
				for (z = 0; z < K; z++)
					p_z_d[i][z][d] /= sum;
			else
				printf("%s\n", "Need initialize again.");

			sum = 0;
			for (z = 0; z < K; z++)
				sum += p_z_d[i][z][d];
			if (fabs(sum-1)>0.0001)	printf("%s\n", "abs(sum-1)>0.0001, Need initialize again.");
		}
	}

	for (i = 0; i < 10; i++)
	{
		for (z = 0; z < K; z++)
		{
			for (w = 0; w < V; w++)
				p_w_z[i][w][z] = rand() / (double)RAND_MAX;		// p(w|z)

			//Normalization
			sum = 0;
			for (w = 0; w < V; w++)
				sum += p_w_z[i][w][z];

			if (sum != 0)
				for (w = 0; w < V; w++)
					p_w_z[i][w][z] /= sum;
			else
				printf("%s\n", "Need initialize again.");

			sum = 0;
			for (w = 0; w < V; w++)
				sum += p_w_z[i][w][z];
			if (fabs(sum-1)>0.0001)	printf("%s\n", "abs(sum-1)>0.0001, Need initialize again.");
		}
	}

	// select the best one, after 50 iterations
	for (i = 0; i < 10; i++)
	{
		printf("%s%d\n", "Initialization ", i);
		pLSA(n_dw1, n_dw2, p_w_z[i], p_z_d[i], 50);
		likelihood[i] = calLikelihood(n_dw1, p_z_d[i], p_w_z[i]);
		printf("%s%d%s%f\n", "Likelihood", i, " is ", likelihood[i]);
	}
	int ml = 0;
	for (i = 1; i < 10; i++)
		if (likelihood[i] > likelihood[ml])
			ml = i;
	printf("%s%d\n", "ml = ", ml);
	pLSA(n_dw1, n_dw2, p_w_z[ml], p_z_d[ml], 450);


	// output
	printf("%s\n", "Writing to file...");
	// d: person, z: topic, w: bill
	// topic-word
	printf("%s\n", "p(w|z)");
	for (z = 0; z < K; z++)
	{
		sum = 0;
		for (w = 0; w < V; w++)
		{
			sum += p_w_z[ml][w][z];
			fprintf(fpwrite, "%.12lf", p_w_z[ml][w][z]);
			if (w != V-1)
				fprintf(fpwrite, "%s", "\t");
		}
		printf("%lf\n", sum);
		fprintf(fpwrite, "%s", "\n");
	}

	// topic-bill
	printf("%s\n", "p(z|d)");
	for (z = 0; z < K; z++)
	{
		for (d = 0; d < D; d++)
		{
			fprintf(fpwrite2, "%.12lf", p_z_d[ml][z][d]);
			if (d != D-1)
				fprintf(fpwrite2, "%s", "\t");
		}
		fprintf(fpwrite2, "%s", "\n");
	}

	// clear
	fclose(fp1);
	fclose(fp2);
	fclose(fpwrite);
	fclose(fpwrite2);
	free_dmatrix(n_dw1, N_Bill_Term);
	free_dmatrix(n_dw2, N_Bill_Term);
	free_d3matrix(p_z_d, 10, K);
	free_d3matrix(p_w_z, 10, V);

	return 0;
}
Esempio n. 2
0
void
uqAppl(const QUESO::BaseEnvironment& env)
{
  if (env.fullRank() == 0) {
    std::cout << "Beginning run of 'uqTgaExample' example\n"
              << std::endl;
  }

  //int iRC;
  struct timeval timevalRef;
  struct timeval timevalNow;

  //******************************************************
  // Task 1 of 5: instantiation of basic classes
  //******************************************************

  // Instantiate the parameter space
  std::vector<std::string> paramNames(2,"");
  paramNames[0] = "A_param";
  paramNames[1] = "E_param";
  QUESO::VectorSpace<QUESO::GslVector,QUESO::GslMatrix> paramSpace(env,"param_",paramNames.size(),&paramNames);

  // Instantiate the parameter domain
  QUESO::GslVector paramMinValues(paramSpace.zeroVector());
  paramMinValues[0] = 2.40e+11;
  paramMinValues[1] = 1.80e+05;
  QUESO::GslVector paramMaxValues(paramSpace.zeroVector());
  paramMaxValues[0] = 2.80e+11;
  paramMaxValues[1] = 2.20e+05;
  QUESO::BoxSubset<QUESO::GslVector,QUESO::GslMatrix> paramDomain("param_",
                                        paramSpace,
                                        paramMinValues,
                                        paramMaxValues);

  // Instantiate the qoi space
  std::vector<std::string> qoiNames(1,"");
  qoiNames[0] = "TimeFor25PercentOfMass";
  QUESO::VectorSpace<QUESO::GslVector,QUESO::GslMatrix> qoiSpace(env,"qoi_",qoiNames.size(),&qoiNames);

  // Instantiate the validation cycle
  QUESO::ValidationCycle<QUESO::GslVector,QUESO::GslMatrix,QUESO::GslVector,QUESO::GslMatrix> cycle(env,
                                                "", // No extra prefix
                                                paramSpace,
                                                qoiSpace);

  //********************************************************
  // Task 2 of 5: calibration stage
  //********************************************************

  /*iRC = */gettimeofday(&timevalRef, NULL);
  if (env.fullRank() == 0) {
    std::cout << "Beginning 'calibration stage' at " << ctime(&timevalRef.tv_sec)
              << std::endl;
  }

  // Inverse problem: instantiate the prior rv
  QUESO::UniformVectorRV<QUESO::GslVector,QUESO::GslMatrix> calPriorRv("cal_prior_", // Extra prefix before the default "rv_" prefix
                                                                       paramDomain);

  // Inverse problem: instantiate the likelihood
  Likelihood<> calLikelihood("cal_like_",
                           paramDomain,
                           "inputData/scenario_5_K_min.dat",
                           "inputData/scenario_25_K_min.dat",
                           "inputData/scenario_50_K_min.dat");


  // Inverse problem: instantiate it (posterior rv is instantiated internally)
  cycle.instantiateCalIP(NULL,
                         calPriorRv,
                         calLikelihood);

  // Inverse problem: solve it, that is, set 'pdf' and 'realizer' of the posterior rv
  QUESO::GslVector paramInitialValues(paramSpace.zeroVector());
  if (env.numSubEnvironments() == 1) {
    // For regression test purposes
    paramInitialValues[0] = 2.41e+11;
    paramInitialValues[1] = 2.19e+05;
  }
  else {
    calPriorRv.realizer().realization(paramInitialValues);
  }

  QUESO::GslMatrix* calProposalCovMatrix = cycle.calIP().postRv().imageSet().vectorSpace().newProposalMatrix(NULL,&paramInitialValues);
  cycle.calIP().solveWithBayesMetropolisHastings(NULL,
                                                 paramInitialValues,
                                                 calProposalCovMatrix);
  delete calProposalCovMatrix;

  // Forward problem: instantiate it (parameter rv = posterior rv of inverse problem; qoi rv is instantiated internally)
  double beta_prediction         = 250.;
  double criticalMass_prediction = 0.;
  double criticalTime_prediction = 3.9;

  qoiRoutine_Data calQoiRoutine_Data;
  calQoiRoutine_Data.m_beta         = beta_prediction;
  calQoiRoutine_Data.m_criticalMass = criticalMass_prediction;
  calQoiRoutine_Data.m_criticalTime = criticalTime_prediction;

  cycle.instantiateCalFP(NULL,
                         qoiRoutine,
                         (void *) &calQoiRoutine_Data);

  // Forward problem: solve it, that is, set 'realizer' and 'cdf' of the qoi rv
  cycle.calFP().solveWithMonteCarlo(NULL); // no extra user entities needed for Monte Carlo algorithm

  /*iRC = */gettimeofday(&timevalNow, NULL);
  if (env.fullRank() == 0) {
    std::cout << "Ending 'calibration stage' at "        << ctime(&timevalNow.tv_sec)
              << "Total 'calibration stage' run time = " << timevalNow.tv_sec - timevalRef.tv_sec
              << " seconds\n"
              << std::endl;
  }

  //********************************************************
  // Task 3 of 5: validation stage
  //********************************************************
  /*iRC = */gettimeofday(&timevalRef, NULL);
  if (env.fullRank() == 0) {
    std::cout << "Beginning 'validation stage' at " << ctime(&timevalRef.tv_sec)
              << std::endl;
  }

  // Inverse problem: no need to instantiate the prior rv (= posterior rv of calibration inverse problem)

  // Inverse problem: instantiate the likelihood function object
  Likelihood<> valLikelihood("val_like_",
                             paramDomain,
                             "inputData/scenario_100_K_min.dat",
                             NULL,
                             NULL);

  // Inverse problem: instantiate it (posterior rv is instantiated internally)
  cycle.instantiateValIP(NULL,valLikelihood);

  // Inverse problem: solve it, that is, set 'pdf' and 'realizer' of the posterior rv
  const QUESO::SequentialVectorRealizer<QUESO::GslVector,QUESO::GslMatrix>* tmpRealizer = dynamic_cast< const QUESO::SequentialVectorRealizer<QUESO::GslVector,QUESO::GslMatrix>* >(&(cycle.calIP().postRv().realizer()));
  QUESO::GslMatrix* valProposalCovMatrix = cycle.calIP().postRv().imageSet().vectorSpace().newProposalMatrix(&tmpRealizer->unifiedSampleVarVector(),  // Use 'realizer()' because post. rv was computed with MH
                                                                                                             &tmpRealizer->unifiedSampleExpVector()); // Use these values as the initial values
  cycle.valIP().solveWithBayesMetropolisHastings(NULL,
                                                 tmpRealizer->unifiedSampleExpVector(),
                                                 valProposalCovMatrix);
  delete valProposalCovMatrix;

  // Forward problem: instantiate it (parameter rv = posterior rv of inverse problem; qoi rv is instantiated internally)
  qoiRoutine_Data valQoiRoutine_Data;
  valQoiRoutine_Data.m_beta         = beta_prediction;
  valQoiRoutine_Data.m_criticalMass = criticalMass_prediction;
  valQoiRoutine_Data.m_criticalTime = criticalTime_prediction;

  cycle.instantiateValFP(NULL,
                         qoiRoutine,
                         (void *) &valQoiRoutine_Data);

  // Forward problem: solve it, that is, set 'realizer' and 'cdf' of the qoi rv
  cycle.valFP().solveWithMonteCarlo(NULL); // no extra user entities needed for Monte Carlo algorithm

  /*iRC = */gettimeofday(&timevalNow, NULL);
  if (env.fullRank() == 0) {
    std::cout << "Ending 'validation stage' at "        << ctime(&timevalNow.tv_sec)
              << "Total 'validation stage' run time = " << timevalNow.tv_sec - timevalRef.tv_sec
              << " seconds\n"
              << std::endl;
  }

  //********************************************************
  // Task 4 of 5: comparison stage
  //********************************************************

  /*iRC = */gettimeofday(&timevalRef, NULL);
  if (env.fullRank() == 0) {
    std::cout << "Beginning 'comparison stage' at " << ctime(&timevalRef.tv_sec)
              << std::endl;
  }

  uqAppl_LocalComparisonStage(cycle);
  if (env.numSubEnvironments() > 1) {
    uqAppl_UnifiedComparisonStage(cycle);
  }

  /*iRC = */gettimeofday(&timevalNow, NULL);
  if (env.fullRank() == 0) {
    std::cout << "Ending 'comparison stage' at "        << ctime(&timevalNow.tv_sec)
              << "Total 'comparison stage' run time = " << timevalNow.tv_sec - timevalRef.tv_sec
              << " seconds\n"
              << std::endl;
  }

  //******************************************************
  // Task 5 of 5: release memory before leaving routine.
  //******************************************************

  if (env.fullRank() == 0) {
    std::cout << "Finishing run of 'uqTgaExample' example"
              << std::endl;
  }

  return;
}
Esempio n. 3
0
void pLSA(double **n_dw1, double **n_dw2, double **p_w_z, double **p_z_d, int iter)
{
	int i,j,d,z,w,zz;
	int step, index, startCount, endCount;
	int d_count, w_count;
	double sum, sumz;
	double freq;

	double beta = 1;				// parameters in Tempered EM
	double eta = 0.95;				// parameters in Tempered EM

	double **temp_p_w_z;			// has different meanings
	double **temp_p_z_d;			// has differnet meanings
	temp_p_w_z = dmatrix(V, K);
	temp_p_z_d = dmatrix(K, D);
	double **sum_w;					// sum_{w}{p(z|d,w)*n(d,w)}
	double **sum_d;					// sum_{d}{p(z|d,w)*n(d,w)}
	sum_w = dmatrix(K, D);
	sum_d = dmatrix(K, V);

	double likelihood[2] = {-1,-1};		// old, new: for early stopping
	
	for (step = 0; step < iter; step++)
	{
	//	if (step%10 == 9)
			printf("%d%s%d\n", step+1, "/", iter);

		// E-step:
		//printf("%s\n", "E-step(1) begins...");			// sum_{d}{p(z|d,w)*n(d,w)}
		startCount = 0;
		endCount = 0;
		for (w = 0; w < V; w++)
		{
		//	if (w%100 == 0)	printf("%d\n", w);
				
			if (startCount >= N_Bill_Term)
				break;
			while ((int)(n_dw2[endCount][0]) == w)
			{
				endCount += 1;
				if (endCount >= N_Bill_Term)
					break;
			}

			if (endCount == startCount)
				printf("%s%d%s\n", "Word ", w, " doesn't appear in any bill");

			if (endCount > N_Bill_Term)
				printf("%s\n", "Index error.");

			d_count = endCount - startCount;

		//	if (w%100 == 0) printf("%s%d\n", "d_count ", d_count);
			
			int *d_for_w = (int *)calloc(d_count, sizeof(int));
			double *dfreq_for_w = (double *)calloc(d_count, sizeof(double));
			for (i = 0; i < d_count; i++)
			{
				d_for_w[i] = (int)(n_dw2[startCount+i][1]);
				dfreq_for_w[i] = n_dw2[startCount+i][2];
			}

			for (i = 0; i < d_count; i++)
			{
				d = d_for_w[i];

				for (z = 0; z < K; z++)
					// actually temp_p_z_d is p(z|d,w), but we omit w in order to save space
					temp_p_z_d[z][d] = p_w_z[w][z] * p_z_d[z][d];

				sum = 0;
				for (z = 0; z < K; z++)
					sum += temp_p_z_d[z][d];

				if (sum != 0)
					for (z = 0; z < K; z++)
						temp_p_z_d[z][d] /= sum;
				else
					printf("%s\n", "sum = 0 in E-step(1)!");
			}

			for (z = 0; z < K; z++)
			{
				sum_d[z][w] = 0;
				for (i = 0; i < d_count; i++)
				{
					d = d_for_w[i];
					freq = dfreq_for_w[i];
					sum_d[z][w] += temp_p_z_d[z][d] * freq;		// sum_{d}{p(z|d,w)*n(d,w)}
				}
			}

			startCount = endCount;
			free(d_for_w);
			free(dfreq_for_w);
		}

		//printf("%s\n", "E-step(2) begins...");			// sum_{w}{p(z|d,w)*n(d,w)}
		startCount = 0;
		endCount = 0;
		for (d = 0; d < D; d++)
		{
		//	if (d%100 == 0)	printf("%d\n", d);

			if (startCount >= N_Bill_Term)
				break;
			while ((int)(n_dw1[endCount][0]) == d)
			{
				endCount += 1;
				if (endCount >= N_Bill_Term)
					break;
			}

			w_count = endCount - startCount;	

			int *w_for_d = (int *)calloc(w_count, sizeof(int));
			double *wfreq_for_d = (double *)calloc(w_count, sizeof(double));
			for (i = 0; i < w_count; i++)
			{
				w_for_d[i] = (int)(n_dw1[startCount+i][1]);
				wfreq_for_d[i] = n_dw1[startCount+i][2];
			}

			for (i = 0; i < w_count; i++)
			{
				w = w_for_d[i];

				for (z = 0; z < K; z++)
					// actually temp_p_w_z is p(z|d,w), but we omit d in order to save space
					temp_p_w_z[w][z] = p_w_z[w][z] * p_z_d[z][d];

				sum = 0;
				for (z = 0; z < K; z++)
					sum += temp_p_w_z[w][z];

				if (sum != 0)
					for (z = 0; z < K; z++)
						temp_p_w_z[w][z] /= sum;
				else
					printf("%s\n", "sum = 0 in E-step(2)!");
			}

			for (z = 0; z < K; z++)
			{
				sum_w[z][d] = 0;
				for (i = 0; i < w_count; i++)
				{
					w = w_for_d[i];
					freq = wfreq_for_d[i];
					sum_w[z][d] += temp_p_w_z[w][z] * freq;		// sum_{w}{p(z|d,w)*n(d,w)}
				}
			}

			startCount = endCount;
			free(w_for_d);
			free(wfreq_for_d);
		}
		
		// M-step(1): update p(z|d)
		//printf("%s\n", "M-step(1) begins...");
		for (d = 0; d < D; d++)
		{
		//	if (d%100 == 0) printf("%d\n",d);
			
			for (z = 0; z < K; z++)
				temp_p_z_d[z][d] = sum_w[z][d];

			// Normalize
			sum = 0;			// denominator
			for (z = 0; z < K; z++)
				sum += temp_p_z_d[z][d];
			if (sum != 0)
				for (z = 0; z < K; z++)
					temp_p_z_d[z][d] /= sum;
			else
				printf("%s\n", "sum = 0 in M-step(1)!");
		}

		// M-step(2): update p(w|z)
		// can add seeds here
		//printf("%s\n", "M-step(2) begins...");
		for (z = 0; z < K; z++)
		{
		//	printf("%d\n", z);
			for (w = 0; w < V; w++)
				temp_p_w_z[w][z] = sum_d[z][w];

			// Normalize
			sum = 0;			// denominator
			for (w = 0; w < V; w++)
				sum += temp_p_w_z[w][z];
			if (sum != 0)
				for (w = 0; w < V; w++)
					temp_p_w_z[w][z] /= sum;
			else
				printf("%s\n", "sum = 0 in M-step(2)!");
		}

		// update p(z|d) and p(w|z)
		for (z = 0; z < K; z++)
		{
			for (d = 0; d < D; d++)
				p_z_d[z][d] = temp_p_z_d[z][d];
			for (w = 0; w < V; w++)
				p_w_z[w][z] = temp_p_w_z[w][z];
		}

		// examine whether sum(p(z|d), z) = 1 and sum(p(w|z), w) = 1
		for (d = 0; d < D; d++)
		{
			sum = 0;
			for (z = 0; z < K; z++)
				sum += p_z_d[z][d];
			if (fabs(sum-1) > 0.0001)
			{
				printf("%s\n", "p(d|z) abs(sum-1)>0.0001!");
				//printf("%s%d\n", "sum != 1 in step ", step+1);
				//printf("%s%f\n", "sum = ", sum);
			}
		}
		for (z = 0; z < K; z++)
		{
			sum = 0;
			for (w = 0; w < V; w++)
				sum += p_w_z[w][z];
			if (fabs(sum-1) > 0.0001)
			{
				printf("%s\n", "p(z|w) abs(sum-1)>0.0001!");
			}
		}

		// early stopping
		if (step%10 == 0)
		{
			likelihood[1] = calLikelihood(n_dw1, p_z_d, p_w_z);
			printf("%s%f\n%f\n", "likelihoods: (old/new)\n", likelihood[0], likelihood[1]);
			double rate = fabs((likelihood[1]-likelihood[0])/likelihood[0]);
			printf("%s%f\n", "rate = ", rate);
			if (rate < pow(10.0, -5.0) && step > 1)
			{
				printf("%s\n", "early stop");
				return;
			}
			likelihood[0] = likelihood[1];

			FILE *fp = fopen("../Result/result6479/likelihood_NoSeeds", "a");
			fprintf(fp, "%lf\t%lf\n", likelihood[1], rate);
			fclose(fp);
		}

	}

	free_dmatrix(temp_p_w_z, V);
	free_dmatrix(temp_p_z_d, K);
	free_dmatrix(sum_w, K);
	free_dmatrix(sum_d, K);

	return;
}