int main() { int i,z,d,w; int billID, wordID, freq; double sum; double likelihood[10]; FILE *fp1 = fopen("../data/Bill_Term", "r"); FILE *fp2 = fopen("../data/Term_Bill", "r"); FILE *fpwrite = fopen("../Result/result6479/10topic_NoSeeds_zw", "w"); FILE *fpwrite2 = fopen("../Result/result6479/10topic_NoSeeds_zd", "w"); // initialize double **n_dw1 = dmatrix(N_Bill_Term, 3); double **n_dw2 = dmatrix(N_Bill_Term, 3); double ***p_z_d = d3matrix(10, K, D); double ***p_w_z = d3matrix(10, V, K); // initialize n(d,w), p(z|d), p(w|z) i = 0; while (!feof(fp1)) { fscanf(fp1, "%d\t%d\t%d\n", &billID, &wordID, &freq); // n_dw[billID-1][wordID-1] = freq; n_dw1[i][0] = billID; n_dw1[i][1] = wordID; n_dw1[i][2] = freq; i++; } printf("%s%d\n%s%d\n", "i = ", i, "N_Bill_Term = ", N_Bill_Term); i = 0; while (!feof(fp2)) { fscanf(fp2, "%d\t%d\t%d\n", &billID, &wordID, &freq); n_dw2[i][0] = wordID; n_dw2[i][1] = billID; n_dw2[i][2] = freq; i++; } printf("%s%d\n%s%d\n", "i = ", i, "N_Bill_Term = ", N_Bill_Term); srand((unsigned)time(NULL)); for (i = 0; i < 10; i++) { for (d = 0; d < D; d++) { for (z = 0; z < K; z++) p_z_d[i][z][d] = rand() / (double)RAND_MAX; // p(z|d) // Normalization sum = 0; for (z = 0; z < K; z++) sum += p_z_d[i][z][d]; if (sum != 0) for (z = 0; z < K; z++) p_z_d[i][z][d] /= sum; else printf("%s\n", "Need initialize again."); sum = 0; for (z = 0; z < K; z++) sum += p_z_d[i][z][d]; if (fabs(sum-1)>0.0001) printf("%s\n", "abs(sum-1)>0.0001, Need initialize again."); } } for (i = 0; i < 10; i++) { for (z = 0; z < K; z++) { for (w = 0; w < V; w++) p_w_z[i][w][z] = rand() / (double)RAND_MAX; // p(w|z) //Normalization sum = 0; for (w = 0; w < V; w++) sum += p_w_z[i][w][z]; if (sum != 0) for (w = 0; w < V; w++) p_w_z[i][w][z] /= sum; else printf("%s\n", "Need initialize again."); sum = 0; for (w = 0; w < V; w++) sum += p_w_z[i][w][z]; if (fabs(sum-1)>0.0001) printf("%s\n", "abs(sum-1)>0.0001, Need initialize again."); } } // select the best one, after 50 iterations for (i = 0; i < 10; i++) { printf("%s%d\n", "Initialization ", i); pLSA(n_dw1, n_dw2, p_w_z[i], p_z_d[i], 50); likelihood[i] = calLikelihood(n_dw1, p_z_d[i], p_w_z[i]); printf("%s%d%s%f\n", "Likelihood", i, " is ", likelihood[i]); } int ml = 0; for (i = 1; i < 10; i++) if (likelihood[i] > likelihood[ml]) ml = i; printf("%s%d\n", "ml = ", ml); pLSA(n_dw1, n_dw2, p_w_z[ml], p_z_d[ml], 450); // output printf("%s\n", "Writing to file..."); // d: person, z: topic, w: bill // topic-word printf("%s\n", "p(w|z)"); for (z = 0; z < K; z++) { sum = 0; for (w = 0; w < V; w++) { sum += p_w_z[ml][w][z]; fprintf(fpwrite, "%.12lf", p_w_z[ml][w][z]); if (w != V-1) fprintf(fpwrite, "%s", "\t"); } printf("%lf\n", sum); fprintf(fpwrite, "%s", "\n"); } // topic-bill printf("%s\n", "p(z|d)"); for (z = 0; z < K; z++) { for (d = 0; d < D; d++) { fprintf(fpwrite2, "%.12lf", p_z_d[ml][z][d]); if (d != D-1) fprintf(fpwrite2, "%s", "\t"); } fprintf(fpwrite2, "%s", "\n"); } // clear fclose(fp1); fclose(fp2); fclose(fpwrite); fclose(fpwrite2); free_dmatrix(n_dw1, N_Bill_Term); free_dmatrix(n_dw2, N_Bill_Term); free_d3matrix(p_z_d, 10, K); free_d3matrix(p_w_z, 10, V); return 0; }
void uqAppl(const QUESO::BaseEnvironment& env) { if (env.fullRank() == 0) { std::cout << "Beginning run of 'uqTgaExample' example\n" << std::endl; } //int iRC; struct timeval timevalRef; struct timeval timevalNow; //****************************************************** // Task 1 of 5: instantiation of basic classes //****************************************************** // Instantiate the parameter space std::vector<std::string> paramNames(2,""); paramNames[0] = "A_param"; paramNames[1] = "E_param"; QUESO::VectorSpace<QUESO::GslVector,QUESO::GslMatrix> paramSpace(env,"param_",paramNames.size(),¶mNames); // Instantiate the parameter domain QUESO::GslVector paramMinValues(paramSpace.zeroVector()); paramMinValues[0] = 2.40e+11; paramMinValues[1] = 1.80e+05; QUESO::GslVector paramMaxValues(paramSpace.zeroVector()); paramMaxValues[0] = 2.80e+11; paramMaxValues[1] = 2.20e+05; QUESO::BoxSubset<QUESO::GslVector,QUESO::GslMatrix> paramDomain("param_", paramSpace, paramMinValues, paramMaxValues); // Instantiate the qoi space std::vector<std::string> qoiNames(1,""); qoiNames[0] = "TimeFor25PercentOfMass"; QUESO::VectorSpace<QUESO::GslVector,QUESO::GslMatrix> qoiSpace(env,"qoi_",qoiNames.size(),&qoiNames); // Instantiate the validation cycle QUESO::ValidationCycle<QUESO::GslVector,QUESO::GslMatrix,QUESO::GslVector,QUESO::GslMatrix> cycle(env, "", // No extra prefix paramSpace, qoiSpace); //******************************************************** // Task 2 of 5: calibration stage //******************************************************** /*iRC = */gettimeofday(&timevalRef, NULL); if (env.fullRank() == 0) { std::cout << "Beginning 'calibration stage' at " << ctime(&timevalRef.tv_sec) << std::endl; } // Inverse problem: instantiate the prior rv QUESO::UniformVectorRV<QUESO::GslVector,QUESO::GslMatrix> calPriorRv("cal_prior_", // Extra prefix before the default "rv_" prefix paramDomain); // Inverse problem: instantiate the likelihood Likelihood<> calLikelihood("cal_like_", paramDomain, "inputData/scenario_5_K_min.dat", "inputData/scenario_25_K_min.dat", "inputData/scenario_50_K_min.dat"); // Inverse problem: instantiate it (posterior rv is instantiated internally) cycle.instantiateCalIP(NULL, calPriorRv, calLikelihood); // Inverse problem: solve it, that is, set 'pdf' and 'realizer' of the posterior rv QUESO::GslVector paramInitialValues(paramSpace.zeroVector()); if (env.numSubEnvironments() == 1) { // For regression test purposes paramInitialValues[0] = 2.41e+11; paramInitialValues[1] = 2.19e+05; } else { calPriorRv.realizer().realization(paramInitialValues); } QUESO::GslMatrix* calProposalCovMatrix = cycle.calIP().postRv().imageSet().vectorSpace().newProposalMatrix(NULL,¶mInitialValues); cycle.calIP().solveWithBayesMetropolisHastings(NULL, paramInitialValues, calProposalCovMatrix); delete calProposalCovMatrix; // Forward problem: instantiate it (parameter rv = posterior rv of inverse problem; qoi rv is instantiated internally) double beta_prediction = 250.; double criticalMass_prediction = 0.; double criticalTime_prediction = 3.9; qoiRoutine_Data calQoiRoutine_Data; calQoiRoutine_Data.m_beta = beta_prediction; calQoiRoutine_Data.m_criticalMass = criticalMass_prediction; calQoiRoutine_Data.m_criticalTime = criticalTime_prediction; cycle.instantiateCalFP(NULL, qoiRoutine, (void *) &calQoiRoutine_Data); // Forward problem: solve it, that is, set 'realizer' and 'cdf' of the qoi rv cycle.calFP().solveWithMonteCarlo(NULL); // no extra user entities needed for Monte Carlo algorithm /*iRC = */gettimeofday(&timevalNow, NULL); if (env.fullRank() == 0) { std::cout << "Ending 'calibration stage' at " << ctime(&timevalNow.tv_sec) << "Total 'calibration stage' run time = " << timevalNow.tv_sec - timevalRef.tv_sec << " seconds\n" << std::endl; } //******************************************************** // Task 3 of 5: validation stage //******************************************************** /*iRC = */gettimeofday(&timevalRef, NULL); if (env.fullRank() == 0) { std::cout << "Beginning 'validation stage' at " << ctime(&timevalRef.tv_sec) << std::endl; } // Inverse problem: no need to instantiate the prior rv (= posterior rv of calibration inverse problem) // Inverse problem: instantiate the likelihood function object Likelihood<> valLikelihood("val_like_", paramDomain, "inputData/scenario_100_K_min.dat", NULL, NULL); // Inverse problem: instantiate it (posterior rv is instantiated internally) cycle.instantiateValIP(NULL,valLikelihood); // Inverse problem: solve it, that is, set 'pdf' and 'realizer' of the posterior rv const QUESO::SequentialVectorRealizer<QUESO::GslVector,QUESO::GslMatrix>* tmpRealizer = dynamic_cast< const QUESO::SequentialVectorRealizer<QUESO::GslVector,QUESO::GslMatrix>* >(&(cycle.calIP().postRv().realizer())); QUESO::GslMatrix* valProposalCovMatrix = cycle.calIP().postRv().imageSet().vectorSpace().newProposalMatrix(&tmpRealizer->unifiedSampleVarVector(), // Use 'realizer()' because post. rv was computed with MH &tmpRealizer->unifiedSampleExpVector()); // Use these values as the initial values cycle.valIP().solveWithBayesMetropolisHastings(NULL, tmpRealizer->unifiedSampleExpVector(), valProposalCovMatrix); delete valProposalCovMatrix; // Forward problem: instantiate it (parameter rv = posterior rv of inverse problem; qoi rv is instantiated internally) qoiRoutine_Data valQoiRoutine_Data; valQoiRoutine_Data.m_beta = beta_prediction; valQoiRoutine_Data.m_criticalMass = criticalMass_prediction; valQoiRoutine_Data.m_criticalTime = criticalTime_prediction; cycle.instantiateValFP(NULL, qoiRoutine, (void *) &valQoiRoutine_Data); // Forward problem: solve it, that is, set 'realizer' and 'cdf' of the qoi rv cycle.valFP().solveWithMonteCarlo(NULL); // no extra user entities needed for Monte Carlo algorithm /*iRC = */gettimeofday(&timevalNow, NULL); if (env.fullRank() == 0) { std::cout << "Ending 'validation stage' at " << ctime(&timevalNow.tv_sec) << "Total 'validation stage' run time = " << timevalNow.tv_sec - timevalRef.tv_sec << " seconds\n" << std::endl; } //******************************************************** // Task 4 of 5: comparison stage //******************************************************** /*iRC = */gettimeofday(&timevalRef, NULL); if (env.fullRank() == 0) { std::cout << "Beginning 'comparison stage' at " << ctime(&timevalRef.tv_sec) << std::endl; } uqAppl_LocalComparisonStage(cycle); if (env.numSubEnvironments() > 1) { uqAppl_UnifiedComparisonStage(cycle); } /*iRC = */gettimeofday(&timevalNow, NULL); if (env.fullRank() == 0) { std::cout << "Ending 'comparison stage' at " << ctime(&timevalNow.tv_sec) << "Total 'comparison stage' run time = " << timevalNow.tv_sec - timevalRef.tv_sec << " seconds\n" << std::endl; } //****************************************************** // Task 5 of 5: release memory before leaving routine. //****************************************************** if (env.fullRank() == 0) { std::cout << "Finishing run of 'uqTgaExample' example" << std::endl; } return; }
void pLSA(double **n_dw1, double **n_dw2, double **p_w_z, double **p_z_d, int iter) { int i,j,d,z,w,zz; int step, index, startCount, endCount; int d_count, w_count; double sum, sumz; double freq; double beta = 1; // parameters in Tempered EM double eta = 0.95; // parameters in Tempered EM double **temp_p_w_z; // has different meanings double **temp_p_z_d; // has differnet meanings temp_p_w_z = dmatrix(V, K); temp_p_z_d = dmatrix(K, D); double **sum_w; // sum_{w}{p(z|d,w)*n(d,w)} double **sum_d; // sum_{d}{p(z|d,w)*n(d,w)} sum_w = dmatrix(K, D); sum_d = dmatrix(K, V); double likelihood[2] = {-1,-1}; // old, new: for early stopping for (step = 0; step < iter; step++) { // if (step%10 == 9) printf("%d%s%d\n", step+1, "/", iter); // E-step: //printf("%s\n", "E-step(1) begins..."); // sum_{d}{p(z|d,w)*n(d,w)} startCount = 0; endCount = 0; for (w = 0; w < V; w++) { // if (w%100 == 0) printf("%d\n", w); if (startCount >= N_Bill_Term) break; while ((int)(n_dw2[endCount][0]) == w) { endCount += 1; if (endCount >= N_Bill_Term) break; } if (endCount == startCount) printf("%s%d%s\n", "Word ", w, " doesn't appear in any bill"); if (endCount > N_Bill_Term) printf("%s\n", "Index error."); d_count = endCount - startCount; // if (w%100 == 0) printf("%s%d\n", "d_count ", d_count); int *d_for_w = (int *)calloc(d_count, sizeof(int)); double *dfreq_for_w = (double *)calloc(d_count, sizeof(double)); for (i = 0; i < d_count; i++) { d_for_w[i] = (int)(n_dw2[startCount+i][1]); dfreq_for_w[i] = n_dw2[startCount+i][2]; } for (i = 0; i < d_count; i++) { d = d_for_w[i]; for (z = 0; z < K; z++) // actually temp_p_z_d is p(z|d,w), but we omit w in order to save space temp_p_z_d[z][d] = p_w_z[w][z] * p_z_d[z][d]; sum = 0; for (z = 0; z < K; z++) sum += temp_p_z_d[z][d]; if (sum != 0) for (z = 0; z < K; z++) temp_p_z_d[z][d] /= sum; else printf("%s\n", "sum = 0 in E-step(1)!"); } for (z = 0; z < K; z++) { sum_d[z][w] = 0; for (i = 0; i < d_count; i++) { d = d_for_w[i]; freq = dfreq_for_w[i]; sum_d[z][w] += temp_p_z_d[z][d] * freq; // sum_{d}{p(z|d,w)*n(d,w)} } } startCount = endCount; free(d_for_w); free(dfreq_for_w); } //printf("%s\n", "E-step(2) begins..."); // sum_{w}{p(z|d,w)*n(d,w)} startCount = 0; endCount = 0; for (d = 0; d < D; d++) { // if (d%100 == 0) printf("%d\n", d); if (startCount >= N_Bill_Term) break; while ((int)(n_dw1[endCount][0]) == d) { endCount += 1; if (endCount >= N_Bill_Term) break; } w_count = endCount - startCount; int *w_for_d = (int *)calloc(w_count, sizeof(int)); double *wfreq_for_d = (double *)calloc(w_count, sizeof(double)); for (i = 0; i < w_count; i++) { w_for_d[i] = (int)(n_dw1[startCount+i][1]); wfreq_for_d[i] = n_dw1[startCount+i][2]; } for (i = 0; i < w_count; i++) { w = w_for_d[i]; for (z = 0; z < K; z++) // actually temp_p_w_z is p(z|d,w), but we omit d in order to save space temp_p_w_z[w][z] = p_w_z[w][z] * p_z_d[z][d]; sum = 0; for (z = 0; z < K; z++) sum += temp_p_w_z[w][z]; if (sum != 0) for (z = 0; z < K; z++) temp_p_w_z[w][z] /= sum; else printf("%s\n", "sum = 0 in E-step(2)!"); } for (z = 0; z < K; z++) { sum_w[z][d] = 0; for (i = 0; i < w_count; i++) { w = w_for_d[i]; freq = wfreq_for_d[i]; sum_w[z][d] += temp_p_w_z[w][z] * freq; // sum_{w}{p(z|d,w)*n(d,w)} } } startCount = endCount; free(w_for_d); free(wfreq_for_d); } // M-step(1): update p(z|d) //printf("%s\n", "M-step(1) begins..."); for (d = 0; d < D; d++) { // if (d%100 == 0) printf("%d\n",d); for (z = 0; z < K; z++) temp_p_z_d[z][d] = sum_w[z][d]; // Normalize sum = 0; // denominator for (z = 0; z < K; z++) sum += temp_p_z_d[z][d]; if (sum != 0) for (z = 0; z < K; z++) temp_p_z_d[z][d] /= sum; else printf("%s\n", "sum = 0 in M-step(1)!"); } // M-step(2): update p(w|z) // can add seeds here //printf("%s\n", "M-step(2) begins..."); for (z = 0; z < K; z++) { // printf("%d\n", z); for (w = 0; w < V; w++) temp_p_w_z[w][z] = sum_d[z][w]; // Normalize sum = 0; // denominator for (w = 0; w < V; w++) sum += temp_p_w_z[w][z]; if (sum != 0) for (w = 0; w < V; w++) temp_p_w_z[w][z] /= sum; else printf("%s\n", "sum = 0 in M-step(2)!"); } // update p(z|d) and p(w|z) for (z = 0; z < K; z++) { for (d = 0; d < D; d++) p_z_d[z][d] = temp_p_z_d[z][d]; for (w = 0; w < V; w++) p_w_z[w][z] = temp_p_w_z[w][z]; } // examine whether sum(p(z|d), z) = 1 and sum(p(w|z), w) = 1 for (d = 0; d < D; d++) { sum = 0; for (z = 0; z < K; z++) sum += p_z_d[z][d]; if (fabs(sum-1) > 0.0001) { printf("%s\n", "p(d|z) abs(sum-1)>0.0001!"); //printf("%s%d\n", "sum != 1 in step ", step+1); //printf("%s%f\n", "sum = ", sum); } } for (z = 0; z < K; z++) { sum = 0; for (w = 0; w < V; w++) sum += p_w_z[w][z]; if (fabs(sum-1) > 0.0001) { printf("%s\n", "p(z|w) abs(sum-1)>0.0001!"); } } // early stopping if (step%10 == 0) { likelihood[1] = calLikelihood(n_dw1, p_z_d, p_w_z); printf("%s%f\n%f\n", "likelihoods: (old/new)\n", likelihood[0], likelihood[1]); double rate = fabs((likelihood[1]-likelihood[0])/likelihood[0]); printf("%s%f\n", "rate = ", rate); if (rate < pow(10.0, -5.0) && step > 1) { printf("%s\n", "early stop"); return; } likelihood[0] = likelihood[1]; FILE *fp = fopen("../Result/result6479/likelihood_NoSeeds", "a"); fprintf(fp, "%lf\t%lf\n", likelihood[1], rate); fclose(fp); } } free_dmatrix(temp_p_w_z, V); free_dmatrix(temp_p_z_d, K); free_dmatrix(sum_w, K); free_dmatrix(sum_d, K); return; }