void parameter_cloud(Tree &T, Model &Mod, long Nrep, long length, double eps, Parameters &Parsim){ long iter; float likel; Parameters Par; Alignment align; Counts data; double eps_pseudo = 0.001; // Amount added to compute the pseudo-counts. // Initialize the parameters for simulation of K81 data for testing Par = create_parameters(T); // Obtaining the distribution of estimated parameters with EM std::ofstream estpar; estpar.open("est-par.dat", std::ios::out); estpar.precision(15); std::vector<double> param; for (iter=0; iter < Nrep; iter++) { random_data(T, Mod, Parsim, length, align); get_counts(align, data); add_pseudocounts(eps_pseudo, data); // Runs EM std::tie(likel, iter)= EMalgorithm(T, Mod, Par, data, eps); // Choses the best permutation. guess_permutation(T, Mod, Par); get_free_param_vector(T, Mod, Par, param); for (unsigned long k=0; k < param.size(); k++) { estpar << param[k] << " "; } estpar << std::endl; } }
BOOL CMarkovChain::Main() { int id = 0; int L = 0; int i = 0; int j,k,l,t; m_nPageNum = m_Path_vec.Max(); CDoubleVector start_vec(m_nRow); CDoubleVector pairID_vec(m_nRow); CDoubleVector length_vec(m_nRow, 0); // (Preprocessing) begin // while (i < m_nRow) { L=0; start_vec(id) = i; pairID_vec(id) = m_ID_vec(i); while (pairID_vec(id) == m_ID_vec(i)) { length_vec(id)++; i++; if (i >= m_nRow) { break; } } id++; } m_nTotalID = id; start_vec.resize(m_nTotalID); pairID_vec.resize(m_nTotalID); length_vec.resize(m_nTotalID); CDoubleVector score(28); // (Preprocessing) end // // (Model selection) begin // if (m_bAutoCluster)//如果自动计算KClusters { int cutting = min(0.4 * m_nTotalID, 5000); int KClusters; for (KClusters=3; KClusters<15; KClusters++) { CDoubleMatrix responsibility(cutting, KClusters); // (Initialization) begin CDoubleVector Pi_vec(KClusters, 1.0/KClusters); CDoubleMatrix Theta_Init_mx(KClusters, m_nPageNum); srand(GetTickCount()); for (k=0; k<KClusters; k++) { for (j=0; j<m_nPageNum; j++) { Theta_Init_mx(k,j) = rand(); } } for (k=0; k<KClusters; k++) { double s = 0; for (j=0; j<m_nPageNum; j++) { s += Theta_Init_mx(k,j); } for (j=0; j<m_nPageNum; j++) { Theta_Init_mx(k,j) = Theta_Init_mx(k,j)/s; } } CDoubleMatrix * Theta_Trans= new CDoubleMatrix[KClusters]; for (k=0; k<KClusters; k++) { CDoubleMatrix Theta_Trans_mx(m_nPageNum, m_nPageNum); for (i=0; i<m_nPageNum; i++) { for (j=0; j<m_nPageNum; j++) { Theta_Trans_mx(i, j) = rand(); } } Theta_Trans[k] = Theta_Trans_mx; } for (k=0; k<KClusters; k++) { for (j=0; j<m_nPageNum; j++) { double s = 0; for (l=0; l<m_nPageNum; l++) { s += Theta_Trans[k](j,l); } for (l=0; l<m_nPageNum; l++) { Theta_Trans[k](j,l) = Theta_Trans[k](j,l)/s; } } } // (Initialization) end EMalgorithm(Pi_vec, Theta_Init_mx, Theta_Trans, responsibility, start_vec, length_vec, pairID_vec, KClusters, cutting); // (Compute Score) begin double LS2 = 0; double total_length = 0; for (i=cutting; i<m_nTotalID; i++) { double like = 0; for (k=0; k<KClusters; k++) { L = length_vec(i); double temp = 1; for (t=start_vec(i); t<start_vec(i)+L-1; t++) { temp *= Theta_Trans[k](m_Path_vec(t)-1,m_Path_vec(t+1)-1); } like += temp * Pi_vec(k) * Theta_Init_mx(k,m_Path_vec(start_vec(i))-1); } LS2 += log(like)/log(2); } for (i=0; i<m_nTotalID; i++) { total_length += length_vec(i); } score(KClusters-3) = -(LS2/total_length)+(0.01*KClusters); // (Compute Score) end } } for (i=0; i<28; i++) { if (score(i) < score(m_nCluster-3)) { m_nCluster = i+3; } } // (Model selection) end // // (Training) begin // CDoubleMatrix responsibility(m_nTotalID, m_nCluster); // (Initialization) CDoubleVector Pi_vec(m_nCluster, 1.0/m_nCluster); // Pi CDoubleMatrix Theta_Init_mx(m_nCluster, m_nPageNum); // Theta_Init srand(GetTickCount()); for (k=0; k<m_nCluster; k++) { for (j=0; j<m_nPageNum; j++) { Theta_Init_mx(k,j) = rand(); } } for (k=0; k<m_nCluster; k++) // normalization { double s = 0; for (j=0; j<m_nPageNum; j++) { s += Theta_Init_mx(k,j); } for (j=0; j<m_nPageNum; j++) { Theta_Init_mx(k,j) = Theta_Init_mx(k,j)/s; } } CDoubleMatrix * Theta_Trans= new CDoubleMatrix[m_nCluster]; // Theta_Trans for (k=0; k<m_nCluster; k++) { CDoubleMatrix Theta_Trans_mx(m_nPageNum, m_nPageNum); for (i=0; i<m_nPageNum; i++) { for (j=0; j<m_nPageNum; j++) { Theta_Trans_mx(i, j) = rand(); } } Theta_Trans[k] = Theta_Trans_mx; } for (k=0; k<m_nCluster; k++) // normalization { for (j=0; j<m_nPageNum; j++) { double s = 0; for (l=0; l<m_nPageNum; l++) { s += Theta_Trans[k](j,l); } for (l=0; l<m_nPageNum; l++) { Theta_Trans[k](j,l) = Theta_Trans[k](j,l)/s; } } } // (hyperparameter) double alpha_IT = 0.01 / m_nPageNum; // EM training EMalgorithm(Pi_vec, Theta_Init_mx, Theta_Trans, responsibility, start_vec, length_vec, pairID_vec, m_nCluster, m_nTotalID); // (Training) end // // (Hard assignment) begin // m_VecClus.create(m_nTotalID); for (i=0; i<m_nTotalID; i++) { int which = 0; for (k=0; k<m_nCluster; k++) { if (responsibility(i, which) > responsibility(i,k)) { which = k; } } m_VecClus(i) = which; } //输出模型信息到 m_strModel CTString strTemp; m_strModel = ""; strTemp.Format("%d", m_nCluster); m_strModel += strTemp; m_strModel += " "; strTemp.Format("%d", m_nPageNum); m_strModel += strTemp; m_strModel += " "; for (k=0; k<m_nCluster; k++) { for (j=0; j<m_nPageNum; j++) { for (i=0; i<m_nPageNum; i++) { strTemp.Format("%.9f", Theta_Trans[k](i,j)); m_strModel += strTemp; m_strModel += " "; } } } //输出模型信息到 m_strModel 完成 if (m_bSaveModel)//保存模型 CFileReadWrite::WriteStrToFile(m_ModelFile, m_strModel); // (Hard assignment) end // return TRUE; }
void parameter_test(Tree &T, Model &Mod, long Nrep, long length, double eps, std::vector<double> &pvals, std::string data_prefix, bool save_mc_exact){ long iter; long i, r; double df, C; double distance, KL; KL=0; distance=0; double likel; Parameters Parsim, Par, Par_noperm; Alignment align; Counts data; double eps_pseudo = 0.001; // Amount added to compute the pseudo-counts. StateList sl; bool save_data = (data_prefix != ""); std::string output_filename; std::stringstream output_index; std::ofstream logfile; std::ofstream logdistfile; std::ofstream out_chi2; std::ofstream out_br; std::ofstream out_brPerc; std::ofstream out_pvals; std::ofstream out_pvals_noperm; std::ofstream out_qvals; std::ofstream out_bound; std::ofstream out_variances; std::ofstream out_qvalsComb; std::ofstream out_qvalsCombzscore; std::ofstream out_covmatrix; std::ofstream out_parest; std::ofstream out_parsim; std::vector<double> KLe; std::vector<std::vector<double> > chi2_array; // an array of chi2 for every edge. std::vector<std::vector<double> > mult_array; // an array of mult for every edge. std::vector<std::vector<double> > br_array; // an array of br. length for every edge. std::vector<std::vector<double> > br_arrayPerc; // an array of br. length for every edge. std::vector<std::vector<double> > cota_array; // an array of upper bounds of the diff in lengths for every edge. std::vector<std::vector<double> > pval_array; // an array of pvals for every edge. std::vector<std::vector<double> > pval_noperm_array; std::vector<std::vector<double> > qval_array; // an array of qvalues for every edge. std::vector<std::vector<double> > variances_array; // an array of theoretical variances. std::vector<std::vector<double> > parest_array; // array of estimated parameters std::vector<std::vector<double> > parsim_array; // array of simulation parameters // ci_binom ci_bin; // condfidence interval std::vector<std::vector<ci_binom> > CIbinomial ; // vector of CIs std::list<long> produced_nan; long npars = T.nedges*Mod.df + Mod.rdf; // Initializing pvals pvals.resize(T.nedges); // Initialize the parameters for simulation of K81 data for testing Par = create_parameters(T); Parsim = create_parameters(T); // Initializing data structures KLe.resize(T.nedges); pval_array.resize(T.nedges); pval_noperm_array.resize(T.nedges); qval_array.resize(T.nedges); chi2_array.resize(T.nedges); mult_array.resize(T.nedges); br_array.resize(T.nedges); br_arrayPerc.resize(T.nedges); cota_array.resize(T.nedges); variances_array.resize(npars); parest_array.resize(npars); parsim_array.resize(npars); // initialize to 0's for (i=0; i < T.nedges; i++) { pval_array[i].resize(Nrep, 0); pval_noperm_array[i].resize(Nrep, 0); qval_array[i].resize(Nrep, 0); chi2_array[i].resize(Nrep, 0); mult_array[i].resize(Nrep, 0); br_array[i].resize(Nrep, 0); br_arrayPerc[i].resize(Nrep, 0); cota_array[i].resize(Nrep, 0); } for(i=0; i < npars; i++) { variances_array[i].resize(Nrep, 0); parest_array[i].resize(Nrep, 0); parsim_array[i].resize(Nrep, 0); } // Information about the chi^2. df = Mod.df; C = get_scale_constant(Mod); if (save_data) { logfile.open((data_prefix + ".log").c_str(), std::ios::out); logfile << "model: " << Mod.name << std::endl; logfile << "length: " << length << std::endl; logfile << "eps: " << eps << std::endl; logfile << "nalpha: " << T.nalpha << std::endl; logfile << "leaves: " << T.nleaves << std::endl; logfile << "tree: " << T.tree_name << std::endl; logfile << std::endl; logdistfile.open((data_prefix + ".dist.log").c_str(), std::ios::out); out_chi2.open(("out_chi2-" + data_prefix + ".txt").c_str(), std::ios::out); out_br.open(("out_br-" + data_prefix + ".txt").c_str(), std::ios::out); out_brPerc.open(("out_brPerc-" + data_prefix + ".txt").c_str(), std::ios::out); out_pvals.open(("out_pvals-" + data_prefix + ".txt").c_str(), std::ios::out); out_pvals_noperm.open(("out_pvals_noperm-" + data_prefix + ".txt").c_str(), std::ios::out); out_qvals.open(("out_qvals-" + data_prefix + ".txt").c_str(), std::ios::out); out_variances.open(("out_variances-" + data_prefix + ".txt").c_str(), std::ios::out); out_parest.open(("out_params-est-" + data_prefix + ".txt").c_str(), std::ios::out); out_parsim.open(("out_params-sim-" + data_prefix + ".txt").c_str(), std::ios::out); out_bound.open(("out_bound-" + data_prefix + ".txt").c_str(), std::ios::out); out_qvalsComb.open(("out_qvalsComb-" + data_prefix + ".txt").c_str(), std::ios::out); out_qvalsCombzscore.open(("out_qvalsCombzscore-" + data_prefix + ".txt").c_str(), std::ios::out); out_parsim.precision(15); out_parest.precision(15); out_variances.precision(15); } // uncomment the 2 following lines if want to fix the parameters // random_parameters_length(T, Mod, Parsim); //random_data(T, Mod, Parsim, length, align); for (iter=0; iter < Nrep; iter++) { std::cout << "iteration: " << iter << " \n"; // Produces an alignment from random parameters random_parameters_length(T, Mod, Parsim); random_data(T, Mod, Parsim, length, align); get_counts(align, data); add_pseudocounts(eps_pseudo, data); // Saving data if (save_data) { output_index.str(""); output_index << iter; output_filename = data_prefix + "-" + output_index.str(); save_alignment(align, output_filename + ".fa"); save_parameters(Parsim, output_filename + ".sim.dat"); } // Runs the EM std::tie(likel, iter) = EMalgorithm(T, Mod, Par, data, eps); // If algorithm returns NaN skip this iteration. if (boost::math::isnan(likel)) { produced_nan.push_back(iter); continue; } copy_parameters(Par, Par_noperm); // Chooses the best permutation. guess_permutation(T, Mod, Par); distance = parameters_distance(Parsim, Par); // estimated counts: Par ; original: Parsim std::vector<double> counts_est; counts_est.resize(T.nalpha, 0); // calculate the cov matrix std::vector<std::vector<double> > Cov; Array2 Cov_br; full_MLE_covariance_matrix(T, Mod, Parsim, length, Cov); if(save_data) { save_matrix(Cov, output_filename + ".cov.dat"); } // Save the covariances in an array std::vector<double> param; std::vector<double> param_sim; param.resize(npars); param_sim.resize(npars); get_free_param_vector(T, Mod, Par, param); get_free_param_vector(T, Mod, Parsim, param_sim); for(i=0; i < npars; i++) { variances_array[i][iter] = Cov[i][i]; parsim_array[i][iter] = param_sim[i]; parest_array[i][iter] = param[i]; } std::vector<double> xbranca, xbranca_noperm, mubranca; double chi2_noperm; xbranca.resize(Mod.df); xbranca_noperm.resize(Mod.df); mubranca.resize(Mod.df); for (i=0; i < T.nedges; i++) { r = 0; // row to be fixed // Extracts the covariance matrix, 1 edge branch_inverted_covariance_matrix(Mod, Cov, i, Cov_br); get_branch_free_param_vector(T, Mod, Parsim, i, mubranca); get_branch_free_param_vector(T, Mod, Par, i, xbranca); get_branch_free_param_vector(T, Mod, Par_noperm, i, xbranca_noperm); chi2_array[i][iter] = chi2_mult(mubranca, xbranca, Cov_br); chi2_noperm = chi2_mult(mubranca, xbranca_noperm, Cov_br); pval_array[i][iter] = pvalue_chi2(chi2_array[i][iter], Mod.df); pval_noperm_array[i][iter] = pvalue_chi2(chi2_noperm, Mod.df); br_array[i][iter] = T.edges[i].br - branch_length(Par.tm[i], T.nalpha); br_arrayPerc[i][iter] = branch_length(Par.tm[i], T.nalpha)/T.edges[i].br; // Upper bound on the parameter distance using multinomial: // cota_array[i][iter] = bound_mult(Parsim.tm[i], Xm, length); // and using the L2 bound cota_array[i][iter] = branch_length_error_bound_mult(Parsim.tm[i], Par.tm[i]); out_br << br_array[i][iter] << " "; out_brPerc << br_arrayPerc[i][iter] << " "; out_bound << cota_array[i][iter] << " "; out_chi2 << chi2_array[i][iter] << " "; } out_chi2 << std::endl; out_bound << std::endl; out_br << std::endl; out_brPerc << std::endl; // Saves more data. if (save_data) { logfile << iter << ": " << distance << " " << KL << std::endl; save_parameters(Par, output_filename + ".est.dat"); logdistfile << iter << ": "; logdistfile << parameters_distance_root(Par, Parsim) << " "; for(int j=0; j < T.nedges; j++) { logdistfile << parameters_distance_edge(Par, Parsim, j) << " "; } logdistfile << std::endl; } } // close iter loop here // Correct the p-values for(i=0; i < T.nedges; i++) { BH(pval_array[i], qval_array[i]); //save them } if (save_mc_exact) { for(long iter=0; iter < Nrep; iter++) { for(long i=0; i < T.nedges; i++) { out_pvals << pval_array[i][iter] << " "; out_pvals_noperm << pval_noperm_array[i][iter] << " "; out_qvals << qval_array[i][iter] << " "; } out_pvals << std::endl; out_pvals_noperm << std::endl; out_qvals << std::endl; for(long i=0; i < npars; i++) { out_variances << variances_array[i][iter] << " "; out_parsim << parsim_array[i][iter] << " "; out_parest << parest_array[i][iter] << " "; } out_variances << std::endl; out_parsim << std::endl; out_parest << std::endl; } } // now combine the pvalues for(i=0; i < T.nedges; i++) { pvals[i] = Fisher_combined_pvalue(pval_array[i]); //using the Zscore it goes like this: pvals[i] = Zscore_combined_pvalue(pval_array[i]); if (save_mc_exact) { out_qvalsComb << pvals[i] << " " ; out_qvalsCombzscore << Zscore_combined_pvalue(pval_array[i]) << " "; } } // Close files if (save_data) { logdistfile.close(); logfile.close(); } if (save_mc_exact) { out_chi2.close(); out_bound.close(); out_variances.close(); out_parest.close(); out_parsim.close(); out_br.close(); out_brPerc.close(); out_pvals.close(); out_qvals.close(); out_qvalsComb.close(); out_qvalsCombzscore.close(); out_covmatrix.close(); } // Warn if some EM's produced NaN. if (produced_nan.size() > 0) { std::cout << std::endl; std::cout << "WARNING: Some iterations produced NaN." << std::endl; std::list<long>::iterator it; for (it = produced_nan.begin(); it != produced_nan.end(); it++) { std::cout << *it << ", "; } std::cout << std::endl; } }
void run(std::string tree_filename, std::string fasta_filename, std::string model_name) { Model Mod; // The model Counts data; // the counts Parameters Par; // the parameters std::vector<double> br; // branch lengths double eps = 1e-8; // The threshold for the EM algorithm. Parameters Parsim; // used for simulating data. std::vector<double> brsim; // branch lengths of simulated data. std::vector<std::vector<double> > Cov; // Covariance matrix std::vector<double> variances; // The variances bool simulate; bool nonident; std::string parameters_filename; std::string covariances_filename; // initialize random number generator with time(0). random_initialize(); parameters_filename = strip_extension(fasta_filename) + ".dat"; covariances_filename = strip_extension(fasta_filename) + ".cov"; // Creates the pointers to the model-specific functions. Mod = create_model(model_name); std::cout << "Model: " << Mod.name << std::endl; // Reads the tree. Tree T = read_tree(tree_filename); // Prints the Tree std::cout << "Tree:" << std::endl; print_tree(T); // Check for possible nonidentifiability issues. nonident = nonident_warning(T); // Initialize the parameters for simulation of K81 data for testing Parsim = create_parameters(T); if (fasta_filename == ":test") { // if fasta file is :test generate random data. simulate = true; // Warn std::cout << "WARNING: Using simulated data " << std::endl << std::endl; // Generate random parameters random_parameters_length(T, Mod, Parsim); // Simulate the data data = random_fake_counts(T, 1000, Parsim); // Prints branch-lengths for future check. branch_lengths(Parsim, brsim); std::cout << "Simulated branch lengths:" << std::endl; print_vector(brsim); } else { // otherwise read the data simulate = false; // Read the counts. std::cout << "Reading fasta file:" << std::endl; read_counts(T, data, fasta_filename); add_pseudocounts(0.01, data); std::cout << std::endl; } // Check whether the data and the tree match. if (T.nalpha != data.nalpha || T.nleaves != data.nspecies) { throw std::invalid_argument("The order of the sequences or their number and the phylogenetic tree do not match."); } //Par = create_parameters(T); //print_parameters(Par); //print_vector(Par.r); //clock_t long start_time, end_time; // Runs the EM algorithm. Par is used as initial parameters. // After execution, Par contains the MLE computed by the algorithm. // for local max over multiple iterations Parameters Parmax = Par; Model Modmax = Mod; float likelL = 0.0; float likelMax = -1000000.0; float timerec; float timemax; int outfiles; //whether to save output std::cout << "Starting the EM algorithm: " << std::endl; int s; int S = 0; //count of cases with neg branches int iter; int iterMax; for (int it_runs = 0; it_runs < 10; it_runs++) { Par = create_parameters(T); Mod = create_model(model_name); std::cout << it_runs << ", " ; start_time = clock(); std::tie(likelL, iter) = EMalgorithm(T, Mod, Par, data, eps); end_time = clock(); //print_parameters(Par); // Choses the best permutation. guess_permutation(T, Mod, Par); branch_lengths(Par, br); //print_vector(br); s = find_negative(br); S +=s; timerec = ((float)end_time - start_time) / CLOCKS_PER_SEC; //assign the 1st iter time value, inc ase it's the best if (it_runs == 0){ timemax = timerec; iterMax = iter; } if (likelL > likelMax){ Parmax = Par; Modmax = Mod; timemax = timerec; likelMax = likelL; iterMax = iter; } } // If parameters are not identifiable, the computation of the covariance matrix will // fail as the Fisher info matrix will not be invertible. if (!nonident) { // Compute the covariance matrix using observed Fisher. full_MLE_observed_covariance_matrix(T, Modmax, Parmax, data, Cov); variances.resize(Cov.size()); for(unsigned int i=0; i < Cov.size(); i++) { variances[i] = Cov[i][i]; } // OUTPUT Save the sigmas into a file //save_sigmas_to(covariances_filename, Cov); } std::cout << std::endl; std::cout << "Finished." << std::endl; std::cout << "Likelihood: " << log_likelihood(T, Parmax, data) << std::endl ; std::cout << "Time: " << timemax << std::endl << std::endl; std::cout << "negative branches: " << S << std::endl; std::cout << "Iter: " << iterMax << std::endl; //std::cout << "Branch lengths: " << std::endl; //print_vector(br); outfiles = 0; if (!nonident && outfiles) { std::cout << "Parameter variances: " << std::endl; print_vector(variances); } std::cout << "Newick Tree:" << std::endl; print_newick_tree(T, br); // if is a simulation, print the L2 distance ! if (simulate) { std::cout << "L2 distance: " << parameters_distance(Par, Parsim) << std::endl; std::cout << "KL divergence: " << KL_divergence(T, Par, Parsim) << std::endl; std::cout << std::endl; } // if it is not a simulation, store the parameters in a file ! if (!simulate && outfiles) { std::fstream st; st.precision(15); st.setf(std::ios::fixed,std::ios::floatfield); st.open(parameters_filename.c_str(), std::ios::out); print_parameters(Par, st); } }