void TaggerMergingData::calculate_parameters_merging(istream& fcounts, int corpus_length) { map<int, map<int, double> > tags_pair; map<int, map<int, double> > emis; map<int, double> tags_count; map<int, double> ambclass_count; map<int, double> tags_count_for_emis; map<int, map<int, double> > coarse_tags_pair; map<int, map<int, double> > coarse_emis; map<int, double> coarse_tags_count; map<int, double> coarse_ambclass_count; map<int, double> coarse_tags_count_for_emis; set<int>::iterator iti, itj, itk, itcoarse; read_counts(fcounts, tags_pair, emis, tags_count, ambclass_count, tags_count_for_emis); cerr<<"Calculating new parameters ... "<<flush; //Calculate coarse tag counts from fine-grained tag counts for(int i=0; i<COARSE_N; i++) { for(int j=0; j<COARSE_N; j++) { coarse_tags_pair[i][j]=0.0; for(iti=coarse2fine[i].begin(); iti!=coarse2fine[i].end(); iti++) { for(itj=coarse2fine[j].begin(); itj!=coarse2fine[j].end(); itj++) { coarse_tags_pair[i][j]+=tags_pair[*iti][*itj]; } } //cerr<<"coarse_tags_pair["<<i<<"]["<<j<<"]="<<coarse_tags_pair[i][j]<<"\n"; } } //Calculate coarse amb. class emission counts for(int k=0; k<COARSE_M; k++) { for(itcoarse=coarse_output[k].begin(); itcoarse!=coarse_output[k].end(); itcoarse++) { coarse_emis[*itcoarse][k]=0.0; for(iti=coarse2fine[*itcoarse].begin(); iti!=coarse2fine[*itcoarse].end(); iti++) { for(itk=coarseamb2fineamb[k].begin(); itk!=coarseamb2fineamb[k].end(); itk++) { if (TaggerData::getOutput()[*itk].find(*iti)!=TaggerData::getOutput()[*itk].end()) { coarse_emis[*itcoarse][k]+=emis[*iti][*itk]; } } } //cerr<<"coarse_emis["<<*itcoarse<<"]["<<k<<"]="<<coarse_emis[*itcoarse][k]<<"\n"; } } for(int i=0; i<COARSE_N; i++) { coarse_tags_count[i]=0.0; coarse_tags_count_for_emis[i]=0.0; for(iti=coarse2fine[i].begin(); iti!=coarse2fine[i].end(); iti++) { coarse_tags_count[i]+=tags_count[*iti]; coarse_tags_count_for_emis[i]+=tags_count_for_emis[*iti]; } //cerr<<"coarse_tags_count["<<i<<"]="<<coarse_tags_count[i]<<"\n"; //cerr<<"coarse_tags_count_for_emis["<<"i"<<"]="<<coarse_tags_count_for_emis[i]<<"\n"; } for(int k=0; k<COARSE_M; k++) { coarse_ambclass_count[k]=0.0; //cerr<<"k="<<k<<" "; for(itk=coarseamb2fineamb[k].begin(); itk!=coarseamb2fineamb[k].end(); itk++) { //cerr<<*itk<<"("<<ambclass_count[*itk]<<") "; coarse_ambclass_count[k]+=ambclass_count[*itk]; } //cerr<<"coarse_ambclass_count["<<k<<"]="<<coarse_ambclass_count[k]<<"\n"; } SmoothUtils::calculate_smoothed_parameters(*this, coarse_tags_count, coarse_tags_pair, coarse_ambclass_count, coarse_emis, coarse_tags_count_for_emis, corpus_length); cerr<<"done.\n"; }
void run(std::string tree_filename, std::string fasta_filename, std::string model_name) { Model Mod; // The model Counts data; // the counts Parameters Par; // the parameters std::vector<double> br; // branch lengths double eps = 1e-8; // The threshold for the EM algorithm. Parameters Parsim; // used for simulating data. std::vector<double> brsim; // branch lengths of simulated data. std::vector<std::vector<double> > Cov; // Covariance matrix std::vector<double> variances; // The variances bool simulate; bool nonident; std::string parameters_filename; std::string covariances_filename; // initialize random number generator with time(0). random_initialize(); parameters_filename = strip_extension(fasta_filename) + ".dat"; covariances_filename = strip_extension(fasta_filename) + ".cov"; // Creates the pointers to the model-specific functions. Mod = create_model(model_name); std::cout << "Model: " << Mod.name << std::endl; // Reads the tree. Tree T = read_tree(tree_filename); // Prints the Tree std::cout << "Tree:" << std::endl; print_tree(T); // Check for possible nonidentifiability issues. nonident = nonident_warning(T); // Initialize the parameters for simulation of K81 data for testing Parsim = create_parameters(T); if (fasta_filename == ":test") { // if fasta file is :test generate random data. simulate = true; // Warn std::cout << "WARNING: Using simulated data " << std::endl << std::endl; // Generate random parameters random_parameters_length(T, Mod, Parsim); // Simulate the data data = random_fake_counts(T, 1000, Parsim); // Prints branch-lengths for future check. branch_lengths(Parsim, brsim); std::cout << "Simulated branch lengths:" << std::endl; print_vector(brsim); } else { // otherwise read the data simulate = false; // Read the counts. std::cout << "Reading fasta file:" << std::endl; read_counts(T, data, fasta_filename); add_pseudocounts(0.01, data); std::cout << std::endl; } // Check whether the data and the tree match. if (T.nalpha != data.nalpha || T.nleaves != data.nspecies) { throw std::invalid_argument("The order of the sequences or their number and the phylogenetic tree do not match."); } //Par = create_parameters(T); //print_parameters(Par); //print_vector(Par.r); //clock_t long start_time, end_time; // Runs the EM algorithm. Par is used as initial parameters. // After execution, Par contains the MLE computed by the algorithm. // for local max over multiple iterations Parameters Parmax = Par; Model Modmax = Mod; float likelL = 0.0; float likelMax = -1000000.0; float timerec; float timemax; int outfiles; //whether to save output std::cout << "Starting the EM algorithm: " << std::endl; int s; int S = 0; //count of cases with neg branches int iter; int iterMax; for (int it_runs = 0; it_runs < 10; it_runs++) { Par = create_parameters(T); Mod = create_model(model_name); std::cout << it_runs << ", " ; start_time = clock(); std::tie(likelL, iter) = EMalgorithm(T, Mod, Par, data, eps); end_time = clock(); //print_parameters(Par); // Choses the best permutation. guess_permutation(T, Mod, Par); branch_lengths(Par, br); //print_vector(br); s = find_negative(br); S +=s; timerec = ((float)end_time - start_time) / CLOCKS_PER_SEC; //assign the 1st iter time value, inc ase it's the best if (it_runs == 0){ timemax = timerec; iterMax = iter; } if (likelL > likelMax){ Parmax = Par; Modmax = Mod; timemax = timerec; likelMax = likelL; iterMax = iter; } } // If parameters are not identifiable, the computation of the covariance matrix will // fail as the Fisher info matrix will not be invertible. if (!nonident) { // Compute the covariance matrix using observed Fisher. full_MLE_observed_covariance_matrix(T, Modmax, Parmax, data, Cov); variances.resize(Cov.size()); for(unsigned int i=0; i < Cov.size(); i++) { variances[i] = Cov[i][i]; } // OUTPUT Save the sigmas into a file //save_sigmas_to(covariances_filename, Cov); } std::cout << std::endl; std::cout << "Finished." << std::endl; std::cout << "Likelihood: " << log_likelihood(T, Parmax, data) << std::endl ; std::cout << "Time: " << timemax << std::endl << std::endl; std::cout << "negative branches: " << S << std::endl; std::cout << "Iter: " << iterMax << std::endl; //std::cout << "Branch lengths: " << std::endl; //print_vector(br); outfiles = 0; if (!nonident && outfiles) { std::cout << "Parameter variances: " << std::endl; print_vector(variances); } std::cout << "Newick Tree:" << std::endl; print_newick_tree(T, br); // if is a simulation, print the L2 distance ! if (simulate) { std::cout << "L2 distance: " << parameters_distance(Par, Parsim) << std::endl; std::cout << "KL divergence: " << KL_divergence(T, Par, Parsim) << std::endl; std::cout << std::endl; } // if it is not a simulation, store the parameters in a file ! if (!simulate && outfiles) { std::fstream st; st.precision(15); st.setf(std::ios::fixed,std::ios::floatfield); st.open(parameters_filename.c_str(), std::ios::out); print_parameters(Par, st); } }