void SequenceCleaner::read_sequences (istream* pios) { Sequence seq; string retstring; int ft = test_seq_filetype_stream(*pios, retstring); int num_current_char = 0; bool first = true; while (read_next_seq_from_stream(*pios, ft, retstring, seq)) { sequences_[seq.get_id()] = seq.get_sequence(); num_current_char = seq.get_sequence().size(); if (first) { num_char_ = num_current_char; // just getting this from an arbitrary (first) sequence for now if (is_dna_) { string alpha_name = seq.get_alpha_name(); if (alpha_name == "AA") { is_dna_ = false; //cout << "I believe this is a protein!" << endl; } } first = false; continue; } else { if (num_current_char != num_char_) { cout << "Error: sequences are not all of the same length. Exiting." << endl; exit(0); } } } if (ft == 2) { sequences_[seq.get_id()] = seq.get_sequence(); num_current_char = seq.get_sequence().size(); if (num_current_char != num_char_) { cout << "Error: sequences are not all of the same length. Exiting." << endl; exit(0); } } num_taxa_ = sequences_.size(); }
NJOI::NJOI (istream* pios, int & threads):ntax_(0), nchar_(0), nthreads_(threads) { Sequence seq; string retstring; int ft = test_seq_filetype_stream(*pios, retstring); int seqcount = 0; // some error checking. should be in general seq reader class bool first = true; while (read_next_seq_from_stream(*pios, ft, retstring, seq)) { sequences_[seq.get_id()] = seq.get_sequence(); if (!first) { if ((int)seq.get_length() != nchar_) { cout << "Error: sequence " << seq.get_id() << " has " << seq.get_length() << " characters, was expecting " << nchar_ << "." << endl << "Exiting." << endl; exit(1); } } else { nchar_ = seq.get_length(); first = false; } seqcount++; } //fasta has a trailing one if (ft == 2) { sequences_[seq.get_id()] = seq.get_sequence(); if ((int)seq.get_length() != nchar_) { cout << "Error: sequence " << seq.get_id() << " has " << seq.get_length() << " characters, was expecting " << nchar_ << "." << endl << "Exiting." << endl; exit(1); }; seqcount++; } ntax_ = seqcount; set_name_key (); Matrix = BuildMatrix(sequences_); TREEMAKE(names_, name_key_, Matrix); }
int main(int argc, char * argv[]) { log_call(argc, argv); bool cfileset = false; bool tfileset = false; bool outfileset = false; char * treef = NULL; char * charf = NULL; char * outf = NULL; int analysis = 0; while (1) { int oi = -1; int c = getopt_long(argc, argv, "c:t:o:hV", long_options, &oi); if (c == -1) { break; } switch(c) { case 'c': cfileset = true; charf = strdup(optarg); check_file_exists(charf); break; case 't': tfileset = true; treef = strdup(optarg); check_file_exists(treef); break; case 'o': outfileset = true; outf = strdup(optarg); break; case 'h': print_help(); exit(0); case 'V': cout << versionline << endl; exit(0); default: print_error(argv[0], (char)c); exit(0); } } istream * pios = NULL; istream * poos = NULL; ifstream * cfstr = NULL; ifstream * tfstr = NULL; ostream * poouts = NULL; ofstream * ofstr = NULL; if (tfileset == true) { tfstr = new ifstream(treef); poos = tfstr; } else { poos = &cin; } if (cfileset == true) { cfstr = new ifstream(charf); pios = cfstr; } else { cout << "you have to set a character file. Only a tree file can be read in through the stream;" << endl; } //out file // if (outfileset == true){ ofstr = new ofstream(outf); poouts = ofstr; } else{ poouts = &cout; } // string retstring; int ft = test_char_filetype_stream(*pios, retstring); if (ft != 1 && ft != 2) { cout << "only fasta and phylip (with spaces) supported so far" << endl; exit(0); } Sequence seq; vector <Sequence> seqs; map <string, int> seq_map; int y = 0; int nchars = 0 ; while (read_next_seq_char_from_stream(*pios, ft, retstring, seq)) { seqs.push_back(seq); nchars = seq.get_num_cont_char(); seq_map[seq.get_id()] = y; seq.clear_cont_char(); y++; } cout << "nchars: " << nchars << endl; if (ft == 2) { seqs.push_back(seq); seq_map[seq.get_id()] = y; seq.clear_cont_char(); } //read trees TreeReader tr; vector<Tree *> trees; while (getline(*poos,retstring)) { if (retstring.size()<4){ continue; } trees.push_back(tr.readTree(retstring)); } int x = 0; //conduct analyses for each character for (int i=0; i < trees[x]->getExternalNodeCount(); i++) { vector<Superdouble> tv (nchars); for (int c=0; c < nchars; c++) { tv[c] = seqs[seq_map[trees[x]->getExternalNode(i)->getName()]].get_cont_char(c); } trees[x]->getExternalNode(i)->assocDoubleVector("val",tv); } for (int i=0; i < trees[x]->getInternalNodeCount(); i++) { vector<Superdouble> tv (nchars); for (int c=0; c < nchars; c++) { tv[c] = 0; } trees[x]->getInternalNode(i)->assocDoubleVector("val",tv); } float sigma = 1; cout << calc_bm_prune(trees[x], sigma) << endl; optimize_single_rate_bm_bl(trees[x]); (*poouts) << trees[x]->getRoot()->getNewick(true) << ";" << endl; cout << calc_bm_prune(trees[x], sigma) << endl; if (cfileset) { cfstr->close(); delete pios; } if (tfileset) { tfstr->close(); delete poos; } if (outfileset) { ofstr->close(); delete poouts; } return EXIT_SUCCESS; }
void SequenceConcatenater::read_sequences (string & seqf) { filename_ = seqf; string retstring; istream * pios = new ifstream(filename_); ft_ = test_seq_filetype_stream(*pios, retstring); Sequence seq; int counter = 0; int length = 0; // phylip (1) NEXUS (0) if (ft_ == 1 || ft_ == 0) { if (ft_ == 1) { vector <string> fileDim = tokenize(retstring); num_taxa_ = stoi(fileDim[0]); num_char_ = stoi(fileDim[1]); } else { get_nexus_dimensions_file(seqf, num_taxa_, num_char_, interleave_); } if (!interleave_) { while (read_next_seq_from_stream(*pios, ft_, retstring, seq)) { length = (int)seq.get_sequence().size(); if (length != num_char_) { cout << "Sequence '" << seq.get_id() << "' has " << length << " characters, but the file '" << filename_ << "' specified " << num_char_ << " characters. Exiting." << endl; delete pios; exit(1); } if (toupcase_) { seq.set_sequence(seq.seq_to_upper()); } seqs_.push_back(seq); counter++; } if (counter != num_taxa_) { cout << "Read " << counter << " taxa, but the file '" << filename_ << "' specified " << num_taxa_ << " taxa. Exiting." << endl; delete pios; exit(1); } } else { seqs_ = read_interleaved_nexus_file(seqf, num_taxa_, num_char_); if (toupcase_) { for (int i = 0; i < num_taxa_; i++) { seqs_[i].set_sequence(seqs_[i].seq_to_upper()); } } } } else if (ft_ == 2) { // fasta bool first = true; while (read_next_seq_from_stream(*pios, ft_, retstring, seq)) { int curr = (int)seq.get_sequence().size(); if (!first) { if (curr != length) { cout << "Error: current sequence has " << curr << " characters, but previous sequence had " << length << " characters. Exiting." << endl; delete pios; exit(1); } } else { length = curr; first = false; } if (toupcase_) { seq.set_sequence(seq.seq_to_upper()); } seqs_.push_back(seq); counter++; } // fasta has a trailing one if (toupcase_) { seq.set_sequence(seq.seq_to_upper()); } seqs_.push_back(seq); counter++; num_taxa_ = counter; num_char_ = length; } else { cout << "I don't know what that alignment file format is! Exiting." << endl; exit(0); } num_partitions_ = 1; partition_sizes_.push_back(num_char_); delete pios; }
int main(int argc, char * argv[]) { log_call(argc, argv); bool outfileset = false; bool sfileset = false; bool cfileset = false; bool nfileset = false; bool verbose = false; char * outf = NULL; char * seqf = NULL; string cnamef = ""; string nnamef = ""; while (1) { int oi = -1; int c = getopt_long(argc, argv, "s:c:n:o:vhV", long_options, &oi); if (c == -1) { break; } switch(c) { case 's': sfileset = true; seqf = strdup(optarg); check_file_exists(seqf); break; case 'c': cfileset = true; cnamef = strdup(optarg); check_file_exists(cnamef.c_str()); break; case 'n': nfileset = true; nnamef = strdup(optarg); check_file_exists(nnamef.c_str()); break; case 'o': outfileset = true; outf = strdup(optarg); break; case 'v': verbose = true; break; case 'h': print_help(); exit(0); case 'V': cout << versionline << endl; exit(0); default: print_error(argv[0], (char)c); exit(0); } } if (sfileset && outfileset) { check_inout_streams_identical(seqf, outf); } istream * pios = NULL; ostream * poos = NULL; ifstream * fstr = NULL; ofstream * ofstr = NULL; if (!nfileset | !cfileset) { cout << "Must supply both name files (-c for current, -n for new)." << endl; exit(0); } if (sfileset == true) { fstr = new ifstream(seqf); pios = fstr; } else { pios = &cin; if (check_for_input_to_stream() == false) { print_help(); exit(1); } } if (outfileset == true) { ofstr = new ofstream(outf); poos = ofstr; } else { poos = &cout; } Relabel rl (cnamef, nnamef, verbose); set <string> orig = rl.get_names_to_replace(); Sequence seq; string retstring; int ft = test_seq_filetype_stream(*pios, retstring); bool success = false; while (read_next_seq_from_stream(*pios, ft, retstring, seq)) { string terp = seq.get_id(); success = rl.relabel_sequence(seq); if (success) { orig.erase(terp); } (*poos) << ">" << seq.get_id() << endl; (*poos) << seq.get_sequence() << endl; } // have to deal with last sequence outside while loop. fix this. if (ft == 2) { string terp = seq.get_id(); success = rl.relabel_sequence(seq); if (success) { orig.erase(terp); } (*poos) << ">" << seq.get_id() << endl; (*poos) << seq.get_sequence() << endl; } if (orig.size() > 0) { if (verbose) { cerr << "The following names to match were not found in the alignment:" << endl; for (auto elem : orig) { cerr << elem << endl; } } } if (sfileset) { fstr->close(); delete pios; } if (outfileset) { ofstr->close(); delete poos; } return EXIT_SUCCESS; }
int main(int argc, char * argv[]) { log_call(argc, argv); bool outfileset = false; bool fileset = false; bool printpost = false; bool showancs = false; bool is_dna = true; float pinvar = 0.0; double tot; string yorn = "n"; int seqlen = 1000; int pos = 0; int pos2 = 0; string infreqs; string inrates; string holdrates; string ancseq; char * outf = NULL; char * treef = NULL; vector <double> diag(20, 0.0); vector <double> basefreq(4, 0.25); vector <double> aabasefreq(20, 0.05); vector <double> userrates; vector <double> multirates; int nreps = 1; // not implemented at the moment int seed = -1; int numpars = 0; float alpha = -1.0; vector<vector <double>> dmatrix; vector< vector <double> > aa_rmatrix(20, vector<double>(20, 1)); for (unsigned int i = 0; i < aa_rmatrix.size(); i++) { for (unsigned int j = 0; j < aa_rmatrix.size(); j++) { if (i == j) { // Fill Diagonal aa_rmatrix[i][j] = -19.0; } } } vector< vector <double> > rmatrix(4, vector<double>(4, 0.33)); for (unsigned int i = 0; i < rmatrix.size(); i++) { for (unsigned int j = 0; j < rmatrix.size(); j++) { if (i == j) { // Fill Diagonal rmatrix[i][j] = -0.99; } } } /*dmatrix = aa_rmatrix; for (unsigned int i = 0; i < dmatrix.size(); i++) { for (unsigned int j = 0; j < dmatrix.size(); j++) { cout << dmatrix[i][j] << " "; } cout << "\n"; }*/ while (1) { int oi = -1; int c = getopt_long(argc, argv, "t:o:l:b:g:i:r:w:q:n:x:apcm:k:hV", long_options, &oi); if (c == -1) { break; } switch(c) { case 't': fileset = true; treef = strdup(optarg); check_file_exists(treef); break; case 'o': outfileset = true; outf = strdup(optarg); break; case 'b': infreqs = strdup(optarg); parse_comma_list(infreqs, basefreq); if (basefreq.size() != 4) { cout << "Error: must provide 4 base frequencies (" << basefreq.size() << " provided). Exiting." << endl; exit(0); } if (!essentially_equal(sum(basefreq), 1.0)) { cout << "Error: base frequencies must sum to 1.0. Exiting." << endl; exit(0); } break; case 'l': seqlen = atoi(strdup(optarg)); break; case 'a': showancs = true; break; case 'r': inrates = strdup(optarg); parse_comma_list(inrates, userrates); // NOTE: will have to alter this check for a.a., non-reversible, etc. if (userrates.size() != 6) { cout << "Error: must provide 6 substitution parameters. " << "Only " << userrates.size() << " provided. Exiting." << endl; exit(0); } // NOTE: this uses order: A,T,C,G for matrix, but // A<->C,A<->G,A<->T,C<->G,C<->T,G<->T for subst. params. rmatrix[0][2] = userrates[0]; rmatrix[2][0] = userrates[0]; rmatrix[0][3] = userrates[1]; rmatrix[3][0] = userrates[1]; rmatrix[0][1] = userrates[2]; rmatrix[1][0] = userrates[2]; rmatrix[2][3] = userrates[3]; rmatrix[3][2] = userrates[3]; rmatrix[1][2] = userrates[4]; rmatrix[2][1] = userrates[4]; rmatrix[1][3] = userrates[5]; rmatrix[3][1] = userrates[5]; rmatrix[0][0] = (userrates[0]+userrates[1]+userrates[2]) * -1; rmatrix[1][1] = (userrates[2]+userrates[4]+userrates[5]) * -1; rmatrix[2][2] = (userrates[0]+userrates[3]+userrates[4]) * -1; rmatrix[3][3] = (userrates[1]+userrates[3]+userrates[5]) * -1; /*//Turn on to check matrix for (unsigned int i = 0; i < rmatrix.size(); i++) { for (unsigned int j = 0; j < rmatrix.size(); j++) { cout << rmatrix[i][j] << " "; } cout << "\n"; }*/ break; case 'w': inrates = strdup(optarg); parse_comma_list(inrates, userrates); is_dna = false; // NOTE: will have to alter this check for a.a., non-reversible, etc. if (userrates.size() != 190) { cout << "Error: must provide 190 substitution parameters, I know its a stupidly large amount. " << "Only " << userrates.size() << " provided. Exiting." << endl; exit(0); } pos = 0; pos2 = 1; //Fill the Matrix for (unsigned int i = 0; i < userrates.size(); i++){ aa_rmatrix[pos][pos2] = userrates[i]; aa_rmatrix[pos2][pos] = userrates[i]; pos2++; if (pos2 == 20){ pos += 1; pos2 = (pos + 1); } } //Replace Diagonal for (unsigned int i = 0; i < aa_rmatrix.size(); i++) { for (unsigned int j = 0; j < aa_rmatrix.size(); j++) { if (i != j){ tot += aa_rmatrix[i][j]; } } aa_rmatrix[i][i] = (tot*-1); tot = 0.0; } /* for (unsigned int i = 0; i < aa_rmatrix.size(); i++) { for (unsigned int j = 0; j < aa_rmatrix.size(); j++) { cout << aa_rmatrix[i][j] << " "; } cout << "\n"; }*/ break; case 'n': nreps = atoi(strdup(optarg)); break; case 'x': seed = atoi(strdup(optarg)); break; case 'q': is_dna = false; infreqs = strdup(optarg); parse_comma_list(infreqs, aabasefreq); if (aabasefreq.size() != 20) { cout << "Error: must provide 20 base frequencies (" << aabasefreq.size() << " provided). Exiting." << endl; exit(0); } if (!essentially_equal(sum(aabasefreq), 1.0)) { cout << "Error: base frequencies must sum to 1.0. Exiting." << endl; exit(0); } break; case 'g': alpha = atof(strdup(optarg)); break; case 'i': pinvar = atof(strdup(optarg)); break; case 'p': printpost = true; break; case 'c': is_dna = false; break; case 'm': holdrates = strdup(optarg); parse_comma_list(holdrates, multirates); numpars = multirates.size(); if ((numpars - 6) % 7 != 0) { cout << "Error: must provide 6 background substitution " << "parameters and 7 values (1 node id + 6 subst. par.) " << "for each piecewise model. Exiting." << endl; exit(0); } break; case 'k': ancseq = strdup(optarg); break; case 'h': print_help(); exit(0); case 'V': cout << versionline << endl; exit(0); default: print_error(argv[0], (char)c); exit(0); } } if (fileset && outfileset) { check_inout_streams_identical(treef, outf); } if (is_dna) { dmatrix = rmatrix; } else { dmatrix = aa_rmatrix; } istream * pios = NULL; ostream * poos = NULL; ifstream * fstr = NULL; ofstream * ofstr = NULL; if (outfileset == true) { ofstr = new ofstream(outf); poos = ofstr; } else { poos = &cout; } if (fileset == true) { fstr = new ifstream(treef); pios = fstr; } else { pios = &cin; if (check_for_input_to_stream() == false) { print_help(); exit(1); } } /* * Default Base Frequencies and Rate Matrix * */ //vector <double> basefreq(4, 0.0); //basefreq[0] = .25; //basefreq[1] = .25; //basefreq[2] = .25; //basefreq[3] = 1.0 - basefreq[0] - basefreq[1] - basefreq[2]; /* vector< vector <double> > rmatrix(4, vector<double>(4, 0.33)); for (unsigned int i = 0; i < rmatrix.size(); i++) { for (unsigned int j = 0; j < rmatrix.size(); j++) { if (i == j) {//Fill Diagnol rmatrix[i][j] = -1.0; } } } */ string retstring; int ft = test_tree_filetype_stream(*pios, retstring); if (ft != 0 && ft != 1) { cerr << "this really only works with nexus or newick" << endl; exit(0); } // allow > 1 tree in input. passing but not yet using nreps int treeCounter = 0; bool going = true; if (ft == 1) { // newick. easy Tree * tree; while (going) { tree = read_next_tree_from_stream_newick (*pios, retstring, &going); if (tree != NULL) { //cout << "Working on tree #" << treeCounter << endl; SequenceGenerator SGen(seqlen, basefreq, dmatrix, tree, showancs, nreps, seed, alpha, pinvar, ancseq, printpost, multirates, aabasefreq, is_dna); vector <Sequence> seqs = SGen.get_sequences(); for (unsigned int i = 0; i < seqs.size(); i++) { Sequence seq = seqs[i]; (*poos) << ">" << seq.get_id() << endl; //cout << "Here" << endl; (*poos) << seq.get_sequence() << endl; } delete tree; treeCounter++; } } } else if (ft == 0) { // Nexus. need to worry about possible translation tables map <string, string> translation_table; bool ttexists; ttexists = get_nexus_translation_table(*pios, &translation_table, &retstring); Tree * tree; while (going) { tree = read_next_tree_from_stream_nexus(*pios, retstring, ttexists, &translation_table, &going); if (going == true) { cout << "Working on tree #" << treeCounter << endl; SequenceGenerator SGen(seqlen, basefreq, dmatrix, tree, showancs, nreps, seed, alpha, pinvar, ancseq, printpost, multirates, aabasefreq, is_dna); vector <Sequence> seqs = SGen.get_sequences(); for (unsigned int i = 0; i < seqs.size(); i++) { Sequence seq = seqs[i]; (*poos) << ">" << seq.get_id() << endl; (*poos) << seq.get_sequence() << endl; } delete tree; treeCounter++; } } } return EXIT_SUCCESS; }