void SequenceConcatenater::write_partition_information (vector <string> const& inputFiles, string & partfile) { ofstream outfile(partfile.c_str()); int charIndex = 1; int stopIndex = 1; // need to check seq type when writing this // use infer_alpha / get_alpha_name // but: are mixed seq types allowed? prolly... // - so: need to check each one for (unsigned int i = 0; i < partition_sizes_.size(); i++) { stopIndex = charIndex + partition_sizes_[i] - 1; bool going = true; string alpha = ""; int j = 0; while (going) { Sequence terp = seqs_[j]; string subseq = terp.get_sequence().substr((charIndex - 1), partition_sizes_[i]); // check if all are the same character (presumably all N, but useful either way) if (subseq.find_first_not_of(subseq.front()) != std::string::npos) { terp.set_sequence(subseq); alpha = terp.get_alpha_name(); going = false; } j++; } outfile << alpha << ", " << inputFiles[i] << " = " << charIndex << "-" << stopIndex << endl; charIndex = stopIndex + 1; } outfile.close(); }
void SequenceCleaner::read_sequences (istream* pios) { Sequence seq; string retstring; int ft = test_seq_filetype_stream(*pios, retstring); int num_current_char = 0; bool first = true; while (read_next_seq_from_stream(*pios, ft, retstring, seq)) { sequences_[seq.get_id()] = seq.get_sequence(); num_current_char = seq.get_sequence().size(); if (first) { num_char_ = num_current_char; // just getting this from an arbitrary (first) sequence for now if (is_dna_) { string alpha_name = seq.get_alpha_name(); if (alpha_name == "AA") { is_dna_ = false; //cout << "I believe this is a protein!" << endl; } } first = false; continue; } else { if (num_current_char != num_char_) { cout << "Error: sequences are not all of the same length. Exiting." << endl; exit(0); } } } if (ft == 2) { sequences_[seq.get_id()] = seq.get_sequence(); num_current_char = seq.get_sequence().size(); if (num_current_char != num_char_) { cout << "Error: sequences are not all of the same length. Exiting." << endl; exit(0); } } num_taxa_ = sequences_.size(); }