SeqType read_fasta(const std::string& filename, const std::string& name) { std::string line, begin_contig=">"+name; SeqType sequence; // open file std::ifstream fastafile ( filename.c_str() , std::ifstream::in ); if (!fastafile.is_open()) { std::stringstream ss; ss<< "Fasta file "<< filename << " cannot be opened."; throw std::domain_error(ss.str().c_str()); } // find contig in multifasta do { getline(fastafile, line); if (fastafile.eof()) { std::stringstream s; s << "Sequence of \"" << name << "\" not found."; throw std::invalid_argument(s.str()); } } while (line!=begin_contig); /* // get first contig line getline(fastafile, line); // while we do not reach a new contig while ((!fastafile.eof())&&(line.substr(0,1) != ">")) { // move read bases into sequence unsigned int filled_until=sequence.size(); sequence.resize(sequence.size()+line.size()); for (unsigned int i=0; i<line.size(); i++) { sequence[i+filled_until]=line[i]; } // read a new line getline(fastafile, line); } */ sequence=read_sequence(fastafile); fastafile.close(); return sequence; }
int main(int argc, char** argv) { if (argc < 4) { std::cout << "Usage: get_TrieArray <fastafile> <matrix> <peptideLength> <outfile> " << std::endl; return -1; } string fastafile(argv[1]); string matrix(argv[2]); cout << "test" << endl; int peptideLength(atoi(argv[3])); string outname(argv[4]); //std::cout << fastafile << "\t" << outname << std::endl; //---------------------------------------------------------------------------------------- cout << "Reading FASTA file..." << endl; Sequences s(fastafile); cout << "Read " << s.size() << " sequences." << endl; cout << "Generating peptides..." << endl; Sequences ninemers; generateAllSubstrings(ninemers, s, peptideLength); cout << "Generated " << ninemers.size() << " peptides." << endl; s.clear(); //Matrix m("/abi-projects/dist2self/matrices/BLOSUM45_distance_normal.dat"); cout << "Initializing trie. " << endl; Trie t; Matrix m(matrix); cout << "Initializing trie. " << endl; Trie t; Matrix::IndexSequence indices; for (size_t i = 0; i < ninemers.size(); ++i) { m.translate(ninemers[i], indices); t.add(indices); } t.dump(); cout << "Converting to trie array." << endl; TrieArray ta(t, peptideLength); cout << "Done." << endl; // std::ofstream ofs("test.trie"); std::ofstream ofs(outname.c_str()); boost::archive::text_oarchive oa(ofs); ta.save(oa,1); }
int main(int argc, char** argv) { if (argc < 2) { std::cout << "Usage: get_TrieArray <fastafile> <outfile> " << std::endl; return -1; } ofstream myfile; string fastafile(argv[1]); const char* outname(argv[2]); myfile.open(outname); std::cout << fastafile << "\t" << outname << std::endl; //---------------------------------------------------------------------------------------- cout << "Reading FASTA file..." << endl; Sequences s(fastafile); cout << "Read " << s.size() << " sequences." << endl; cout << "Generating ninemers..." << endl; Sequences ninemers; generateAllSubstrings(ninemers, s, 9); cout << "Generated " << ninemers.size() << " ninemers." << endl; s.clear(); for (size_t i = 0; i < ninemers.size(); ++i) { myfile << "> " << i << endl; myfile << ninemers[i] << endl; } myfile.close(); cout << "Done." << endl; }
fasta::FastaData fasta::parse_fasta(std::string const& filename, int codon_length) { fs::path p(filename); if (!fs::exists(p)) { throw std::invalid_argument("File not found: " + filename); } std::ifstream fastafile(filename.c_str()); std::string line; std::string header; std::string seq_line; auto in_sequence_section = true; fasta::FastaData fd; while (std::getline(fastafile, line)) { if (line.substr(0, 1) == ">") { if (!seq_line.empty()) { auto sequence = fasta::make_sequence( header, seq_line, codon_length ); fd.sequences.push_back(sequence); seq_line = ""; } in_sequence_section = true; // Parse header line.erase(std::remove(line.begin(), line.end(), '\r'), line.end()); header = line; continue; } if (line.substr(0, 1) == "#") { assert(!seq_line.empty()); auto sequence = fasta::make_sequence(header, seq_line, codon_length); fd.sequences.push_back(sequence); in_sequence_section = false; continue; } if (in_sequence_section) { line.erase(std::remove(line.begin(), line.end(), '\r'), line.end()); seq_line += line; } else { std::vector<std::string> result; boost::split(result, line, boost::is_any_of("\t ")); if (result.size() != 2) { throw std::runtime_error("Invalid probability format: " + line); } fd.probabilities["m_" + result[0]] = std::stod(result[1]); } } // If the file doesn't have a probability section, add the last sequence that // was being processed. if (in_sequence_section) { auto sequence = fasta::make_sequence(header, seq_line, codon_length); fd.sequences.push_back(sequence); } return fd; }